Index: projects/iosched/cddl/contrib/opensolaris/cmd/sgs/tools/common/sgsmsg.c =================================================================== --- projects/iosched/cddl/contrib/opensolaris/cmd/sgs/tools/common/sgsmsg.c (revision 287721) +++ projects/iosched/cddl/contrib/opensolaris/cmd/sgs/tools/common/sgsmsg.c (revision 287722) @@ -1,1210 +1,1212 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * sgsmsg generates several message files from an input template file. Messages * are constructed for use with gettext(3i) - the default - or catgets(3c). The * files generate are: * * msg.h a header file containing definitions for each message. The -h * option triggers the creation of these definitions and specifies * the name to use. * * msg.c a data array of message strings. The msg.h definitions are * offsets into this array. The -d option triggers the creation of * these definitions and specifies the name to use. * * messages a message file suitable for catgets(3c) or gettext(3i) use. The * -m option triggers this output and specifies the filename to be * used. * * The template file is processed based on the first character of each line: * * # or $ entries are copied (as is) to the message file (messages). * * @ token(s) entries are translated. Two translations are possible dependent * on whether one or more tokens are supplied: * * A single token is interpreted as one of two reserved message * output indicators, or a message identifier. The reserved output * indicator _START_ enables output to the message file - Note that * the occurance of any other @ token will also enable message * output. The reserved output indicator _END_ disables output to * the message file. The use of these two indicators provides for * only those message strings that require translation to be output * to the message file. * * Besides the reserved output indicators, a single token is taken * to be a message identifier which will be subsituted for a * `setid' for catgets(3c) output, or a `domain' name for * gettext(3i) output. This value is determine by substituting the * token for the associated definition found in the message * identifier file (specified with the -i option). * * Multiple tokens are taken to be a message definition followed by * the associated message string. The message string is copied to * the data array being built in msg.c. The index into this array * becomes the `message' identifier created in the msg.h file. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include <_string_table.h> /* * Define any error message strings. */ static const char * Errmsg_malt = "sgsmsg: file %s: line %d: malformed input " "at line\n", * Errmsg_nmem = "sgsmsg: memory allocation failed: %s\n", * Errmsg_opne = "sgsmsg: file %s: open failed: %s\n", * Errmsg_wrte = "sgsmsg: file %s: write failed: %s\n", * Errmsg_read = "sgsmsg: file %s: read failed %s\n", * Errmsg_stnw = "sgsmsg: st_new(): failed: %s\n", * Errmsg_stin = "sgsmsg: Str_tbl insert failed: %s\n", * Errmsg_mnfn = "sgsmsg: message not found in Str_tbl: %s\n", * Errmsg_use = "usage: sgsmsg [-clv] [-d mesgdata] [-h mesgdefs] " "[-m messages] [-n name] [-i mesgident] file ...\n"; /* * Define all output filenames and associated descriptors. */ static FILE *fddefs, *fddata, *fdmsgs, *fdmids, *fddesc; static char *fldefs, *fldata, *flmsgs, *flmids, *fldesc; static FILE *fdlint; static char fllint[MAXPATHLEN]; static uint_t vflag; /* verbose flag */ static Str_tbl *stp; /* string table */ /* * Define any default strings. */ static const char *nmlint = "/tmp/sgsmsg.lint", *interface = "sgs_msg", *start = "_START_", *end = "_END_"; /* * Define any default flags and data items. */ static int cflag = 0, lflag = 0, prtmsgs = 0, line, ptr = 1, msgid = 0; static char *mesgid = 0, *setid = 0, *domain = 0; typedef struct msg_string { char *ms_defn; char *ms_message; struct msg_string *ms_next; } msg_string; static msg_string *msg_head; static msg_string *msg_tail; +int aok; + /* * message_append() is responsible for both inserting strings into * the master Str_tbl as well as maintaining a list of the * DEFINITIONS associated with each string. * * The list of strings is traversed at the end once the full * Str_tbl has been constructed - and string offsets can be * assigned. */ static void message_append(const char *defn, const char *message) { msg_string *msg; if ((msg = calloc(sizeof (msg_string), 1)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); exit(1); } /* * Initialize the string table. */ if ((stp == 0) && ((stp = st_new(FLG_STNEW_COMPRESS)) == NULL)) { (void) fprintf(stderr, Errmsg_stnw, strerror(errno)); exit(1); } if ((msg->ms_defn = strdup(defn)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); exit(1); } if ((msg->ms_message = strdup(message)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); exit(1); } if (st_insert(stp, msg->ms_message) == -1) { (void) fprintf(stderr, Errmsg_stin, message); exit(1); } if (msg_head == 0) { msg_head = msg_tail = msg; return; } msg_tail->ms_next = msg; msg_tail = msg; } /* * Initialize a setid value. Given a setid definition determine its numeric * value from the specified message identifier file (specified with the -i * option). Return a pointer to the numeric string. */ static int getmesgid(char *id) { char *buffer, *token, *_mesgid = 0, *_setid = 0, *_domain = 0; /* * If we're being asked to interpret a message id but the user didn't * provide the required message identifier file (-i option) we're in * trouble. */ if (flmids == 0) { (void) fprintf(stderr, "sgsmsg: file %s: line %d: mesgid %s: " "unable to process mesgid\n\t" "no message identifier file specified " "(see -i option)\n", fldesc, line, id); return (1); } if ((buffer = malloc(LINE_MAX)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); return (1); } /* * Read the message identifier file and locate the required mesgid. */ rewind(fdmids); while (fgets(buffer, LINE_MAX, fdmids) != NULL) { if ((token = strstr(buffer, id)) == NULL) continue; /* * Establish individual strings for the mesgid, setid and domain * values. */ _mesgid = token; while (!(isspace(*token))) token++; *token++ = 0; while (isspace(*token)) token++; _setid = token; while (!(isspace(*token))) token++; *token++ = 0; while (isspace(*token)) token++; _domain = token; while (!(isspace(*token))) token++; *token = 0; break; } /* * Did we find a match? */ if ((_mesgid == 0) || (_setid == 0) || (_domain == 0)) { (void) fprintf(stderr, "sgsmsg: file %s: line %d: mesgid %s: " "unable to process mesgid\n\t" "identifier does not exist in file %s\n", fldesc, line, id, flmids); return (1); } /* * Have we been here before? */ if (mesgid) { if (cflag == 1) { /* * If we're being asked to process more than one mesgid * warn the user that only one mesgid can be used for * the catgets(3c) call. */ (void) fprintf(stderr, "sgsmsg: file %s: line %d: " "setid %s: warning: multiple mesgids " "encountered\n\t" "last setting used in messaging code\n", fldesc, line, id); } } mesgid = _mesgid; setid = _setid; domain = _domain; /* * Generate the message file output (insure output flag is enabled). */ if (prtmsgs != -1) prtmsgs = 1; if (fdmsgs && (prtmsgs == 1)) { if (cflag == 1) { if (fprintf(fdmsgs, "$quote \"\n$set %s\n", setid) < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } else { if (fprintf(fdmsgs, "domain\t\"%s\"\n", domain) < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } } /* * For catgets(3c) output generate a setid definition in the message * definition file. */ if (fddefs && (cflag == 1) && (fprintf(fddefs, "#define\t%s\t%s\n\n", mesgid, setid) < 0)) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } return (0); } /* * Dump contents of String Table to standard out */ static void dump_stringtab(Str_tbl *stp) { uint_t cnt; if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) { (void) printf("string table full size: %ld: uncompressed\n", stp->st_fullstrsize); return; } (void) printf("string table full size: %ld compressed down to: %ld\n\n", stp->st_fullstrsize, stp->st_strsize); (void) printf("string table compression information [%d buckets]:\n", stp->st_hbckcnt); for (cnt = 0; cnt < stp->st_hbckcnt; cnt++) { Str_hash *sthash = stp->st_hashbcks[cnt]; if (sthash == 0) continue; (void) printf(" bucket: [%d]\n", cnt); while (sthash) { size_t stroff = sthash->hi_mstr->sm_strlen - sthash->hi_strlen; if (stroff == 0) { (void) printf(" [%ld]: '%s' \n", sthash->hi_refcnt, sthash->hi_mstr->sm_str); } else { (void) printf(" [%ld]: '%s' \n", sthash->hi_refcnt, &sthash->hi_mstr->sm_str[stroff], sthash->hi_mstr->sm_str); } sthash = sthash->hi_next; } } } /* * Initialize the message definition header file stream. */ static int init_defs(void) { static char guard[FILENAME_MAX + 6]; char *optr; const char *iptr, *_ptr; /* * Establish a header guard name using the files basename. */ for (iptr = 0, _ptr = fldefs; _ptr && (*_ptr != '\0'); _ptr++) { if (*_ptr == '/') iptr = _ptr + 1; } if (iptr == 0) iptr = fldefs; optr = guard; for (*optr++ = '_'; iptr && (*iptr != '\0'); iptr++, optr++) { if (*iptr == '.') { *optr++ = '_'; *optr++ = 'D'; *optr++ = 'O'; *optr++ = 'T'; *optr = '_'; } else *optr = toupper(*iptr); } if (fprintf(fddefs, "#ifndef\t%s\n#define\t%s\n\n", guard, guard) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } if (fprintf(fddefs, "#ifndef\t__lint\n\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } /* * add "typedef int Msg;" */ if (fprintf(fddefs, "typedef int\tMsg;\n\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } /* * If the associated data array is global define a prototype. * Define a macro to access the array elements. */ if (lflag == 0) { if (fprintf(fddefs, "extern\tconst char\t__%s[];\n\n", interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } } if (fprintf(fddefs, "#define\tMSG_ORIG(x)\t&__%s[x]\n\n", interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } /* * Generate a prototype to access the associated data array. */ if (fprintf(fddefs, "extern\tconst char *\t_%s(Msg);\n\n", interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } if (fprintf(fddefs, "#define\tMSG_INTL(x)\t_%s(x)\n\n", interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } return (0); } /* * Finish the message definition header file. */ static int fini_defs(void) { if (fprintf(fddefs, "\n#else\t/* __lint */\n\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } /* * When __lint is defined, Msg is a char *. This allows lint to * check our format strings against it's arguments. */ if (fprintf(fddefs, "\ntypedef char *\tMsg;\n\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } if (fprintf(fddefs, "extern\tconst char *\t_%s(Msg);\n\n", interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } if (lflag == 0) { if (fprintf(fddefs, "extern\tconst char\t__%s[];\n\n", interface) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } } if (fprintf(fddefs, "#define MSG_ORIG(x)\tx\n#define MSG_INTL(x)\tx\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } /* * Copy the temporary lint defs file into the new header. */ if (fdlint) { long size; char *buf; size = ftell(fdlint); (void) rewind(fdlint); if ((buf = malloc(size)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); return (1); } if (fread(buf, size, 1, fdlint) == 0) { (void) fprintf(stderr, Errmsg_read, fllint, strerror(errno)); return (1); } if (fwrite(buf, size, 1, fddefs) == 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } (void) free(buf); } if (fprintf(fddefs, "\n#endif\t/* __lint */\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } if (fprintf(fddefs, "\n#endif\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } return (0); } /* * The entire messaging file has been scanned - and all strings have been * inserted into the string_table. We can now walk the message queue * and create the '#define ' for each string - with the strings * assigned offset into the string_table. */ static int output_defs(void) { msg_string *msg; size_t stbufsize; char *stbuf; stbufsize = st_getstrtab_sz(stp); if ((stbuf = malloc(stbufsize)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); exit(1); } (void) st_setstrbuf(stp, stbuf, stbufsize); for (msg = msg_head; msg; msg = msg->ms_next) { size_t stoff; if ((st_setstring(stp, msg->ms_message, &stoff)) == -1) { (void) fprintf(stderr, Errmsg_mnfn, msg->ms_message); return (1); } if (fprintf(fddefs, "\n#define\t%s\t%ld\n", msg->ms_defn, stoff) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } if (fddefs && fprintf(fddefs, "#define\t%s_SIZE\t%d\n", msg->ms_defn, strlen(msg->ms_message)) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldefs, strerror(errno)); return (1); } } return (0); } /* * Finish off the data structure definition. */ static int output_data(void) { size_t stbufsize; size_t ndx; size_t column = 1; const char *stbuf; const char *fmtstr; stbufsize = st_getstrtab_sz(stp); stbuf = st_getstrbuf(stp); assert(stbuf); /* * Determine from the local flag whether the data declaration should * be static. */ if (lflag) fmtstr = (const char *)"static const"; else fmtstr = (const char *)"const"; if (fprintf(fddata, "\n%s char __%s[%ld] = { ", fmtstr, interface, stbufsize) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldata, strerror(errno)); return (1); } for (ndx = 0; ndx < (stbufsize - 1); ndx++) { if (column == 1) { if (fddata && fprintf(fddata, "\n/* %4ld */ 0x%.2x,", ndx, (unsigned char)stbuf[ndx]) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldata, strerror(errno)); return (1); } } else { if (fddata && fprintf(fddata, " 0x%.2x,", (unsigned char)stbuf[ndx]) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldata, strerror(errno)); return (1); } } if (column++ == 10) column = 1; } if (column == 1) fmtstr = "\n\t0x%.2x };\n"; else fmtstr = " 0x%.2x };\n"; if (fprintf(fddata, fmtstr, (unsigned char)stbuf[stbufsize - 1]) < 0) { (void) fprintf(stderr, Errmsg_wrte, fldata, strerror(errno)); return (1); } return (0); } static int file() { char buffer[LINE_MAX], * token; uint_t bufsize; char *token_buffer; int escape = 0; if ((token_buffer = malloc(LINE_MAX)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); return (1); } bufsize = LINE_MAX; line = 1; while ((token = fgets(buffer, LINE_MAX, fddesc)) != NULL) { char defn[PATH_MAX], * _defn, * str; int len; switch (*token) { case '#': case '$': if (escape) { (void) fprintf(stderr, Errmsg_malt, fldesc, line); return (1); } /* * If a msgid has been output a msgstr must follow * before we digest the new token. A msgid is only set * if fdmsgs is in use. */ if (msgid) { msgid = 0; if (fprintf(fdmsgs, "msgstr\t\"\"\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } /* * Pass lines directly through to the output message * file. */ if (fdmsgs && (prtmsgs == 1)) { char comment; if (cflag == 0) comment = '#'; else comment = '$'; if (fprintf(fdmsgs, "%c%s", comment, ++token) < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } break; case '@': if (escape) { (void) fprintf(stderr, Errmsg_malt, fldesc, line); return (1); } /* * If a msgid has been output a msgstr must follow * before we digest the new token. */ if (msgid) { msgid = 0; if (fprintf(fdmsgs, "msgstr\t\"\"\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } /* * Determine whether we have one or more tokens. */ token++; while (isspace(*token)) /* rid any whitespace */ token++; _defn = token; /* definition start */ while (!(isspace(*token))) token++; *token++ = 0; while (isspace(*token)) /* rid any whitespace */ token++; /* * Determine whether the single token is one of the * reserved message output delimiters otherwise * translate it as a message identifier. */ if (*token == 0) { if (strcmp(_defn, start) == 0) prtmsgs = 1; else if (strcmp(_defn, end) == 0) prtmsgs = -1; else if (getmesgid(_defn) == 1) return (1); break; } /* * Multiple tokens are translated by taking the first * token as the message definition, and the rest of the * line as the message itself. A message line ending * with an escape ('\') is expected to be continued on * the next line. */ if (prtmsgs != -1) prtmsgs = 1; if (fdmsgs && (prtmsgs == 1)) { /* * For catgets(3c) make sure a message * identifier has been established (this is * normally a domain for gettext(3i), but for * sgsmsg use this could be argued as being * redundent). Also make sure that the message * definitions haven't exceeeded the maximum * value allowed by gencat(1) before generating * any message file entries. */ if (cflag == 1) { if (setid == 0) { (void) fprintf(stderr, "file " "%s: no message identifier " "has been established\n", fldesc); return (1); } if (ptr > NL_MSGMAX) { (void) fprintf(stderr, "file " "%s: message definition " "(%d) exceeds allowable " "limit (NL_MSGMAX)\n", fldesc, ptr); return (1); } } /* * For catgets(3c) write the definition and the * message string to the message file. For * gettext(3i) write the message string as a * mesgid - indicate a mesgid has been output * so that a msgstr can follow. */ if (cflag == 1) { if (fprintf(fdmsgs, "%d\t%s", ptr, token) < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } else { if (fprintf(fdmsgs, "msgid\t\"") < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } msgid = 1; } } /* * The message itself is a quoted string as this makes * embedding spaces at the start (or the end) of the * string very easy. */ if (*token != '"') { (void) fprintf(stderr, Errmsg_malt, fldesc, line); return (1); } (void) strcpy(defn, _defn); /* * Write the tag to the lint definitions. */ if (fdlint) { if (fprintf(fdlint, "\n#define\t%s\t", _defn) < 0) { (void) fprintf(stderr, Errmsg_wrte, fllint, strerror(errno)); return (1); } } len = 0; /* * Write each character of the message string to the * data array. Translate any escaped characters - use * the same specially recognized characters as defined * by gencat(1). */ message: if (*token == '"') { if (fdlint && (fprintf(fdlint, "%c", *token) < 0)) { (void) fprintf(stderr, Errmsg_wrte, fllint, strerror(errno)); return (1); } token++; } while (*token) { char _token; if ((*token == '\\') && (escape == 0)) { escape = 1; if (fdlint && (*(token + 1) != '\n') && fprintf(fdlint, "%c", *token) < 0) { (void) fprintf(stderr, Errmsg_wrte, fllint, strerror(errno)); return (1); } token++; continue; } if (escape) { if (*token == 'n') _token = '\n'; else if (*token == 't') _token = '\t'; else if (*token == 'v') _token = '\v'; else if (*token == 'b') _token = '\b'; else if (*token == 'f') _token = '\f'; else if (*token == '\\') _token = '\\'; else if (*token == '"') _token = '"'; else if (*token == '\n') break; else _token = *token; if (fdmsgs && (prtmsgs == 1) && (fprintf(fdmsgs, "\\") < 0)) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } else { /* * If this is the trailing quote then * thats the last of the message string. * Eat up any remaining white space and * unless an escape character is found * terminate the data string with a 0. */ /* BEGIN CSTYLED */ if (*token == '"') { if (fdlint && (fprintf(fdlint, "%c", *token) < 0)) { (void) fprintf(stderr, Errmsg_wrte, fllint, strerror(errno)); return (1); } if (fdmsgs && (prtmsgs == 1) && (fprintf(fdmsgs, "%c", *token) < 0)) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } while (*++token) { if (*token == '\n') break; } _token = '\0'; } else _token = *token; /* END CSTYLED */ } if (fdmsgs && (prtmsgs == 1) && (fprintf(fdmsgs, "%c", *token) < 0)) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } if (fdlint && fprintf(fdlint, "%c", *token) < 0) { (void) fprintf(stderr, Errmsg_wrte, fllint, strerror(errno)); return (1); } if (len >= bufsize) { bufsize += LINE_MAX; if ((token_buffer = realloc( token_buffer, bufsize)) == 0) { (void) fprintf(stderr, Errmsg_nmem, strerror(errno)); return (1); } } token_buffer[len] = _token; ptr++, token++, len++; escape = 0; if (_token == '\0') break; } /* * After the complete message string has been processed * (including its continuation beyond one line), create * a string size definition. */ if (escape == 0) { const char *form = "#define\t%s_SIZE\t%d\n"; token_buffer[len] = '\0'; message_append(defn, token_buffer); if (fdlint && fprintf(fdlint, form, defn, (len - 1)) < 0) { (void) fprintf(stderr, Errmsg_wrte, fllint, strerror(errno)); return (1); } } break; default: /* * Empty lines are passed through to the message file. */ while (isspace(*token)) token++; if (*token == 0) { if (msgid || (fdmsgs && (prtmsgs == 1))) { /* * If a msgid has been output a msgstr * must follow before we digest the new * token. */ if (msgid) { msgid = 0; str = "msgstr\t\"\"\n\n"; } else str = "\n"; if (fprintf(fdmsgs, str) < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } break; } /* * If an escape is in effect then any tokens are taken * to be message continuations. */ if (escape) { escape = 0; goto message; } (void) fprintf(stderr, "file %s: line %d: invalid " "input does not start with #, $ or @\n", fldesc, line); return (1); } line++; } free(token_buffer); return (0); } int main(int argc, char ** argv) { opterr = 0; while ((line = getopt(argc, argv, "cd:h:lm:n:i:v")) != EOF) { switch (line) { case 'c': /* catgets instead of gettext */ cflag = 1; break; case 'd': /* new message data filename */ fldata = optarg; /* (msg.c is default) */ break; case 'h': /* new message defs filename */ fldefs = optarg; /* (msg.h is default) */ break; case 'i': /* input message ids from */ flmids = optarg; /* from this file */ break; case 'l': /* define message data arrays */ lflag = 1; /* to be local (static) */ break; case 'm': /* generate message database */ flmsgs = optarg; /* to this file */ break; case 'n': /* new data array and func */ interface = optarg; /* name (msg is default) */ break; case 'v': vflag = 1; /* set verbose flag */ break; case '?': (void) fprintf(stderr, Errmsg_use, argv[0]); exit(1); default: break; } } /* * Validate the we have been given at least one input file. */ if ((argc - optind) < 1) { (void) fprintf(stderr, Errmsg_use); exit(1); } /* * Open all the required output files. */ if (fldefs) { if ((fddefs = fopen(fldefs, "w+")) == NULL) { (void) fprintf(stderr, Errmsg_opne, fldefs, strerror(errno)); return (1); } } if (fldata) { if (fldefs && (strcmp(fldefs, fldata) == 0)) fddata = fddefs; else if ((fddata = fopen(fldata, "w+")) == NULL) { (void) fprintf(stderr, Errmsg_opne, fldata, strerror(errno)); return (1); } } if (fddefs && fddata) { (void) sprintf(fllint, "%s.%d", nmlint, (int)getpid()); if ((fdlint = fopen(fllint, "w+")) == NULL) { (void) fprintf(stderr, Errmsg_opne, fllint, strerror(errno)); return (1); } } if (flmsgs) { if ((fdmsgs = fopen(flmsgs, "w+")) == NULL) { (void) fprintf(stderr, Errmsg_opne, flmsgs, strerror(errno)); return (1); } } if (flmids) { if ((fdmids = fopen(flmids, "r")) == NULL) { (void) fprintf(stderr, Errmsg_opne, flmids, strerror(errno)); return (1); } } /* * Initialize the message definition and message data streams. */ if (fddefs) { if (init_defs()) return (1); } /* * Read the input message file, and for each line process accordingly. */ for (; optind < argc; optind++) { int err; fldesc = argv[optind]; if ((fddesc = fopen(fldesc, "r")) == NULL) { (void) fprintf(stderr, Errmsg_opne, fldesc, strerror(errno)); return (1); } err = file(); (void) fclose(fddesc); if (err != 0) return (1); } /* * If a msgid has been output a msgstr must follow before we end the * file. */ if (msgid) { msgid = 0; if (fprintf(fdmsgs, "msgstr\t\"\"\n") < 0) { (void) fprintf(stderr, Errmsg_wrte, flmsgs, strerror(errno)); return (1); } } if (fdmids) (void) fclose(fdmids); if (fdmsgs) (void) fclose(fdmsgs); if (fddefs) { if (output_defs()) return (1); } /* * Finish off any generated data and header file. */ if (fldata) { if (output_data()) return (1); } if (fddefs) { if (fini_defs()) return (1); } if (vflag) dump_stringtab(stp); /* * Close up everything and go home. */ if (fddata) (void) fclose(fddata); if (fddefs && (fddefs != fddata)) (void) fclose(fddefs); if (fddefs && fddata) { (void) fclose(fdlint); (void) unlink(fllint); } if (stp) st_destroy(stp); return (0); } Index: projects/iosched/cddl/contrib/opensolaris =================================================================== --- projects/iosched/cddl/contrib/opensolaris (revision 287721) +++ projects/iosched/cddl/contrib/opensolaris (revision 287722) Property changes on: projects/iosched/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl/contrib/opensolaris:r287701-287721 Index: projects/iosched/cddl =================================================================== --- projects/iosched/cddl (revision 287721) +++ projects/iosched/cddl (revision 287722) Property changes on: projects/iosched/cddl ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/cddl:r287701-287721 Index: projects/iosched/share/man/man4/ctl.4 =================================================================== --- projects/iosched/share/man/man4/ctl.4 (revision 287721) +++ projects/iosched/share/man/man4/ctl.4 (revision 287722) @@ -1,136 +1,189 @@ .\" Copyright (c) 2013 Edward Tomasz Napierala +.\" Copyright (c) 2015 Alexander Motin .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ -.Dd August 9, 2015 +.Dd September 12, 2015 .Dt CTL 4 .Os .Sh NAME .Nm ctl .Nd CAM Target Layer / iSCSI target .Sh SYNOPSIS To compile this driver into the kernel, place the following line in your kernel configuration file: .Bd -ragged -offset indent .Cd "device iscsi" .Cd "device ctl" .Ed .Pp Alternatively, to load the driver as a module at boot time, place the following line in .Xr loader.conf 5 : .Bd -literal -offset indent ctl_load="YES" .Ed .Sh DESCRIPTION The .Nm subsystem provides SCSI disk and processor emulation. It supports features such as: .Pp .Bl -bullet -compact .It Disk and processor device emulation .It Tagged queueing .It SCSI task attribute support (ordered, head of queue, simple tags) .It SCSI implicit command ordering support .It Full task management support (abort, LUN reset, target reset, etc.) .It Support for multiple ports .It Support for multiple simultaneous initiators .It Support for multiple simultaneous backing stores .It Support for VMWare VAAI: COMPARE AND WRITE, XCOPY, WRITE SAME, and UNMAP commands .It Support for Microsoft ODX: POPULATE TOKEN/WRITE USING TOKEN, WRITE SAME, and UNMAP commands .It Persistent reservation support .It Mode sense/select support .It Error injection support .It +High Availability clustering support with ALUA +.It All I/O handled in-kernel, no userland context switch overhead .El .Pp It also serves as a kernel component of the native iSCSI target. .Sh SYSCTL VARIABLES The following variables are available as both .Xr sysctl 8 variables and .Xr loader 8 tunables: .Bl -tag -width indent .It Va kern.cam.ctl.debug Bit mask of enabled CTL log levels: .Bl -tag -offset indent -compact .It 1 log commands with errors; .It 2 log all commands; .It 4 -log received data for commands except READ/WRITE. +log data for commands other then READ/WRITE. .El Defaults to 0. +.It Va kern.cam.ctl.ha_id +Specifies unique position of this node within High Availability cluster. +Default is 0 -- no HA, 1 and 2 -- HA enabled at specified position. +.It Va kern.cam.ctl.ha_mode +Specifies High Availability cluster operation mode: +.Bl -tag -offset indent -compact +.It 0 +Active/Standby -- primary node has backend access and processes requests, +while secondary can only do basic LUN discovery and reservation; +.It 1 +Active/Active -- both nodes have backend access and process requests, +while secondary node synchronizes processing with primary one; +.It 2 +Active/Active -- primary node has backend access and processes requests, +while secondary node forwards all requests and data to primary one; +.El +All above modes require established connection between HA cluster nodes. +If connection is not configured, secondary node will report Unavailable +state; if configured but not established -- Transitioning state. +Defaults to 0. +.It Va kern.cam.ctl.ha_peer +String value, specifying method to establish connection to peer HA node. +Can be "listen IP:port", "connect IP:port" or empty. +.It Va kern.cam.ctl.ha_link +Reports present state of connection between HA cluster nodes: +.Bl -tag -offset indent -compact +.It 0 +not configured; +.It 1 +configured but not established; +.It 2 +established. +.El +.It Va kern.cam.ctl.ha_role +Specifies default role of this node: +.Bl -tag -offset indent -compact +.It 0 +primary; +.It 1 +secondary. +.El +This role can be overriden on per-LUN basis using "ha_role" LUN option, +so that for one LUN one node is primary, while for another -- another. +Role change from primary to secondary for HA modes 0 and 2 closes backends, +the opposite change -- opens. +If there is no primary node (both nodes are secondary, or secondary node has +no connection to primary one), secondary node(s) report Transitioning state. +State with two primary nodes is illegal (split brain condition). .It Va kern.cam.ctl.iscsi.debug Verbosity level for log messages from the kernel part of iSCSI target. Set to 0 to disable logging or 1 to warn about potential problems. Larger values enable debugging output. Defaults to 1. .It Va kern.cam.ctl.iscsi.maxcmdsn_delta The number of outstanding commands to advertise to the iSCSI initiator. Technically, it is the difference between ExpCmdSN and MaxCmdSN fields in the iSCSI PDU. Defaults to 256. .It Va kern.cam.ctl.iscsi.ping_timeout The number of seconds to wait for the iSCSI initiator to respond to a NOP-In PDU. In the event that there is no response within that time the session gets forcibly terminated. Set to 0 to disable sending NOP-In PDUs. Defaults to 5. .El .Sh SEE ALSO .Xr ctladm 8 , .Xr ctld 8 , .Xr ctlstat 8 .Sh HISTORY The .Nm subsystem first appeared in .Fx 9.1 . .Sh AUTHORS The .Nm -subsystem was written by +subsystem was originally written by .An Kenneth Merry Aq Mt ken@FreeBSD.org . +Later work was done by +.An Alexander Motin Aq Mt mav@FreeBSD.org . Index: projects/iosched/share/man/man4/geom_fox.4 =================================================================== --- projects/iosched/share/man/man4/geom_fox.4 (revision 287721) +++ projects/iosched/share/man/man4/geom_fox.4 (revision 287722) @@ -1,214 +1,221 @@ .\" .\" Copyright (c) 2006 Wilko Bulte .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd January 2, 2005 +.Dd September 12, 2015 .Dt GEOM_FOX 4 .Os .Sh NAME .Nm geom_fox .Nd "GEOM based basic disk multipathing" .Sh SYNOPSIS To compile this driver into the kernel, place the following line in your kernel configuration file: .Bd -ragged -offset indent .Cd "options GEOM_FOX" .Ed .Pp Alternatively, to load the driver as a module at boot time, place the following line in .Xr loader.conf 5 : .Bd -literal -offset indent geom_fox_load="YES" .Ed .Sh DESCRIPTION +.Bf -symbolic +This driver is obsolete. +Users are advised to use +.Xr gmultipath 8 +instead. +.Ef +.Pp The intent of the .Nm framework is to provide basic multipathing support to access direct access devices. Basic in the above sentence should be read as: .Nm only provides path failover functionality, not load balancing over the available paths etc. Using suitable hardware like SCSI or FibreChannel disks it is possible to have multiple (typically 2) host bus adapters access the same physical disk drive. .Pp Without a multipathing driver the .Fx kernel would probe the disks multiple times, resulting in the creation of multiple .Pa /dev entries for the same underlying physical device. A unique label written in the GEOM label area allows .Nm to detect multiple paths. Using this information it creates a unique .Pa da#.fox device. .Pp The .Nm device is subsequently used by the .Fx kernel to access the disks. Multiple physical access paths ensure that even in case of a path failure the .Fx kernel can continue to access the data. .Pp The .Nm driver will disallow write operations to the underlying devices once the fox device has been opened for writing. .Sh EXAMPLES .Bl -bullet -compact .It .Nm needs a label on the disk as follows in order to work properly: .Bd -literal "0123456789abcdef0123456789abcdef" "GEOM::FOX <--unique--id-->" .Ed .Pp For the unique ID 16 bytes are available. The .Dq Li GEOM::FOX is the magic to mark a .Nm device. .Pp The actual labelling is accomplished by .Bd -literal echo "GEOM::FOX someid" | dd of=/dev/da2 conv=sync .Ed .Pp For FibreChannel devices it is suggested to use the Node World Wide Name (Node WWN) as this is guaranteed by the FibreChannel standard to be worldwide unique. The use of the Port WWN not recommended as each port of a given device has a different WWN, thereby confusing things. .Pp The Node WWN can be obtained from a verbose boot as in for example .Bd -literal isp1: Target 1 (Loop 0x1) Port ID 0xe8 (role Target) Arrived Port WWN 0x21000004cfc8aca2 Node WWN 0x20000004cfc8aca2 .Ed .Pp This Node WWN would then be used like so: .Bd -literal echo "GEOM::FOX 20000004cfc8aca2" | dd of=/dev/da2 conv=sync .Ed .Pp For non-FibreChannel devices you could for example use the serial number of the device. Regardless of what you use, make sure the label is unique. .Pp Once the labelling has been performed and assuming the .Nm module is loaded the kernel will inform you that it has found a new .Nm device with a message similar to .Bd -literal Creating new fox (da2) fox da2.fox lock 0xfffffc0000fdba20 .Ed .Pp .It To check which physical devices match a given .Nm device: .Bd -literal -offset indent # geom fox list Geom name: da2.fox Providers: 1. Name: da2.fox Mediasize: 73407865344 (68G) Sectorsize: 512 Mode: r0w0e0 Consumers: 1. Name: da2 Mediasize: 73407865856 (68G) Sectorsize: 512 Mode: r0w0e0 2. Name: da6 Mediasize: 73407865856 (68G) Sectorsize: 512 Mode: r0w0e0 .Ed .Pp .It To check the status of the .Nm components: .Bd -literal # geom fox status Name Status Components da2.fox N/A da2 da6 .Ed .El .Sh SEE ALSO .Xr GEOM 4 , .Xr geom 8 , .Xr gmultipath 8 .Sh AUTHORS .An -nosplit The .Nm driver was written by .An Poul-Henning Kamp Aq Mt phk@FreeBSD.org . This manual page was written by .An Wilko Bulte Aq Mt wilko@FreeBSD.org . .Sh CAVEATS The .Nm driver depends on the underlying hardware drivers to do the right thing in case of a path failure. If for example a hardware driver continues to retry forever, .Nm is not able to re-initiate the I/O to an alternative physical path. .Pp You have to be very sure to provide a unique label for each of the .Nm devices. Safety belts are not provided. For FibreChannel devices it is suggested to use the Port WWN of the device. The World Wide Name is guaranteed to be worldwide unique per the FibreChannel standard. .Sh BUGS The .Nm framework has only seen light testing. There definitely might be dragons here. .Pp The name .Nm is completely obscure. Just remember that any sly fox has multiple exits from its hole. .Pp The examples provided are too FibreChannel-centric. Index: projects/iosched/share/man/man4 =================================================================== --- projects/iosched/share/man/man4 (revision 287721) +++ projects/iosched/share/man/man4 (revision 287722) Property changes on: projects/iosched/share/man/man4 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/share/man/man4:r287701-287721 Index: projects/iosched/share =================================================================== --- projects/iosched/share (revision 287721) +++ projects/iosched/share (revision 287722) Property changes on: projects/iosched/share ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/share:r287701-287721 Index: projects/iosched/sys/cam/ctl/README.ctl.txt =================================================================== --- projects/iosched/sys/cam/ctl/README.ctl.txt (revision 287721) +++ projects/iosched/sys/cam/ctl/README.ctl.txt (revision 287722) @@ -1,403 +1,402 @@ /* $FreeBSD$ */ CTL - CAM Target Layer Description Revision 1.4 (December 29th, 2011) Ken Merry Table of Contents: ================= Introduction Features Configuring and Running CTL Revision 1.N Changes To Do List Code Roadmap Userland Commands Introduction: ============ CTL is a disk and processor device emulation subsystem originally written for Copan Systems under Linux starting in 2003. It has been shipping in Copan (now SGI) products since 2005. It was ported to FreeBSD in 2008, and thanks to an agreement between SGI (who acquired Copan's assets in 2010) and Spectra Logic in 2010, CTL is available under a BSD-style license. The intent behind the agreement was that Spectra would work to get CTL into the FreeBSD tree. Features: ======== - Disk and processor device emulation. - Tagged queueing - SCSI task attribute support (ordered, head of queue, simple tags) - SCSI implicit command ordering support. (e.g. if a read follows a mode select, the read will be blocked until the mode select completes.) - Full task management support (abort, LUN reset, target reset, etc.) - Support for multiple ports - Support for multiple simultaneous initiators - Support for multiple simultaneous backing stores + - Support for VMWare VAAI: COMPARE AND WRITE, XCOPY, WRITE SAME and + UNMAP commands + - Support for Microsoft ODX: POPULATE TOKEN/WRITE USING TOKEN, WRITE SAME + and UNMAP commands - Persistent reservation support - Mode sense/select support - Error injection support - - High Availability support + - High Availability clustering support with ALUA - All I/O handled in-kernel, no userland context switch overhead. Configuring and Running CTL: =========================== - - After applying the CTL patchset to your tree, build world and install it - on your target system. + - Add 'device ctl' to your kernel configuration file or load the module. - - Add 'device ctl' to your kernel configuration file. - - If you're running with a 8Gb or 4Gb Qlogic FC board, add - 'options ISP_TARGET_MODE' to your kernel config file. Keep in mind that - the isp(4) driver can run in target or initiator mode, but not both on - the same machine. 'device ispfw' or loading the ispfw module is also - recommended. + 'options ISP_TARGET_MODE' to your kernel config file. 'device ispfw' or + loading the ispfw module is also recommended. - Rebuild and install a new kernel. - Reboot with the new kernel. - To add a LUN with the RAM disk backend: ctladm create -b ramdisk -s 10485760000000000000 ctladm port -o on - You should now see the CTL disk LUN through camcontrol devlist: scbus6 on ctl2cam0 bus 0: at scbus6 target 1 lun 0 (da24,pass32) <> at scbus6 target -1 lun -1 () This is visible through the CTL CAM SIM. This allows using CTL without any physical hardware. You should be able to issue any normal SCSI commands to the device via the pass(4)/da(4) devices. If any target-capable HBAs are in the system (e.g. isp(4)), and have target mode enabled, you should now also be able to see the CTL LUNs via that target interface. Note that all CTL LUNs are presented to all frontends. There is no LUN masking, or separate, per-port configuration. - Note that the ramdisk backend is a "fake" ramdisk. That is, it is backed by a small amount of RAM that is used for all I/O requests. This is useful for performance testing, but not for any data integrity tests. - To add a LUN with the block/file backend: truncate -s +1T myfile ctladm create -b block -o file=myfile ctladm port -o on - You can also see a list of LUNs and their backends like this: # ctladm devlist LUN Backend Size (Blocks) BS Serial Number Device ID 0 block 2147483648 512 MYSERIAL 0 MYDEVID 0 1 block 2147483648 512 MYSERIAL 1 MYDEVID 1 2 block 2147483648 512 MYSERIAL 2 MYDEVID 2 3 block 2147483648 512 MYSERIAL 3 MYDEVID 3 4 block 2147483648 512 MYSERIAL 4 MYDEVID 4 5 block 2147483648 512 MYSERIAL 5 MYDEVID 5 6 block 2147483648 512 MYSERIAL 6 MYDEVID 6 7 block 2147483648 512 MYSERIAL 7 MYDEVID 7 8 block 2147483648 512 MYSERIAL 8 MYDEVID 8 9 block 2147483648 512 MYSERIAL 9 MYDEVID 9 10 block 2147483648 512 MYSERIAL 10 MYDEVID 10 11 block 2147483648 512 MYSERIAL 11 MYDEVID 11 - You can see the LUN type and backing store for block/file backend LUNs like this: # ctladm devlist -v LUN Backend Size (Blocks) BS Serial Number Device ID 0 block 2147483648 512 MYSERIAL 0 MYDEVID 0 lun_type=0 num_threads=14 file=testdisk0 1 block 2147483648 512 MYSERIAL 1 MYDEVID 1 lun_type=0 num_threads=14 file=testdisk1 2 block 2147483648 512 MYSERIAL 2 MYDEVID 2 lun_type=0 num_threads=14 file=testdisk2 3 block 2147483648 512 MYSERIAL 3 MYDEVID 3 lun_type=0 num_threads=14 file=testdisk3 4 block 2147483648 512 MYSERIAL 4 MYDEVID 4 lun_type=0 num_threads=14 file=testdisk4 5 block 2147483648 512 MYSERIAL 5 MYDEVID 5 lun_type=0 num_threads=14 file=testdisk5 6 block 2147483648 512 MYSERIAL 6 MYDEVID 6 lun_type=0 num_threads=14 file=testdisk6 7 block 2147483648 512 MYSERIAL 7 MYDEVID 7 lun_type=0 num_threads=14 file=testdisk7 8 block 2147483648 512 MYSERIAL 8 MYDEVID 8 lun_type=0 num_threads=14 file=testdisk8 9 block 2147483648 512 MYSERIAL 9 MYDEVID 9 lun_type=0 num_threads=14 file=testdisk9 10 ramdisk 0 0 MYSERIAL 0 MYDEVID 0 lun_type=3 11 ramdisk 204800000000000 512 MYSERIAL 1 MYDEVID 1 lun_type=0 Revision 1.4 Changes ==================== - Added in the second HA mode (where CTL does the data transfers instead of having data transfers done below CTL), and abstracted out the Copan HA API. - Fixed the phantom device problem in the CTL CAM SIM and improved the CAM SIM to automatically trigger a rescan when the port is enabled and disabled. - Made the number of threads in the block backend configurable via sysctl, loader tunable and the ctladm command line. (You can now specify -o num_threads=4 when creating a LUN with ctladm create.) - Fixed some LUN selection issues in ctlstat(8) and allowed for selection of LUN numbers up to 1023. - General cleanup. - This version intended for public release. Revision 1.3 Changes ==================== - Added descriptor sense support to CTL. It can be enabled through the control mode page (10), but is disabled by default. - Improved error injection support. The number of errors that can be injected with 'ctladm inject' has been increased, and any arbitrary sense data may now be injected as well. - The port infrastructure has been revamped. Individual ports and types of ports may now be enabled and disabled from the command line. ctladm now has the ability to set the WWNN and WWPN for each port. - The block backend can now send multiple I/Os to backing files. Multiple writes are only allowed for ZFS, but multiple readers are allowed for any filesystem. - The block and ramdisk backends now support setting the LUN blocksize. There are some restrictions when the backing device is a block device, but otherwise the blocksize may be set to anything. Revision 1.2 Changes ==================== - CTL initialization process has been revamped. Instead of using an ad-hoc method, it is now sequenced through SYSINIT() calls. - A block/file backend has been added. This allows using arbitrary files or block devices as a backing store. - The userland LUN configuration interface has been completely rewritten. Configuration is now done out of band. - The ctladm(8) command line interface has been revamped, and is now similar to camcontrol(8). To Do List: ========== - Use devstat(9) for CTL's statistics collection. CTL uses a home-grown statistics collection system that is similar to devstat(9). ctlstat should be retired in favor of iostat, etc., once aggregation modes are available in iostat to match the behavior of ctlstat -t and dump modes are available to match the behavior of ctlstat -d/ctlstat -J. - ZFS ARC backend for CTL. Since ZFS copies all I/O into the ARC (Adaptive Replacement Cache), running the block/file backend on top of a ZFS-backed zdev or file will involve an extra set of copies. The optimal solution for backing targets served by CTL with ZFS would be to allocate buffers out of the ARC directly, and DMA to/from them directly. That would eliminate an extra data buffer allocation and copy. - Switch CTL over to using CAM CCBs instead of its own union ctl_io. This will likely require a significant amount of work, but will eliminate another data structure in the stack, more memory allocations, etc. This will also require changes to the CAM CCB structure to support CTL. Code Roadmap: ============ CTL has the concept of pluggable frontend ports and backends. All frontends and backends can be active at the same time. You can have a ramdisk-backed LUN present along side a file backed LUN. ctl.c: ----- This is the core of CTL, where all of the command handlers and a lot of other things live. Yes, it is large. It started off small and grew to its current size over time. Perhaps it can be split into more files at some point. Here is a roadmap of some of the primary functions in ctl.c. Starting here and following the various leaf functions will show the command flow. ctl_queue() This is where commands from the frontend ports come in. ctl_queue_sense() This is only used for non-packetized SCSI. i.e. parallel SCSI prior to U320 and perhaps U160. ctl_work_thread() This is the primary work thread, and everything gets executed from there. ctl_scsiio_precheck() This where all of the initial checks are done, and I/O is either queued for execution or blocked. ctl_scsiio() This is where the command handler is actually executed. (See ctl_cmd_table.c for the mapping of SCSI opcode to command handler function.) ctl_done() This is the routine called (or ctl_done_lock()) to initiate the command completion process. ctl_process_done() This is where command completion actually happens. ctl.h: ----- Basic function declarations and data structures. ctl_backend.c, ctl_backend.h: ------------- These files define the basic CTL backend API. The comments in the header explain the API. ctl_backend_block.c ------------------- The block and file backend. This allows for using a disk or a file as the backing store for a LUN. Multiple threads are started to do I/O to the backing device, primarily because the VFS API requires that to get any concurrency. ctl_backend_ramdisk.c: --------------------- A "fake" ramdisk backend. It only allocates a small amount of memory to act as a source and sink for reads and writes from an initiator. Therefore it cannot be used for any real data, but it can be used to test for throughput. It can also be used to test initiators' support for extremely large LUNs. ctl_cmd_table.c: --------------- This is a table with all 256 possible SCSI opcodes, and command handler functions defined for supported opcodes. It is included in ctl.c. ctl_debug.h: ----------- Simplistic debugging support. ctl_error.c, ctl_error.h: ----------- CTL-specific wrappers around the CAM sense building functions. ctl_frontend.c, ctl_frontend.h: -------------- These files define the basic CTL frontend port API. The comments in the header explain the API. ctl_frontend_cam_sim.c: ---------------------- This is a CTL frontend port that is also a CAM SIM. The idea is that this frontend allows for using CTL without any target-capable hardware. So any LUNs you create in CTL are visible via this port. ctl_ha.c: ctl_ha.h: -------- This is a High Availability API and TCP-based interlink implementation. ctl_io.h: -------- This defines most of the core CTL I/O structures. union ctl_io is conceptually very similar to CAM's union ccb. ctl_ioctl.h: ----------- This defines all ioctls available through the CTL character device, and the data structures needed for those ioctls. ctl_private.h: ------------- Private data structres (e.g. CTL softc) and function prototypes. This also includes the SCSI vendor and product names used by CTL. ctl_scsi_all.c ctl_scsi_all.h: -------------- CTL wrappers around CAM sense printing functions. ctl_ser_table.c: --------------- Command serialization table. This defines what happens when one type of command is followed by another type of command. e.g., what do you do when you have a mode select followed by a write? You block the write until the mode select is complete. That is defined in this table. ctl_util.c ctl_util.h: ---------- CTL utility functions, primarily designed to be used from userland. See ctladm for the primary consumer of these functions. These include CDB building functions. scsi_ctl.c: ---------- CAM target peripheral driver and CTL frontend port. This is the path into CTL for commands from target-capable hardware/SIMs. Userland Commands: ================= ctladm(8) fills a role similar to camcontrol(8). It allow configuring LUNs, issuing commands, injecting errors and various other control functions. ctlstat(8) fills a role similar to iostat(8). It reports I/O statistics for CTL. Index: projects/iosched/sys/cam/ctl/ctl.c =================================================================== --- projects/iosched/sys/cam/ctl/ctl.c (revision 287721) +++ projects/iosched/sys/cam/ctl/ctl.c (revision 287722) @@ -1,13300 +1,13346 @@ /*- * Copyright (c) 2003-2009 Silicon Graphics International Corp. * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2015 Alexander Motin * All rights reserved. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id$ */ /* * CAM Target Layer, a SCSI device emulation subsystem. * * Author: Ken Merry */ #define _CTL_C #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct ctl_softc *control_softc = NULL; /* * Template mode pages. */ /* * Note that these are default values only. The actual values will be * filled in when the user does a mode sense. */ const static struct copan_debugconf_subpage debugconf_page_default = { DBGCNF_PAGE_CODE | SMPH_SPF, /* page_code */ DBGCNF_SUBPAGE_CODE, /* subpage */ {(sizeof(struct copan_debugconf_subpage) - 4) >> 8, (sizeof(struct copan_debugconf_subpage) - 4) >> 0}, /* page_length */ DBGCNF_VERSION, /* page_version */ {CTL_TIME_IO_DEFAULT_SECS>>8, CTL_TIME_IO_DEFAULT_SECS>>0}, /* ctl_time_io_secs */ }; const static struct copan_debugconf_subpage debugconf_page_changeable = { DBGCNF_PAGE_CODE | SMPH_SPF, /* page_code */ DBGCNF_SUBPAGE_CODE, /* subpage */ {(sizeof(struct copan_debugconf_subpage) - 4) >> 8, (sizeof(struct copan_debugconf_subpage) - 4) >> 0}, /* page_length */ 0, /* page_version */ {0xff,0xff}, /* ctl_time_io_secs */ }; const static struct scsi_da_rw_recovery_page rw_er_page_default = { /*page_code*/SMS_RW_ERROR_RECOVERY_PAGE, /*page_length*/sizeof(struct scsi_da_rw_recovery_page) - 2, /*byte3*/SMS_RWER_AWRE|SMS_RWER_ARRE, /*read_retry_count*/0, /*correction_span*/0, /*head_offset_count*/0, /*data_strobe_offset_cnt*/0, /*byte8*/SMS_RWER_LBPERE, /*write_retry_count*/0, /*reserved2*/0, /*recovery_time_limit*/{0, 0}, }; const static struct scsi_da_rw_recovery_page rw_er_page_changeable = { /*page_code*/SMS_RW_ERROR_RECOVERY_PAGE, /*page_length*/sizeof(struct scsi_da_rw_recovery_page) - 2, /*byte3*/0, /*read_retry_count*/0, /*correction_span*/0, /*head_offset_count*/0, /*data_strobe_offset_cnt*/0, /*byte8*/0, /*write_retry_count*/0, /*reserved2*/0, /*recovery_time_limit*/{0, 0}, }; const static struct scsi_format_page format_page_default = { /*page_code*/SMS_FORMAT_DEVICE_PAGE, /*page_length*/sizeof(struct scsi_format_page) - 2, /*tracks_per_zone*/ {0, 0}, /*alt_sectors_per_zone*/ {0, 0}, /*alt_tracks_per_zone*/ {0, 0}, /*alt_tracks_per_lun*/ {0, 0}, /*sectors_per_track*/ {(CTL_DEFAULT_SECTORS_PER_TRACK >> 8) & 0xff, CTL_DEFAULT_SECTORS_PER_TRACK & 0xff}, /*bytes_per_sector*/ {0, 0}, /*interleave*/ {0, 0}, /*track_skew*/ {0, 0}, /*cylinder_skew*/ {0, 0}, /*flags*/ SFP_HSEC, /*reserved*/ {0, 0, 0} }; const static struct scsi_format_page format_page_changeable = { /*page_code*/SMS_FORMAT_DEVICE_PAGE, /*page_length*/sizeof(struct scsi_format_page) - 2, /*tracks_per_zone*/ {0, 0}, /*alt_sectors_per_zone*/ {0, 0}, /*alt_tracks_per_zone*/ {0, 0}, /*alt_tracks_per_lun*/ {0, 0}, /*sectors_per_track*/ {0, 0}, /*bytes_per_sector*/ {0, 0}, /*interleave*/ {0, 0}, /*track_skew*/ {0, 0}, /*cylinder_skew*/ {0, 0}, /*flags*/ 0, /*reserved*/ {0, 0, 0} }; const static struct scsi_rigid_disk_page rigid_disk_page_default = { /*page_code*/SMS_RIGID_DISK_PAGE, /*page_length*/sizeof(struct scsi_rigid_disk_page) - 2, /*cylinders*/ {0, 0, 0}, /*heads*/ CTL_DEFAULT_HEADS, /*start_write_precomp*/ {0, 0, 0}, /*start_reduced_current*/ {0, 0, 0}, /*step_rate*/ {0, 0}, /*landing_zone_cylinder*/ {0, 0, 0}, /*rpl*/ SRDP_RPL_DISABLED, /*rotational_offset*/ 0, /*reserved1*/ 0, /*rotation_rate*/ {(CTL_DEFAULT_ROTATION_RATE >> 8) & 0xff, CTL_DEFAULT_ROTATION_RATE & 0xff}, /*reserved2*/ {0, 0} }; const static struct scsi_rigid_disk_page rigid_disk_page_changeable = { /*page_code*/SMS_RIGID_DISK_PAGE, /*page_length*/sizeof(struct scsi_rigid_disk_page) - 2, /*cylinders*/ {0, 0, 0}, /*heads*/ 0, /*start_write_precomp*/ {0, 0, 0}, /*start_reduced_current*/ {0, 0, 0}, /*step_rate*/ {0, 0}, /*landing_zone_cylinder*/ {0, 0, 0}, /*rpl*/ 0, /*rotational_offset*/ 0, /*reserved1*/ 0, /*rotation_rate*/ {0, 0}, /*reserved2*/ {0, 0} }; const static struct scsi_caching_page caching_page_default = { /*page_code*/SMS_CACHING_PAGE, /*page_length*/sizeof(struct scsi_caching_page) - 2, /*flags1*/ SCP_DISC | SCP_WCE, /*ret_priority*/ 0, /*disable_pf_transfer_len*/ {0xff, 0xff}, /*min_prefetch*/ {0, 0}, /*max_prefetch*/ {0xff, 0xff}, /*max_pf_ceiling*/ {0xff, 0xff}, /*flags2*/ 0, /*cache_segments*/ 0, /*cache_seg_size*/ {0, 0}, /*reserved*/ 0, /*non_cache_seg_size*/ {0, 0, 0} }; const static struct scsi_caching_page caching_page_changeable = { /*page_code*/SMS_CACHING_PAGE, /*page_length*/sizeof(struct scsi_caching_page) - 2, /*flags1*/ SCP_WCE | SCP_RCD, /*ret_priority*/ 0, /*disable_pf_transfer_len*/ {0, 0}, /*min_prefetch*/ {0, 0}, /*max_prefetch*/ {0, 0}, /*max_pf_ceiling*/ {0, 0}, /*flags2*/ 0, /*cache_segments*/ 0, /*cache_seg_size*/ {0, 0}, /*reserved*/ 0, /*non_cache_seg_size*/ {0, 0, 0} }; const static struct scsi_control_page control_page_default = { /*page_code*/SMS_CONTROL_MODE_PAGE, /*page_length*/sizeof(struct scsi_control_page) - 2, /*rlec*/0, /*queue_flags*/SCP_QUEUE_ALG_RESTRICTED, /*eca_and_aen*/0, /*flags4*/SCP_TAS, /*aen_holdoff_period*/{0, 0}, /*busy_timeout_period*/{0, 0}, /*extended_selftest_completion_time*/{0, 0} }; const static struct scsi_control_page control_page_changeable = { /*page_code*/SMS_CONTROL_MODE_PAGE, /*page_length*/sizeof(struct scsi_control_page) - 2, /*rlec*/SCP_DSENSE, /*queue_flags*/SCP_QUEUE_ALG_MASK, /*eca_and_aen*/SCP_SWP, /*flags4*/0, /*aen_holdoff_period*/{0, 0}, /*busy_timeout_period*/{0, 0}, /*extended_selftest_completion_time*/{0, 0} }; const static struct scsi_info_exceptions_page ie_page_default = { /*page_code*/SMS_INFO_EXCEPTIONS_PAGE, /*page_length*/sizeof(struct scsi_info_exceptions_page) - 2, /*info_flags*/SIEP_FLAGS_DEXCPT, /*mrie*/0, /*interval_timer*/{0, 0, 0, 0}, /*report_count*/{0, 0, 0, 0} }; const static struct scsi_info_exceptions_page ie_page_changeable = { /*page_code*/SMS_INFO_EXCEPTIONS_PAGE, /*page_length*/sizeof(struct scsi_info_exceptions_page) - 2, /*info_flags*/0, /*mrie*/0, /*interval_timer*/{0, 0, 0, 0}, /*report_count*/{0, 0, 0, 0} }; #define CTL_LBPM_LEN (sizeof(struct ctl_logical_block_provisioning_page) - 4) const static struct ctl_logical_block_provisioning_page lbp_page_default = {{ /*page_code*/SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF, /*subpage_code*/0x02, /*page_length*/{CTL_LBPM_LEN >> 8, CTL_LBPM_LEN}, /*flags*/0, /*reserved*/{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /*descr*/{}}, {{/*flags*/0, /*resource*/0x01, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0x02, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0xf1, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0xf2, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}} } }; const static struct ctl_logical_block_provisioning_page lbp_page_changeable = {{ /*page_code*/SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF, /*subpage_code*/0x02, /*page_length*/{CTL_LBPM_LEN >> 8, CTL_LBPM_LEN}, /*flags*/0, /*reserved*/{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /*descr*/{}}, {{/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}} } }; SYSCTL_NODE(_kern_cam, OID_AUTO, ctl, CTLFLAG_RD, 0, "CAM Target Layer"); static int worker_threads = -1; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, worker_threads, CTLFLAG_RDTUN, &worker_threads, 1, "Number of worker threads"); static int ctl_debug = CTL_DEBUG_NONE; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, debug, CTLFLAG_RWTUN, &ctl_debug, 0, "Enabled debug flags"); /* * Supported pages (0x00), Serial number (0x80), Device ID (0x83), * Extended INQUIRY Data (0x86), Mode Page Policy (0x87), * SCSI Ports (0x88), Third-party Copy (0x8F), Block limits (0xB0), * Block Device Characteristics (0xB1) and Logical Block Provisioning (0xB2) */ #define SCSI_EVPD_NUM_SUPPORTED_PAGES 10 static void ctl_isc_event_handler(ctl_ha_channel chanel, ctl_ha_event event, int param); static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest); static void ctl_copy_sense_data_back(union ctl_io *src, union ctl_ha_msg *dest); static int ctl_init(void); void ctl_shutdown(void); static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td); static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td); static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio); static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num, struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries); static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td); static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *lun, struct ctl_be_lun *be_lun); static int ctl_free_lun(struct ctl_lun *lun); static void ctl_create_lun(struct ctl_be_lun *be_lun); static struct ctl_port * ctl_io_port(struct ctl_io_hdr *io_hdr); static int ctl_do_mode_select(union ctl_io *io); static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key, uint64_t sa_res_key, uint8_t type, uint32_t residx, struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb, struct scsi_per_res_out_parms* param); static void ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg); static void ctl_hndl_per_res_out_on_other_sc(union ctl_ha_msg *msg); static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio); static int ctl_inquiry_std(struct ctl_scsiio *ctsio); static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint64_t *len); static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2, bool seq); static ctl_action ctl_extent_check_seq(union ctl_io *io1, union ctl_io *io2); static ctl_action ctl_check_for_blockage(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *ooa_io); static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *starting_io); static int ctl_check_blocked(struct ctl_lun *lun); static int ctl_scsiio_lun_check(struct ctl_lun *lun, const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio); static void ctl_failover_lun(struct ctl_lun *lun); static void ctl_est_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua); static void ctl_est_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua); static void ctl_clr_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua); static void ctl_clr_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua); static void ctl_clr_ua_allluns(struct ctl_softc *ctl_softc, uint32_t initidx, ctl_ua_type ua_type); static int ctl_scsiio_precheck(struct ctl_softc *ctl_softc, struct ctl_scsiio *ctsio); static int ctl_scsiio(struct ctl_scsiio *ctsio); static int ctl_bus_reset(struct ctl_softc *ctl_softc, union ctl_io *io); static int ctl_target_reset(struct ctl_softc *ctl_softc, union ctl_io *io, ctl_ua_type ua_type); static int ctl_lun_reset(struct ctl_lun *lun, union ctl_io *io, ctl_ua_type ua_type); static int ctl_abort_task(union ctl_io *io); static int ctl_abort_task_set(union ctl_io *io); static int ctl_i_t_nexus_reset(union ctl_io *io); static void ctl_run_task(union ctl_io *io); #ifdef CTL_IO_DELAY static void ctl_datamove_timer_wakeup(void *arg); static void ctl_done_timer_wakeup(void *arg); #endif /* CTL_IO_DELAY */ static void ctl_send_datamove_done(union ctl_io *io, int have_lock); static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq); static int ctl_datamove_remote_dm_write_cb(union ctl_io *io); static void ctl_datamove_remote_write(union ctl_io *io); static int ctl_datamove_remote_dm_read_cb(union ctl_io *io); static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq); static int ctl_datamove_remote_sgl_setup(union ctl_io *io); static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command, ctl_ha_dt_cb callback); static void ctl_datamove_remote_read(union ctl_io *io); static void ctl_datamove_remote(union ctl_io *io); static int ctl_process_done(union ctl_io *io); static void ctl_lun_thread(void *arg); static void ctl_thresh_thread(void *arg); static void ctl_work_thread(void *arg); static void ctl_enqueue_incoming(union ctl_io *io); static void ctl_enqueue_rtr(union ctl_io *io); static void ctl_enqueue_done(union ctl_io *io); static void ctl_enqueue_isc(union ctl_io *io); static const struct ctl_cmd_entry * ctl_get_cmd_entry(struct ctl_scsiio *ctsio, int *sa); static const struct ctl_cmd_entry * ctl_validate_command(struct ctl_scsiio *ctsio); static int ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry); static uint64_t ctl_get_prkey(struct ctl_lun *lun, uint32_t residx); static void ctl_clr_prkey(struct ctl_lun *lun, uint32_t residx); static void ctl_alloc_prkey(struct ctl_lun *lun, uint32_t residx); static void ctl_set_prkey(struct ctl_lun *lun, uint32_t residx, uint64_t key); /* * Load the serialization table. This isn't very pretty, but is probably * the easiest way to do it. */ #include "ctl_ser_table.c" /* * We only need to define open, close and ioctl routines for this driver. */ static struct cdevsw ctl_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = ctl_open, .d_close = ctl_close, .d_ioctl = ctl_ioctl, .d_name = "ctl", }; MALLOC_DEFINE(M_CTL, "ctlmem", "Memory used for CTL"); static int ctl_module_event_handler(module_t, int /*modeventtype_t*/, void *); static moduledata_t ctl_moduledata = { "ctl", ctl_module_event_handler, NULL }; DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD); MODULE_VERSION(ctl, 1); static struct ctl_frontend ha_frontend = { .name = "ha", }; static void ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc, union ctl_ha_msg *msg_info) { struct ctl_scsiio *ctsio; if (msg_info->hdr.original_sc == NULL) { printf("%s: original_sc == NULL!\n", __func__); /* XXX KDM now what? */ return; } ctsio = &msg_info->hdr.original_sc->scsiio; ctsio->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO; ctsio->io_hdr.status = msg_info->hdr.status; ctsio->scsi_status = msg_info->scsi.scsi_status; ctsio->sense_len = msg_info->scsi.sense_len; ctsio->sense_residual = msg_info->scsi.sense_residual; ctsio->residual = msg_info->scsi.residual; memcpy(&ctsio->sense_data, &msg_info->scsi.sense_data, msg_info->scsi.sense_len); memcpy(&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes, &msg_info->scsi.lbalen, sizeof(msg_info->scsi.lbalen)); ctl_enqueue_isc((union ctl_io *)ctsio); } static void ctl_isc_handler_finish_ser_only(struct ctl_softc *ctl_softc, union ctl_ha_msg *msg_info) { struct ctl_scsiio *ctsio; if (msg_info->hdr.serializing_sc == NULL) { printf("%s: serializing_sc == NULL!\n", __func__); /* XXX KDM now what? */ return; } ctsio = &msg_info->hdr.serializing_sc->scsiio; ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO; ctl_enqueue_isc((union ctl_io *)ctsio); } void ctl_isc_announce_lun(struct ctl_lun *lun) { struct ctl_softc *softc = lun->ctl_softc; union ctl_ha_msg *msg; struct ctl_ha_msg_lun_pr_key pr_key; int i, k; if (softc->ha_link != CTL_HA_LINK_ONLINE) return; mtx_lock(&lun->lun_lock); i = sizeof(msg->lun); if (lun->lun_devid) i += lun->lun_devid->len; i += sizeof(pr_key) * lun->pr_key_count; alloc: mtx_unlock(&lun->lun_lock); msg = malloc(i, M_CTL, M_WAITOK); mtx_lock(&lun->lun_lock); k = sizeof(msg->lun); if (lun->lun_devid) k += lun->lun_devid->len; k += sizeof(pr_key) * lun->pr_key_count; if (i < k) { free(msg, M_CTL); i = k; goto alloc; } bzero(&msg->lun, sizeof(msg->lun)); msg->hdr.msg_type = CTL_MSG_LUN_SYNC; msg->hdr.nexus.targ_lun = lun->lun; msg->hdr.nexus.targ_mapped_lun = lun->lun; msg->lun.flags = lun->flags; msg->lun.pr_generation = lun->PRGeneration; msg->lun.pr_res_idx = lun->pr_res_idx; msg->lun.pr_res_type = lun->res_type; msg->lun.pr_key_count = lun->pr_key_count; i = 0; if (lun->lun_devid) { msg->lun.lun_devid_len = lun->lun_devid->len; memcpy(&msg->lun.data[i], lun->lun_devid->data, msg->lun.lun_devid_len); i += msg->lun.lun_devid_len; } for (k = 0; k < CTL_MAX_INITIATORS; k++) { if ((pr_key.pr_key = ctl_get_prkey(lun, k)) == 0) continue; pr_key.pr_iid = k; memcpy(&msg->lun.data[i], &pr_key, sizeof(pr_key)); i += sizeof(pr_key); } mtx_unlock(&lun->lun_lock); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->port, sizeof(msg->port) + i, M_WAITOK); free(msg, M_CTL); } void ctl_isc_announce_port(struct ctl_port *port) { struct ctl_softc *softc = control_softc; union ctl_ha_msg *msg; int i; if (port->targ_port < softc->port_min || port->targ_port >= softc->port_max || softc->ha_link != CTL_HA_LINK_ONLINE) return; i = sizeof(msg->port) + strlen(port->port_name) + 1; if (port->lun_map) i += sizeof(uint32_t) * CTL_MAX_LUNS; if (port->port_devid) i += port->port_devid->len; if (port->target_devid) i += port->target_devid->len; msg = malloc(i, M_CTL, M_WAITOK); bzero(&msg->port, sizeof(msg->port)); msg->hdr.msg_type = CTL_MSG_PORT_SYNC; msg->hdr.nexus.targ_port = port->targ_port; msg->port.port_type = port->port_type; msg->port.physical_port = port->physical_port; msg->port.virtual_port = port->virtual_port; msg->port.status = port->status; i = 0; msg->port.name_len = sprintf(&msg->port.data[i], "%d:%s", softc->ha_id, port->port_name) + 1; i += msg->port.name_len; if (port->lun_map) { msg->port.lun_map_len = sizeof(uint32_t) * CTL_MAX_LUNS; memcpy(&msg->port.data[i], port->lun_map, msg->port.lun_map_len); i += msg->port.lun_map_len; } if (port->port_devid) { msg->port.port_devid_len = port->port_devid->len; memcpy(&msg->port.data[i], port->port_devid->data, msg->port.port_devid_len); i += msg->port.port_devid_len; } if (port->target_devid) { msg->port.target_devid_len = port->target_devid->len; memcpy(&msg->port.data[i], port->target_devid->data, msg->port.target_devid_len); i += msg->port.target_devid_len; } ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->port, sizeof(msg->port) + i, M_WAITOK); free(msg, M_CTL); } static void ctl_isc_ha_link_up(struct ctl_softc *softc) { struct ctl_port *port; struct ctl_lun *lun; STAILQ_FOREACH(port, &softc->port_list, links) ctl_isc_announce_port(port); STAILQ_FOREACH(lun, &softc->lun_list, links) ctl_isc_announce_lun(lun); } static void ctl_isc_ha_link_down(struct ctl_softc *softc) { struct ctl_port *port; struct ctl_lun *lun; union ctl_io *io; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); - lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY; + if (lun->flags & CTL_LUN_PEER_SC_PRIMARY) { + lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY; + ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); + } mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); io = ctl_alloc_io(softc->othersc_pool); mtx_lock(&softc->ctl_lock); ctl_zero_io(io); io->io_hdr.msg_type = CTL_MSG_FAILOVER; io->io_hdr.nexus.targ_mapped_lun = lun->lun; ctl_enqueue_isc(io); } STAILQ_FOREACH(port, &softc->port_list, links) { if (port->targ_port >= softc->port_min && port->targ_port < softc->port_max) continue; port->status &= ~CTL_PORT_STATUS_ONLINE; } mtx_unlock(&softc->ctl_lock); } static void ctl_isc_ua(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_lun *lun; uint32_t iid = ctl_get_initindex(&msg->hdr.nexus); + mtx_lock(&softc->ctl_lock); if (msg->hdr.nexus.targ_lun < CTL_MAX_LUNS && - (lun = softc->ctl_luns[msg->hdr.nexus.targ_lun]) != NULL) { + (lun = softc->ctl_luns[msg->hdr.nexus.targ_mapped_lun]) != NULL) { + mtx_lock(&lun->lun_lock); + mtx_unlock(&softc->ctl_lock); if (msg->ua.ua_all) { if (msg->ua.ua_set) ctl_est_ua_all(lun, iid, msg->ua.ua_type); else ctl_clr_ua_all(lun, iid, msg->ua.ua_type); } else { if (msg->ua.ua_set) ctl_est_ua(lun, iid, msg->ua.ua_type); else ctl_clr_ua(lun, iid, msg->ua.ua_type); } - } + mtx_unlock(&lun->lun_lock); + } else + mtx_unlock(&softc->ctl_lock); } static void ctl_isc_lun_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_lun *lun; struct ctl_ha_msg_lun_pr_key pr_key; int i, k; + ctl_lun_flags oflags; + uint32_t targ_lun; - lun = softc->ctl_luns[msg->hdr.nexus.targ_lun]; - if (lun == NULL) { - CTL_DEBUG_PRINT(("%s: Unknown LUN %d\n", __func__, - msg->hdr.nexus.targ_lun)); + targ_lun = msg->hdr.nexus.targ_mapped_lun; + mtx_lock(&softc->ctl_lock); + if ((targ_lun >= CTL_MAX_LUNS) || + ((lun = softc->ctl_luns[targ_lun]) == NULL)) { + mtx_unlock(&softc->ctl_lock); + return; + } + mtx_lock(&lun->lun_lock); + mtx_unlock(&softc->ctl_lock); + if (lun->flags & CTL_LUN_DISABLED) { + mtx_unlock(&lun->lun_lock); + return; + } + i = (lun->lun_devid != NULL) ? lun->lun_devid->len : 0; + if (msg->lun.lun_devid_len != i || (i > 0 && + memcmp(&msg->lun.data[0], lun->lun_devid->data, i) != 0)) { + mtx_unlock(&lun->lun_lock); + printf("%s: Received conflicting HA LUN %d\n", + __func__, msg->hdr.nexus.targ_lun); + return; } else { - mtx_lock(&lun->lun_lock); - i = (lun->lun_devid != NULL) ? lun->lun_devid->len : 0; - if (msg->lun.lun_devid_len != i || (i > 0 && - memcmp(&msg->lun.data[0], lun->lun_devid->data, i) != 0)) { - mtx_unlock(&lun->lun_lock); - printf("%s: Received conflicting HA LUN %d\n", - __func__, msg->hdr.nexus.targ_lun); - return; - } else { - /* Record whether peer is primary. */ - if ((msg->lun.flags & CTL_LUN_PRIMARY_SC) && - (msg->lun.flags & CTL_LUN_DISABLED) == 0) - lun->flags |= CTL_LUN_PEER_SC_PRIMARY; - else - lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY; + /* Record whether peer is primary. */ + oflags = lun->flags; + if ((msg->lun.flags & CTL_LUN_PRIMARY_SC) && + (msg->lun.flags & CTL_LUN_DISABLED) == 0) + lun->flags |= CTL_LUN_PEER_SC_PRIMARY; + else + lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY; + if (oflags != lun->flags) + ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); - /* If peer is primary and we are not -- use data */ - if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && - (lun->flags & CTL_LUN_PEER_SC_PRIMARY)) { - lun->PRGeneration = msg->lun.pr_generation; - lun->pr_res_idx = msg->lun.pr_res_idx; - lun->res_type = msg->lun.pr_res_type; - lun->pr_key_count = msg->lun.pr_key_count; - for (k = 0; k < CTL_MAX_INITIATORS; k++) - ctl_clr_prkey(lun, k); - for (k = 0; k < msg->lun.pr_key_count; k++) { - memcpy(&pr_key, &msg->lun.data[i], - sizeof(pr_key)); - ctl_alloc_prkey(lun, pr_key.pr_iid); - ctl_set_prkey(lun, pr_key.pr_iid, - pr_key.pr_key); - i += sizeof(pr_key); - } + /* If peer is primary and we are not -- use data */ + if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && + (lun->flags & CTL_LUN_PEER_SC_PRIMARY)) { + lun->PRGeneration = msg->lun.pr_generation; + lun->pr_res_idx = msg->lun.pr_res_idx; + lun->res_type = msg->lun.pr_res_type; + lun->pr_key_count = msg->lun.pr_key_count; + for (k = 0; k < CTL_MAX_INITIATORS; k++) + ctl_clr_prkey(lun, k); + for (k = 0; k < msg->lun.pr_key_count; k++) { + memcpy(&pr_key, &msg->lun.data[i], + sizeof(pr_key)); + ctl_alloc_prkey(lun, pr_key.pr_iid); + ctl_set_prkey(lun, pr_key.pr_iid, + pr_key.pr_key); + i += sizeof(pr_key); } + } - mtx_unlock(&lun->lun_lock); - CTL_DEBUG_PRINT(("%s: Known LUN %d, peer is %s\n", - __func__, msg->hdr.nexus.targ_lun, - (msg->lun.flags & CTL_LUN_PRIMARY_SC) ? - "primary" : "secondary")); + mtx_unlock(&lun->lun_lock); + CTL_DEBUG_PRINT(("%s: Known LUN %d, peer is %s\n", + __func__, msg->hdr.nexus.targ_lun, + (msg->lun.flags & CTL_LUN_PRIMARY_SC) ? + "primary" : "secondary")); - /* If we are primary but peer doesn't know -- notify */ - if ((lun->flags & CTL_LUN_PRIMARY_SC) && - (msg->lun.flags & CTL_LUN_PEER_SC_PRIMARY) == 0) - ctl_isc_announce_lun(lun); - } + /* If we are primary but peer doesn't know -- notify */ + if ((lun->flags & CTL_LUN_PRIMARY_SC) && + (msg->lun.flags & CTL_LUN_PEER_SC_PRIMARY) == 0) + ctl_isc_announce_lun(lun); } } static void ctl_isc_port_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_port *port; int i, new; port = softc->ctl_ports[msg->hdr.nexus.targ_port]; if (port == NULL) { CTL_DEBUG_PRINT(("%s: New port %d\n", __func__, msg->hdr.nexus.targ_port)); new = 1; port = malloc(sizeof(*port), M_CTL, M_WAITOK | M_ZERO); port->frontend = &ha_frontend; port->targ_port = msg->hdr.nexus.targ_port; } else if (port->frontend == &ha_frontend) { CTL_DEBUG_PRINT(("%s: Updated port %d\n", __func__, msg->hdr.nexus.targ_port)); new = 0; } else { printf("%s: Received conflicting HA port %d\n", __func__, msg->hdr.nexus.targ_port); return; } port->port_type = msg->port.port_type; port->physical_port = msg->port.physical_port; port->virtual_port = msg->port.virtual_port; port->status = msg->port.status; i = 0; free(port->port_name, M_CTL); port->port_name = strndup(&msg->port.data[i], msg->port.name_len, M_CTL); i += msg->port.name_len; if (msg->port.lun_map_len != 0) { if (port->lun_map == NULL) port->lun_map = malloc(sizeof(uint32_t) * CTL_MAX_LUNS, M_CTL, M_WAITOK); memcpy(port->lun_map, &msg->port.data[i], sizeof(uint32_t) * CTL_MAX_LUNS); i += msg->port.lun_map_len; } else { free(port->lun_map, M_CTL); port->lun_map = NULL; } if (msg->port.port_devid_len != 0) { if (port->port_devid == NULL || port->port_devid->len != msg->port.port_devid_len) { free(port->port_devid, M_CTL); port->port_devid = malloc(sizeof(struct ctl_devid) + msg->port.port_devid_len, M_CTL, M_WAITOK); } memcpy(port->port_devid->data, &msg->port.data[i], msg->port.port_devid_len); port->port_devid->len = msg->port.port_devid_len; i += msg->port.port_devid_len; } else { free(port->port_devid, M_CTL); port->port_devid = NULL; } if (msg->port.target_devid_len != 0) { if (port->target_devid == NULL || port->target_devid->len != msg->port.target_devid_len) { free(port->target_devid, M_CTL); port->target_devid = malloc(sizeof(struct ctl_devid) + msg->port.target_devid_len, M_CTL, M_WAITOK); } memcpy(port->target_devid->data, &msg->port.data[i], msg->port.target_devid_len); port->target_devid->len = msg->port.target_devid_len; i += msg->port.target_devid_len; } else { free(port->port_devid, M_CTL); port->port_devid = NULL; } if (new) { if (ctl_port_register(port) != 0) { printf("%s: ctl_port_register() failed with error\n", __func__); } } } /* * ISC (Inter Shelf Communication) event handler. Events from the HA * subsystem come in here. */ static void ctl_isc_event_handler(ctl_ha_channel channel, ctl_ha_event event, int param) { struct ctl_softc *softc; union ctl_io *io; struct ctl_prio *presio; ctl_ha_status isc_status; softc = control_softc; CTL_DEBUG_PRINT(("CTL: Isc Msg event %d\n", event)); if (event == CTL_HA_EVT_MSG_RECV) { union ctl_ha_msg *msg, msgbuf; if (param > sizeof(msgbuf)) msg = malloc(param, M_CTL, M_WAITOK); else msg = &msgbuf; isc_status = ctl_ha_msg_recv(CTL_HA_CHAN_CTL, msg, param, M_WAITOK); if (isc_status != CTL_HA_STATUS_SUCCESS) { printf("%s: Error receiving message: %d\n", __func__, isc_status); if (msg != &msgbuf) free(msg, M_CTL); return; } CTL_DEBUG_PRINT(("CTL: msg_type %d\n", msg->msg_type)); switch (msg->hdr.msg_type) { case CTL_MSG_SERIALIZE: io = ctl_alloc_io(softc->othersc_pool); ctl_zero_io(io); // populate ctsio from msg io->io_hdr.io_type = CTL_IO_SCSI; io->io_hdr.msg_type = CTL_MSG_SERIALIZE; io->io_hdr.original_sc = msg->hdr.original_sc; io->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC | CTL_FLAG_IO_ACTIVE; /* * If we're in serialization-only mode, we don't * want to go through full done processing. Thus * the COPY flag. * * XXX KDM add another flag that is more specific. */ if (softc->ha_mode != CTL_HA_MODE_XFER) io->io_hdr.flags |= CTL_FLAG_INT_COPY; io->io_hdr.nexus = msg->hdr.nexus; #if 0 printf("port %u, iid %u, lun %u\n", io->io_hdr.nexus.targ_port, io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_lun); #endif io->scsiio.tag_num = msg->scsi.tag_num; io->scsiio.tag_type = msg->scsi.tag_type; #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; getbintime(&io->io_hdr.start_bt); #endif /* CTL_TIME_IO */ io->scsiio.cdb_len = msg->scsi.cdb_len; memcpy(io->scsiio.cdb, msg->scsi.cdb, CTL_MAX_CDBLEN); if (softc->ha_mode == CTL_HA_MODE_XFER) { const struct ctl_cmd_entry *entry; entry = ctl_get_cmd_entry(&io->scsiio, NULL); io->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; io->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; } ctl_enqueue_isc(io); break; /* Performed on the Originating SC, XFER mode only */ case CTL_MSG_DATAMOVE: { struct ctl_sg_entry *sgl; int i, j; io = msg->hdr.original_sc; if (io == NULL) { printf("%s: original_sc == NULL!\n", __func__); /* XXX KDM do something here */ break; } io->io_hdr.msg_type = CTL_MSG_DATAMOVE; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; /* * Keep track of this, we need to send it back over * when the datamove is complete. */ io->io_hdr.serializing_sc = msg->hdr.serializing_sc; if (msg->dt.sg_sequence == 0) { i = msg->dt.kern_sg_entries + io->scsiio.kern_data_len / CTL_HA_DATAMOVE_SEGMENT + 1; sgl = malloc(sizeof(*sgl) * i, M_CTL, M_WAITOK | M_ZERO); io->io_hdr.remote_sglist = sgl; io->io_hdr.local_sglist = &sgl[msg->dt.kern_sg_entries]; io->scsiio.kern_data_ptr = (uint8_t *)sgl; io->scsiio.kern_sg_entries = msg->dt.kern_sg_entries; io->scsiio.rem_sg_entries = msg->dt.kern_sg_entries; io->scsiio.kern_data_len = msg->dt.kern_data_len; io->scsiio.kern_total_len = msg->dt.kern_total_len; io->scsiio.kern_data_resid = msg->dt.kern_data_resid; io->scsiio.kern_rel_offset = msg->dt.kern_rel_offset; io->io_hdr.flags &= ~CTL_FLAG_BUS_ADDR; io->io_hdr.flags |= msg->dt.flags & CTL_FLAG_BUS_ADDR; } else sgl = (struct ctl_sg_entry *) io->scsiio.kern_data_ptr; for (i = msg->dt.sent_sg_entries, j = 0; i < (msg->dt.sent_sg_entries + msg->dt.cur_sg_entries); i++, j++) { sgl[i].addr = msg->dt.sg_list[j].addr; sgl[i].len = msg->dt.sg_list[j].len; #if 0 printf("%s: L: %p,%d -> %p,%d j=%d, i=%d\n", __func__, msg->dt.sg_list[j].addr, msg->dt.sg_list[j].len, sgl[i].addr, sgl[i].len, j, i); #endif } /* * If this is the last piece of the I/O, we've got * the full S/G list. Queue processing in the thread. * Otherwise wait for the next piece. */ if (msg->dt.sg_last != 0) ctl_enqueue_isc(io); break; } /* Performed on the Serializing (primary) SC, XFER mode only */ case CTL_MSG_DATAMOVE_DONE: { if (msg->hdr.serializing_sc == NULL) { printf("%s: serializing_sc == NULL!\n", __func__); /* XXX KDM now what? */ break; } /* * We grab the sense information here in case * there was a failure, so we can return status * back to the initiator. */ io = msg->hdr.serializing_sc; io->io_hdr.msg_type = CTL_MSG_DATAMOVE_DONE; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; io->io_hdr.port_status = msg->scsi.fetd_status; io->scsiio.residual = msg->scsi.residual; if (msg->hdr.status != CTL_STATUS_NONE) { io->io_hdr.status = msg->hdr.status; io->scsiio.scsi_status = msg->scsi.scsi_status; io->scsiio.sense_len = msg->scsi.sense_len; io->scsiio.sense_residual =msg->scsi.sense_residual; memcpy(&io->scsiio.sense_data, &msg->scsi.sense_data, msg->scsi.sense_len); } ctl_enqueue_isc(io); break; } /* Preformed on Originating SC, SER_ONLY mode */ case CTL_MSG_R2R: io = msg->hdr.original_sc; if (io == NULL) { printf("%s: original_sc == NULL!\n", __func__); break; } io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; io->io_hdr.msg_type = CTL_MSG_R2R; io->io_hdr.serializing_sc = msg->hdr.serializing_sc; ctl_enqueue_isc(io); break; /* * Performed on Serializing(i.e. primary SC) SC in SER_ONLY * mode. * Performed on the Originating (i.e. secondary) SC in XFER * mode */ case CTL_MSG_FINISH_IO: if (softc->ha_mode == CTL_HA_MODE_XFER) ctl_isc_handler_finish_xfer(softc, msg); else ctl_isc_handler_finish_ser_only(softc, msg); break; /* Preformed on Originating SC */ case CTL_MSG_BAD_JUJU: io = msg->hdr.original_sc; if (io == NULL) { printf("%s: Bad JUJU!, original_sc is NULL!\n", __func__); break; } ctl_copy_sense_data(msg, io); /* * IO should have already been cleaned up on other * SC so clear this flag so we won't send a message * back to finish the IO there. */ io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; /* io = msg->hdr.serializing_sc; */ io->io_hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_enqueue_isc(io); break; /* Handle resets sent from the other side */ case CTL_MSG_MANAGE_TASKS: { struct ctl_taskio *taskio; taskio = (struct ctl_taskio *)ctl_alloc_io( softc->othersc_pool); ctl_zero_io((union ctl_io *)taskio); taskio->io_hdr.io_type = CTL_IO_TASK; taskio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC; taskio->io_hdr.nexus = msg->hdr.nexus; taskio->task_action = msg->task.task_action; taskio->tag_num = msg->task.tag_num; taskio->tag_type = msg->task.tag_type; #ifdef CTL_TIME_IO taskio->io_hdr.start_time = time_uptime; getbintime(&taskio->io_hdr.start_bt); #endif /* CTL_TIME_IO */ ctl_run_task((union ctl_io *)taskio); break; } /* Persistent Reserve action which needs attention */ case CTL_MSG_PERS_ACTION: presio = (struct ctl_prio *)ctl_alloc_io( softc->othersc_pool); ctl_zero_io((union ctl_io *)presio); presio->io_hdr.msg_type = CTL_MSG_PERS_ACTION; presio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC; presio->io_hdr.nexus = msg->hdr.nexus; presio->pr_msg = msg->pr; ctl_enqueue_isc((union ctl_io *)presio); break; case CTL_MSG_UA: ctl_isc_ua(softc, msg, param); break; case CTL_MSG_PORT_SYNC: ctl_isc_port_sync(softc, msg, param); break; case CTL_MSG_LUN_SYNC: ctl_isc_lun_sync(softc, msg, param); break; default: printf("Received HA message of unknown type %d\n", msg->hdr.msg_type); break; } if (msg != &msgbuf) free(msg, M_CTL); } else if (event == CTL_HA_EVT_LINK_CHANGE) { printf("CTL: HA link status changed from %d to %d\n", softc->ha_link, param); if (param == softc->ha_link) return; if (softc->ha_link == CTL_HA_LINK_ONLINE) { softc->ha_link = param; ctl_isc_ha_link_down(softc); } else { softc->ha_link = param; if (softc->ha_link == CTL_HA_LINK_ONLINE) ctl_isc_ha_link_up(softc); } return; } else { printf("ctl_isc_event_handler: Unknown event %d\n", event); return; } } static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest) { memcpy(&dest->scsiio.sense_data, &src->scsi.sense_data, src->scsi.sense_len); dest->scsiio.scsi_status = src->scsi.scsi_status; dest->scsiio.sense_len = src->scsi.sense_len; dest->io_hdr.status = src->hdr.status; } static void ctl_copy_sense_data_back(union ctl_io *src, union ctl_ha_msg *dest) { memcpy(&dest->scsi.sense_data, &src->scsiio.sense_data, src->scsiio.sense_len); dest->scsi.scsi_status = src->scsiio.scsi_status; dest->scsi.sense_len = src->scsiio.sense_len; dest->hdr.status = src->io_hdr.status; } static void ctl_est_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; ctl_ua_type *pu; if (initidx < softc->init_min || initidx >= softc->init_max) return; mtx_assert(&lun->lun_lock, MA_OWNED); pu = lun->pending_ua[initidx / CTL_MAX_INIT_PER_PORT]; if (pu == NULL) return; pu[initidx % CTL_MAX_INIT_PER_PORT] |= ua; } static void ctl_est_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; int i, j; mtx_assert(&lun->lun_lock, MA_OWNED); for (i = softc->port_min; i < softc->port_max; i++) { if (lun->pending_ua[i] == NULL) continue; for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (i * CTL_MAX_INIT_PER_PORT + j == except) continue; lun->pending_ua[i][j] |= ua; } } } static void ctl_clr_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; ctl_ua_type *pu; if (initidx < softc->init_min || initidx >= softc->init_max) return; mtx_assert(&lun->lun_lock, MA_OWNED); pu = lun->pending_ua[initidx / CTL_MAX_INIT_PER_PORT]; if (pu == NULL) return; pu[initidx % CTL_MAX_INIT_PER_PORT] &= ~ua; } static void ctl_clr_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; int i, j; mtx_assert(&lun->lun_lock, MA_OWNED); for (i = softc->port_min; i < softc->port_max; i++) { if (lun->pending_ua[i] == NULL) continue; for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (i * CTL_MAX_INIT_PER_PORT + j == except) continue; lun->pending_ua[i][j] &= ~ua; } } } static void ctl_clr_ua_allluns(struct ctl_softc *ctl_softc, uint32_t initidx, ctl_ua_type ua_type) { struct ctl_lun *lun; mtx_assert(&ctl_softc->ctl_lock, MA_OWNED); STAILQ_FOREACH(lun, &ctl_softc->lun_list, links) { mtx_lock(&lun->lun_lock); ctl_clr_ua(lun, initidx, ua_type); mtx_unlock(&lun->lun_lock); } } static int ctl_ha_role_sysctl(SYSCTL_HANDLER_ARGS) { struct ctl_softc *softc = (struct ctl_softc *)arg1; struct ctl_lun *lun; struct ctl_lun_req ireq; int error, value; value = (softc->flags & CTL_FLAG_ACTIVE_SHELF) ? 0 : 1; error = sysctl_handle_int(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); mtx_lock(&softc->ctl_lock); if (value == 0) softc->flags |= CTL_FLAG_ACTIVE_SHELF; else softc->flags &= ~CTL_FLAG_ACTIVE_SHELF; STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_unlock(&softc->ctl_lock); bzero(&ireq, sizeof(ireq)); ireq.reqtype = CTL_LUNREQ_MODIFY; ireq.reqdata.modify.lun_id = lun->lun; lun->backend->ioctl(NULL, CTL_LUN_REQ, (caddr_t)&ireq, 0, curthread); if (ireq.status != CTL_LUN_OK) { printf("%s: CTL_LUNREQ_MODIFY returned %d '%s'\n", __func__, ireq.status, ireq.error_str); } mtx_lock(&softc->ctl_lock); } mtx_unlock(&softc->ctl_lock); return (0); } static int ctl_init(void) { struct ctl_softc *softc; void *other_pool; int i, error, retval; retval = 0; control_softc = malloc(sizeof(*control_softc), M_DEVBUF, M_WAITOK | M_ZERO); softc = control_softc; softc->dev = make_dev(&ctl_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "cam/ctl"); softc->dev->si_drv1 = softc; sysctl_ctx_init(&softc->sysctl_ctx); softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam), OID_AUTO, "ctl", CTLFLAG_RD, 0, "CAM Target Layer"); if (softc->sysctl_tree == NULL) { printf("%s: unable to allocate sysctl tree\n", __func__); destroy_dev(softc->dev); free(control_softc, M_DEVBUF); control_softc = NULL; return (ENOMEM); } mtx_init(&softc->ctl_lock, "CTL mutex", NULL, MTX_DEF); softc->io_zone = uma_zcreate("CTL IO", sizeof(union ctl_io), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); softc->open_count = 0; /* * Default to actually sending a SYNCHRONIZE CACHE command down to * the drive. */ softc->flags = CTL_FLAG_REAL_SYNC; SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_mode", CTLFLAG_RDTUN, (int *)&softc->ha_mode, 0, "HA mode (0 - act/stby, 1 - serialize only, 2 - xfer)"); /* * In Copan's HA scheme, the "master" and "slave" roles are * figured out through the slot the controller is in. Although it * is an active/active system, someone has to be in charge. */ SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_id", CTLFLAG_RDTUN, &softc->ha_id, 0, "HA head ID (0 - no HA)"); if (softc->ha_id == 0 || softc->ha_id > NUM_TARGET_PORT_GROUPS) { softc->flags |= CTL_FLAG_ACTIVE_SHELF; softc->is_single = 1; softc->port_cnt = CTL_MAX_PORTS; softc->port_min = 0; } else { softc->port_cnt = CTL_MAX_PORTS / NUM_TARGET_PORT_GROUPS; softc->port_min = (softc->ha_id - 1) * softc->port_cnt; } softc->port_max = softc->port_min + softc->port_cnt; softc->init_min = softc->port_min * CTL_MAX_INIT_PER_PORT; softc->init_max = softc->port_max * CTL_MAX_INIT_PER_PORT; SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_link", CTLFLAG_RD, (int *)&softc->ha_link, 0, "HA link state (0 - offline, 1 - unknown, 2 - online)"); STAILQ_INIT(&softc->lun_list); STAILQ_INIT(&softc->pending_lun_queue); STAILQ_INIT(&softc->fe_list); STAILQ_INIT(&softc->port_list); STAILQ_INIT(&softc->be_list); ctl_tpc_init(softc); if (ctl_pool_create(softc, "othersc", CTL_POOL_ENTRIES_OTHER_SC, &other_pool) != 0) { printf("ctl: can't allocate %d entry other SC pool, " "exiting\n", CTL_POOL_ENTRIES_OTHER_SC); return (ENOMEM); } softc->othersc_pool = other_pool; if (worker_threads <= 0) worker_threads = max(1, mp_ncpus / 4); if (worker_threads > CTL_MAX_THREADS) worker_threads = CTL_MAX_THREADS; for (i = 0; i < worker_threads; i++) { struct ctl_thread *thr = &softc->threads[i]; mtx_init(&thr->queue_lock, "CTL queue mutex", NULL, MTX_DEF); thr->ctl_softc = softc; STAILQ_INIT(&thr->incoming_queue); STAILQ_INIT(&thr->rtr_queue); STAILQ_INIT(&thr->done_queue); STAILQ_INIT(&thr->isc_queue); error = kproc_kthread_add(ctl_work_thread, thr, &softc->ctl_proc, &thr->thread, 0, 0, "ctl", "work%d", i); if (error != 0) { printf("error creating CTL work thread!\n"); ctl_pool_free(other_pool); return (error); } } error = kproc_kthread_add(ctl_lun_thread, softc, &softc->ctl_proc, NULL, 0, 0, "ctl", "lun"); if (error != 0) { printf("error creating CTL lun thread!\n"); ctl_pool_free(other_pool); return (error); } error = kproc_kthread_add(ctl_thresh_thread, softc, &softc->ctl_proc, NULL, 0, 0, "ctl", "thresh"); if (error != 0) { printf("error creating CTL threshold thread!\n"); ctl_pool_free(other_pool); return (error); } SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_role", CTLTYPE_INT | CTLFLAG_RWTUN, softc, 0, ctl_ha_role_sysctl, "I", "HA role for this head"); if (softc->is_single == 0) { ctl_frontend_register(&ha_frontend); if (ctl_ha_msg_init(softc) != CTL_HA_STATUS_SUCCESS) { printf("ctl_init: ctl_ha_msg_init failed.\n"); softc->is_single = 1; } else if (ctl_ha_msg_register(CTL_HA_CHAN_CTL, ctl_isc_event_handler) != CTL_HA_STATUS_SUCCESS) { printf("ctl_init: ctl_ha_msg_register failed.\n"); softc->is_single = 1; } } return (0); } void ctl_shutdown(void) { struct ctl_softc *softc; struct ctl_lun *lun, *next_lun; softc = (struct ctl_softc *)control_softc; if (softc->is_single == 0) { if (ctl_ha_msg_deregister(CTL_HA_CHAN_CTL) != CTL_HA_STATUS_SUCCESS) { printf("ctl_shutdown: ctl_ha_msg_deregister failed.\n"); } if (ctl_ha_msg_shutdown(softc) != CTL_HA_STATUS_SUCCESS) { printf("ctl_shutdown: ctl_ha_msg_shutdown failed.\n"); } ctl_frontend_deregister(&ha_frontend); } mtx_lock(&softc->ctl_lock); /* * Free up each LUN. */ for (lun = STAILQ_FIRST(&softc->lun_list); lun != NULL; lun = next_lun){ next_lun = STAILQ_NEXT(lun, links); ctl_free_lun(lun); } mtx_unlock(&softc->ctl_lock); #if 0 ctl_shutdown_thread(softc->work_thread); mtx_destroy(&softc->queue_lock); #endif ctl_tpc_shutdown(softc); uma_zdestroy(softc->io_zone); mtx_destroy(&softc->ctl_lock); destroy_dev(softc->dev); sysctl_ctx_free(&softc->sysctl_ctx); free(control_softc, M_DEVBUF); control_softc = NULL; } static int ctl_module_event_handler(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (ctl_init()); case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } } /* * XXX KDM should we do some access checks here? Bump a reference count to * prevent a CTL module from being unloaded while someone has it open? */ static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } /* * Remove an initiator by port number and initiator ID. * Returns 0 for success, -1 for failure. */ int ctl_remove_initiator(struct ctl_port *port, int iid) { struct ctl_softc *softc = control_softc; mtx_assert(&softc->ctl_lock, MA_NOTOWNED); if (iid > CTL_MAX_INIT_PER_PORT) { printf("%s: initiator ID %u > maximun %u!\n", __func__, iid, CTL_MAX_INIT_PER_PORT); return (-1); } mtx_lock(&softc->ctl_lock); port->wwpn_iid[iid].in_use--; port->wwpn_iid[iid].last_use = time_uptime; mtx_unlock(&softc->ctl_lock); return (0); } /* * Add an initiator to the initiator map. * Returns iid for success, < 0 for failure. */ int ctl_add_initiator(struct ctl_port *port, int iid, uint64_t wwpn, char *name) { struct ctl_softc *softc = control_softc; time_t best_time; int i, best; mtx_assert(&softc->ctl_lock, MA_NOTOWNED); if (iid >= CTL_MAX_INIT_PER_PORT) { printf("%s: WWPN %#jx initiator ID %u > maximum %u!\n", __func__, wwpn, iid, CTL_MAX_INIT_PER_PORT); free(name, M_CTL); return (-1); } mtx_lock(&softc->ctl_lock); if (iid < 0 && (wwpn != 0 || name != NULL)) { for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (wwpn != 0 && wwpn == port->wwpn_iid[i].wwpn) { iid = i; break; } if (name != NULL && port->wwpn_iid[i].name != NULL && strcmp(name, port->wwpn_iid[i].name) == 0) { iid = i; break; } } } if (iid < 0) { for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port->wwpn_iid[i].in_use == 0 && port->wwpn_iid[i].wwpn == 0 && port->wwpn_iid[i].name == NULL) { iid = i; break; } } } if (iid < 0) { best = -1; best_time = INT32_MAX; for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port->wwpn_iid[i].in_use == 0) { if (port->wwpn_iid[i].last_use < best_time) { best = i; best_time = port->wwpn_iid[i].last_use; } } } iid = best; } if (iid < 0) { mtx_unlock(&softc->ctl_lock); free(name, M_CTL); return (-2); } if (port->wwpn_iid[iid].in_use > 0 && (wwpn != 0 || name != NULL)) { /* * This is not an error yet. */ if (wwpn != 0 && wwpn == port->wwpn_iid[iid].wwpn) { #if 0 printf("%s: port %d iid %u WWPN %#jx arrived" " again\n", __func__, port->targ_port, iid, (uintmax_t)wwpn); #endif goto take; } if (name != NULL && port->wwpn_iid[iid].name != NULL && strcmp(name, port->wwpn_iid[iid].name) == 0) { #if 0 printf("%s: port %d iid %u name '%s' arrived" " again\n", __func__, port->targ_port, iid, name); #endif goto take; } /* * This is an error, but what do we do about it? The * driver is telling us we have a new WWPN for this * initiator ID, so we pretty much need to use it. */ printf("%s: port %d iid %u WWPN %#jx '%s' arrived," " but WWPN %#jx '%s' is still at that address\n", __func__, port->targ_port, iid, wwpn, name, (uintmax_t)port->wwpn_iid[iid].wwpn, port->wwpn_iid[iid].name); /* * XXX KDM clear have_ca and ua_pending on each LUN for * this initiator. */ } take: free(port->wwpn_iid[iid].name, M_CTL); port->wwpn_iid[iid].name = name; port->wwpn_iid[iid].wwpn = wwpn; port->wwpn_iid[iid].in_use++; mtx_unlock(&softc->ctl_lock); return (iid); } static int ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf) { int len; switch (port->port_type) { case CTL_PORT_FC: { struct scsi_transportid_fcp *id = (struct scsi_transportid_fcp *)buf; if (port->wwpn_iid[iid].wwpn == 0) return (0); memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_FC; scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->n_port_name); return (sizeof(*id)); } case CTL_PORT_ISCSI: { struct scsi_transportid_iscsi_port *id = (struct scsi_transportid_iscsi_port *)buf; if (port->wwpn_iid[iid].name == NULL) return (0); memset(id, 0, 256); id->format_protocol = SCSI_TRN_ISCSI_FORMAT_PORT | SCSI_PROTO_ISCSI; len = strlcpy(id->iscsi_name, port->wwpn_iid[iid].name, 252) + 1; len = roundup2(min(len, 252), 4); scsi_ulto2b(len, id->additional_length); return (sizeof(*id) + len); } case CTL_PORT_SAS: { struct scsi_transportid_sas *id = (struct scsi_transportid_sas *)buf; if (port->wwpn_iid[iid].wwpn == 0) return (0); memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_SAS; scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->sas_address); return (sizeof(*id)); } default: { struct scsi_transportid_spi *id = (struct scsi_transportid_spi *)buf; memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_SPI; scsi_ulto2b(iid, id->scsi_addr); scsi_ulto2b(port->targ_port, id->rel_trgt_port_id); return (sizeof(*id)); } } } /* * Serialize a command that went down the "wrong" side, and so was sent to * this controller for execution. The logic is a little different than the * standard case in ctl_scsiio_precheck(). Errors in this case need to get * sent back to the other side, but in the success case, we execute the * command on this side (XFER mode) or tell the other side to execute it * (SER_ONLY mode). */ static int ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio) { struct ctl_softc *softc; union ctl_ha_msg msg_info; struct ctl_lun *lun; const struct ctl_cmd_entry *entry; int retval = 0; uint32_t targ_lun; softc = control_softc; targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun; + mtx_lock(&softc->ctl_lock); if ((targ_lun < CTL_MAX_LUNS) && ((lun = softc->ctl_luns[targ_lun]) != NULL)) { + mtx_lock(&lun->lun_lock); + mtx_unlock(&softc->ctl_lock); /* * If the LUN is invalid, pretend that it doesn't exist. * It will go away as soon as all pending I/O has been * completed. */ - mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); lun = NULL; } - } else + } else { + mtx_unlock(&softc->ctl_lock); lun = NULL; + } if (lun == NULL) { /* * The other node would not send this request to us unless * received announce that we are primary node for this LUN. * If this LUN does not exist now, it is probably result of * a race, so respond to initiator in the most opaque way. */ ctl_set_busy(ctsio); ctl_copy_sense_data_back((union ctl_io *)ctsio, &msg_info); msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi), M_WAITOK); return(1); } entry = ctl_get_cmd_entry(ctsio, NULL); if (ctl_scsiio_lun_check(lun, entry, ctsio) != 0) { mtx_unlock(&lun->lun_lock); ctl_copy_sense_data_back((union ctl_io *)ctsio, &msg_info); msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi), M_WAITOK); return(1); } ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = lun; ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr = lun->be_lun; /* * Every I/O goes into the OOA queue for a * particular LUN, and stays there until completion. */ #ifdef CTL_TIME_IO if (TAILQ_EMPTY(&lun->ooa_queue)) lun->idle_time += getsbinuptime() - lun->last_busy; #endif TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links))) { case CTL_ACTION_BLOCK: ctsio->io_hdr.flags |= CTL_FLAG_BLOCKED; TAILQ_INSERT_TAIL(&lun->blocked_queue, &ctsio->io_hdr, blocked_links); mtx_unlock(&lun->lun_lock); break; case CTL_ACTION_PASS: case CTL_ACTION_SKIP: if (softc->ha_mode == CTL_HA_MODE_XFER) { ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr((union ctl_io *)ctsio); mtx_unlock(&lun->lun_lock); } else { ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; mtx_unlock(&lun->lun_lock); /* send msg back to other side */ msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = (union ctl_io *)ctsio; msg_info.hdr.msg_type = CTL_MSG_R2R; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.hdr), M_WAITOK); } break; case CTL_ACTION_OVERLAP: TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); mtx_unlock(&lun->lun_lock); retval = 1; ctl_set_overlapped_cmd(ctsio); ctl_copy_sense_data_back((union ctl_io *)ctsio, &msg_info); msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi), M_WAITOK); break; case CTL_ACTION_OVERLAP_TAG: TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); mtx_unlock(&lun->lun_lock); retval = 1; ctl_set_overlapped_tag(ctsio, ctsio->tag_num); ctl_copy_sense_data_back((union ctl_io *)ctsio, &msg_info); msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi), M_WAITOK); break; case CTL_ACTION_ERROR: default: TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); mtx_unlock(&lun->lun_lock); retval = 1; ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 0); ctl_copy_sense_data_back((union ctl_io *)ctsio, &msg_info); msg_info.hdr.original_sc = ctsio->io_hdr.original_sc; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi), M_WAITOK); break; } return (retval); } /* * Returns 0 for success, errno for failure. */ static int ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num, struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries) { union ctl_io *io; int retval; retval = 0; mtx_lock(&lun->lun_lock); for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); (io != NULL); (*cur_fill_num)++, io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr, ooa_links)) { struct ctl_ooa_entry *entry; /* * If we've got more than we can fit, just count the * remaining entries. */ if (*cur_fill_num >= ooa_hdr->alloc_num) continue; entry = &kern_entries[*cur_fill_num]; entry->tag_num = io->scsiio.tag_num; entry->lun_num = lun->lun; #ifdef CTL_TIME_IO entry->start_bt = io->io_hdr.start_bt; #endif bcopy(io->scsiio.cdb, entry->cdb, io->scsiio.cdb_len); entry->cdb_len = io->scsiio.cdb_len; if (io->io_hdr.flags & CTL_FLAG_BLOCKED) entry->cmd_flags |= CTL_OOACMD_FLAG_BLOCKED; if (io->io_hdr.flags & CTL_FLAG_DMA_INPROG) entry->cmd_flags |= CTL_OOACMD_FLAG_DMA; if (io->io_hdr.flags & CTL_FLAG_ABORT) entry->cmd_flags |= CTL_OOACMD_FLAG_ABORT; if (io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR) entry->cmd_flags |= CTL_OOACMD_FLAG_RTR; if (io->io_hdr.flags & CTL_FLAG_DMA_QUEUED) entry->cmd_flags |= CTL_OOACMD_FLAG_DMA_QUEUED; } mtx_unlock(&lun->lun_lock); return (retval); } static void * ctl_copyin_alloc(void *user_addr, int len, char *error_str, size_t error_str_len) { void *kptr; kptr = malloc(len, M_CTL, M_WAITOK | M_ZERO); if (copyin(user_addr, kptr, len) != 0) { snprintf(error_str, error_str_len, "Error copying %d bytes " "from user address %p to kernel address %p", len, user_addr, kptr); free(kptr, M_CTL); return (NULL); } return (kptr); } static void ctl_free_args(int num_args, struct ctl_be_arg *args) { int i; if (args == NULL) return; for (i = 0; i < num_args; i++) { free(args[i].kname, M_CTL); free(args[i].kvalue, M_CTL); } free(args, M_CTL); } static struct ctl_be_arg * ctl_copyin_args(int num_args, struct ctl_be_arg *uargs, char *error_str, size_t error_str_len) { struct ctl_be_arg *args; int i; args = ctl_copyin_alloc(uargs, num_args * sizeof(*args), error_str, error_str_len); if (args == NULL) goto bailout; for (i = 0; i < num_args; i++) { args[i].kname = NULL; args[i].kvalue = NULL; } for (i = 0; i < num_args; i++) { uint8_t *tmpptr; args[i].kname = ctl_copyin_alloc(args[i].name, args[i].namelen, error_str, error_str_len); if (args[i].kname == NULL) goto bailout; if (args[i].kname[args[i].namelen - 1] != '\0') { snprintf(error_str, error_str_len, "Argument %d " "name is not NUL-terminated", i); goto bailout; } if (args[i].flags & CTL_BEARG_RD) { tmpptr = ctl_copyin_alloc(args[i].value, args[i].vallen, error_str, error_str_len); if (tmpptr == NULL) goto bailout; if ((args[i].flags & CTL_BEARG_ASCII) && (tmpptr[args[i].vallen - 1] != '\0')) { snprintf(error_str, error_str_len, "Argument " "%d value is not NUL-terminated", i); goto bailout; } args[i].kvalue = tmpptr; } else { args[i].kvalue = malloc(args[i].vallen, M_CTL, M_WAITOK | M_ZERO); } } return (args); bailout: ctl_free_args(num_args, args); return (NULL); } static void ctl_copyout_args(int num_args, struct ctl_be_arg *args) { int i; for (i = 0; i < num_args; i++) { if (args[i].flags & CTL_BEARG_WR) copyout(args[i].kvalue, args[i].value, args[i].vallen); } } /* * Escape characters that are illegal or not recommended in XML. */ int ctl_sbuf_printf_esc(struct sbuf *sb, char *str, int size) { char *end = str + size; int retval; retval = 0; for (; *str && str < end; str++) { switch (*str) { case '&': retval = sbuf_printf(sb, "&"); break; case '>': retval = sbuf_printf(sb, ">"); break; case '<': retval = sbuf_printf(sb, "<"); break; default: retval = sbuf_putc(sb, *str); break; } if (retval != 0) break; } return (retval); } static void ctl_id_sbuf(struct ctl_devid *id, struct sbuf *sb) { struct scsi_vpd_id_descriptor *desc; int i; if (id == NULL || id->len < 4) return; desc = (struct scsi_vpd_id_descriptor *)id->data; switch (desc->id_type & SVPD_ID_TYPE_MASK) { case SVPD_ID_TYPE_T10: sbuf_printf(sb, "t10."); break; case SVPD_ID_TYPE_EUI64: sbuf_printf(sb, "eui."); break; case SVPD_ID_TYPE_NAA: sbuf_printf(sb, "naa."); break; case SVPD_ID_TYPE_SCSI_NAME: break; } switch (desc->proto_codeset & SVPD_ID_CODESET_MASK) { case SVPD_ID_CODESET_BINARY: for (i = 0; i < desc->length; i++) sbuf_printf(sb, "%02x", desc->identifier[i]); break; case SVPD_ID_CODESET_ASCII: sbuf_printf(sb, "%.*s", (int)desc->length, (char *)desc->identifier); break; case SVPD_ID_CODESET_UTF8: sbuf_printf(sb, "%s", (char *)desc->identifier); break; } } static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ctl_softc *softc; int retval; softc = control_softc; retval = 0; switch (cmd) { case CTL_IO: retval = ctl_ioctl_io(dev, cmd, addr, flag, td); break; case CTL_ENABLE_PORT: case CTL_DISABLE_PORT: case CTL_SET_PORT_WWNS: { struct ctl_port *port; struct ctl_port_entry *entry; entry = (struct ctl_port_entry *)addr; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { int action, done; if (port->targ_port < softc->port_min || port->targ_port >= softc->port_max) continue; action = 0; done = 0; if ((entry->port_type == CTL_PORT_NONE) && (entry->targ_port == port->targ_port)) { /* * If the user only wants to enable or * disable or set WWNs on a specific port, * do the operation and we're done. */ action = 1; done = 1; } else if (entry->port_type & port->port_type) { /* * Compare the user's type mask with the * particular frontend type to see if we * have a match. */ action = 1; done = 0; /* * Make sure the user isn't trying to set * WWNs on multiple ports at the same time. */ if (cmd == CTL_SET_PORT_WWNS) { printf("%s: Can't set WWNs on " "multiple ports\n", __func__); retval = EINVAL; break; } } if (action == 0) continue; /* * XXX KDM we have to drop the lock here, because * the online/offline operations can potentially * block. We need to reference count the frontends * so they can't go away, */ if (cmd == CTL_ENABLE_PORT) { mtx_unlock(&softc->ctl_lock); ctl_port_online(port); mtx_lock(&softc->ctl_lock); } else if (cmd == CTL_DISABLE_PORT) { mtx_unlock(&softc->ctl_lock); ctl_port_offline(port); mtx_lock(&softc->ctl_lock); } else if (cmd == CTL_SET_PORT_WWNS) { ctl_port_set_wwns(port, (entry->flags & CTL_PORT_WWNN_VALID) ? 1 : 0, entry->wwnn, (entry->flags & CTL_PORT_WWPN_VALID) ? 1 : 0, entry->wwpn); } if (done != 0) break; } mtx_unlock(&softc->ctl_lock); break; } case CTL_GET_PORT_LIST: { struct ctl_port *port; struct ctl_port_list *list; int i; list = (struct ctl_port_list *)addr; if (list->alloc_len != (list->alloc_num * sizeof(struct ctl_port_entry))) { printf("%s: CTL_GET_PORT_LIST: alloc_len %u != " "alloc_num %u * sizeof(struct ctl_port_entry) " "%zu\n", __func__, list->alloc_len, list->alloc_num, sizeof(struct ctl_port_entry)); retval = EINVAL; break; } list->fill_len = 0; list->fill_num = 0; list->dropped_num = 0; i = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { struct ctl_port_entry entry, *list_entry; if (list->fill_num >= list->alloc_num) { list->dropped_num++; continue; } entry.port_type = port->port_type; strlcpy(entry.port_name, port->port_name, sizeof(entry.port_name)); entry.targ_port = port->targ_port; entry.physical_port = port->physical_port; entry.virtual_port = port->virtual_port; entry.wwnn = port->wwnn; entry.wwpn = port->wwpn; if (port->status & CTL_PORT_STATUS_ONLINE) entry.online = 1; else entry.online = 0; list_entry = &list->entries[i]; retval = copyout(&entry, list_entry, sizeof(entry)); if (retval != 0) { printf("%s: CTL_GET_PORT_LIST: copyout " "returned %d\n", __func__, retval); break; } i++; list->fill_num++; list->fill_len += sizeof(entry); } mtx_unlock(&softc->ctl_lock); /* * If this is non-zero, we had a copyout fault, so there's * probably no point in attempting to set the status inside * the structure. */ if (retval != 0) break; if (list->dropped_num > 0) list->status = CTL_PORT_LIST_NEED_MORE_SPACE; else list->status = CTL_PORT_LIST_OK; break; } case CTL_DUMP_OOA: { struct ctl_lun *lun; union ctl_io *io; char printbuf[128]; struct sbuf sb; mtx_lock(&softc->ctl_lock); printf("Dumping OOA queues:\n"); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); for (io = (union ctl_io *)TAILQ_FIRST( &lun->ooa_queue); io != NULL; io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr, ooa_links)) { sbuf_new(&sb, printbuf, sizeof(printbuf), SBUF_FIXEDLEN); sbuf_printf(&sb, "LUN %jd tag 0x%04x%s%s%s%s: ", (intmax_t)lun->lun, io->scsiio.tag_num, (io->io_hdr.flags & CTL_FLAG_BLOCKED) ? "" : " BLOCKED", (io->io_hdr.flags & CTL_FLAG_DMA_INPROG) ? " DMA" : "", (io->io_hdr.flags & CTL_FLAG_ABORT) ? " ABORT" : "", (io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR) ? " RTR" : ""); ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_finish(&sb); printf("%s\n", sbuf_data(&sb)); } mtx_unlock(&lun->lun_lock); } printf("OOA queues dump done\n"); mtx_unlock(&softc->ctl_lock); break; } case CTL_GET_OOA: { struct ctl_lun *lun; struct ctl_ooa *ooa_hdr; struct ctl_ooa_entry *entries; uint32_t cur_fill_num; ooa_hdr = (struct ctl_ooa *)addr; if ((ooa_hdr->alloc_len == 0) || (ooa_hdr->alloc_num == 0)) { printf("%s: CTL_GET_OOA: alloc len %u and alloc num %u " "must be non-zero\n", __func__, ooa_hdr->alloc_len, ooa_hdr->alloc_num); retval = EINVAL; break; } if (ooa_hdr->alloc_len != (ooa_hdr->alloc_num * sizeof(struct ctl_ooa_entry))) { printf("%s: CTL_GET_OOA: alloc len %u must be alloc " "num %d * sizeof(struct ctl_ooa_entry) %zd\n", __func__, ooa_hdr->alloc_len, ooa_hdr->alloc_num,sizeof(struct ctl_ooa_entry)); retval = EINVAL; break; } entries = malloc(ooa_hdr->alloc_len, M_CTL, M_WAITOK | M_ZERO); if (entries == NULL) { printf("%s: could not allocate %d bytes for OOA " "dump\n", __func__, ooa_hdr->alloc_len); retval = ENOMEM; break; } mtx_lock(&softc->ctl_lock); if (((ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) == 0) && ((ooa_hdr->lun_num >= CTL_MAX_LUNS) || (softc->ctl_luns[ooa_hdr->lun_num] == NULL))) { mtx_unlock(&softc->ctl_lock); free(entries, M_CTL); printf("%s: CTL_GET_OOA: invalid LUN %ju\n", __func__, (uintmax_t)ooa_hdr->lun_num); retval = EINVAL; break; } cur_fill_num = 0; if (ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) { STAILQ_FOREACH(lun, &softc->lun_list, links) { retval = ctl_ioctl_fill_ooa(lun, &cur_fill_num, ooa_hdr, entries); if (retval != 0) break; } if (retval != 0) { mtx_unlock(&softc->ctl_lock); free(entries, M_CTL); break; } } else { lun = softc->ctl_luns[ooa_hdr->lun_num]; retval = ctl_ioctl_fill_ooa(lun, &cur_fill_num,ooa_hdr, entries); } mtx_unlock(&softc->ctl_lock); ooa_hdr->fill_num = min(cur_fill_num, ooa_hdr->alloc_num); ooa_hdr->fill_len = ooa_hdr->fill_num * sizeof(struct ctl_ooa_entry); retval = copyout(entries, ooa_hdr->entries, ooa_hdr->fill_len); if (retval != 0) { printf("%s: error copying out %d bytes for OOA dump\n", __func__, ooa_hdr->fill_len); } getbintime(&ooa_hdr->cur_bt); if (cur_fill_num > ooa_hdr->alloc_num) { ooa_hdr->dropped_num = cur_fill_num -ooa_hdr->alloc_num; ooa_hdr->status = CTL_OOA_NEED_MORE_SPACE; } else { ooa_hdr->dropped_num = 0; ooa_hdr->status = CTL_OOA_OK; } free(entries, M_CTL); break; } case CTL_CHECK_OOA: { union ctl_io *io; struct ctl_lun *lun; struct ctl_ooa_info *ooa_info; ooa_info = (struct ctl_ooa_info *)addr; if (ooa_info->lun_id >= CTL_MAX_LUNS) { ooa_info->status = CTL_OOA_INVALID_LUN; break; } mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[ooa_info->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); ooa_info->status = CTL_OOA_INVALID_LUN; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); ooa_info->num_entries = 0; for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); io != NULL; io = (union ctl_io *)TAILQ_NEXT( &io->io_hdr, ooa_links)) { ooa_info->num_entries++; } mtx_unlock(&lun->lun_lock); ooa_info->status = CTL_OOA_SUCCESS; break; } case CTL_DELAY_IO: { struct ctl_io_delay_info *delay_info; #ifdef CTL_IO_DELAY struct ctl_lun *lun; #endif /* CTL_IO_DELAY */ delay_info = (struct ctl_io_delay_info *)addr; #ifdef CTL_IO_DELAY mtx_lock(&softc->ctl_lock); if ((delay_info->lun_id >= CTL_MAX_LUNS) || (softc->ctl_luns[delay_info->lun_id] == NULL)) { delay_info->status = CTL_DELAY_STATUS_INVALID_LUN; } else { lun = softc->ctl_luns[delay_info->lun_id]; mtx_lock(&lun->lun_lock); delay_info->status = CTL_DELAY_STATUS_OK; switch (delay_info->delay_type) { case CTL_DELAY_TYPE_CONT: break; case CTL_DELAY_TYPE_ONESHOT: break; default: delay_info->status = CTL_DELAY_STATUS_INVALID_TYPE; break; } switch (delay_info->delay_loc) { case CTL_DELAY_LOC_DATAMOVE: lun->delay_info.datamove_type = delay_info->delay_type; lun->delay_info.datamove_delay = delay_info->delay_secs; break; case CTL_DELAY_LOC_DONE: lun->delay_info.done_type = delay_info->delay_type; lun->delay_info.done_delay = delay_info->delay_secs; break; default: delay_info->status = CTL_DELAY_STATUS_INVALID_LOC; break; } mtx_unlock(&lun->lun_lock); } mtx_unlock(&softc->ctl_lock); #else delay_info->status = CTL_DELAY_STATUS_NOT_IMPLEMENTED; #endif /* CTL_IO_DELAY */ break; } case CTL_REALSYNC_SET: { int *syncstate; syncstate = (int *)addr; mtx_lock(&softc->ctl_lock); switch (*syncstate) { case 0: softc->flags &= ~CTL_FLAG_REAL_SYNC; break; case 1: softc->flags |= CTL_FLAG_REAL_SYNC; break; default: retval = EINVAL; break; } mtx_unlock(&softc->ctl_lock); break; } case CTL_REALSYNC_GET: { int *syncstate; syncstate = (int*)addr; mtx_lock(&softc->ctl_lock); if (softc->flags & CTL_FLAG_REAL_SYNC) *syncstate = 1; else *syncstate = 0; mtx_unlock(&softc->ctl_lock); break; } case CTL_SETSYNC: case CTL_GETSYNC: { struct ctl_sync_info *sync_info; struct ctl_lun *lun; sync_info = (struct ctl_sync_info *)addr; mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[sync_info->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); sync_info->status = CTL_GS_SYNC_NO_LUN; + break; } /* * Get or set the sync interval. We're not bounds checking * in the set case, hopefully the user won't do something * silly. */ mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (cmd == CTL_GETSYNC) sync_info->sync_interval = lun->sync_interval; else lun->sync_interval = sync_info->sync_interval; mtx_unlock(&lun->lun_lock); sync_info->status = CTL_GS_SYNC_OK; break; } case CTL_GETSTATS: { struct ctl_stats *stats; struct ctl_lun *lun; int i; stats = (struct ctl_stats *)addr; if ((sizeof(struct ctl_lun_io_stats) * softc->num_luns) > stats->alloc_len) { stats->status = CTL_SS_NEED_MORE_SPACE; stats->num_luns = softc->num_luns; break; } /* * XXX KDM no locking here. If the LUN list changes, * things can blow up. */ for (i = 0, lun = STAILQ_FIRST(&softc->lun_list); lun != NULL; i++, lun = STAILQ_NEXT(lun, links)) { retval = copyout(&lun->stats, &stats->lun_stats[i], sizeof(lun->stats)); if (retval != 0) break; } stats->num_luns = softc->num_luns; stats->fill_len = sizeof(struct ctl_lun_io_stats) * softc->num_luns; stats->status = CTL_SS_OK; #ifdef CTL_TIME_IO stats->flags = CTL_STATS_FLAG_TIME_VALID; #else stats->flags = CTL_STATS_FLAG_NONE; #endif getnanouptime(&stats->timestamp); break; } case CTL_ERROR_INJECT: { struct ctl_error_desc *err_desc, *new_err_desc; struct ctl_lun *lun; err_desc = (struct ctl_error_desc *)addr; new_err_desc = malloc(sizeof(*new_err_desc), M_CTL, M_WAITOK | M_ZERO); bcopy(err_desc, new_err_desc, sizeof(*new_err_desc)); mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[err_desc->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); free(new_err_desc, M_CTL); printf("%s: CTL_ERROR_INJECT: invalid LUN %ju\n", __func__, (uintmax_t)err_desc->lun_id); retval = EINVAL; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); /* * We could do some checking here to verify the validity * of the request, but given the complexity of error * injection requests, the checking logic would be fairly * complex. * * For now, if the request is invalid, it just won't get * executed and might get deleted. */ STAILQ_INSERT_TAIL(&lun->error_list, new_err_desc, links); /* * XXX KDM check to make sure the serial number is unique, * in case we somehow manage to wrap. That shouldn't * happen for a very long time, but it's the right thing to * do. */ new_err_desc->serial = lun->error_serial; err_desc->serial = lun->error_serial; lun->error_serial++; mtx_unlock(&lun->lun_lock); break; } case CTL_ERROR_INJECT_DELETE: { struct ctl_error_desc *delete_desc, *desc, *desc2; struct ctl_lun *lun; int delete_done; delete_desc = (struct ctl_error_desc *)addr; delete_done = 0; mtx_lock(&softc->ctl_lock); lun = softc->ctl_luns[delete_desc->lun_id]; if (lun == NULL) { mtx_unlock(&softc->ctl_lock); printf("%s: CTL_ERROR_INJECT_DELETE: invalid LUN %ju\n", __func__, (uintmax_t)delete_desc->lun_id); retval = EINVAL; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) { if (desc->serial != delete_desc->serial) continue; STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links); free(desc, M_CTL); delete_done = 1; } mtx_unlock(&lun->lun_lock); if (delete_done == 0) { printf("%s: CTL_ERROR_INJECT_DELETE: can't find " "error serial %ju on LUN %u\n", __func__, delete_desc->serial, delete_desc->lun_id); retval = EINVAL; break; } break; } case CTL_DUMP_STRUCTS: { int i, j, k; struct ctl_port *port; struct ctl_frontend *fe; mtx_lock(&softc->ctl_lock); printf("CTL Persistent Reservation information start:\n"); for (i = 0; i < CTL_MAX_LUNS; i++) { struct ctl_lun *lun; lun = softc->ctl_luns[i]; if ((lun == NULL) || ((lun->flags & CTL_LUN_DISABLED) != 0)) continue; for (j = 0; j < CTL_MAX_PORTS; j++) { if (lun->pr_keys[j] == NULL) continue; for (k = 0; k < CTL_MAX_INIT_PER_PORT; k++){ if (lun->pr_keys[j][k] == 0) continue; printf(" LUN %d port %d iid %d key " "%#jx\n", i, j, k, (uintmax_t)lun->pr_keys[j][k]); } } } printf("CTL Persistent Reservation information end\n"); printf("CTL Ports:\n"); STAILQ_FOREACH(port, &softc->port_list, links) { printf(" Port %d '%s' Frontend '%s' Type %u pp %d vp %d WWNN " "%#jx WWPN %#jx\n", port->targ_port, port->port_name, port->frontend->name, port->port_type, port->physical_port, port->virtual_port, (uintmax_t)port->wwnn, (uintmax_t)port->wwpn); for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (port->wwpn_iid[j].in_use == 0 && port->wwpn_iid[j].wwpn == 0 && port->wwpn_iid[j].name == NULL) continue; printf(" iid %u use %d WWPN %#jx '%s'\n", j, port->wwpn_iid[j].in_use, (uintmax_t)port->wwpn_iid[j].wwpn, port->wwpn_iid[j].name); } } printf("CTL Port information end\n"); mtx_unlock(&softc->ctl_lock); /* * XXX KDM calling this without a lock. We'd likely want * to drop the lock before calling the frontend's dump * routine anyway. */ printf("CTL Frontends:\n"); STAILQ_FOREACH(fe, &softc->fe_list, links) { printf(" Frontend '%s'\n", fe->name); if (fe->fe_dump != NULL) fe->fe_dump(); } printf("CTL Frontend information end\n"); break; } case CTL_LUN_REQ: { struct ctl_lun_req *lun_req; struct ctl_backend_driver *backend; lun_req = (struct ctl_lun_req *)addr; backend = ctl_backend_find(lun_req->backend); if (backend == NULL) { lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Backend \"%s\" not found.", lun_req->backend); break; } if (lun_req->num_be_args > 0) { lun_req->kern_be_args = ctl_copyin_args( lun_req->num_be_args, lun_req->be_args, lun_req->error_str, sizeof(lun_req->error_str)); if (lun_req->kern_be_args == NULL) { lun_req->status = CTL_LUN_ERROR; break; } } retval = backend->ioctl(dev, cmd, addr, flag, td); if (lun_req->num_be_args > 0) { ctl_copyout_args(lun_req->num_be_args, lun_req->kern_be_args); ctl_free_args(lun_req->num_be_args, lun_req->kern_be_args); } break; } case CTL_LUN_LIST: { struct sbuf *sb; struct ctl_lun *lun; struct ctl_lun_list *list; struct ctl_option *opt; list = (struct ctl_lun_list *)addr; /* * Allocate a fixed length sbuf here, based on the length * of the user's buffer. We could allocate an auto-extending * buffer, and then tell the user how much larger our * amount of data is than his buffer, but that presents * some problems: * * 1. The sbuf(9) routines use a blocking malloc, and so * we can't hold a lock while calling them with an * auto-extending buffer. * * 2. There is not currently a LUN reference counting * mechanism, outside of outstanding transactions on * the LUN's OOA queue. So a LUN could go away on us * while we're getting the LUN number, backend-specific * information, etc. Thus, given the way things * currently work, we need to hold the CTL lock while * grabbing LUN information. * * So, from the user's standpoint, the best thing to do is * allocate what he thinks is a reasonable buffer length, * and then if he gets a CTL_LUN_LIST_NEED_MORE_SPACE error, * double the buffer length and try again. (And repeat * that until he succeeds.) */ sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN); if (sb == NULL) { list->status = CTL_LUN_LIST_ERROR; snprintf(list->error_str, sizeof(list->error_str), "Unable to allocate %d bytes for LUN list", list->alloc_len); break; } sbuf_printf(sb, "\n"); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); retval = sbuf_printf(sb, "\n", (uintmax_t)lun->lun); /* * Bail out as soon as we see that we've overfilled * the buffer. */ if (retval != 0) break; retval = sbuf_printf(sb, "\t%s" "\n", (lun->backend == NULL) ? "none" : lun->backend->name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", lun->be_lun->lun_type); if (retval != 0) break; if (lun->backend == NULL) { retval = sbuf_printf(sb, "\n"); if (retval != 0) break; continue; } retval = sbuf_printf(sb, "\t%ju\n", (lun->be_lun->maxlba > 0) ? lun->be_lun->maxlba + 1 : 0); if (retval != 0) break; retval = sbuf_printf(sb, "\t%u\n", lun->be_lun->blocksize); if (retval != 0) break; retval = sbuf_printf(sb, "\t"); if (retval != 0) break; retval = ctl_sbuf_printf_esc(sb, lun->be_lun->serial_num, sizeof(lun->be_lun->serial_num)); if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; retval = sbuf_printf(sb, "\t"); if (retval != 0) break; retval = ctl_sbuf_printf_esc(sb, lun->be_lun->device_id, sizeof(lun->be_lun->device_id)); if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; if (lun->backend->lun_info != NULL) { retval = lun->backend->lun_info(lun->be_lun->be_lun, sb); if (retval != 0) break; } STAILQ_FOREACH(opt, &lun->be_lun->options, links) { retval = sbuf_printf(sb, "\t<%s>%s\n", opt->name, opt->value, opt->name); if (retval != 0) break; } retval = sbuf_printf(sb, "\n"); if (retval != 0) break; mtx_unlock(&lun->lun_lock); } if (lun != NULL) mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if ((retval != 0) || ((retval = sbuf_printf(sb, "\n")) != 0)) { retval = 0; sbuf_delete(sb); list->status = CTL_LUN_LIST_NEED_MORE_SPACE; snprintf(list->error_str, sizeof(list->error_str), "Out of space, %d bytes is too small", list->alloc_len); break; } sbuf_finish(sb); retval = copyout(sbuf_data(sb), list->lun_xml, sbuf_len(sb) + 1); list->fill_len = sbuf_len(sb) + 1; list->status = CTL_LUN_LIST_OK; sbuf_delete(sb); break; } case CTL_ISCSI: { struct ctl_iscsi *ci; struct ctl_frontend *fe; ci = (struct ctl_iscsi *)addr; fe = ctl_frontend_find("iscsi"); if (fe == NULL) { ci->status = CTL_ISCSI_ERROR; snprintf(ci->error_str, sizeof(ci->error_str), "Frontend \"iscsi\" not found."); break; } retval = fe->ioctl(dev, cmd, addr, flag, td); break; } case CTL_PORT_REQ: { struct ctl_req *req; struct ctl_frontend *fe; req = (struct ctl_req *)addr; fe = ctl_frontend_find(req->driver); if (fe == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Frontend \"%s\" not found.", req->driver); break; } if (req->num_args > 0) { req->kern_args = ctl_copyin_args(req->num_args, req->args, req->error_str, sizeof(req->error_str)); if (req->kern_args == NULL) { req->status = CTL_LUN_ERROR; break; } } if (fe->ioctl) retval = fe->ioctl(dev, cmd, addr, flag, td); else retval = ENODEV; if (req->num_args > 0) { ctl_copyout_args(req->num_args, req->kern_args); ctl_free_args(req->num_args, req->kern_args); } break; } case CTL_PORT_LIST: { struct sbuf *sb; struct ctl_port *port; struct ctl_lun_list *list; struct ctl_option *opt; int j; uint32_t plun; list = (struct ctl_lun_list *)addr; sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN); if (sb == NULL) { list->status = CTL_LUN_LIST_ERROR; snprintf(list->error_str, sizeof(list->error_str), "Unable to allocate %d bytes for LUN list", list->alloc_len); break; } sbuf_printf(sb, "\n"); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { retval = sbuf_printf(sb, "\n", (uintmax_t)port->targ_port); /* * Bail out as soon as we see that we've overfilled * the buffer. */ if (retval != 0) break; retval = sbuf_printf(sb, "\t%s" "\n", port->frontend->name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->port_type); if (retval != 0) break; retval = sbuf_printf(sb, "\t%s\n", (port->status & CTL_PORT_STATUS_ONLINE) ? "YES" : "NO"); if (retval != 0) break; retval = sbuf_printf(sb, "\t%s\n", port->port_name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->physical_port); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->virtual_port); if (retval != 0) break; if (port->target_devid != NULL) { sbuf_printf(sb, "\t"); ctl_id_sbuf(port->target_devid, sb); sbuf_printf(sb, "\n"); } if (port->port_devid != NULL) { sbuf_printf(sb, "\t"); ctl_id_sbuf(port->port_devid, sb); sbuf_printf(sb, "\n"); } if (port->port_info != NULL) { retval = port->port_info(port->onoff_arg, sb); if (retval != 0) break; } STAILQ_FOREACH(opt, &port->options, links) { retval = sbuf_printf(sb, "\t<%s>%s\n", opt->name, opt->value, opt->name); if (retval != 0) break; } if (port->lun_map != NULL) { sbuf_printf(sb, "\ton\n"); for (j = 0; j < CTL_MAX_LUNS; j++) { plun = ctl_lun_map_from_port(port, j); if (plun >= CTL_MAX_LUNS) continue; sbuf_printf(sb, "\t%u\n", j, plun); } } for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (port->wwpn_iid[j].in_use == 0 || (port->wwpn_iid[j].wwpn == 0 && port->wwpn_iid[j].name == NULL)) continue; if (port->wwpn_iid[j].name != NULL) retval = sbuf_printf(sb, "\t%s\n", j, port->wwpn_iid[j].name); else retval = sbuf_printf(sb, "\tnaa.%08jx\n", j, port->wwpn_iid[j].wwpn); if (retval != 0) break; } if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; } mtx_unlock(&softc->ctl_lock); if ((retval != 0) || ((retval = sbuf_printf(sb, "\n")) != 0)) { retval = 0; sbuf_delete(sb); list->status = CTL_LUN_LIST_NEED_MORE_SPACE; snprintf(list->error_str, sizeof(list->error_str), "Out of space, %d bytes is too small", list->alloc_len); break; } sbuf_finish(sb); retval = copyout(sbuf_data(sb), list->lun_xml, sbuf_len(sb) + 1); list->fill_len = sbuf_len(sb) + 1; list->status = CTL_LUN_LIST_OK; sbuf_delete(sb); break; } case CTL_LUN_MAP: { struct ctl_lun_map *lm = (struct ctl_lun_map *)addr; struct ctl_port *port; mtx_lock(&softc->ctl_lock); if (lm->port < softc->port_min || lm->port >= softc->port_max || (port = softc->ctl_ports[lm->port]) == NULL) { mtx_unlock(&softc->ctl_lock); return (ENXIO); } mtx_unlock(&softc->ctl_lock); // XXX: port_enable sleeps if (lm->plun < CTL_MAX_LUNS) { if (lm->lun == UINT32_MAX) retval = ctl_lun_map_unset(port, lm->plun); else if (lm->lun < CTL_MAX_LUNS && softc->ctl_luns[lm->lun] != NULL) retval = ctl_lun_map_set(port, lm->plun, lm->lun); else return (ENXIO); } else if (lm->plun == UINT32_MAX) { if (lm->lun == UINT32_MAX) retval = ctl_lun_map_deinit(port); else retval = ctl_lun_map_init(port); } else return (ENXIO); break; } default: { /* XXX KDM should we fix this? */ #if 0 struct ctl_backend_driver *backend; unsigned int type; int found; found = 0; /* * We encode the backend type as the ioctl type for backend * ioctls. So parse it out here, and then search for a * backend of this type. */ type = _IOC_TYPE(cmd); STAILQ_FOREACH(backend, &softc->be_list, links) { if (backend->type == type) { found = 1; break; } } if (found == 0) { printf("ctl: unknown ioctl command %#lx or backend " "%d\n", cmd, type); retval = EINVAL; break; } retval = backend->ioctl(dev, cmd, addr, flag, td); #endif retval = ENOTTY; break; } } return (retval); } uint32_t ctl_get_initindex(struct ctl_nexus *nexus) { return (nexus->initid + (nexus->targ_port * CTL_MAX_INIT_PER_PORT)); } int ctl_lun_map_init(struct ctl_port *port) { struct ctl_softc *softc = control_softc; struct ctl_lun *lun; uint32_t i; if (port->lun_map == NULL) port->lun_map = malloc(sizeof(uint32_t) * CTL_MAX_LUNS, M_CTL, M_NOWAIT); if (port->lun_map == NULL) return (ENOMEM); for (i = 0; i < CTL_MAX_LUNS; i++) port->lun_map[i] = UINT32_MAX; if (port->status & CTL_PORT_STATUS_ONLINE) { if (port->lun_disable != NULL) { STAILQ_FOREACH(lun, &softc->lun_list, links) port->lun_disable(port->targ_lun_arg, lun->lun); } ctl_isc_announce_port(port); } return (0); } int ctl_lun_map_deinit(struct ctl_port *port) { struct ctl_softc *softc = control_softc; struct ctl_lun *lun; if (port->lun_map == NULL) return (0); free(port->lun_map, M_CTL); port->lun_map = NULL; if (port->status & CTL_PORT_STATUS_ONLINE) { if (port->lun_enable != NULL) { STAILQ_FOREACH(lun, &softc->lun_list, links) port->lun_enable(port->targ_lun_arg, lun->lun); } ctl_isc_announce_port(port); } return (0); } int ctl_lun_map_set(struct ctl_port *port, uint32_t plun, uint32_t glun) { int status; uint32_t old; if (port->lun_map == NULL) { status = ctl_lun_map_init(port); if (status != 0) return (status); } old = port->lun_map[plun]; port->lun_map[plun] = glun; if ((port->status & CTL_PORT_STATUS_ONLINE) && old >= CTL_MAX_LUNS) { if (port->lun_enable != NULL) port->lun_enable(port->targ_lun_arg, plun); ctl_isc_announce_port(port); } return (0); } int ctl_lun_map_unset(struct ctl_port *port, uint32_t plun) { uint32_t old; if (port->lun_map == NULL) return (0); old = port->lun_map[plun]; port->lun_map[plun] = UINT32_MAX; if ((port->status & CTL_PORT_STATUS_ONLINE) && old < CTL_MAX_LUNS) { if (port->lun_disable != NULL) port->lun_disable(port->targ_lun_arg, plun); ctl_isc_announce_port(port); } return (0); } uint32_t ctl_lun_map_from_port(struct ctl_port *port, uint32_t lun_id) { if (port == NULL) return (UINT32_MAX); if (port->lun_map == NULL || lun_id >= CTL_MAX_LUNS) return (lun_id); return (port->lun_map[lun_id]); } uint32_t ctl_lun_map_to_port(struct ctl_port *port, uint32_t lun_id) { uint32_t i; if (port == NULL) return (UINT32_MAX); if (port->lun_map == NULL) return (lun_id); for (i = 0; i < CTL_MAX_LUNS; i++) { if (port->lun_map[i] == lun_id) return (i); } return (UINT32_MAX); } static struct ctl_port * ctl_io_port(struct ctl_io_hdr *io_hdr) { return (control_softc->ctl_ports[io_hdr->nexus.targ_port]); } int ctl_ffz(uint32_t *mask, uint32_t first, uint32_t last) { int i; for (i = first; i < last; i++) { if ((mask[i / 32] & (1 << (i % 32))) == 0) return (i); } return (-1); } int ctl_set_mask(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) != 0) return (-1); else mask[chunk] |= (1 << piece); return (0); } int ctl_clear_mask(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) == 0) return (-1); else mask[chunk] &= ~(1 << piece); return (0); } int ctl_is_set(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) == 0) return (0); else return (1); } static uint64_t ctl_get_prkey(struct ctl_lun *lun, uint32_t residx) { uint64_t *t; t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT]; if (t == NULL) return (0); return (t[residx % CTL_MAX_INIT_PER_PORT]); } static void ctl_clr_prkey(struct ctl_lun *lun, uint32_t residx) { uint64_t *t; t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT]; if (t == NULL) return; t[residx % CTL_MAX_INIT_PER_PORT] = 0; } static void ctl_alloc_prkey(struct ctl_lun *lun, uint32_t residx) { uint64_t *p; u_int i; i = residx/CTL_MAX_INIT_PER_PORT; if (lun->pr_keys[i] != NULL) return; mtx_unlock(&lun->lun_lock); p = malloc(sizeof(uint64_t) * CTL_MAX_INIT_PER_PORT, M_CTL, M_WAITOK | M_ZERO); mtx_lock(&lun->lun_lock); if (lun->pr_keys[i] == NULL) lun->pr_keys[i] = p; else free(p, M_CTL); } static void ctl_set_prkey(struct ctl_lun *lun, uint32_t residx, uint64_t key) { uint64_t *t; t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT]; KASSERT(t != NULL, ("prkey %d is not allocated", residx)); t[residx % CTL_MAX_INIT_PER_PORT] = key; } /* * ctl_softc, pool_name, total_ctl_io are passed in. * npool is passed out. */ int ctl_pool_create(struct ctl_softc *ctl_softc, const char *pool_name, uint32_t total_ctl_io, void **npool) { #ifdef IO_POOLS struct ctl_io_pool *pool; pool = (struct ctl_io_pool *)malloc(sizeof(*pool), M_CTL, M_NOWAIT | M_ZERO); if (pool == NULL) return (ENOMEM); snprintf(pool->name, sizeof(pool->name), "CTL IO %s", pool_name); pool->ctl_softc = ctl_softc; pool->zone = uma_zsecond_create(pool->name, NULL, NULL, NULL, NULL, ctl_softc->io_zone); /* uma_prealloc(pool->zone, total_ctl_io); */ *npool = pool; #else *npool = ctl_softc->io_zone; #endif return (0); } void ctl_pool_free(struct ctl_io_pool *pool) { if (pool == NULL) return; #ifdef IO_POOLS uma_zdestroy(pool->zone); free(pool, M_CTL); #endif } union ctl_io * ctl_alloc_io(void *pool_ref) { union ctl_io *io; #ifdef IO_POOLS struct ctl_io_pool *pool = (struct ctl_io_pool *)pool_ref; io = uma_zalloc(pool->zone, M_WAITOK); #else io = uma_zalloc((uma_zone_t)pool_ref, M_WAITOK); #endif if (io != NULL) io->io_hdr.pool = pool_ref; return (io); } union ctl_io * ctl_alloc_io_nowait(void *pool_ref) { union ctl_io *io; #ifdef IO_POOLS struct ctl_io_pool *pool = (struct ctl_io_pool *)pool_ref; io = uma_zalloc(pool->zone, M_NOWAIT); #else io = uma_zalloc((uma_zone_t)pool_ref, M_NOWAIT); #endif if (io != NULL) io->io_hdr.pool = pool_ref; return (io); } void ctl_free_io(union ctl_io *io) { #ifdef IO_POOLS struct ctl_io_pool *pool; #endif if (io == NULL) return; #ifdef IO_POOLS pool = (struct ctl_io_pool *)io->io_hdr.pool; uma_zfree(pool->zone, io); #else uma_zfree((uma_zone_t)io->io_hdr.pool, io); #endif } void ctl_zero_io(union ctl_io *io) { void *pool_ref; if (io == NULL) return; /* * May need to preserve linked list pointers at some point too. */ pool_ref = io->io_hdr.pool; memset(io, 0, sizeof(*io)); io->io_hdr.pool = pool_ref; } /* * This routine is currently used for internal copies of ctl_ios that need * to persist for some reason after we've already returned status to the * FETD. (Thus the flag set.) * * XXX XXX * Note that this makes a blind copy of all fields in the ctl_io, except * for the pool reference. This includes any memory that has been * allocated! That memory will no longer be valid after done has been * called, so this would be VERY DANGEROUS for command that actually does * any reads or writes. Right now (11/7/2005), this is only used for immediate * start and stop commands, which don't transfer any data, so this is not a * problem. If it is used for anything else, the caller would also need to * allocate data buffer space and this routine would need to be modified to * copy the data buffer(s) as well. */ void ctl_copy_io(union ctl_io *src, union ctl_io *dest) { void *pool_ref; if ((src == NULL) || (dest == NULL)) return; /* * May need to preserve linked list pointers at some point too. */ pool_ref = dest->io_hdr.pool; memcpy(dest, src, MIN(sizeof(*src), sizeof(*dest))); dest->io_hdr.pool = pool_ref; /* * We need to know that this is an internal copy, and doesn't need * to get passed back to the FETD that allocated it. */ dest->io_hdr.flags |= CTL_FLAG_INT_COPY; } int ctl_expand_number(const char *buf, uint64_t *num) { char *endptr; uint64_t number; unsigned shift; number = strtoq(buf, &endptr, 0); switch (tolower((unsigned char)*endptr)) { case 'e': shift = 60; break; case 'p': shift = 50; break; case 't': shift = 40; break; case 'g': shift = 30; break; case 'm': shift = 20; break; case 'k': shift = 10; break; case 'b': case '\0': /* No unit. */ *num = number; return (0); default: /* Unrecognized unit. */ return (-1); } if ((number << shift) >> shift != number) { /* Overflow */ return (-1); } *num = number << shift; return (0); } /* * This routine could be used in the future to load default and/or saved * mode page parameters for a particuar lun. */ static int ctl_init_page_index(struct ctl_lun *lun) { int i; struct ctl_page_index *page_index; const char *value; uint64_t ival; memcpy(&lun->mode_pages.index, page_index_template, sizeof(page_index_template)); for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { page_index = &lun->mode_pages.index[i]; /* * If this is a disk-only mode page, there's no point in * setting it up. For some pages, we have to have some * basic information about the disk in order to calculate the * mode page data. */ if ((lun->be_lun->lun_type != T_DIRECT) && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; switch (page_index->page_code & SMPH_PC_MASK) { case SMS_RW_ERROR_RECOVERY_PAGE: { if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("subpage is incorrect!"); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_CURRENT], &rw_er_page_default, sizeof(rw_er_page_default)); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_CHANGEABLE], &rw_er_page_changeable, sizeof(rw_er_page_changeable)); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_DEFAULT], &rw_er_page_default, sizeof(rw_er_page_default)); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_SAVED], &rw_er_page_default, sizeof(rw_er_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.rw_er_page; break; } case SMS_FORMAT_DEVICE_PAGE: { struct scsi_format_page *format_page; if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("subpage is incorrect!"); /* * Sectors per track are set above. Bytes per * sector need to be set here on a per-LUN basis. */ memcpy(&lun->mode_pages.format_page[CTL_PAGE_CURRENT], &format_page_default, sizeof(format_page_default)); memcpy(&lun->mode_pages.format_page[ CTL_PAGE_CHANGEABLE], &format_page_changeable, sizeof(format_page_changeable)); memcpy(&lun->mode_pages.format_page[CTL_PAGE_DEFAULT], &format_page_default, sizeof(format_page_default)); memcpy(&lun->mode_pages.format_page[CTL_PAGE_SAVED], &format_page_default, sizeof(format_page_default)); format_page = &lun->mode_pages.format_page[ CTL_PAGE_CURRENT]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); format_page = &lun->mode_pages.format_page[ CTL_PAGE_DEFAULT]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); format_page = &lun->mode_pages.format_page[ CTL_PAGE_SAVED]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); page_index->page_data = (uint8_t *)lun->mode_pages.format_page; break; } case SMS_RIGID_DISK_PAGE: { struct scsi_rigid_disk_page *rigid_disk_page; uint32_t sectors_per_cylinder; uint64_t cylinders; #ifndef __XSCALE__ int shift; #endif /* !__XSCALE__ */ if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("invalid subpage value %d", page_index->subpage); /* * Rotation rate and sectors per track are set * above. We calculate the cylinders here based on * capacity. Due to the number of heads and * sectors per track we're using, smaller arrays * may turn out to have 0 cylinders. Linux and * FreeBSD don't pay attention to these mode pages * to figure out capacity, but Solaris does. It * seems to deal with 0 cylinders just fine, and * works out a fake geometry based on the capacity. */ memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_DEFAULT], &rigid_disk_page_default, sizeof(rigid_disk_page_default)); memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_CHANGEABLE],&rigid_disk_page_changeable, sizeof(rigid_disk_page_changeable)); sectors_per_cylinder = CTL_DEFAULT_SECTORS_PER_TRACK * CTL_DEFAULT_HEADS; /* * The divide method here will be more accurate, * probably, but results in floating point being * used in the kernel on i386 (__udivdi3()). On the * XScale, though, __udivdi3() is implemented in * software. * * The shift method for cylinder calculation is * accurate if sectors_per_cylinder is a power of * 2. Otherwise it might be slightly off -- you * might have a bit of a truncation problem. */ #ifdef __XSCALE__ cylinders = (lun->be_lun->maxlba + 1) / sectors_per_cylinder; #else for (shift = 31; shift > 0; shift--) { if (sectors_per_cylinder & (1 << shift)) break; } cylinders = (lun->be_lun->maxlba + 1) >> shift; #endif /* * We've basically got 3 bytes, or 24 bits for the * cylinder size in the mode page. If we're over, * just round down to 2^24. */ if (cylinders > 0xffffff) cylinders = 0xffffff; rigid_disk_page = &lun->mode_pages.rigid_disk_page[ CTL_PAGE_DEFAULT]; scsi_ulto3b(cylinders, rigid_disk_page->cylinders); if ((value = ctl_get_opt(&lun->be_lun->options, "rpm")) != NULL) { scsi_ulto2b(strtol(value, NULL, 0), rigid_disk_page->rotation_rate); } memcpy(&lun->mode_pages.rigid_disk_page[CTL_PAGE_CURRENT], &lun->mode_pages.rigid_disk_page[CTL_PAGE_DEFAULT], sizeof(rigid_disk_page_default)); memcpy(&lun->mode_pages.rigid_disk_page[CTL_PAGE_SAVED], &lun->mode_pages.rigid_disk_page[CTL_PAGE_DEFAULT], sizeof(rigid_disk_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.rigid_disk_page; break; } case SMS_CACHING_PAGE: { struct scsi_caching_page *caching_page; if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("invalid subpage value %d", page_index->subpage); memcpy(&lun->mode_pages.caching_page[CTL_PAGE_DEFAULT], &caching_page_default, sizeof(caching_page_default)); memcpy(&lun->mode_pages.caching_page[ CTL_PAGE_CHANGEABLE], &caching_page_changeable, sizeof(caching_page_changeable)); memcpy(&lun->mode_pages.caching_page[CTL_PAGE_SAVED], &caching_page_default, sizeof(caching_page_default)); caching_page = &lun->mode_pages.caching_page[ CTL_PAGE_SAVED]; value = ctl_get_opt(&lun->be_lun->options, "writecache"); if (value != NULL && strcmp(value, "off") == 0) caching_page->flags1 &= ~SCP_WCE; value = ctl_get_opt(&lun->be_lun->options, "readcache"); if (value != NULL && strcmp(value, "off") == 0) caching_page->flags1 |= SCP_RCD; memcpy(&lun->mode_pages.caching_page[CTL_PAGE_CURRENT], &lun->mode_pages.caching_page[CTL_PAGE_SAVED], sizeof(caching_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.caching_page; break; } case SMS_CONTROL_MODE_PAGE: { struct scsi_control_page *control_page; if (page_index->subpage != SMS_SUBPAGE_PAGE_0) panic("invalid subpage value %d", page_index->subpage); memcpy(&lun->mode_pages.control_page[CTL_PAGE_DEFAULT], &control_page_default, sizeof(control_page_default)); memcpy(&lun->mode_pages.control_page[ CTL_PAGE_CHANGEABLE], &control_page_changeable, sizeof(control_page_changeable)); memcpy(&lun->mode_pages.control_page[CTL_PAGE_SAVED], &control_page_default, sizeof(control_page_default)); control_page = &lun->mode_pages.control_page[ CTL_PAGE_SAVED]; value = ctl_get_opt(&lun->be_lun->options, "reordering"); if (value != NULL && strcmp(value, "unrestricted") == 0) { control_page->queue_flags &= ~SCP_QUEUE_ALG_MASK; control_page->queue_flags |= SCP_QUEUE_ALG_UNRESTRICTED; } memcpy(&lun->mode_pages.control_page[CTL_PAGE_CURRENT], &lun->mode_pages.control_page[CTL_PAGE_SAVED], sizeof(control_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.control_page; break; } case SMS_INFO_EXCEPTIONS_PAGE: { switch (page_index->subpage) { case SMS_SUBPAGE_PAGE_0: memcpy(&lun->mode_pages.ie_page[CTL_PAGE_CURRENT], &ie_page_default, sizeof(ie_page_default)); memcpy(&lun->mode_pages.ie_page[ CTL_PAGE_CHANGEABLE], &ie_page_changeable, sizeof(ie_page_changeable)); memcpy(&lun->mode_pages.ie_page[CTL_PAGE_DEFAULT], &ie_page_default, sizeof(ie_page_default)); memcpy(&lun->mode_pages.ie_page[CTL_PAGE_SAVED], &ie_page_default, sizeof(ie_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.ie_page; break; case 0x02: { struct ctl_logical_block_provisioning_page *page; memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_DEFAULT], &lbp_page_default, sizeof(lbp_page_default)); memcpy(&lun->mode_pages.lbp_page[ CTL_PAGE_CHANGEABLE], &lbp_page_changeable, sizeof(lbp_page_changeable)); memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_SAVED], &lbp_page_default, sizeof(lbp_page_default)); page = &lun->mode_pages.lbp_page[CTL_PAGE_SAVED]; value = ctl_get_opt(&lun->be_lun->options, "avail-threshold"); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[0].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_DEC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[0].count); } value = ctl_get_opt(&lun->be_lun->options, "used-threshold"); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[1].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_INC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[1].count); } value = ctl_get_opt(&lun->be_lun->options, "pool-avail-threshold"); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[2].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_DEC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[2].count); } value = ctl_get_opt(&lun->be_lun->options, "pool-used-threshold"); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[3].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_INC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[3].count); } memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_CURRENT], &lun->mode_pages.lbp_page[CTL_PAGE_SAVED], sizeof(lbp_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.lbp_page; }} break; } case SMS_VENDOR_SPECIFIC_PAGE:{ switch (page_index->subpage) { case DBGCNF_SUBPAGE_CODE: { struct copan_debugconf_subpage *current_page, *saved_page; memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_CURRENT], &debugconf_page_default, sizeof(debugconf_page_default)); memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_CHANGEABLE], &debugconf_page_changeable, sizeof(debugconf_page_changeable)); memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_DEFAULT], &debugconf_page_default, sizeof(debugconf_page_default)); memcpy(&lun->mode_pages.debugconf_subpage[ CTL_PAGE_SAVED], &debugconf_page_default, sizeof(debugconf_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.debugconf_subpage; current_page = (struct copan_debugconf_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_page = (struct copan_debugconf_subpage *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); break; } default: panic("invalid subpage value %d", page_index->subpage); break; } break; } default: panic("invalid page value %d", page_index->page_code & SMPH_PC_MASK); break; } } return (CTL_RETVAL_COMPLETE); } static int ctl_init_log_page_index(struct ctl_lun *lun) { struct ctl_page_index *page_index; int i, j, k, prev; memcpy(&lun->log_pages.index, log_page_index_template, sizeof(log_page_index_template)); prev = -1; for (i = 0, j = 0, k = 0; i < CTL_NUM_LOG_PAGES; i++) { page_index = &lun->log_pages.index[i]; /* * If this is a disk-only mode page, there's no point in * setting it up. For some pages, we have to have some * basic information about the disk in order to calculate the * mode page data. */ if ((lun->be_lun->lun_type != T_DIRECT) && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; if (page_index->page_code == SLS_LOGICAL_BLOCK_PROVISIONING && lun->backend->lun_attr == NULL) continue; if (page_index->page_code != prev) { lun->log_pages.pages_page[j] = page_index->page_code; prev = page_index->page_code; j++; } lun->log_pages.subpages_page[k*2] = page_index->page_code; lun->log_pages.subpages_page[k*2+1] = page_index->subpage; k++; } lun->log_pages.index[0].page_data = &lun->log_pages.pages_page[0]; lun->log_pages.index[0].page_len = j; lun->log_pages.index[1].page_data = &lun->log_pages.subpages_page[0]; lun->log_pages.index[1].page_len = k * 2; lun->log_pages.index[2].page_data = &lun->log_pages.lbp_page[0]; lun->log_pages.index[2].page_len = 12*CTL_NUM_LBP_PARAMS; lun->log_pages.index[3].page_data = (uint8_t *)&lun->log_pages.stat_page; lun->log_pages.index[3].page_len = sizeof(lun->log_pages.stat_page); return (CTL_RETVAL_COMPLETE); } static int hex2bin(const char *str, uint8_t *buf, int buf_size) { int i; u_char c; memset(buf, 0, buf_size); while (isspace(str[0])) str++; if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) str += 2; buf_size *= 2; for (i = 0; str[i] != 0 && i < buf_size; i++) { c = str[i]; if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else break; if (c >= 16) break; if ((i & 1) == 0) buf[i / 2] |= (c << 4); else buf[i / 2] |= c; } return ((i + 1) / 2); } /* * LUN allocation. * * Requirements: * - caller allocates and zeros LUN storage, or passes in a NULL LUN if he * wants us to allocate the LUN and he can block. * - ctl_softc is always set * - be_lun is set if the LUN has a backend (needed for disk LUNs) * * Returns 0 for success, non-zero (errno) for failure. */ static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *ctl_lun, struct ctl_be_lun *const be_lun) { struct ctl_lun *nlun, *lun; struct scsi_vpd_id_descriptor *desc; struct scsi_vpd_id_t10 *t10id; const char *eui, *naa, *scsiname, *vendor; int lun_number, i, lun_malloced; int devidlen, idlen1, idlen2 = 0, len; if (be_lun == NULL) return (EINVAL); /* * We currently only support Direct Access or Processor LUN types. */ switch (be_lun->lun_type) { case T_DIRECT: break; case T_PROCESSOR: break; case T_SEQUENTIAL: case T_CHANGER: default: be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); break; } if (ctl_lun == NULL) { lun = malloc(sizeof(*lun), M_CTL, M_WAITOK); lun_malloced = 1; } else { lun_malloced = 0; lun = ctl_lun; } memset(lun, 0, sizeof(*lun)); if (lun_malloced) lun->flags = CTL_LUN_MALLOCED; /* Generate LUN ID. */ devidlen = max(CTL_DEVID_MIN_LEN, strnlen(be_lun->device_id, CTL_DEVID_LEN)); idlen1 = sizeof(*t10id) + devidlen; len = sizeof(struct scsi_vpd_id_descriptor) + idlen1; scsiname = ctl_get_opt(&be_lun->options, "scsiname"); if (scsiname != NULL) { idlen2 = roundup2(strlen(scsiname) + 1, 4); len += sizeof(struct scsi_vpd_id_descriptor) + idlen2; } eui = ctl_get_opt(&be_lun->options, "eui"); if (eui != NULL) { len += sizeof(struct scsi_vpd_id_descriptor) + 16; } naa = ctl_get_opt(&be_lun->options, "naa"); if (naa != NULL) { len += sizeof(struct scsi_vpd_id_descriptor) + 16; } lun->lun_devid = malloc(sizeof(struct ctl_devid) + len, M_CTL, M_WAITOK | M_ZERO); desc = (struct scsi_vpd_id_descriptor *)lun->lun_devid->data; desc->proto_codeset = SVPD_ID_CODESET_ASCII; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_T10; desc->length = idlen1; t10id = (struct scsi_vpd_id_t10 *)&desc->identifier[0]; memset(t10id->vendor, ' ', sizeof(t10id->vendor)); if ((vendor = ctl_get_opt(&be_lun->options, "vendor")) == NULL) { strncpy((char *)t10id->vendor, CTL_VENDOR, sizeof(t10id->vendor)); } else { strncpy(t10id->vendor, vendor, min(sizeof(t10id->vendor), strlen(vendor))); } strncpy((char *)t10id->vendor_spec_id, (char *)be_lun->device_id, devidlen); if (scsiname != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_UTF8; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_SCSI_NAME; desc->length = idlen2; strlcpy(desc->identifier, scsiname, idlen2); } if (eui != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_EUI64; desc->length = hex2bin(eui, desc->identifier, 16); desc->length = desc->length > 12 ? 16 : (desc->length > 8 ? 12 : 8); len -= 16 - desc->length; } if (naa != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_NAA; desc->length = hex2bin(naa, desc->identifier, 16); desc->length = desc->length > 8 ? 16 : 8; len -= 16 - desc->length; } lun->lun_devid->len = len; mtx_lock(&ctl_softc->ctl_lock); /* * See if the caller requested a particular LUN number. If so, see * if it is available. Otherwise, allocate the first available LUN. */ if (be_lun->flags & CTL_LUN_FLAG_ID_REQ) { if ((be_lun->req_lun_id > (CTL_MAX_LUNS - 1)) || (ctl_is_set(ctl_softc->ctl_lun_mask, be_lun->req_lun_id))) { mtx_unlock(&ctl_softc->ctl_lock); if (be_lun->req_lun_id > (CTL_MAX_LUNS - 1)) { printf("ctl: requested LUN ID %d is higher " "than CTL_MAX_LUNS - 1 (%d)\n", be_lun->req_lun_id, CTL_MAX_LUNS - 1); } else { /* * XXX KDM return an error, or just assign * another LUN ID in this case?? */ printf("ctl: requested LUN ID %d is already " "in use\n", be_lun->req_lun_id); } if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); return (ENOSPC); } lun_number = be_lun->req_lun_id; } else { lun_number = ctl_ffz(ctl_softc->ctl_lun_mask, 0, CTL_MAX_LUNS); if (lun_number == -1) { mtx_unlock(&ctl_softc->ctl_lock); printf("ctl: can't allocate LUN, out of LUNs\n"); if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); return (ENOSPC); } } ctl_set_mask(ctl_softc->ctl_lun_mask, lun_number); mtx_init(&lun->lun_lock, "CTL LUN", NULL, MTX_DEF); lun->lun = lun_number; lun->be_lun = be_lun; /* * The processor LUN is always enabled. Disk LUNs come on line * disabled, and must be enabled by the backend. */ lun->flags |= CTL_LUN_DISABLED; lun->backend = be_lun->be; be_lun->ctl_lun = lun; be_lun->lun_id = lun_number; atomic_add_int(&be_lun->be->num_luns, 1); if (be_lun->flags & CTL_LUN_FLAG_OFFLINE) lun->flags |= CTL_LUN_OFFLINE; if (be_lun->flags & CTL_LUN_FLAG_POWERED_OFF) lun->flags |= CTL_LUN_STOPPED; if (be_lun->flags & CTL_LUN_FLAG_INOPERABLE) lun->flags |= CTL_LUN_INOPERABLE; if (be_lun->flags & CTL_LUN_FLAG_PRIMARY) lun->flags |= CTL_LUN_PRIMARY_SC; lun->ctl_softc = ctl_softc; #ifdef CTL_TIME_IO lun->last_busy = getsbinuptime(); #endif TAILQ_INIT(&lun->ooa_queue); TAILQ_INIT(&lun->blocked_queue); STAILQ_INIT(&lun->error_list); ctl_tpc_lun_init(lun); /* * Initialize the mode and log page index. */ ctl_init_page_index(lun); ctl_init_log_page_index(lun); /* * Now, before we insert this lun on the lun list, set the lun * inventory changed UA for all other luns. */ STAILQ_FOREACH(nlun, &ctl_softc->lun_list, links) { mtx_lock(&nlun->lun_lock); ctl_est_ua_all(nlun, -1, CTL_UA_LUN_CHANGE); mtx_unlock(&nlun->lun_lock); } STAILQ_INSERT_TAIL(&ctl_softc->lun_list, lun, links); ctl_softc->ctl_luns[lun_number] = lun; ctl_softc->num_luns++; /* Setup statistics gathering */ lun->stats.device_type = be_lun->lun_type; lun->stats.lun_number = lun_number; if (lun->stats.device_type == T_DIRECT) lun->stats.blocksize = be_lun->blocksize; else lun->stats.flags = CTL_LUN_STATS_NO_BLOCKSIZE; for (i = 0;i < CTL_MAX_PORTS;i++) lun->stats.ports[i].targ_port = i; mtx_unlock(&ctl_softc->ctl_lock); lun->be_lun->lun_config_status(lun->be_lun->be_lun, CTL_LUN_CONFIG_OK); return (0); } /* * Delete a LUN. * Assumptions: * - LUN has already been marked invalid and any pending I/O has been taken * care of. */ static int ctl_free_lun(struct ctl_lun *lun) { struct ctl_softc *softc; struct ctl_lun *nlun; int i; softc = lun->ctl_softc; mtx_assert(&softc->ctl_lock, MA_OWNED); STAILQ_REMOVE(&softc->lun_list, lun, ctl_lun, links); ctl_clear_mask(softc->ctl_lun_mask, lun->lun); softc->ctl_luns[lun->lun] = NULL; if (!TAILQ_EMPTY(&lun->ooa_queue)) panic("Freeing a LUN %p with outstanding I/O!!\n", lun); softc->num_luns--; /* * Tell the backend to free resources, if this LUN has a backend. */ atomic_subtract_int(&lun->be_lun->be->num_luns, 1); lun->be_lun->lun_shutdown(lun->be_lun->be_lun); ctl_tpc_lun_shutdown(lun); mtx_destroy(&lun->lun_lock); free(lun->lun_devid, M_CTL); for (i = 0; i < CTL_MAX_PORTS; i++) free(lun->pending_ua[i], M_CTL); for (i = 0; i < CTL_MAX_PORTS; i++) free(lun->pr_keys[i], M_CTL); free(lun->write_buffer, M_CTL); if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); STAILQ_FOREACH(nlun, &softc->lun_list, links) { mtx_lock(&nlun->lun_lock); ctl_est_ua_all(nlun, -1, CTL_UA_LUN_CHANGE); mtx_unlock(&nlun->lun_lock); } return (0); } static void ctl_create_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc; softc = control_softc; /* * ctl_alloc_lun() should handle all potential failure cases. */ ctl_alloc_lun(softc, NULL, be_lun); } int ctl_add_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc = control_softc; mtx_lock(&softc->ctl_lock); STAILQ_INSERT_TAIL(&softc->pending_lun_queue, be_lun, links); mtx_unlock(&softc->ctl_lock); wakeup(&softc->pending_lun_queue); return (0); } int ctl_enable_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc; struct ctl_port *port, *nport; struct ctl_lun *lun; int retval; lun = (struct ctl_lun *)be_lun->ctl_lun; softc = lun->ctl_softc; mtx_lock(&softc->ctl_lock); mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_DISABLED) == 0) { /* * eh? Why did we get called if the LUN is already * enabled? */ mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); return (0); } lun->flags &= ~CTL_LUN_DISABLED; mtx_unlock(&lun->lun_lock); for (port = STAILQ_FIRST(&softc->port_list); port != NULL; port = nport) { nport = STAILQ_NEXT(port, links); if ((port->status & CTL_PORT_STATUS_ONLINE) == 0 || port->lun_map != NULL || port->lun_enable == NULL) continue; /* * Drop the lock while we call the FETD's enable routine. * This can lead to a callback into CTL (at least in the * case of the internal initiator frontend. */ mtx_unlock(&softc->ctl_lock); retval = port->lun_enable(port->targ_lun_arg, lun->lun); mtx_lock(&softc->ctl_lock); if (retval != 0) { printf("%s: FETD %s port %d returned error " "%d for lun_enable on lun %jd\n", __func__, port->port_name, port->targ_port, retval, (intmax_t)lun->lun); } } mtx_unlock(&softc->ctl_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_disable_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc; struct ctl_port *port; struct ctl_lun *lun; int retval; lun = (struct ctl_lun *)be_lun->ctl_lun; softc = lun->ctl_softc; mtx_lock(&softc->ctl_lock); mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); return (0); } lun->flags |= CTL_LUN_DISABLED; mtx_unlock(&lun->lun_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0 || port->lun_map != NULL || port->lun_disable == NULL) continue; /* * Drop the lock before we call the frontend's disable * routine, to avoid lock order reversals. * * XXX KDM what happens if the frontend list changes while * we're traversing it? It's unlikely, but should be handled. */ mtx_unlock(&softc->ctl_lock); retval = port->lun_disable(port->targ_lun_arg, lun->lun); mtx_lock(&softc->ctl_lock); if (retval != 0) { printf("%s: FETD %s port %d returned error " "%d for lun_disable on lun %jd\n", __func__, port->port_name, port->targ_port, retval, (intmax_t)lun->lun); } } mtx_unlock(&softc->ctl_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_start_lun(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_STOPPED; mtx_unlock(&lun->lun_lock); return (0); } int ctl_stop_lun(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_STOPPED; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_offline(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_OFFLINE; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_online(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_OFFLINE; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_primary(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_PRIMARY_SC; - mtx_unlock(&lun->lun_lock); ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); + mtx_unlock(&lun->lun_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_lun_secondary(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_PRIMARY_SC; - mtx_unlock(&lun->lun_lock); ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); + mtx_unlock(&lun->lun_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_invalidate_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc; struct ctl_lun *lun; lun = (struct ctl_lun *)be_lun->ctl_lun; softc = lun->ctl_softc; mtx_lock(&lun->lun_lock); /* * The LUN needs to be disabled before it can be marked invalid. */ if ((lun->flags & CTL_LUN_DISABLED) == 0) { mtx_unlock(&lun->lun_lock); return (-1); } /* * Mark the LUN invalid. */ lun->flags |= CTL_LUN_INVALID; /* * If there is nothing in the OOA queue, go ahead and free the LUN. * If we have something in the OOA queue, we'll free it when the * last I/O completes. */ if (TAILQ_EMPTY(&lun->ooa_queue)) { mtx_unlock(&lun->lun_lock); mtx_lock(&softc->ctl_lock); ctl_free_lun(lun); mtx_unlock(&softc->ctl_lock); } else mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_inoperable(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_INOPERABLE; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_operable(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_INOPERABLE; mtx_unlock(&lun->lun_lock); return (0); } void ctl_lun_capacity_changed(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; union ctl_ha_msg msg; mtx_lock(&lun->lun_lock); ctl_est_ua_all(lun, -1, CTL_UA_CAPACITY_CHANGED); mtx_unlock(&lun->lun_lock); if (lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) { /* Send msg to other side. */ bzero(&msg.ua, sizeof(msg.ua)); msg.hdr.msg_type = CTL_MSG_UA; msg.hdr.nexus.initid = -1; msg.hdr.nexus.targ_port = -1; msg.hdr.nexus.targ_lun = lun->lun; msg.hdr.nexus.targ_mapped_lun = lun->lun; msg.ua.ua_all = 1; msg.ua.ua_set = 1; msg.ua.ua_type = CTL_UA_CAPACITY_CHANGED; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.ua), M_WAITOK); } } /* * Backend "memory move is complete" callback for requests that never * make it down to say RAIDCore's configuration code. */ int ctl_config_move_done(union ctl_io *io) { int retval; CTL_DEBUG_PRINT(("ctl_config_move_done\n")); KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Config I/O type isn't CTL_IO_SCSI (%d)!", io->io_hdr.io_type)); if ((io->io_hdr.port_status != 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { /* * For hardware error sense keys, the sense key * specific value is defined to be a retry count, * but we use it to pass back an internal FETD * error code. XXX KDM Hopefully the FETD is only * using 16 bits for an error code, since that's * all the space we have in the sks field. */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ io->io_hdr.port_status); } if (ctl_debug & CTL_DEBUG_CDB_DATA) ctl_data_print(io); if (((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)) { /* * XXX KDM just assuming a single pointer here, and not a * S/G list. If we start using S/G lists for config data, * we'll need to know how to clean them up here as well. */ if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) free(io->scsiio.kern_data_ptr, M_CTL); ctl_done(io); retval = CTL_RETVAL_COMPLETE; } else { /* * XXX KDM now we need to continue data movement. Some * options: * - call ctl_scsiio() again? We don't do this for data * writes, because for those at least we know ahead of * time where the write will go and how long it is. For * config writes, though, that information is largely * contained within the write itself, thus we need to * parse out the data again. * * - Call some other function once the data is in? */ /* * XXX KDM call ctl_scsiio() again for now, and check flag * bits to see whether we're allocated or not. */ retval = ctl_scsiio(&io->scsiio); } return (retval); } /* * This gets called by a backend driver when it is done with a * data_submit method. */ void ctl_data_submit_done(union ctl_io *io) { /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. * * If there is an error, though, we don't want to keep processing. * Instead, just send status back to the initiator. */ if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) && (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { io->scsiio.io_cont(io); return; } ctl_done(io); } /* * This gets called by a backend driver when it is done with a * configuration write. */ void ctl_config_write_done(union ctl_io *io) { uint8_t *buf; /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. * * If there is an error, though, we don't want to keep processing. * Instead, just send status back to the initiator. */ if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) && (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { io->scsiio.io_cont(io); return; } /* * Since a configuration write can be done for commands that actually * have data allocated, like write buffer, and commands that have * no data, like start/stop unit, we need to check here. */ if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) buf = io->scsiio.kern_data_ptr; else buf = NULL; ctl_done(io); if (buf) free(buf, M_CTL); } void ctl_config_read_done(union ctl_io *io) { uint8_t *buf; /* * If there is some error -- we are done, skip data transfer. */ if ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) buf = io->scsiio.kern_data_ptr; else buf = NULL; ctl_done(io); if (buf) free(buf, M_CTL); return; } /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. */ if (io->io_hdr.flags & CTL_FLAG_IO_CONT) { io->scsiio.io_cont(io); return; } ctl_datamove(io); } /* * SCSI release command. */ int ctl_scsi_release(struct ctl_scsiio *ctsio) { int length, longid, thirdparty_id, resv_id; struct ctl_lun *lun; uint32_t residx; length = 0; resv_id = 0; CTL_DEBUG_PRINT(("ctl_scsi_release\n")); residx = ctl_get_initindex(&ctsio->io_hdr.nexus); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; switch (ctsio->cdb[0]) { case RELEASE_10: { struct scsi_release_10 *cdb; cdb = (struct scsi_release_10 *)ctsio->cdb; if (cdb->byte2 & SR10_LONGID) longid = 1; else thirdparty_id = cdb->thirdparty_id; resv_id = cdb->resv_id; length = scsi_2btoul(cdb->length); break; } } /* * XXX KDM right now, we only support LUN reservation. We don't * support 3rd party reservations, or extent reservations, which * might actually need the parameter list. If we've gotten this * far, we've got a LUN reservation. Anything else got kicked out * above. So, according to SPC, ignore the length. */ length = 0; if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) && (length > 0)) { ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK); ctsio->kern_data_len = length; ctsio->kern_total_len = length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (length > 0) thirdparty_id = scsi_8btou64(ctsio->kern_data_ptr); mtx_lock(&lun->lun_lock); /* * According to SPC, it is not an error for an intiator to attempt * to release a reservation on a LUN that isn't reserved, or that * is reserved by another initiator. The reservation can only be * released, though, by the initiator who made it or by one of * several reset type events. */ if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx == residx)) lun->flags &= ~CTL_LUN_RESERVED; mtx_unlock(&lun->lun_lock); if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_scsi_reserve(struct ctl_scsiio *ctsio) { int extent, thirdparty, longid; int resv_id, length; uint64_t thirdparty_id; struct ctl_lun *lun; uint32_t residx; extent = 0; thirdparty = 0; longid = 0; resv_id = 0; length = 0; thirdparty_id = 0; CTL_DEBUG_PRINT(("ctl_reserve\n")); residx = ctl_get_initindex(&ctsio->io_hdr.nexus); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; switch (ctsio->cdb[0]) { case RESERVE_10: { struct scsi_reserve_10 *cdb; cdb = (struct scsi_reserve_10 *)ctsio->cdb; if (cdb->byte2 & SR10_LONGID) longid = 1; else thirdparty_id = cdb->thirdparty_id; resv_id = cdb->resv_id; length = scsi_2btoul(cdb->length); break; } } /* * XXX KDM right now, we only support LUN reservation. We don't * support 3rd party reservations, or extent reservations, which * might actually need the parameter list. If we've gotten this * far, we've got a LUN reservation. Anything else got kicked out * above. So, according to SPC, ignore the length. */ length = 0; if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) && (length > 0)) { ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK); ctsio->kern_data_len = length; ctsio->kern_total_len = length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (length > 0) thirdparty_id = scsi_8btou64(ctsio->kern_data_ptr); mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx != residx)) { ctl_set_reservation_conflict(ctsio); goto bailout; } lun->flags |= CTL_LUN_RESERVED; lun->res_idx = residx; ctl_set_success(ctsio); bailout: mtx_unlock(&lun->lun_lock); if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_start_stop(struct ctl_scsiio *ctsio) { struct scsi_start_stop_unit *cdb; struct ctl_lun *lun; int retval; CTL_DEBUG_PRINT(("ctl_start_stop\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = 0; cdb = (struct scsi_start_stop_unit *)ctsio->cdb; /* * XXX KDM * We don't support the immediate bit on a stop unit. In order to * do that, we would need to code up a way to know that a stop is * pending, and hold off any new commands until it completes, one * way or another. Then we could accept or reject those commands * depending on its status. We would almost need to do the reverse * of what we do below for an immediate start -- return the copy of * the ctl_io to the FETD with status to send to the host (and to * free the copy!) and then free the original I/O once the stop * actually completes. That way, the OOA queue mechanism can work * to block commands that shouldn't proceed. Another alternative * would be to put the copy in the queue in place of the original, * and return the original back to the caller. That could be * slightly safer.. */ if ((cdb->byte2 & SSS_IMMED) && ((cdb->how & SSS_START) == 0)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if ((lun->flags & CTL_LUN_PR_RESERVED) && ((cdb->how & SSS_START)==0)) { uint32_t residx; residx = ctl_get_initindex(&ctsio->io_hdr.nexus); if (ctl_get_prkey(lun, residx) == 0 || (lun->pr_res_idx!=residx && lun->res_type < 4)) { ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } /* * If there is no backend on this device, we can't start or stop * it. In theory we shouldn't get any start/stop commands in the * first place at this level if the LUN doesn't have a backend. * That should get stopped by the command decode code. */ if (lun->backend == NULL) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * XXX KDM Copan-specific offline behavior. * Figure out a reasonable way to port this? */ #ifdef NEEDTOPORT mtx_lock(&lun->lun_lock); if (((cdb->byte2 & SSS_ONOFFLINE) == 0) && (lun->flags & CTL_LUN_OFFLINE)) { /* * If the LUN is offline, and the on/offline bit isn't set, * reject the start or stop. Otherwise, let it through. */ mtx_unlock(&lun->lun_lock); ctl_set_lun_not_ready(ctsio); ctl_done((union ctl_io *)ctsio); } else { mtx_unlock(&lun->lun_lock); #endif /* NEEDTOPORT */ /* * This could be a start or a stop when we're online, * or a stop/offline or start/online. A start or stop when * we're offline is covered in the case above. */ /* * In the non-immediate case, we send the request to * the backend and return status to the user when * it is done. * * In the immediate case, we allocate a new ctl_io * to hold a copy of the request, and send that to * the backend. We then set good status on the * user's request and return it immediately. */ if (cdb->byte2 & SSS_IMMED) { union ctl_io *new_io; new_io = ctl_alloc_io(ctsio->io_hdr.pool); ctl_copy_io((union ctl_io *)ctsio, new_io); retval = lun->backend->config_write(new_io); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); } else { retval = lun->backend->config_write( (union ctl_io *)ctsio); } #ifdef NEEDTOPORT } #endif return (retval); } /* * We support the SYNCHRONIZE CACHE command (10 and 16 byte versions), but * we don't really do anything with the LBA and length fields if the user * passes them in. Instead we'll just flush out the cache for the entire * LUN. */ int ctl_sync_cache(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_softc *softc; struct ctl_lba_len_flags *lbalen; uint64_t starting_lba; uint32_t block_count; int retval; uint8_t byte2; CTL_DEBUG_PRINT(("ctl_sync_cache\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; softc = lun->ctl_softc; retval = 0; switch (ctsio->cdb[0]) { case SYNCHRONIZE_CACHE: { struct scsi_sync_cache *cdb; cdb = (struct scsi_sync_cache *)ctsio->cdb; starting_lba = scsi_4btoul(cdb->begin_lba); block_count = scsi_2btoul(cdb->lb_count); byte2 = cdb->byte2; break; } case SYNCHRONIZE_CACHE_16: { struct scsi_sync_cache_16 *cdb; cdb = (struct scsi_sync_cache_16 *)ctsio->cdb; starting_lba = scsi_8btou64(cdb->begin_lba); block_count = scsi_4btoul(cdb->lb_count); byte2 = cdb->byte2; break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); goto bailout; break; /* NOTREACHED */ } /* * We check the LBA and length, but don't do anything with them. * A SYNCHRONIZE CACHE will cause the entire cache for this lun to * get flushed. This check will just help satisfy anyone who wants * to see an error for an out of range LBA. */ if ((starting_lba + block_count) > (lun->be_lun->maxlba + 1)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); goto bailout; } /* * If this LUN has no backend, we can't flush the cache anyway. */ if (lun->backend == NULL) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); goto bailout; } lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = starting_lba; lbalen->len = block_count; lbalen->flags = byte2; /* * Check to see whether we're configured to send the SYNCHRONIZE * CACHE command directly to the back end. */ mtx_lock(&lun->lun_lock); if ((softc->flags & CTL_FLAG_REAL_SYNC) && (++(lun->sync_count) >= lun->sync_interval)) { lun->sync_count = 0; mtx_unlock(&lun->lun_lock); retval = lun->backend->config_write((union ctl_io *)ctsio); } else { mtx_unlock(&lun->lun_lock); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); } bailout: return (retval); } int ctl_format(struct ctl_scsiio *ctsio) { struct scsi_format *cdb; struct ctl_lun *lun; int length, defect_list_len; CTL_DEBUG_PRINT(("ctl_format\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_format *)ctsio->cdb; length = 0; if (cdb->byte2 & SF_FMTDATA) { if (cdb->byte2 & SF_LONGLIST) length = sizeof(struct scsi_format_header_long); else length = sizeof(struct scsi_format_header_short); } if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) && (length > 0)) { ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK); ctsio->kern_data_len = length; ctsio->kern_total_len = length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } defect_list_len = 0; if (cdb->byte2 & SF_FMTDATA) { if (cdb->byte2 & SF_LONGLIST) { struct scsi_format_header_long *header; header = (struct scsi_format_header_long *) ctsio->kern_data_ptr; defect_list_len = scsi_4btoul(header->defect_list_len); if (defect_list_len != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto bailout; } } else { struct scsi_format_header_short *header; header = (struct scsi_format_header_short *) ctsio->kern_data_ptr; defect_list_len = scsi_2btoul(header->defect_list_len); if (defect_list_len != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto bailout; } } } /* * The format command will clear out the "Medium format corrupted" * status if set by the configuration code. That status is really * just a way to notify the host that we have lost the media, and * get them to issue a command that will basically make them think * they're blowing away the media. */ mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_INOPERABLE; mtx_unlock(&lun->lun_lock); ctl_set_success(ctsio); bailout: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_buffer(struct ctl_scsiio *ctsio) { struct scsi_read_buffer *cdb; struct ctl_lun *lun; int buffer_offset, len; static uint8_t descr[4]; static uint8_t echo_descr[4] = { 0 }; CTL_DEBUG_PRINT(("ctl_read_buffer\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_read_buffer *)ctsio->cdb; if ((cdb->byte2 & RWB_MODE) != RWB_MODE_DATA && (cdb->byte2 & RWB_MODE) != RWB_MODE_ECHO_DESCR && (cdb->byte2 & RWB_MODE) != RWB_MODE_DESCR) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = scsi_3btoul(cdb->length); buffer_offset = scsi_3btoul(cdb->offset); if (buffer_offset + len > CTL_WRITE_BUFFER_SIZE) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if ((cdb->byte2 & RWB_MODE) == RWB_MODE_DESCR) { descr[0] = 0; scsi_ulto3b(CTL_WRITE_BUFFER_SIZE, &descr[1]); ctsio->kern_data_ptr = descr; len = min(len, sizeof(descr)); } else if ((cdb->byte2 & RWB_MODE) == RWB_MODE_ECHO_DESCR) { ctsio->kern_data_ptr = echo_descr; len = min(len, sizeof(echo_descr)); } else { if (lun->write_buffer == NULL) { lun->write_buffer = malloc(CTL_WRITE_BUFFER_SIZE, M_CTL, M_WAITOK); } ctsio->kern_data_ptr = lun->write_buffer + buffer_offset; } ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctl_set_success(ctsio); ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_write_buffer(struct ctl_scsiio *ctsio) { struct scsi_write_buffer *cdb; struct ctl_lun *lun; int buffer_offset, len; CTL_DEBUG_PRINT(("ctl_write_buffer\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_write_buffer *)ctsio->cdb; if ((cdb->byte2 & RWB_MODE) != RWB_MODE_DATA) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = scsi_3btoul(cdb->length); buffer_offset = scsi_3btoul(cdb->offset); if (buffer_offset + len > CTL_WRITE_BUFFER_SIZE) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { if (lun->write_buffer == NULL) { lun->write_buffer = malloc(CTL_WRITE_BUFFER_SIZE, M_CTL, M_WAITOK); } ctsio->kern_data_ptr = lun->write_buffer + buffer_offset; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_write_same(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int len, retval; uint8_t byte2; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_write_same\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; switch (ctsio->cdb[0]) { case WRITE_SAME_10: { struct scsi_write_same_10 *cdb; cdb = (struct scsi_write_same_10 *)ctsio->cdb; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); byte2 = cdb->byte2; break; } case WRITE_SAME_16: { struct scsi_write_same_16 *cdb; cdb = (struct scsi_write_same_16 *)ctsio->cdb; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); byte2 = cdb->byte2; break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* NDOB and ANCHOR flags can be used only together with UNMAP */ if ((byte2 & SWS_UNMAP) == 0 && (byte2 & (SWS_NDOB | SWS_ANCHOR)) != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Zero number of blocks means "to the last logical block" */ if (num_blocks == 0) { if ((lun->be_lun->maxlba + 1) - lba > UINT32_MAX) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 0, /*command*/ 1, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } num_blocks = (lun->be_lun->maxlba + 1) - lba; } len = lun->be_lun->blocksize; /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((byte2 & SWS_NDOB) == 0 && (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = byte2; retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); } int ctl_unmap(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_unmap *cdb; struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_header *hdr; struct scsi_unmap_desc *buf, *end, *endnz, *range; uint64_t lba; uint32_t num_blocks; int len, retval; uint8_t byte2; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_unmap\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_unmap *)ctsio->cdb; len = scsi_2btoul(cdb->length); byte2 = cdb->byte2; /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = ctsio->kern_total_len - ctsio->kern_data_resid; hdr = (struct scsi_unmap_header *)ctsio->kern_data_ptr; if (len < sizeof (*hdr) || len < (scsi_2btoul(hdr->length) + sizeof(hdr->length)) || len < (scsi_2btoul(hdr->desc_length) + sizeof (*hdr)) || scsi_2btoul(hdr->desc_length) % sizeof(*buf) != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 0, /*command*/ 0, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); goto done; } len = scsi_2btoul(hdr->desc_length); buf = (struct scsi_unmap_desc *)(hdr + 1); end = buf + len / sizeof(*buf); endnz = buf; for (range = buf; range < end; range++) { lba = scsi_8btou64(range->lba); num_blocks = scsi_4btoul(range->length); if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (num_blocks != 0) endnz = range + 1; } /* * Block backend can not handle zero last range. * Filter it out and return if there is nothing left. */ len = (uint8_t *)endnz - (uint8_t *)buf; if (len == 0) { ctl_set_success(ctsio); goto done; } mtx_lock(&lun->lun_lock); ptrlen = (struct ctl_ptr_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; ptrlen->ptr = (void *)buf; ptrlen->len = len; ptrlen->flags = byte2; ctl_check_blocked(lun); mtx_unlock(&lun->lun_lock); retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); done: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Note that this function currently doesn't actually do anything inside * CTL to enforce things if the DQue bit is turned on. * * Also note that this function can't be used in the default case, because * the DQue bit isn't set in the changeable mask for the control mode page * anyway. This is just here as an example for how to implement a page * handler, and a placeholder in case we want to allow the user to turn * tagged queueing on and off. * * The D_SENSE bit handling is functional, however, and will turn * descriptor sense on and off for a given LUN. */ int ctl_control_page_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { struct scsi_control_page *current_cp, *saved_cp, *user_cp; struct ctl_lun *lun; int set_ua; uint32_t initidx; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); set_ua = 0; user_cp = (struct scsi_control_page *)page_ptr; current_cp = (struct scsi_control_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_cp = (struct scsi_control_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); mtx_lock(&lun->lun_lock); if (((current_cp->rlec & SCP_DSENSE) == 0) && ((user_cp->rlec & SCP_DSENSE) != 0)) { /* * Descriptor sense is currently turned off and the user * wants to turn it on. */ current_cp->rlec |= SCP_DSENSE; saved_cp->rlec |= SCP_DSENSE; lun->flags |= CTL_LUN_SENSE_DESC; set_ua = 1; } else if (((current_cp->rlec & SCP_DSENSE) != 0) && ((user_cp->rlec & SCP_DSENSE) == 0)) { /* * Descriptor sense is currently turned on, and the user * wants to turn it off. */ current_cp->rlec &= ~SCP_DSENSE; saved_cp->rlec &= ~SCP_DSENSE; lun->flags &= ~CTL_LUN_SENSE_DESC; set_ua = 1; } if ((current_cp->queue_flags & SCP_QUEUE_ALG_MASK) != (user_cp->queue_flags & SCP_QUEUE_ALG_MASK)) { current_cp->queue_flags &= ~SCP_QUEUE_ALG_MASK; current_cp->queue_flags |= user_cp->queue_flags & SCP_QUEUE_ALG_MASK; saved_cp->queue_flags &= ~SCP_QUEUE_ALG_MASK; saved_cp->queue_flags |= user_cp->queue_flags & SCP_QUEUE_ALG_MASK; set_ua = 1; } if ((current_cp->eca_and_aen & SCP_SWP) != (user_cp->eca_and_aen & SCP_SWP)) { current_cp->eca_and_aen &= ~SCP_SWP; current_cp->eca_and_aen |= user_cp->eca_and_aen & SCP_SWP; saved_cp->eca_and_aen &= ~SCP_SWP; saved_cp->eca_and_aen |= user_cp->eca_and_aen & SCP_SWP; set_ua = 1; } if (set_ua != 0) ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE); mtx_unlock(&lun->lun_lock); return (0); } int ctl_caching_sp_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { struct scsi_caching_page *current_cp, *saved_cp, *user_cp; struct ctl_lun *lun; int set_ua; uint32_t initidx; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); set_ua = 0; user_cp = (struct scsi_caching_page *)page_ptr; current_cp = (struct scsi_caching_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); saved_cp = (struct scsi_caching_page *) (page_index->page_data + (page_index->page_len * CTL_PAGE_SAVED)); mtx_lock(&lun->lun_lock); if ((current_cp->flags1 & (SCP_WCE | SCP_RCD)) != (user_cp->flags1 & (SCP_WCE | SCP_RCD))) { current_cp->flags1 &= ~(SCP_WCE | SCP_RCD); current_cp->flags1 |= user_cp->flags1 & (SCP_WCE | SCP_RCD); saved_cp->flags1 &= ~(SCP_WCE | SCP_RCD); saved_cp->flags1 |= user_cp->flags1 & (SCP_WCE | SCP_RCD); set_ua = 1; } if (set_ua != 0) ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE); mtx_unlock(&lun->lun_lock); return (0); } int ctl_debugconf_sp_select_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { uint8_t *c; int i; c = ((struct copan_debugconf_subpage *)page_ptr)->ctl_time_io_secs; ctl_time_io_secs = (c[0] << 8) | (c[1] << 0) | 0; CTL_DEBUG_PRINT(("set ctl_time_io_secs to %d\n", ctl_time_io_secs)); printf("set ctl_time_io_secs to %d\n", ctl_time_io_secs); printf("page data:"); for (i=0; i<8; i++) printf(" %.2x",page_ptr[i]); printf("\n"); return (0); } int ctl_debugconf_sp_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct copan_debugconf_subpage *page; page = (struct copan_debugconf_subpage *)page_index->page_data + (page_index->page_len * pc); switch (pc) { case SMS_PAGE_CTRL_CHANGEABLE >> 6: case SMS_PAGE_CTRL_DEFAULT >> 6: case SMS_PAGE_CTRL_SAVED >> 6: /* * We don't update the changable or default bits for this page. */ break; case SMS_PAGE_CTRL_CURRENT >> 6: page->ctl_time_io_secs[0] = ctl_time_io_secs >> 8; page->ctl_time_io_secs[1] = ctl_time_io_secs >> 0; break; default: #ifdef NEEDTOPORT EPRINT(0, "Invalid PC %d!!", pc); #endif /* NEEDTOPORT */ break; } return (0); } static int ctl_do_mode_select(union ctl_io *io) { struct scsi_mode_page_header *page_header; struct ctl_page_index *page_index; struct ctl_scsiio *ctsio; int control_dev, page_len; int page_len_offset, page_len_size; union ctl_modepage_info *modepage_info; struct ctl_lun *lun; int *len_left, *len_used; int retval, i; ctsio = &io->scsiio; page_index = NULL; page_len = 0; retval = CTL_RETVAL_COMPLETE; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun->be_lun->lun_type != T_DIRECT) control_dev = 1; else control_dev = 0; modepage_info = (union ctl_modepage_info *) ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes; len_left = &modepage_info->header.len_left; len_used = &modepage_info->header.len_used; do_next_page: page_header = (struct scsi_mode_page_header *) (ctsio->kern_data_ptr + *len_used); if (*len_left == 0) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if (*len_left < sizeof(struct scsi_mode_page_header)) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if ((page_header->page_code & SMPH_SPF) && (*len_left < sizeof(struct scsi_mode_page_header_sp))) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * XXX KDM should we do something with the block descriptor? */ for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { if ((control_dev != 0) && (lun->mode_pages.index[i].page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) != (page_header->page_code & SMPH_PC_MASK)) continue; /* * If neither page has a subpage code, then we've got a * match. */ if (((lun->mode_pages.index[i].page_code & SMPH_SPF) == 0) && ((page_header->page_code & SMPH_SPF) == 0)) { page_index = &lun->mode_pages.index[i]; page_len = page_header->page_length; break; } /* * If both pages have subpages, then the subpage numbers * have to match. */ if ((lun->mode_pages.index[i].page_code & SMPH_SPF) && (page_header->page_code & SMPH_SPF)) { struct scsi_mode_page_header_sp *sph; sph = (struct scsi_mode_page_header_sp *)page_header; if (lun->mode_pages.index[i].subpage == sph->subpage) { page_index = &lun->mode_pages.index[i]; page_len = scsi_2btoul(sph->page_length); break; } } } /* * If we couldn't find the page, or if we don't have a mode select * handler for it, send back an error to the user. */ if ((page_index == NULL) || (page_index->select_handler == NULL)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used, /*bit_valid*/ 0, /*bit*/ 0); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (page_index->page_code & SMPH_SPF) { page_len_offset = 2; page_len_size = 2; } else { page_len_size = 1; page_len_offset = 1; } /* * If the length the initiator gives us isn't the one we specify in * the mode page header, or if they didn't specify enough data in * the CDB to avoid truncating this page, kick out the request. */ if ((page_len != (page_index->page_len - page_len_offset - page_len_size)) || (*len_left < page_index->page_len)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used + page_len_offset, /*bit_valid*/ 0, /*bit*/ 0); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Run through the mode page, checking to make sure that the bits * the user changed are actually legal for him to change. */ for (i = 0; i < page_index->page_len; i++) { uint8_t *user_byte, *change_mask, *current_byte; int bad_bit; int j; user_byte = (uint8_t *)page_header + i; change_mask = page_index->page_data + (page_index->page_len * CTL_PAGE_CHANGEABLE) + i; current_byte = page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT) + i; /* * Check to see whether the user set any bits in this byte * that he is not allowed to set. */ if ((*user_byte & ~(*change_mask)) == (*current_byte & ~(*change_mask))) continue; /* * Go through bit by bit to determine which one is illegal. */ bad_bit = 0; for (j = 7; j >= 0; j--) { if ((((1 << i) & ~(*change_mask)) & *user_byte) != (((1 << i) & ~(*change_mask)) & *current_byte)) { bad_bit = i; break; } } ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used + i, /*bit_valid*/ 1, /*bit*/ bad_bit); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Decrement these before we call the page handler, since we may * end up getting called back one way or another before the handler * returns to this context. */ *len_left -= page_index->page_len; *len_used += page_index->page_len; retval = page_index->select_handler(ctsio, page_index, (uint8_t *)page_header); /* * If the page handler returns CTL_RETVAL_QUEUED, then we need to * wait until this queued command completes to finish processing * the mode page. If it returns anything other than * CTL_RETVAL_COMPLETE (e.g. CTL_RETVAL_ERROR), then it should have * already set the sense information, freed the data pointer, and * completed the io for us. */ if (retval != CTL_RETVAL_COMPLETE) goto bailout_no_done; /* * If the initiator sent us more than one page, parse the next one. */ if (*len_left > 0) goto do_next_page; ctl_set_success(ctsio); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); bailout_no_done: return (CTL_RETVAL_COMPLETE); } int ctl_mode_select(struct ctl_scsiio *ctsio) { int param_len, pf, sp; int header_size, bd_len; int len_left, len_used; struct ctl_page_index *page_index; struct ctl_lun *lun; int control_dev, page_len; union ctl_modepage_info *modepage_info; int retval; pf = 0; sp = 0; page_len = 0; len_used = 0; len_left = 0; retval = 0; bd_len = 0; page_index = NULL; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun->be_lun->lun_type != T_DIRECT) control_dev = 1; else control_dev = 0; switch (ctsio->cdb[0]) { case MODE_SELECT_6: { struct scsi_mode_select_6 *cdb; cdb = (struct scsi_mode_select_6 *)ctsio->cdb; pf = (cdb->byte2 & SMS_PF) ? 1 : 0; sp = (cdb->byte2 & SMS_SP) ? 1 : 0; param_len = cdb->length; header_size = sizeof(struct scsi_mode_header_6); break; } case MODE_SELECT_10: { struct scsi_mode_select_10 *cdb; cdb = (struct scsi_mode_select_10 *)ctsio->cdb; pf = (cdb->byte2 & SMS_PF) ? 1 : 0; sp = (cdb->byte2 & SMS_SP) ? 1 : 0; param_len = scsi_2btoul(cdb->length); header_size = sizeof(struct scsi_mode_header_10); break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * From SPC-3: * "A parameter list length of zero indicates that the Data-Out Buffer * shall be empty. This condition shall not be considered as an error." */ if (param_len == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Since we'll hit this the first time through, prior to * allocation, we don't need to free a data buffer here. */ if (param_len < header_size) { ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Allocate the data buffer and grab the user's data. In theory, * we shouldn't have to sanity check the parameter list length here * because the maximum size is 64K. We should be able to malloc * that much without too many problems. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK); ctsio->kern_data_len = param_len; ctsio->kern_total_len = param_len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } switch (ctsio->cdb[0]) { case MODE_SELECT_6: { struct scsi_mode_header_6 *mh6; mh6 = (struct scsi_mode_header_6 *)ctsio->kern_data_ptr; bd_len = mh6->blk_desc_len; break; } case MODE_SELECT_10: { struct scsi_mode_header_10 *mh10; mh10 = (struct scsi_mode_header_10 *)ctsio->kern_data_ptr; bd_len = scsi_2btoul(mh10->blk_desc_len); break; } default: panic("Invalid CDB type %#x", ctsio->cdb[0]); break; } if (param_len < (header_size + bd_len)) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Set the IO_CONT flag, so that if this I/O gets passed to * ctl_config_write_done(), it'll get passed back to * ctl_do_mode_select() for further processing, or completion if * we're all done. */ ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT; ctsio->io_cont = ctl_do_mode_select; modepage_info = (union ctl_modepage_info *) ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes; memset(modepage_info, 0, sizeof(*modepage_info)); len_left = param_len - header_size - bd_len; len_used = header_size + bd_len; modepage_info->header.len_left = len_left; modepage_info->header.len_used = len_used; return (ctl_do_mode_select((union ctl_io *)ctsio)); } int ctl_mode_sense(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; int pc, page_code, dbd, llba, subpage; int alloc_len, page_len, header_len, total_len; struct scsi_mode_block_descr *block_desc; struct ctl_page_index *page_index; int control_dev; dbd = 0; llba = 0; block_desc = NULL; page_index = NULL; CTL_DEBUG_PRINT(("ctl_mode_sense\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun->be_lun->lun_type != T_DIRECT) control_dev = 1; else control_dev = 0; switch (ctsio->cdb[0]) { case MODE_SENSE_6: { struct scsi_mode_sense_6 *cdb; cdb = (struct scsi_mode_sense_6 *)ctsio->cdb; header_len = sizeof(struct scsi_mode_hdr_6); if (cdb->byte2 & SMS_DBD) dbd = 1; else header_len += sizeof(struct scsi_mode_block_descr); pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SMS_PAGE_CODE; subpage = cdb->subpage; alloc_len = cdb->length; break; } case MODE_SENSE_10: { struct scsi_mode_sense_10 *cdb; cdb = (struct scsi_mode_sense_10 *)ctsio->cdb; header_len = sizeof(struct scsi_mode_hdr_10); if (cdb->byte2 & SMS_DBD) dbd = 1; else header_len += sizeof(struct scsi_mode_block_descr); if (cdb->byte2 & SMS10_LLBAA) llba = 1; pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SMS_PAGE_CODE; subpage = cdb->subpage; alloc_len = scsi_2btoul(cdb->length); break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * We have to make a first pass through to calculate the size of * the pages that match the user's query. Then we allocate enough * memory to hold it, and actually copy the data into the buffer. */ switch (page_code) { case SMS_ALL_PAGES_PAGE: { int i; page_len = 0; /* * At the moment, values other than 0 and 0xff here are * reserved according to SPC-3. */ if ((subpage != SMS_SUBPAGE_PAGE_0) && (subpage != SMS_SUBPAGE_ALL)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 3, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { if ((control_dev != 0) && (lun->mode_pages.index[i].page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; /* * We don't use this subpage if the user didn't * request all subpages. */ if ((lun->mode_pages.index[i].subpage != 0) && (subpage == SMS_SUBPAGE_PAGE_0)) continue; #if 0 printf("found page %#x len %d\n", lun->mode_pages.index[i].page_code & SMPH_PC_MASK, lun->mode_pages.index[i].page_len); #endif page_len += lun->mode_pages.index[i].page_len; } break; } default: { int i; page_len = 0; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { /* Look for the right page code */ if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if ((lun->mode_pages.index[i].subpage != subpage) && (subpage != SMS_SUBPAGE_ALL)) continue; /* Make sure the page is supported for this dev type */ if ((control_dev != 0) && (lun->mode_pages.index[i].page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; #if 0 printf("found page %#x len %d\n", lun->mode_pages.index[i].page_code & SMPH_PC_MASK, lun->mode_pages.index[i].page_len); #endif page_len += lun->mode_pages.index[i].page_len; } if (page_len == 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 5); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } break; } } total_len = header_len + page_len; #if 0 printf("header_len = %d, page_len = %d, total_len = %d\n", header_len, page_len, total_len); #endif ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } switch (ctsio->cdb[0]) { case MODE_SENSE_6: { struct scsi_mode_hdr_6 *header; header = (struct scsi_mode_hdr_6 *)ctsio->kern_data_ptr; header->datalen = MIN(total_len - 1, 254); if (control_dev == 0) { header->dev_specific = 0x10; /* DPOFUA */ if ((lun->be_lun->flags & CTL_LUN_FLAG_READONLY) || (lun->mode_pages.control_page[CTL_PAGE_CURRENT] .eca_and_aen & SCP_SWP) != 0) header->dev_specific |= 0x80; /* WP */ } if (dbd) header->block_descr_len = 0; else header->block_descr_len = sizeof(struct scsi_mode_block_descr); block_desc = (struct scsi_mode_block_descr *)&header[1]; break; } case MODE_SENSE_10: { struct scsi_mode_hdr_10 *header; int datalen; header = (struct scsi_mode_hdr_10 *)ctsio->kern_data_ptr; datalen = MIN(total_len - 2, 65533); scsi_ulto2b(datalen, header->datalen); if (control_dev == 0) { header->dev_specific = 0x10; /* DPOFUA */ if ((lun->be_lun->flags & CTL_LUN_FLAG_READONLY) || (lun->mode_pages.control_page[CTL_PAGE_CURRENT] .eca_and_aen & SCP_SWP) != 0) header->dev_specific |= 0x80; /* WP */ } if (dbd) scsi_ulto2b(0, header->block_descr_len); else scsi_ulto2b(sizeof(struct scsi_mode_block_descr), header->block_descr_len); block_desc = (struct scsi_mode_block_descr *)&header[1]; break; } default: panic("invalid CDB type %#x", ctsio->cdb[0]); break; /* NOTREACHED */ } /* * If we've got a disk, use its blocksize in the block * descriptor. Otherwise, just set it to 0. */ if (dbd == 0) { if (control_dev == 0) scsi_ulto3b(lun->be_lun->blocksize, block_desc->block_len); else scsi_ulto3b(0, block_desc->block_len); } switch (page_code) { case SMS_ALL_PAGES_PAGE: { int i, data_used; data_used = header_len; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { struct ctl_page_index *page_index; page_index = &lun->mode_pages.index[i]; if ((control_dev != 0) && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; /* * We don't use this subpage if the user didn't * request all subpages. We already checked (above) * to make sure the user only specified a subpage * of 0 or 0xff in the SMS_ALL_PAGES_PAGE case. */ if ((page_index->subpage != 0) && (subpage == SMS_SUBPAGE_PAGE_0)) continue; /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index,pc); memcpy(ctsio->kern_data_ptr + data_used, page_index->page_data + (page_index->page_len * pc), page_index->page_len); data_used += page_index->page_len; } break; } default: { int i, data_used; data_used = header_len; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { struct ctl_page_index *page_index; page_index = &lun->mode_pages.index[i]; /* Look for the right page code */ if ((page_index->page_code & SMPH_PC_MASK) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if ((page_index->subpage != subpage) && (subpage != SMS_SUBPAGE_ALL)) continue; /* Make sure the page is supported for this dev type */ if ((control_dev != 0) && (page_index->page_flags & CTL_PAGE_FLAG_DISK_ONLY)) continue; /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index,pc); memcpy(ctsio->kern_data_ptr + data_used, page_index->page_data + (page_index->page_len * pc), page_index->page_len); data_used += page_index->page_len; } break; } } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_lbp_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct ctl_lun *lun; struct scsi_log_param_header *phdr; uint8_t *data; uint64_t val; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data = page_index->page_data; if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "blocksavail")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x0001, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x02; /* per-pool */ data += phdr->param_len; } if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "blocksused")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x0002, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x01; /* per-LUN */ data += phdr->param_len; } if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "poolblocksavail")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x00f1, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x02; /* per-pool */ data += phdr->param_len; } if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "poolblocksused")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x00f2, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x02; /* per-pool */ data += phdr->param_len; } page_index->page_len = data - page_index->page_data; return (0); } int ctl_sap_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct ctl_lun *lun; struct stat_page *data; uint64_t rn, wn, rb, wb; struct bintime rt, wt; int i; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data = (struct stat_page *)page_index->page_data; scsi_ulto2b(SLP_SAP, data->sap.hdr.param_code); data->sap.hdr.param_control = SLP_LBIN; data->sap.hdr.param_len = sizeof(struct scsi_log_stat_and_perf) - sizeof(struct scsi_log_param_header); rn = wn = rb = wb = 0; bintime_clear(&rt); bintime_clear(&wt); for (i = 0; i < CTL_MAX_PORTS; i++) { rn += lun->stats.ports[i].operations[CTL_STATS_READ]; wn += lun->stats.ports[i].operations[CTL_STATS_WRITE]; rb += lun->stats.ports[i].bytes[CTL_STATS_READ]; wb += lun->stats.ports[i].bytes[CTL_STATS_WRITE]; bintime_add(&rt, &lun->stats.ports[i].time[CTL_STATS_READ]); bintime_add(&wt, &lun->stats.ports[i].time[CTL_STATS_WRITE]); } scsi_u64to8b(rn, data->sap.read_num); scsi_u64to8b(wn, data->sap.write_num); if (lun->stats.blocksize > 0) { scsi_u64to8b(wb / lun->stats.blocksize, data->sap.recvieved_lba); scsi_u64to8b(rb / lun->stats.blocksize, data->sap.transmitted_lba); } scsi_u64to8b((uint64_t)rt.sec * 1000 + rt.frac / (UINT64_MAX / 1000), data->sap.read_int); scsi_u64to8b((uint64_t)wt.sec * 1000 + wt.frac / (UINT64_MAX / 1000), data->sap.write_int); scsi_u64to8b(0, data->sap.weighted_num); scsi_u64to8b(0, data->sap.weighted_int); scsi_ulto2b(SLP_IT, data->it.hdr.param_code); data->it.hdr.param_control = SLP_LBIN; data->it.hdr.param_len = sizeof(struct scsi_log_idle_time) - sizeof(struct scsi_log_param_header); #ifdef CTL_TIME_IO scsi_u64to8b(lun->idle_time / SBT_1MS, data->it.idle_int); #endif scsi_ulto2b(SLP_TI, data->ti.hdr.param_code); data->it.hdr.param_control = SLP_LBIN; data->ti.hdr.param_len = sizeof(struct scsi_log_time_interval) - sizeof(struct scsi_log_param_header); scsi_ulto4b(3, data->ti.exponent); scsi_ulto4b(1, data->ti.integer); page_index->page_len = sizeof(*data); return (0); } int ctl_log_sense(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; int i, pc, page_code, subpage; int alloc_len, total_len; struct ctl_page_index *page_index; struct scsi_log_sense *cdb; struct scsi_log_header *header; CTL_DEBUG_PRINT(("ctl_log_sense\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_log_sense *)ctsio->cdb; pc = (cdb->page & SLS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SLS_PAGE_CODE; subpage = cdb->subpage; alloc_len = scsi_2btoul(cdb->length); page_index = NULL; for (i = 0; i < CTL_NUM_LOG_PAGES; i++) { page_index = &lun->log_pages.index[i]; /* Look for the right page code */ if ((page_index->page_code & SL_PAGE_CODE) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if (page_index->subpage != subpage) continue; break; } if (i >= CTL_NUM_LOG_PAGES) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(struct scsi_log_header) + page_index->page_len; ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } header = (struct scsi_log_header *)ctsio->kern_data_ptr; header->page = page_index->page_code; if (page_index->subpage) { header->page |= SL_SPF; header->subpage = page_index->subpage; } scsi_ulto2b(page_index->page_len, header->datalen); /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index, pc); memcpy(header + 1, page_index->page_data, page_index->page_len); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_capacity(struct ctl_scsiio *ctsio) { struct scsi_read_capacity *cdb; struct scsi_read_capacity_data *data; struct ctl_lun *lun; uint32_t lba; CTL_DEBUG_PRINT(("ctl_read_capacity\n")); cdb = (struct scsi_read_capacity *)ctsio->cdb; lba = scsi_4btoul(cdb->addr); if (((cdb->pmi & SRC_PMI) == 0) && (lba != 0)) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_read_capacity_data *)ctsio->kern_data_ptr; ctsio->residual = 0; ctsio->kern_data_len = sizeof(*data); ctsio->kern_total_len = sizeof(*data); ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * If the maximum LBA is greater than 0xfffffffe, the user must * issue a SERVICE ACTION IN (16) command, with the read capacity * serivce action set. */ if (lun->be_lun->maxlba > 0xfffffffe) scsi_ulto4b(0xffffffff, data->addr); else scsi_ulto4b(lun->be_lun->maxlba, data->addr); /* * XXX KDM this may not be 512 bytes... */ scsi_ulto4b(lun->be_lun->blocksize, data->length); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_capacity_16(struct ctl_scsiio *ctsio) { struct scsi_read_capacity_16 *cdb; struct scsi_read_capacity_data_long *data; struct ctl_lun *lun; uint64_t lba; uint32_t alloc_len; CTL_DEBUG_PRINT(("ctl_read_capacity_16\n")); cdb = (struct scsi_read_capacity_16 *)ctsio->cdb; alloc_len = scsi_4btoul(cdb->alloc_len); lba = scsi_8btou64(cdb->addr); if ((cdb->reladr & SRC16_PMI) && (lba != 0)) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_read_capacity_data_long *)ctsio->kern_data_ptr; if (sizeof(*data) < alloc_len) { ctsio->residual = alloc_len - sizeof(*data); ctsio->kern_data_len = sizeof(*data); ctsio->kern_total_len = sizeof(*data); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; scsi_u64to8b(lun->be_lun->maxlba, data->addr); /* XXX KDM this may not be 512 bytes... */ scsi_ulto4b(lun->be_lun->blocksize, data->length); data->prot_lbppbe = lun->be_lun->pblockexp & SRC16_LBPPBE; scsi_ulto2b(lun->be_lun->pblockoff & SRC16_LALBA_A, data->lalba_lbp); if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) data->lalba_lbp[0] |= SRC16_LBPME | SRC16_LBPRZ; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_get_lba_status(struct ctl_scsiio *ctsio) { struct scsi_get_lba_status *cdb; struct scsi_get_lba_status_data *data; struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t alloc_len, total_len; int retval; CTL_DEBUG_PRINT(("ctl_get_lba_status\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_get_lba_status *)ctsio->cdb; lba = scsi_8btou64(cdb->addr); alloc_len = scsi_4btoul(cdb->alloc_len); if (lba > lun->be_lun->maxlba) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(*data) + sizeof(data->descr[0]); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_get_lba_status_data *)ctsio->kern_data_ptr; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* Fill dummy data in case backend can't tell anything. */ scsi_ulto4b(4 + sizeof(data->descr[0]), data->length); scsi_u64to8b(lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, lun->be_lun->maxlba + 1 - lba), data->descr[0].length); data->descr[0].status = 0; /* Mapped or unknown. */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = total_len; lbalen->flags = 0; retval = lun->backend->config_read((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_defect(struct ctl_scsiio *ctsio) { struct scsi_read_defect_data_10 *ccb10; struct scsi_read_defect_data_12 *ccb12; struct scsi_read_defect_data_hdr_10 *data10; struct scsi_read_defect_data_hdr_12 *data12; uint32_t alloc_len, data_len; uint8_t format; CTL_DEBUG_PRINT(("ctl_read_defect\n")); if (ctsio->cdb[0] == READ_DEFECT_DATA_10) { ccb10 = (struct scsi_read_defect_data_10 *)&ctsio->cdb; format = ccb10->format; alloc_len = scsi_2btoul(ccb10->alloc_length); data_len = sizeof(*data10); } else { ccb12 = (struct scsi_read_defect_data_12 *)&ctsio->cdb; format = ccb12->format; alloc_len = scsi_4btoul(ccb12->alloc_length); data_len = sizeof(*data12); } if (alloc_len == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; if (ctsio->cdb[0] == READ_DEFECT_DATA_10) { data10 = (struct scsi_read_defect_data_hdr_10 *) ctsio->kern_data_ptr; data10->format = format; scsi_ulto2b(0, data10->length); } else { data12 = (struct scsi_read_defect_data_hdr_12 *) ctsio->kern_data_ptr; data12->format = format; scsi_ulto2b(0, data12->generation); scsi_ulto4b(0, data12->length); } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_report_tagret_port_groups(struct ctl_scsiio *ctsio) { struct scsi_maintenance_in *cdb; int retval; int alloc_len, ext, total_len = 0, g, pc, pg, gs, os; int num_target_port_groups, num_target_ports; struct ctl_lun *lun; struct ctl_softc *softc; struct ctl_port *port; struct scsi_target_group_data *rtg_ptr; struct scsi_target_group_data_extended *rtg_ext_ptr; struct scsi_target_port_group_descriptor *tpg_desc; CTL_DEBUG_PRINT(("ctl_report_tagret_port_groups\n")); cdb = (struct scsi_maintenance_in *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; softc = lun->ctl_softc; retval = CTL_RETVAL_COMPLETE; switch (cdb->byte2 & STG_PDF_MASK) { case STG_PDF_LENGTH: ext = 0; break; case STG_PDF_EXTENDED: ext = 1; break; default: ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 5); ctl_done((union ctl_io *)ctsio); return(retval); } if (softc->is_single) num_target_port_groups = 1; else num_target_port_groups = NUM_TARGET_PORT_GROUPS; num_target_ports = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (ctl_lun_map_to_port(port, lun->lun) >= CTL_MAX_LUNS) continue; num_target_ports++; } mtx_unlock(&softc->ctl_lock); if (ext) total_len = sizeof(struct scsi_target_group_data_extended); else total_len = sizeof(struct scsi_target_group_data); total_len += sizeof(struct scsi_target_port_group_descriptor) * num_target_port_groups + sizeof(struct scsi_target_port_descriptor) * num_target_ports; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; if (ext) { rtg_ext_ptr = (struct scsi_target_group_data_extended *) ctsio->kern_data_ptr; scsi_ulto4b(total_len - 4, rtg_ext_ptr->length); rtg_ext_ptr->format_type = 0x10; rtg_ext_ptr->implicit_transition_time = 0; tpg_desc = &rtg_ext_ptr->groups[0]; } else { rtg_ptr = (struct scsi_target_group_data *) ctsio->kern_data_ptr; scsi_ulto4b(total_len - 4, rtg_ptr->length); tpg_desc = &rtg_ptr->groups[0]; } mtx_lock(&softc->ctl_lock); pg = softc->port_min / softc->port_cnt; if (softc->ha_link == CTL_HA_LINK_OFFLINE) gs = TPG_ASYMMETRIC_ACCESS_UNAVAILABLE; else if (softc->ha_link == CTL_HA_LINK_UNKNOWN) gs = TPG_ASYMMETRIC_ACCESS_TRANSITIONING; else if (softc->ha_mode == CTL_HA_MODE_ACT_STBY) gs = TPG_ASYMMETRIC_ACCESS_STANDBY; else gs = TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED; if (lun->flags & CTL_LUN_PRIMARY_SC) { os = gs; gs = TPG_ASYMMETRIC_ACCESS_OPTIMIZED; } else os = TPG_ASYMMETRIC_ACCESS_OPTIMIZED; for (g = 0; g < num_target_port_groups; g++) { tpg_desc->pref_state = (g == pg) ? gs : os; tpg_desc->support = TPG_AO_SUP | TPG_AN_SUP | TPG_S_SUP | TPG_U_SUP | TPG_T_SUP; scsi_ulto2b(g + 1, tpg_desc->target_port_group); tpg_desc->status = TPG_IMPLICIT; pc = 0; STAILQ_FOREACH(port, &softc->port_list, links) { if (port->targ_port < g * softc->port_cnt || port->targ_port >= (g + 1) * softc->port_cnt) continue; if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (ctl_lun_map_to_port(port, lun->lun) >= CTL_MAX_LUNS) continue; scsi_ulto2b(port->targ_port, tpg_desc->descriptors[pc]. relative_target_port_identifier); pc++; } tpg_desc->target_port_count = pc; tpg_desc = (struct scsi_target_port_group_descriptor *) &tpg_desc->descriptors[pc]; } mtx_unlock(&softc->ctl_lock); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return(retval); } int ctl_report_supported_opcodes(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_report_supported_opcodes *cdb; const struct ctl_cmd_entry *entry, *sentry; struct scsi_report_supported_opcodes_all *all; struct scsi_report_supported_opcodes_descr *descr; struct scsi_report_supported_opcodes_one *one; int retval; int alloc_len, total_len; int opcode, service_action, i, j, num; CTL_DEBUG_PRINT(("ctl_report_supported_opcodes\n")); cdb = (struct scsi_report_supported_opcodes *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; opcode = cdb->requested_opcode; service_action = scsi_2btoul(cdb->requested_service_action); switch (cdb->options & RSO_OPTIONS_MASK) { case RSO_OPTIONS_ALL: num = 0; for (i = 0; i < 256; i++) { entry = &ctl_cmd_table[i]; if (entry->flags & CTL_CMD_FLAG_SA5) { for (j = 0; j < 32; j++) { sentry = &((const struct ctl_cmd_entry *) entry->execute)[j]; if (ctl_cmd_applicable( lun->be_lun->lun_type, sentry)) num++; } } else { if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) num++; } } total_len = sizeof(struct scsi_report_supported_opcodes_all) + num * sizeof(struct scsi_report_supported_opcodes_descr); break; case RSO_OPTIONS_OC: if (ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32; break; case RSO_OPTIONS_OC_SA: if ((ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) == 0 || service_action >= 32) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32; break; default: ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; switch (cdb->options & RSO_OPTIONS_MASK) { case RSO_OPTIONS_ALL: all = (struct scsi_report_supported_opcodes_all *) ctsio->kern_data_ptr; num = 0; for (i = 0; i < 256; i++) { entry = &ctl_cmd_table[i]; if (entry->flags & CTL_CMD_FLAG_SA5) { for (j = 0; j < 32; j++) { sentry = &((const struct ctl_cmd_entry *) entry->execute)[j]; if (!ctl_cmd_applicable( lun->be_lun->lun_type, sentry)) continue; descr = &all->descr[num++]; descr->opcode = i; scsi_ulto2b(j, descr->service_action); descr->flags = RSO_SERVACTV; scsi_ulto2b(sentry->length, descr->cdb_length); } } else { if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) continue; descr = &all->descr[num++]; descr->opcode = i; scsi_ulto2b(0, descr->service_action); descr->flags = 0; scsi_ulto2b(entry->length, descr->cdb_length); } } scsi_ulto4b( num * sizeof(struct scsi_report_supported_opcodes_descr), all->length); break; case RSO_OPTIONS_OC: one = (struct scsi_report_supported_opcodes_one *) ctsio->kern_data_ptr; entry = &ctl_cmd_table[opcode]; goto fill_one; case RSO_OPTIONS_OC_SA: one = (struct scsi_report_supported_opcodes_one *) ctsio->kern_data_ptr; entry = &ctl_cmd_table[opcode]; entry = &((const struct ctl_cmd_entry *) entry->execute)[service_action]; fill_one: if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) { one->support = 3; scsi_ulto2b(entry->length, one->cdb_length); one->cdb_usage[0] = opcode; memcpy(&one->cdb_usage[1], entry->usage, entry->length - 1); } else one->support = 1; break; } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return(retval); } int ctl_report_supported_tmf(struct ctl_scsiio *ctsio) { struct scsi_report_supported_tmf *cdb; struct scsi_report_supported_tmf_data *data; int retval; int alloc_len, total_len; CTL_DEBUG_PRINT(("ctl_report_supported_tmf\n")); cdb = (struct scsi_report_supported_tmf *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; total_len = sizeof(struct scsi_report_supported_tmf_data); alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_report_supported_tmf_data *)ctsio->kern_data_ptr; data->byte1 |= RST_ATS | RST_ATSS | RST_CTSS | RST_LURS | RST_TRS; data->byte2 |= RST_ITNRS; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_report_timestamp(struct ctl_scsiio *ctsio) { struct scsi_report_timestamp *cdb; struct scsi_report_timestamp_data *data; struct timeval tv; int64_t timestamp; int retval; int alloc_len, total_len; CTL_DEBUG_PRINT(("ctl_report_timestamp\n")); cdb = (struct scsi_report_timestamp *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; total_len = sizeof(struct scsi_report_timestamp_data); alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_report_timestamp_data *)ctsio->kern_data_ptr; scsi_ulto2b(sizeof(*data) - 2, data->length); data->origin = RTS_ORIG_OUTSIDE; getmicrotime(&tv); timestamp = (int64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000; scsi_ulto4b(timestamp >> 16, data->timestamp); scsi_ulto2b(timestamp & 0xffff, &data->timestamp[4]); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_persistent_reserve_in(struct ctl_scsiio *ctsio) { struct scsi_per_res_in *cdb; int alloc_len, total_len = 0; /* struct scsi_per_res_in_rsrv in_data; */ struct ctl_lun *lun; struct ctl_softc *softc; uint64_t key; CTL_DEBUG_PRINT(("ctl_persistent_reserve_in\n")); cdb = (struct scsi_per_res_in *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; softc = lun->ctl_softc; retry: mtx_lock(&lun->lun_lock); switch (cdb->action) { case SPRI_RK: /* read keys */ total_len = sizeof(struct scsi_per_res_in_keys) + lun->pr_key_count * sizeof(struct scsi_per_res_key); break; case SPRI_RR: /* read reservation */ if (lun->flags & CTL_LUN_PR_RESERVED) total_len = sizeof(struct scsi_per_res_in_rsrv); else total_len = sizeof(struct scsi_per_res_in_header); break; case SPRI_RC: /* report capabilities */ total_len = sizeof(struct scsi_per_res_cap); break; case SPRI_RS: /* read full status */ total_len = sizeof(struct scsi_per_res_in_header) + (sizeof(struct scsi_per_res_in_full_desc) + 256) * lun->pr_key_count; break; default: panic("Invalid PR type %x", cdb->action); } mtx_unlock(&lun->lun_lock); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; mtx_lock(&lun->lun_lock); switch (cdb->action) { case SPRI_RK: { // read keys struct scsi_per_res_in_keys *res_keys; int i, key_count; res_keys = (struct scsi_per_res_in_keys*)ctsio->kern_data_ptr; /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (total_len != (sizeof(struct scsi_per_res_in_keys) + (lun->pr_key_count * sizeof(struct scsi_per_res_key)))){ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation length changed, retrying\n", __func__); goto retry; } scsi_ulto4b(lun->PRGeneration, res_keys->header.generation); scsi_ulto4b(sizeof(struct scsi_per_res_key) * lun->pr_key_count, res_keys->header.length); for (i = 0, key_count = 0; i < CTL_MAX_INITIATORS; i++) { if ((key = ctl_get_prkey(lun, i)) == 0) continue; /* * We used lun->pr_key_count to calculate the * size to allocate. If it turns out the number of * initiators with the registered flag set is * larger than that (i.e. they haven't been kept in * sync), we've got a problem. */ if (key_count >= lun->pr_key_count) { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_PR_ERROR, csevent_LogType_Fault, csevent_AlertLevel_Yellow, csevent_FRU_ShelfController, csevent_FRU_Firmware, csevent_FRU_Unknown, "registered keys %d >= key " "count %d", key_count, lun->pr_key_count); #endif key_count++; continue; } scsi_u64to8b(key, res_keys->keys[key_count].key); key_count++; } break; } case SPRI_RR: { // read reservation struct scsi_per_res_in_rsrv *res; int tmp_len, header_only; res = (struct scsi_per_res_in_rsrv *)ctsio->kern_data_ptr; scsi_ulto4b(lun->PRGeneration, res->header.generation); if (lun->flags & CTL_LUN_PR_RESERVED) { tmp_len = sizeof(struct scsi_per_res_in_rsrv); scsi_ulto4b(sizeof(struct scsi_per_res_in_rsrv_data), res->header.length); header_only = 0; } else { tmp_len = sizeof(struct scsi_per_res_in_header); scsi_ulto4b(0, res->header.length); header_only = 1; } /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (tmp_len != total_len) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation status changed, retrying\n", __func__); goto retry; } /* * No reservation held, so we're done. */ if (header_only != 0) break; /* * If the registration is an All Registrants type, the key * is 0, since it doesn't really matter. */ if (lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) { scsi_u64to8b(ctl_get_prkey(lun, lun->pr_res_idx), res->data.reservation); } res->data.scopetype = lun->res_type; break; } case SPRI_RC: //report capabilities { struct scsi_per_res_cap *res_cap; uint16_t type_mask; res_cap = (struct scsi_per_res_cap *)ctsio->kern_data_ptr; scsi_ulto2b(sizeof(*res_cap), res_cap->length); res_cap->flags2 |= SPRI_TMV | SPRI_ALLOW_5; type_mask = SPRI_TM_WR_EX_AR | SPRI_TM_EX_AC_RO | SPRI_TM_WR_EX_RO | SPRI_TM_EX_AC | SPRI_TM_WR_EX | SPRI_TM_EX_AC_AR; scsi_ulto2b(type_mask, res_cap->type_mask); break; } case SPRI_RS: { // read full status struct scsi_per_res_in_full *res_status; struct scsi_per_res_in_full_desc *res_desc; struct ctl_port *port; int i, len; res_status = (struct scsi_per_res_in_full*)ctsio->kern_data_ptr; /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (total_len < (sizeof(struct scsi_per_res_in_header) + (sizeof(struct scsi_per_res_in_full_desc) + 256) * lun->pr_key_count)){ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation length changed, retrying\n", __func__); goto retry; } scsi_ulto4b(lun->PRGeneration, res_status->header.generation); res_desc = &res_status->desc[0]; for (i = 0; i < CTL_MAX_INITIATORS; i++) { if ((key = ctl_get_prkey(lun, i)) == 0) continue; scsi_u64to8b(key, res_desc->res_key.key); if ((lun->flags & CTL_LUN_PR_RESERVED) && (lun->pr_res_idx == i || lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS)) { res_desc->flags = SPRI_FULL_R_HOLDER; res_desc->scopetype = lun->res_type; } scsi_ulto2b(i / CTL_MAX_INIT_PER_PORT, res_desc->rel_trgt_port_id); len = 0; port = softc->ctl_ports[i / CTL_MAX_INIT_PER_PORT]; if (port != NULL) len = ctl_create_iid(port, i % CTL_MAX_INIT_PER_PORT, res_desc->transport_id); scsi_ulto4b(len, res_desc->additional_length); res_desc = (struct scsi_per_res_in_full_desc *) &res_desc->transport_id[len]; } scsi_ulto4b((uint8_t *)res_desc - (uint8_t *)&res_status->desc[0], res_status->header.length); break; } default: /* * This is a bug, because we just checked for this above, * and should have returned an error. */ panic("Invalid PR type %x", cdb->action); break; /* NOTREACHED */ } mtx_unlock(&lun->lun_lock); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Returns 0 if ctl_persistent_reserve_out() should continue, non-zero if * it should return. */ static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key, uint64_t sa_res_key, uint8_t type, uint32_t residx, struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb, struct scsi_per_res_out_parms* param) { union ctl_ha_msg persis_io; int i; mtx_lock(&lun->lun_lock); if (sa_res_key == 0) { if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { /* validate scope and type */ if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (1); } if (type>8 || type==2 || type==4 || type==0) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } /* * Unregister everybody else and build UA for * them */ for(i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == residx || ctl_get_prkey(lun, i) == 0) continue; ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->pr_key_count = 1; lun->res_type = type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; lun->PRGeneration++; mtx_unlock(&lun->lun_lock); /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else { /* not all registrants */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 8, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS || !(lun->flags & CTL_LUN_PR_RESERVED)) { int found = 0; if (res_key == sa_res_key) { /* special case */ /* * The spec implies this is not good but doesn't * say what to do. There are two choices either * generate a res conflict or check condition * with illegal field in parameter data. Since * that is what is done when the sa_res_key is * zero I'll take that approach since this has * to do with the sa_res_key. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 8, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (ctl_get_prkey(lun, i) != sa_res_key) continue; found = 1; ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } if (!found) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lun->PRGeneration++; mtx_unlock(&lun->lun_lock); /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else { /* Reserved but not all registrants */ /* sa_res_key is res holder */ if (sa_res_key == ctl_get_prkey(lun, lun->pr_res_idx)) { /* validate scope and type */ if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (1); } if (type>8 || type==2 || type==4 || type==0) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } /* * Do the following: * if sa_res_key != res_key remove all * registrants w/sa_res_key and generate UA * for these registrants(Registrations * Preempted) if it wasn't an exclusive * reservation generate UA(Reservations * Preempted) for all other registered nexuses * if the type has changed. Establish the new * reservation and holder. If res_key and * sa_res_key are the same do the above * except don't unregister the res holder. */ for(i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == residx || ctl_get_prkey(lun, i) == 0) continue; if (sa_res_key == ctl_get_prkey(lun, i)) { ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } else if (type != lun->res_type && (lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type ==SPR_TYPE_EX_AC_RO)){ ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->res_type = type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; lun->PRGeneration++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else { /* * sa_res_key is not the res holder just * remove registrants */ int found=0; for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (sa_res_key != ctl_get_prkey(lun, i)) continue; found = 1; ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } if (!found) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (1); } lun->PRGeneration++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } } return (0); } static void ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg) { uint64_t sa_res_key; int i; sa_res_key = scsi_8btou64(msg->pr.pr_info.sa_res_key); if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS || lun->pr_res_idx == CTL_PR_NO_RESERVATION || sa_res_key != ctl_get_prkey(lun, lun->pr_res_idx)) { if (sa_res_key == 0) { /* * Unregister everybody else and build UA for * them */ for(i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == msg->pr.pr_info.residx || ctl_get_prkey(lun, i) == 0) continue; ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->pr_key_count = 1; lun->res_type = msg->pr.pr_info.res_type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = msg->pr.pr_info.residx; } else { for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (sa_res_key == ctl_get_prkey(lun, i)) continue; ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } } } else { for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == msg->pr.pr_info.residx || ctl_get_prkey(lun, i) == 0) continue; if (sa_res_key == ctl_get_prkey(lun, i)) { ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } else if (msg->pr.pr_info.res_type != lun->res_type && (lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type == SPR_TYPE_EX_AC_RO)) { ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->res_type = msg->pr.pr_info.res_type; if (lun->res_type != SPR_TYPE_WR_EX_AR && lun->res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = msg->pr.pr_info.residx; else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; } lun->PRGeneration++; } int ctl_persistent_reserve_out(struct ctl_scsiio *ctsio) { int retval; u_int32_t param_len; struct scsi_per_res_out *cdb; struct ctl_lun *lun; struct scsi_per_res_out_parms* param; struct ctl_softc *softc; uint32_t residx; uint64_t res_key, sa_res_key, key; uint8_t type; union ctl_ha_msg persis_io; int i; CTL_DEBUG_PRINT(("ctl_persistent_reserve_out\n")); retval = CTL_RETVAL_COMPLETE; cdb = (struct scsi_per_res_out *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; softc = lun->ctl_softc; /* * We only support whole-LUN scope. The scope & type are ignored for * register, register and ignore existing key and clear. * We sometimes ignore scope and type on preempts too!! * Verify reservation type here as well. */ type = cdb->scope_type & SPR_TYPE_MASK; if ((cdb->action == SPRO_RESERVE) || (cdb->action == SPRO_RELEASE)) { if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (type>8 || type==2 || type==4 || type==0) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } param_len = scsi_4btoul(cdb->length); if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK); ctsio->kern_data_len = param_len; ctsio->kern_total_len = param_len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } param = (struct scsi_per_res_out_parms *)ctsio->kern_data_ptr; residx = ctl_get_initindex(&ctsio->io_hdr.nexus); res_key = scsi_8btou64(param->res_key.key); sa_res_key = scsi_8btou64(param->serv_act_res_key); /* * Validate the reservation key here except for SPRO_REG_IGNO * This must be done for all other service actions */ if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REG_IGNO) { mtx_lock(&lun->lun_lock); if ((key = ctl_get_prkey(lun, residx)) != 0) { if (res_key != key) { /* * The current key passed in doesn't match * the one the initiator previously * registered. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } else if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REGISTER) { /* * We are not registered */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if (res_key != 0) { /* * We are not registered and trying to register but * the register key isn't zero. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_unlock(&lun->lun_lock); } switch (cdb->action & SPRO_ACTION_MASK) { case SPRO_REGISTER: case SPRO_REG_IGNO: { #if 0 printf("Registration received\n"); #endif /* * We don't support any of these options, as we report in * the read capabilities request (see * ctl_persistent_reserve_in(), above). */ if ((param->flags & SPR_SPEC_I_PT) || (param->flags & SPR_ALL_TG_PT) || (param->flags & SPR_APTPL)) { int bit_ptr; if (param->flags & SPR_APTPL) bit_ptr = 0; else if (param->flags & SPR_ALL_TG_PT) bit_ptr = 2; else /* SPR_SPEC_I_PT */ bit_ptr = 3; free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 20, /*bit_valid*/ 1, /*bit*/ bit_ptr); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_lock(&lun->lun_lock); /* * The initiator wants to clear the * key/unregister. */ if (sa_res_key == 0) { if ((res_key == 0 && (cdb->action & SPRO_ACTION_MASK) == SPRO_REGISTER) || ((cdb->action & SPRO_ACTION_MASK) == SPRO_REG_IGNO && ctl_get_prkey(lun, residx) == 0)) { mtx_unlock(&lun->lun_lock); goto done; } ctl_clr_prkey(lun, residx); lun->pr_key_count--; if (residx == lun->pr_res_idx) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; if ((lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type == SPR_TYPE_EX_AC_RO) && lun->pr_key_count) { /* * If the reservation is a registrants * only type we need to generate a UA * for other registered inits. The * sense code should be RESERVATIONS * RELEASED */ for (i = softc->init_min; i < softc->init_max; i++){ if (ctl_get_prkey(lun, i) == 0) continue; ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->res_type = 0; } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { if (lun->pr_key_count==0) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; } } lun->PRGeneration++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_UNREG_KEY; persis_io.pr.pr_info.residx = residx; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else /* sa_res_key != 0 */ { /* * If we aren't registered currently then increment * the key count and set the registered flag. */ ctl_alloc_prkey(lun, residx); if (ctl_get_prkey(lun, residx) == 0) lun->pr_key_count++; ctl_set_prkey(lun, residx, sa_res_key); lun->PRGeneration++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_REG_KEY; persis_io.pr.pr_info.residx = residx; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } break; } case SPRO_RESERVE: #if 0 printf("Reserve executed type %d\n", type); #endif mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_PR_RESERVED) { /* * if this isn't the reservation holder and it's * not a "all registrants" type or if the type is * different then we have a conflict */ if ((lun->pr_res_idx != residx && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) || lun->res_type != type) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_unlock(&lun->lun_lock); } else /* create a reservation */ { /* * If it's not an "all registrants" type record * reservation holder */ if (type != SPR_TYPE_WR_EX_AR && type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; /* Res holder */ else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; lun->flags |= CTL_LUN_PR_RESERVED; lun->res_type = type; mtx_unlock(&lun->lun_lock); /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_RESERVE; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } break; case SPRO_RELEASE: mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_PR_RESERVED) == 0) { /* No reservation exists return good status */ mtx_unlock(&lun->lun_lock); goto done; } /* * Is this nexus a reservation holder? */ if (lun->pr_res_idx != residx && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) { /* * not a res holder return good status but * do nothing */ mtx_unlock(&lun->lun_lock); goto done; } if (lun->res_type != type) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_illegal_pr_release(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* okay to release */ lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; lun->res_type = 0; /* * if this isn't an exclusive access * res generate UA for all other * registrants. */ if (type != SPR_TYPE_EX_AC && type != SPR_TYPE_WR_EX) { for (i = softc->init_min; i < softc->init_max; i++) { if (i == residx || ctl_get_prkey(lun, i) == 0) continue; ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } mtx_unlock(&lun->lun_lock); /* Send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_RELEASE; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); break; case SPRO_CLEAR: /* send msg to other side */ mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_key_count = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; ctl_clr_prkey(lun, residx); for (i = 0; i < CTL_MAX_INITIATORS; i++) if (ctl_get_prkey(lun, i) != 0) { ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->PRGeneration++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_CLEAR; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); break; case SPRO_PREEMPT: case SPRO_PRE_ABO: { int nretval; nretval = ctl_pro_preempt(softc, lun, res_key, sa_res_key, type, residx, ctsio, cdb, param); if (nretval != 0) return (CTL_RETVAL_COMPLETE); break; } default: panic("Invalid PR type %x", cdb->action); } done: free(ctsio->kern_data_ptr, M_CTL); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } /* * This routine is for handling a message from the other SC pertaining to * persistent reserve out. All the error checking will have been done * so only perorming the action need be done here to keep the two * in sync. */ static void ctl_hndl_per_res_out_on_other_sc(union ctl_ha_msg *msg) { struct ctl_lun *lun; struct ctl_softc *softc; int i; - uint32_t targ_lun; + uint32_t residx, targ_lun; softc = control_softc; - targ_lun = msg->hdr.nexus.targ_mapped_lun; - lun = softc->ctl_luns[targ_lun]; + mtx_lock(&softc->ctl_lock); + if ((targ_lun >= CTL_MAX_LUNS) || + ((lun = softc->ctl_luns[targ_lun]) == NULL)) { + mtx_unlock(&softc->ctl_lock); + return; + } mtx_lock(&lun->lun_lock); + mtx_unlock(&softc->ctl_lock); + if (lun->flags & CTL_LUN_DISABLED) { + mtx_unlock(&lun->lun_lock); + return; + } + residx = ctl_get_initindex(&msg->hdr.nexus); switch(msg->pr.pr_info.action) { case CTL_PR_REG_KEY: ctl_alloc_prkey(lun, msg->pr.pr_info.residx); if (ctl_get_prkey(lun, msg->pr.pr_info.residx) == 0) lun->pr_key_count++; ctl_set_prkey(lun, msg->pr.pr_info.residx, scsi_8btou64(msg->pr.pr_info.sa_res_key)); lun->PRGeneration++; break; case CTL_PR_UNREG_KEY: ctl_clr_prkey(lun, msg->pr.pr_info.residx); lun->pr_key_count--; /* XXX Need to see if the reservation has been released */ /* if so do we need to generate UA? */ if (msg->pr.pr_info.residx == lun->pr_res_idx) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; if ((lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type == SPR_TYPE_EX_AC_RO) && lun->pr_key_count) { /* * If the reservation is a registrants * only type we need to generate a UA * for other registered inits. The * sense code should be RESERVATIONS * RELEASED */ for (i = softc->init_min; i < softc->init_max; i++) { if (ctl_get_prkey(lun, i) == 0) continue; ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->res_type = 0; } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { if (lun->pr_key_count==0) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; } } lun->PRGeneration++; break; case CTL_PR_RESERVE: lun->flags |= CTL_LUN_PR_RESERVED; lun->res_type = msg->pr.pr_info.res_type; lun->pr_res_idx = msg->pr.pr_info.residx; break; case CTL_PR_RELEASE: /* * if this isn't an exclusive access res generate UA for all * other registrants. */ if (lun->res_type != SPR_TYPE_EX_AC && lun->res_type != SPR_TYPE_WR_EX) { for (i = softc->init_min; i < softc->init_max; i++) - if (ctl_get_prkey(lun, i) != 0) - ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); + if (i == residx || ctl_get_prkey(lun, i) == 0) + continue; + ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; lun->res_type = 0; break; case CTL_PR_PREEMPT: ctl_pro_preempt_other(lun, msg); break; case CTL_PR_CLEAR: lun->flags &= ~CTL_LUN_PR_RESERVED; lun->res_type = 0; lun->pr_key_count = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; for (i=0; i < CTL_MAX_INITIATORS; i++) { if (ctl_get_prkey(lun, i) == 0) continue; ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->PRGeneration++; break; } mtx_unlock(&lun->lun_lock); } int ctl_read_write(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int flags, retval; int isread; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_read_write: command: %#x\n", ctsio->cdb[0])); flags = 0; retval = CTL_RETVAL_COMPLETE; isread = ctsio->cdb[0] == READ_6 || ctsio->cdb[0] == READ_10 || ctsio->cdb[0] == READ_12 || ctsio->cdb[0] == READ_16; switch (ctsio->cdb[0]) { case READ_6: case WRITE_6: { struct scsi_rw_6 *cdb; cdb = (struct scsi_rw_6 *)ctsio->cdb; lba = scsi_3btoul(cdb->addr); /* only 5 bits are valid in the most significant address byte */ lba &= 0x1fffff; num_blocks = cdb->length; /* * This is correct according to SBC-2. */ if (num_blocks == 0) num_blocks = 256; break; } case READ_10: case WRITE_10: { struct scsi_rw_10 *cdb; cdb = (struct scsi_rw_10 *)ctsio->cdb; if (cdb->byte2 & SRW10_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW10_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case WRITE_VERIFY_10: { struct scsi_write_verify_10 *cdb; cdb = (struct scsi_write_verify_10 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case READ_12: case WRITE_12: { struct scsi_rw_12 *cdb; cdb = (struct scsi_rw_12 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_12: { struct scsi_write_verify_12 *cdb; cdb = (struct scsi_write_verify_12 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case READ_16: case WRITE_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case WRITE_ATOMIC_16: { struct scsi_rw_16 *cdb; if (lun->be_lun->atomicblock == 0) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } cdb = (struct scsi_rw_16 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); if (num_blocks > lun->be_lun->atomicblock) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 12, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } break; } case WRITE_VERIFY_16: { struct scsi_write_verify_16 *cdb; cdb = (struct scsi_write_verify_16 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. * Note that this cannot happen with WRITE(6) or READ(6), since 0 * translates to 256 blocks for those commands. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Set FUA and/or DPO if caches are disabled. */ if (isread) { if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 & SCP_RCD) != 0) flags |= CTL_LLF_FUA | CTL_LLF_DPO; } else { if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 & SCP_WCE) == 0) flags |= CTL_LLF_FUA; } lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags; ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize; ctsio->kern_rel_offset = 0; CTL_DEBUG_PRINT(("ctl_read_write: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } static int ctl_cnw_cont(union ctl_io *io) { struct ctl_scsiio *ctsio; struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; int retval; ctsio = &io->scsiio; ctsio->io_hdr.status = CTL_STATUS_NONE; ctsio->io_hdr.flags &= ~CTL_FLAG_IO_CONT; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->flags &= ~CTL_LLF_COMPARE; lbalen->flags |= CTL_LLF_WRITE; CTL_DEBUG_PRINT(("ctl_cnw_cont: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_cnw(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int flags, retval; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_cnw: command: %#x\n", ctsio->cdb[0])); flags = 0; retval = CTL_RETVAL_COMPLETE; switch (ctsio->cdb[0]) { case COMPARE_AND_WRITE: { struct scsi_compare_and_write *cdb; cdb = (struct scsi_compare_and_write *)ctsio->cdb; if (cdb->byte2 & SRW10_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW10_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = cdb->length; break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Set FUA if write cache is disabled. */ if ((lun->mode_pages.caching_page[CTL_PAGE_CURRENT].flags1 & SCP_WCE) == 0) flags |= CTL_LLF_FUA; ctsio->kern_total_len = 2 * num_blocks * lun->be_lun->blocksize; ctsio->kern_rel_offset = 0; /* * Set the IO_CONT flag, so that if this I/O gets passed to * ctl_data_submit_done(), it'll get passed back to * ctl_ctl_cnw_cont() for further processing. */ ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT; ctsio->io_cont = ctl_cnw_cont; lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = CTL_LLF_COMPARE | flags; CTL_DEBUG_PRINT(("ctl_cnw: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_verify(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int bytchk, flags; int retval; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_verify: command: %#x\n", ctsio->cdb[0])); bytchk = 0; flags = CTL_LLF_FUA; retval = CTL_RETVAL_COMPLETE; switch (ctsio->cdb[0]) { case VERIFY_10: { struct scsi_verify_10 *cdb; cdb = (struct scsi_verify_10 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case VERIFY_12: { struct scsi_verify_12 *cdb; cdb = (struct scsi_verify_12 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case VERIFY_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; if (bytchk) { lbalen->flags = CTL_LLF_COMPARE | flags; ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize; } else { lbalen->flags = CTL_LLF_VERIFY | flags; ctsio->kern_total_len = 0; } ctsio->kern_rel_offset = 0; CTL_DEBUG_PRINT(("ctl_verify: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_report_luns(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = control_softc; struct scsi_report_luns *cdb; struct scsi_report_luns_data *lun_data; struct ctl_lun *lun, *request_lun; struct ctl_port *port; int num_luns, retval; uint32_t alloc_len, lun_datalen; int num_filled, well_known; uint32_t initidx, targ_lun_id, lun_id; retval = CTL_RETVAL_COMPLETE; well_known = 0; cdb = (struct scsi_report_luns *)ctsio->cdb; port = ctl_io_port(&ctsio->io_hdr); CTL_DEBUG_PRINT(("ctl_report_luns\n")); mtx_lock(&softc->ctl_lock); num_luns = 0; for (targ_lun_id = 0; targ_lun_id < CTL_MAX_LUNS; targ_lun_id++) { if (ctl_lun_map_from_port(port, targ_lun_id) < CTL_MAX_LUNS) num_luns++; } mtx_unlock(&softc->ctl_lock); switch (cdb->select_report) { case RPL_REPORT_DEFAULT: case RPL_REPORT_ALL: break; case RPL_REPORT_WELLKNOWN: well_known = 1; num_luns = 0; break; default: ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); break; /* NOTREACHED */ } alloc_len = scsi_4btoul(cdb->length); /* * The initiator has to allocate at least 16 bytes for this request, * so he can at least get the header and the first LUN. Otherwise * we reject the request (per SPC-3 rev 14, section 6.21). */ if (alloc_len < (sizeof(struct scsi_report_luns_data) + sizeof(struct scsi_report_luns_lundata))) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } request_lun = (struct ctl_lun *) ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; lun_datalen = sizeof(*lun_data) + (num_luns * sizeof(struct scsi_report_luns_lundata)); ctsio->kern_data_ptr = malloc(lun_datalen, M_CTL, M_WAITOK | M_ZERO); lun_data = (struct scsi_report_luns_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); mtx_lock(&softc->ctl_lock); for (targ_lun_id = 0, num_filled = 0; targ_lun_id < CTL_MAX_LUNS && num_filled < num_luns; targ_lun_id++) { lun_id = ctl_lun_map_from_port(port, targ_lun_id); if (lun_id >= CTL_MAX_LUNS) continue; lun = softc->ctl_luns[lun_id]; if (lun == NULL) continue; if (targ_lun_id <= 0xff) { /* * Peripheral addressing method, bus number 0. */ lun_data->luns[num_filled].lundata[0] = RPL_LUNDATA_ATYP_PERIPH; lun_data->luns[num_filled].lundata[1] = targ_lun_id; num_filled++; } else if (targ_lun_id <= 0x3fff) { /* * Flat addressing method. */ lun_data->luns[num_filled].lundata[0] = RPL_LUNDATA_ATYP_FLAT | (targ_lun_id >> 8); lun_data->luns[num_filled].lundata[1] = (targ_lun_id & 0xff); num_filled++; } else if (targ_lun_id <= 0xffffff) { /* * Extended flat addressing method. */ lun_data->luns[num_filled].lundata[0] = RPL_LUNDATA_ATYP_EXTLUN | 0x12; scsi_ulto3b(targ_lun_id, &lun_data->luns[num_filled].lundata[1]); num_filled++; } else { printf("ctl_report_luns: bogus LUN number %jd, " "skipping\n", (intmax_t)targ_lun_id); } /* * According to SPC-3, rev 14 section 6.21: * * "The execution of a REPORT LUNS command to any valid and * installed logical unit shall clear the REPORTED LUNS DATA * HAS CHANGED unit attention condition for all logical * units of that target with respect to the requesting * initiator. A valid and installed logical unit is one * having a PERIPHERAL QUALIFIER of 000b in the standard * INQUIRY data (see 6.4.2)." * * If request_lun is NULL, the LUN this report luns command * was issued to is either disabled or doesn't exist. In that * case, we shouldn't clear any pending lun change unit * attention. */ if (request_lun != NULL) { mtx_lock(&lun->lun_lock); ctl_clr_ua(lun, initidx, CTL_UA_LUN_CHANGE); mtx_unlock(&lun->lun_lock); } } mtx_unlock(&softc->ctl_lock); /* * It's quite possible that we've returned fewer LUNs than we allocated * space for. Trim it. */ lun_datalen = sizeof(*lun_data) + (num_filled * sizeof(struct scsi_report_luns_lundata)); if (lun_datalen < alloc_len) { ctsio->residual = alloc_len - lun_datalen; ctsio->kern_data_len = lun_datalen; ctsio->kern_total_len = lun_datalen; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * We set this to the actual data length, regardless of how much * space we actually have to return results. If the user looks at * this value, he'll know whether or not he allocated enough space * and reissue the command if necessary. We don't support well * known logical units, so if the user asks for that, return none. */ scsi_ulto4b(lun_datalen - 8, lun_data->length); /* * We can only return SCSI_STATUS_CHECK_COND when we can't satisfy * this request. */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_request_sense(struct ctl_scsiio *ctsio) { struct scsi_request_sense *cdb; struct scsi_sense_data *sense_ptr; struct ctl_softc *ctl_softc; struct ctl_lun *lun; uint32_t initidx; int have_error; scsi_sense_data_type sense_format; ctl_ua_type ua_type; cdb = (struct scsi_request_sense *)ctsio->cdb; ctl_softc = control_softc; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; CTL_DEBUG_PRINT(("ctl_request_sense\n")); /* * Determine which sense format the user wants. */ if (cdb->byte2 & SRS_DESC) sense_format = SSD_TYPE_DESC; else sense_format = SSD_TYPE_FIXED; ctsio->kern_data_ptr = malloc(sizeof(*sense_ptr), M_CTL, M_WAITOK); sense_ptr = (struct scsi_sense_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; /* * struct scsi_sense_data, which is currently set to 256 bytes, is * larger than the largest allowed value for the length field in the * REQUEST SENSE CDB, which is 252 bytes as of SPC-4. */ ctsio->residual = 0; ctsio->kern_data_len = cdb->length; ctsio->kern_total_len = cdb->length; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * If we don't have a LUN, we don't have any pending sense. */ if (lun == NULL) goto no_sense; have_error = 0; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); /* * Check for pending sense, and then for pending unit attentions. * Pending sense gets returned first, then pending unit attentions. */ mtx_lock(&lun->lun_lock); #ifdef CTL_WITH_CA if (ctl_is_set(lun->have_ca, initidx)) { scsi_sense_data_type stored_format; /* * Check to see which sense format was used for the stored * sense data. */ stored_format = scsi_sense_type(&lun->pending_sense[initidx]); /* * If the user requested a different sense format than the * one we stored, then we need to convert it to the other * format. If we're going from descriptor to fixed format * sense data, we may lose things in translation, depending * on what options were used. * * If the stored format is SSD_TYPE_NONE (i.e. invalid), * for some reason we'll just copy it out as-is. */ if ((stored_format == SSD_TYPE_FIXED) && (sense_format == SSD_TYPE_DESC)) ctl_sense_to_desc((struct scsi_sense_data_fixed *) &lun->pending_sense[initidx], (struct scsi_sense_data_desc *)sense_ptr); else if ((stored_format == SSD_TYPE_DESC) && (sense_format == SSD_TYPE_FIXED)) ctl_sense_to_fixed((struct scsi_sense_data_desc *) &lun->pending_sense[initidx], (struct scsi_sense_data_fixed *)sense_ptr); else memcpy(sense_ptr, &lun->pending_sense[initidx], MIN(sizeof(*sense_ptr), sizeof(lun->pending_sense[initidx]))); ctl_clear_mask(lun->have_ca, initidx); have_error = 1; } else #endif { ua_type = ctl_build_ua(lun, initidx, sense_ptr, sense_format); if (ua_type != CTL_UA_NONE) have_error = 1; if (ua_type == CTL_UA_LUN_CHANGE) { mtx_unlock(&lun->lun_lock); mtx_lock(&ctl_softc->ctl_lock); ctl_clr_ua_allluns(ctl_softc, initidx, ua_type); mtx_unlock(&ctl_softc->ctl_lock); mtx_lock(&lun->lun_lock); } } mtx_unlock(&lun->lun_lock); /* * We already have a pending error, return it. */ if (have_error != 0) { /* * We report the SCSI status as OK, since the status of the * request sense command itself is OK. * We report 0 for the sense length, because we aren't doing * autosense in this case. We're reporting sense as * parameter data. */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } no_sense: /* * No sense information to report, so we report that everything is * okay. */ ctl_set_sense_data(sense_ptr, lun, sense_format, /*current_error*/ 1, /*sense_key*/ SSD_KEY_NO_SENSE, /*asc*/ 0x00, /*ascq*/ 0x00, SSD_ELEM_NONE); /* * We report 0 for the sense length, because we aren't doing * autosense in this case. We're reporting sense as parameter data. */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_tur(struct ctl_scsiio *ctsio) { CTL_DEBUG_PRINT(("ctl_tur\n")); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x00, the Supported VPD Pages page. */ static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_supported_pages *pages; int sup_page_size; struct ctl_lun *lun; int p; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; sup_page_size = sizeof(struct scsi_vpd_supported_pages) * SCSI_EVPD_NUM_SUPPORTED_PAGES; ctsio->kern_data_ptr = malloc(sup_page_size, M_CTL, M_WAITOK | M_ZERO); pages = (struct scsi_vpd_supported_pages *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sup_page_size < alloc_len) { ctsio->residual = alloc_len - sup_page_size; ctsio->kern_data_len = sup_page_size; ctsio->kern_total_len = sup_page_size; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) pages->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else pages->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; p = 0; /* Supported VPD pages */ pages->page_list[p++] = SVPD_SUPPORTED_PAGES; /* Serial Number */ pages->page_list[p++] = SVPD_UNIT_SERIAL_NUMBER; /* Device Identification */ pages->page_list[p++] = SVPD_DEVICE_ID; /* Extended INQUIRY Data */ pages->page_list[p++] = SVPD_EXTENDED_INQUIRY_DATA; /* Mode Page Policy */ pages->page_list[p++] = SVPD_MODE_PAGE_POLICY; /* SCSI Ports */ pages->page_list[p++] = SVPD_SCSI_PORTS; /* Third-party Copy */ pages->page_list[p++] = SVPD_SCSI_TPC; if (lun != NULL && lun->be_lun->lun_type == T_DIRECT) { /* Block limits */ pages->page_list[p++] = SVPD_BLOCK_LIMITS; /* Block Device Characteristics */ pages->page_list[p++] = SVPD_BDC; /* Logical Block Provisioning */ pages->page_list[p++] = SVPD_LBP; } pages->length = p; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x80, the Unit Serial Number page. */ static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_unit_serial_number *sn_ptr; struct ctl_lun *lun; int data_len; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data_len = 4 + CTL_SN_LEN; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); sn_ptr = (struct scsi_vpd_unit_serial_number *)ctsio->kern_data_ptr; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) sn_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else sn_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; sn_ptr->page_code = SVPD_UNIT_SERIAL_NUMBER; sn_ptr->length = CTL_SN_LEN; /* * If we don't have a LUN, we just leave the serial number as * all spaces. */ if (lun != NULL) { strncpy((char *)sn_ptr->serial_num, (char *)lun->be_lun->serial_num, CTL_SN_LEN); } else memset(sn_ptr->serial_num, 0x20, CTL_SN_LEN); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x86, the Extended INQUIRY Data page. */ static int ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_extended_inquiry_data *eid_ptr; struct ctl_lun *lun; int data_len; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data_len = sizeof(struct scsi_vpd_extended_inquiry_data); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); eid_ptr = (struct scsi_vpd_extended_inquiry_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) eid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else eid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; eid_ptr->page_code = SVPD_EXTENDED_INQUIRY_DATA; scsi_ulto2b(data_len - 4, eid_ptr->page_length); /* * We support head of queue, ordered and simple tags. */ eid_ptr->flags2 = SVPD_EID_HEADSUP | SVPD_EID_ORDSUP | SVPD_EID_SIMPSUP; /* * Volatile cache supported. */ eid_ptr->flags3 = SVPD_EID_V_SUP; /* * This means that we clear the REPORTED LUNS DATA HAS CHANGED unit * attention for a particular IT nexus on all LUNs once we report * it to that nexus once. This bit is required as of SPC-4. */ eid_ptr->flags4 = SVPD_EID_LUICLT; /* * XXX KDM in order to correctly answer this, we would need * information from the SIM to determine how much sense data it * can send. So this would really be a path inquiry field, most * likely. This can be set to a maximum of 252 according to SPC-4, * but the hardware may or may not be able to support that much. * 0 just means that the maximum sense data length is not reported. */ eid_ptr->max_sense_length = 0; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_mode_page_policy *mpp_ptr; struct ctl_lun *lun; int data_len; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data_len = sizeof(struct scsi_vpd_mode_page_policy) + sizeof(struct scsi_vpd_mode_page_policy_descr); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); mpp_ptr = (struct scsi_vpd_mode_page_policy *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) mpp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else mpp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; mpp_ptr->page_code = SVPD_MODE_PAGE_POLICY; scsi_ulto2b(data_len - 4, mpp_ptr->page_length); mpp_ptr->descr[0].page_code = 0x3f; mpp_ptr->descr[0].subpage_code = 0xff; mpp_ptr->descr[0].policy = SVPD_MPP_SHARED; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x83, the Device Identification page. */ static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_device_id *devid_ptr; struct scsi_vpd_id_descriptor *desc; struct ctl_softc *softc; struct ctl_lun *lun; struct ctl_port *port; int data_len; uint8_t proto; softc = control_softc; port = ctl_io_port(&ctsio->io_hdr); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data_len = sizeof(struct scsi_vpd_device_id) + sizeof(struct scsi_vpd_id_descriptor) + sizeof(struct scsi_vpd_id_rel_trgt_port_id) + sizeof(struct scsi_vpd_id_descriptor) + sizeof(struct scsi_vpd_id_trgt_port_grp_id); if (lun && lun->lun_devid) data_len += lun->lun_devid->len; if (port && port->port_devid) data_len += port->port_devid->len; if (port && port->target_devid) data_len += port->target_devid->len; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); devid_ptr = (struct scsi_vpd_device_id *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) devid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else devid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; devid_ptr->page_code = SVPD_DEVICE_ID; scsi_ulto2b(data_len - 4, devid_ptr->length); if (port && port->port_type == CTL_PORT_FC) proto = SCSI_PROTO_FC << 4; else if (port && port->port_type == CTL_PORT_ISCSI) proto = SCSI_PROTO_ISCSI << 4; else proto = SCSI_PROTO_SPI << 4; desc = (struct scsi_vpd_id_descriptor *)devid_ptr->desc_list; /* * We're using a LUN association here. i.e., this device ID is a * per-LUN identifier. */ if (lun && lun->lun_devid) { memcpy(desc, lun->lun_devid->data, lun->lun_devid->len); desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc + lun->lun_devid->len); } /* * This is for the WWPN which is a port association. */ if (port && port->port_devid) { memcpy(desc, port->port_devid->data, port->port_devid->len); desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc + port->port_devid->len); } /* * This is for the Relative Target Port(type 4h) identifier */ desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT | SVPD_ID_TYPE_RELTARG; desc->length = 4; scsi_ulto2b(ctsio->io_hdr.nexus.targ_port, &desc->identifier[2]); desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + sizeof(struct scsi_vpd_id_rel_trgt_port_id)); /* * This is for the Target Port Group(type 5h) identifier */ desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT | SVPD_ID_TYPE_TPORTGRP; desc->length = 4; scsi_ulto2b(ctsio->io_hdr.nexus.targ_port / softc->port_cnt + 1, &desc->identifier[2]); desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + sizeof(struct scsi_vpd_id_trgt_port_grp_id)); /* * This is for the Target identifier */ if (port && port->target_devid) { memcpy(desc, port->target_devid->data, port->target_devid->len); } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_softc *softc = control_softc; struct scsi_vpd_scsi_ports *sp; struct scsi_vpd_port_designation *pd; struct scsi_vpd_port_designation_cont *pdc; struct ctl_lun *lun; struct ctl_port *port; int data_len, num_target_ports, iid_len, id_len; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; num_target_ports = 0; iid_len = 0; id_len = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (lun != NULL && ctl_lun_map_to_port(port, lun->lun) >= CTL_MAX_LUNS) continue; num_target_ports++; if (port->init_devid) iid_len += port->init_devid->len; if (port->port_devid) id_len += port->port_devid->len; } mtx_unlock(&softc->ctl_lock); data_len = sizeof(struct scsi_vpd_scsi_ports) + num_target_ports * (sizeof(struct scsi_vpd_port_designation) + sizeof(struct scsi_vpd_port_designation_cont)) + iid_len + id_len; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); sp = (struct scsi_vpd_scsi_ports *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) sp->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else sp->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; sp->page_code = SVPD_SCSI_PORTS; scsi_ulto2b(data_len - sizeof(struct scsi_vpd_scsi_ports), sp->page_length); pd = &sp->design[0]; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (lun != NULL && ctl_lun_map_to_port(port, lun->lun) >= CTL_MAX_LUNS) continue; scsi_ulto2b(port->targ_port, pd->relative_port_id); if (port->init_devid) { iid_len = port->init_devid->len; memcpy(pd->initiator_transportid, port->init_devid->data, port->init_devid->len); } else iid_len = 0; scsi_ulto2b(iid_len, pd->initiator_transportid_length); pdc = (struct scsi_vpd_port_designation_cont *) (&pd->initiator_transportid[iid_len]); if (port->port_devid) { id_len = port->port_devid->len; memcpy(pdc->target_port_descriptors, port->port_devid->data, port->port_devid->len); } else id_len = 0; scsi_ulto2b(id_len, pdc->target_port_descriptors_length); pd = (struct scsi_vpd_port_designation *) ((uint8_t *)pdc->target_port_descriptors + id_len); } mtx_unlock(&softc->ctl_lock); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_block_limits *bl_ptr; struct ctl_lun *lun; int bs; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*bl_ptr), M_CTL, M_WAITOK | M_ZERO); bl_ptr = (struct scsi_vpd_block_limits *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sizeof(*bl_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*bl_ptr); ctsio->kern_data_len = sizeof(*bl_ptr); ctsio->kern_total_len = sizeof(*bl_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) bl_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else bl_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; bl_ptr->page_code = SVPD_BLOCK_LIMITS; scsi_ulto2b(sizeof(*bl_ptr) - 4, bl_ptr->page_length); bl_ptr->max_cmp_write_len = 0xff; scsi_ulto4b(0xffffffff, bl_ptr->max_txfer_len); if (lun != NULL) { bs = lun->be_lun->blocksize; scsi_ulto4b(lun->be_lun->opttxferlen, bl_ptr->opt_txfer_len); if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) { scsi_ulto4b(0xffffffff, bl_ptr->max_unmap_lba_cnt); scsi_ulto4b(0xffffffff, bl_ptr->max_unmap_blk_cnt); if (lun->be_lun->ublockexp != 0) { scsi_ulto4b((1 << lun->be_lun->ublockexp), bl_ptr->opt_unmap_grain); scsi_ulto4b(0x80000000 | lun->be_lun->ublockoff, bl_ptr->unmap_grain_align); } } scsi_ulto4b(lun->be_lun->atomicblock, bl_ptr->max_atomic_transfer_length); scsi_ulto4b(0, bl_ptr->atomic_alignment); scsi_ulto4b(0, bl_ptr->atomic_transfer_length_granularity); } scsi_u64to8b(UINT64_MAX, bl_ptr->max_write_same_length); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_block_device_characteristics *bdc_ptr; struct ctl_lun *lun; const char *value; u_int i; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*bdc_ptr), M_CTL, M_WAITOK | M_ZERO); bdc_ptr = (struct scsi_vpd_block_device_characteristics *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sizeof(*bdc_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*bdc_ptr); ctsio->kern_data_len = sizeof(*bdc_ptr); ctsio->kern_total_len = sizeof(*bdc_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) bdc_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else bdc_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; bdc_ptr->page_code = SVPD_BDC; scsi_ulto2b(sizeof(*bdc_ptr) - 4, bdc_ptr->page_length); if (lun != NULL && (value = ctl_get_opt(&lun->be_lun->options, "rpm")) != NULL) i = strtol(value, NULL, 0); else i = CTL_DEFAULT_ROTATION_RATE; scsi_ulto2b(i, bdc_ptr->medium_rotation_rate); if (lun != NULL && (value = ctl_get_opt(&lun->be_lun->options, "formfactor")) != NULL) i = strtol(value, NULL, 0); else i = 0; bdc_ptr->wab_wac_ff = (i & 0x0f); bdc_ptr->flags = SVPD_FUAB | SVPD_VBULS; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_logical_block_prov *lbp_ptr; struct ctl_lun *lun; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; ctsio->kern_data_ptr = malloc(sizeof(*lbp_ptr), M_CTL, M_WAITOK | M_ZERO); lbp_ptr = (struct scsi_vpd_logical_block_prov *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (sizeof(*lbp_ptr) < alloc_len) { ctsio->residual = alloc_len - sizeof(*lbp_ptr); ctsio->kern_data_len = sizeof(*lbp_ptr); ctsio->kern_total_len = sizeof(*lbp_ptr); } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) lbp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else lbp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; lbp_ptr->page_code = SVPD_LBP; scsi_ulto2b(sizeof(*lbp_ptr) - 4, lbp_ptr->page_length); lbp_ptr->threshold_exponent = CTL_LBP_EXPONENT; if (lun != NULL && lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) { lbp_ptr->flags = SVPD_LBP_UNMAP | SVPD_LBP_WS16 | SVPD_LBP_WS10 | SVPD_LBP_RZ | SVPD_LBP_ANC_SUP; lbp_ptr->prov_type = SVPD_LBP_THIN; } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * INQUIRY with the EVPD bit set. */ static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_inquiry *cdb; int alloc_len, retval; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_inquiry *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); switch (cdb->page_code) { case SVPD_SUPPORTED_PAGES: retval = ctl_inquiry_evpd_supported(ctsio, alloc_len); break; case SVPD_UNIT_SERIAL_NUMBER: retval = ctl_inquiry_evpd_serial(ctsio, alloc_len); break; case SVPD_DEVICE_ID: retval = ctl_inquiry_evpd_devid(ctsio, alloc_len); break; case SVPD_EXTENDED_INQUIRY_DATA: retval = ctl_inquiry_evpd_eid(ctsio, alloc_len); break; case SVPD_MODE_PAGE_POLICY: retval = ctl_inquiry_evpd_mpp(ctsio, alloc_len); break; case SVPD_SCSI_PORTS: retval = ctl_inquiry_evpd_scsi_ports(ctsio, alloc_len); break; case SVPD_SCSI_TPC: retval = ctl_inquiry_evpd_tpc(ctsio, alloc_len); break; case SVPD_BLOCK_LIMITS: if (lun == NULL || lun->be_lun->lun_type != T_DIRECT) goto err; retval = ctl_inquiry_evpd_block_limits(ctsio, alloc_len); break; case SVPD_BDC: if (lun == NULL || lun->be_lun->lun_type != T_DIRECT) goto err; retval = ctl_inquiry_evpd_bdc(ctsio, alloc_len); break; case SVPD_LBP: if (lun == NULL || lun->be_lun->lun_type != T_DIRECT) goto err; retval = ctl_inquiry_evpd_lbp(ctsio, alloc_len); break; default: err: ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } /* * Standard INQUIRY data. */ static int ctl_inquiry_std(struct ctl_scsiio *ctsio) { struct scsi_inquiry_data *inq_ptr; struct scsi_inquiry *cdb; struct ctl_softc *softc; struct ctl_port *port; struct ctl_lun *lun; char *val; uint32_t alloc_len, data_len; ctl_port_type port_type; softc = control_softc; /* * Figure out whether we're talking to a Fibre Channel port or not. * We treat the ioctl front end, and any SCSI adapters, as packetized * SCSI front ends. */ port = ctl_io_port(&ctsio->io_hdr); if (port != NULL) port_type = port->port_type; else port_type = CTL_PORT_SCSI; if (port_type == CTL_PORT_IOCTL || port_type == CTL_PORT_INTERNAL) port_type = CTL_PORT_SCSI; lun = ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_inquiry *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); /* * We malloc the full inquiry data size here and fill it * in. If the user only asks for less, we'll give him * that much. */ data_len = offsetof(struct scsi_inquiry_data, vendor_specific1); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); inq_ptr = (struct scsi_inquiry_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } if (lun != NULL) { if ((lun->flags & CTL_LUN_PRIMARY_SC) || softc->ha_link >= CTL_HA_LINK_UNKNOWN) { inq_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; } else { inq_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | lun->be_lun->lun_type; } } else inq_ptr->device = (SID_QUAL_BAD_LU << 5) | T_NODEVICE; /* RMB in byte 2 is 0 */ inq_ptr->version = SCSI_REV_SPC4; /* * According to SAM-3, even if a device only supports a single * level of LUN addressing, it should still set the HISUP bit: * * 4.9.1 Logical unit numbers overview * * All logical unit number formats described in this standard are * hierarchical in structure even when only a single level in that * hierarchy is used. The HISUP bit shall be set to one in the * standard INQUIRY data (see SPC-2) when any logical unit number * format described in this standard is used. Non-hierarchical * formats are outside the scope of this standard. * * Therefore we set the HiSup bit here. * * The reponse format is 2, per SPC-3. */ inq_ptr->response_format = SID_HiSup | 2; inq_ptr->additional_length = data_len - (offsetof(struct scsi_inquiry_data, additional_length) + 1); CTL_DEBUG_PRINT(("additional_length = %d\n", inq_ptr->additional_length)); inq_ptr->spc3_flags = SPC3_SID_3PC | SPC3_SID_TPGS_IMPLICIT; /* 16 bit addressing */ if (port_type == CTL_PORT_SCSI) inq_ptr->spc2_flags = SPC2_SID_ADDR16; /* XXX set the SID_MultiP bit here if we're actually going to respond on multiple ports */ inq_ptr->spc2_flags |= SPC2_SID_MultiP; /* 16 bit data bus, synchronous transfers */ if (port_type == CTL_PORT_SCSI) inq_ptr->flags = SID_WBus16 | SID_Sync; /* * XXX KDM do we want to support tagged queueing on the control * device at all? */ if ((lun == NULL) || (lun->be_lun->lun_type != T_PROCESSOR)) inq_ptr->flags |= SID_CmdQue; /* * Per SPC-3, unused bytes in ASCII strings are filled with spaces. * We have 8 bytes for the vendor name, and 16 bytes for the device * name and 4 bytes for the revision. */ if (lun == NULL || (val = ctl_get_opt(&lun->be_lun->options, "vendor")) == NULL) { strncpy(inq_ptr->vendor, CTL_VENDOR, sizeof(inq_ptr->vendor)); } else { memset(inq_ptr->vendor, ' ', sizeof(inq_ptr->vendor)); strncpy(inq_ptr->vendor, val, min(sizeof(inq_ptr->vendor), strlen(val))); } if (lun == NULL) { strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT, sizeof(inq_ptr->product)); } else if ((val = ctl_get_opt(&lun->be_lun->options, "product")) == NULL) { switch (lun->be_lun->lun_type) { case T_DIRECT: strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT, sizeof(inq_ptr->product)); break; case T_PROCESSOR: strncpy(inq_ptr->product, CTL_PROCESSOR_PRODUCT, sizeof(inq_ptr->product)); break; default: strncpy(inq_ptr->product, CTL_UNKNOWN_PRODUCT, sizeof(inq_ptr->product)); break; } } else { memset(inq_ptr->product, ' ', sizeof(inq_ptr->product)); strncpy(inq_ptr->product, val, min(sizeof(inq_ptr->product), strlen(val))); } /* * XXX make this a macro somewhere so it automatically gets * incremented when we make changes. */ if (lun == NULL || (val = ctl_get_opt(&lun->be_lun->options, "revision")) == NULL) { strncpy(inq_ptr->revision, "0001", sizeof(inq_ptr->revision)); } else { memset(inq_ptr->revision, ' ', sizeof(inq_ptr->revision)); strncpy(inq_ptr->revision, val, min(sizeof(inq_ptr->revision), strlen(val))); } /* * For parallel SCSI, we support double transition and single * transition clocking. We also support QAS (Quick Arbitration * and Selection) and Information Unit transfers on both the * control and array devices. */ if (port_type == CTL_PORT_SCSI) inq_ptr->spi3data = SID_SPI_CLOCK_DT_ST | SID_SPI_QAS | SID_SPI_IUS; /* SAM-5 (no version claimed) */ scsi_ulto2b(0x00A0, inq_ptr->version1); /* SPC-4 (no version claimed) */ scsi_ulto2b(0x0460, inq_ptr->version2); if (port_type == CTL_PORT_FC) { /* FCP-2 ANSI INCITS.350:2003 */ scsi_ulto2b(0x0917, inq_ptr->version3); } else if (port_type == CTL_PORT_SCSI) { /* SPI-4 ANSI INCITS.362:200x */ scsi_ulto2b(0x0B56, inq_ptr->version3); } else if (port_type == CTL_PORT_ISCSI) { /* iSCSI (no version claimed) */ scsi_ulto2b(0x0960, inq_ptr->version3); } else if (port_type == CTL_PORT_SAS) { /* SAS (no version claimed) */ scsi_ulto2b(0x0BE0, inq_ptr->version3); } if (lun == NULL) { /* SBC-4 (no version claimed) */ scsi_ulto2b(0x0600, inq_ptr->version4); } else { switch (lun->be_lun->lun_type) { case T_DIRECT: /* SBC-4 (no version claimed) */ scsi_ulto2b(0x0600, inq_ptr->version4); break; case T_PROCESSOR: default: break; } } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_inquiry(struct ctl_scsiio *ctsio) { struct scsi_inquiry *cdb; int retval; CTL_DEBUG_PRINT(("ctl_inquiry\n")); cdb = (struct scsi_inquiry *)ctsio->cdb; if (cdb->byte2 & SI_EVPD) retval = ctl_inquiry_evpd(ctsio); else if (cdb->page_code == 0) retval = ctl_inquiry_std(ctsio); else { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } return (retval); } /* * For known CDB types, parse the LBA and length. */ static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint64_t *len) { if (io->io_hdr.io_type != CTL_IO_SCSI) return (1); switch (io->scsiio.cdb[0]) { case COMPARE_AND_WRITE: { struct scsi_compare_and_write *cdb; cdb = (struct scsi_compare_and_write *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = cdb->length; break; } case READ_6: case WRITE_6: { struct scsi_rw_6 *cdb; cdb = (struct scsi_rw_6 *)io->scsiio.cdb; *lba = scsi_3btoul(cdb->addr); /* only 5 bits are valid in the most significant address byte */ *lba &= 0x1fffff; *len = cdb->length; break; } case READ_10: case WRITE_10: { struct scsi_rw_10 *cdb; cdb = (struct scsi_rw_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case WRITE_VERIFY_10: { struct scsi_write_verify_10 *cdb; cdb = (struct scsi_write_verify_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case READ_12: case WRITE_12: { struct scsi_rw_12 *cdb; cdb = (struct scsi_rw_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_12: { struct scsi_write_verify_12 *cdb; cdb = (struct scsi_write_verify_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case READ_16: case WRITE_16: case WRITE_ATOMIC_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_16: { struct scsi_write_verify_16 *cdb; cdb = (struct scsi_write_verify_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_SAME_10: { struct scsi_write_same_10 *cdb; cdb = (struct scsi_write_same_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case WRITE_SAME_16: { struct scsi_write_same_16 *cdb; cdb = (struct scsi_write_same_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case VERIFY_10: { struct scsi_verify_10 *cdb; cdb = (struct scsi_verify_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case VERIFY_12: { struct scsi_verify_12 *cdb; cdb = (struct scsi_verify_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case VERIFY_16: { struct scsi_verify_16 *cdb; cdb = (struct scsi_verify_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case UNMAP: { *lba = 0; *len = UINT64_MAX; break; } case SERVICE_ACTION_IN: { /* GET LBA STATUS */ struct scsi_get_lba_status *cdb; cdb = (struct scsi_get_lba_status *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = UINT32_MAX; break; } default: return (1); break; /* NOTREACHED */ } return (0); } static ctl_action ctl_extent_check_lba(uint64_t lba1, uint64_t len1, uint64_t lba2, uint64_t len2, bool seq) { uint64_t endlba1, endlba2; endlba1 = lba1 + len1 - (seq ? 0 : 1); endlba2 = lba2 + len2 - 1; if ((endlba1 < lba2) || (endlba2 < lba1)) return (CTL_ACTION_PASS); else return (CTL_ACTION_BLOCK); } static int ctl_extent_check_unmap(union ctl_io *io, uint64_t lba2, uint64_t len2) { struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_desc *buf, *end, *range; uint64_t lba; uint32_t len; /* If not UNMAP -- go other way. */ if (io->io_hdr.io_type != CTL_IO_SCSI || io->scsiio.cdb[0] != UNMAP) return (CTL_ACTION_ERROR); /* If UNMAP without data -- block and wait for data. */ ptrlen = (struct ctl_ptr_len_flags *) &io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; if ((io->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0 || ptrlen->ptr == NULL) return (CTL_ACTION_BLOCK); /* UNMAP with data -- check for collision. */ buf = (struct scsi_unmap_desc *)ptrlen->ptr; end = buf + ptrlen->len / sizeof(*buf); for (range = buf; range < end; range++) { lba = scsi_8btou64(range->lba); len = scsi_4btoul(range->length); if ((lba < lba2 + len2) && (lba + len > lba2)) return (CTL_ACTION_BLOCK); } return (CTL_ACTION_PASS); } static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2, bool seq) { uint64_t lba1, lba2; uint64_t len1, len2; int retval; if (ctl_get_lba_len(io2, &lba2, &len2) != 0) return (CTL_ACTION_ERROR); retval = ctl_extent_check_unmap(io1, lba2, len2); if (retval != CTL_ACTION_ERROR) return (retval); if (ctl_get_lba_len(io1, &lba1, &len1) != 0) return (CTL_ACTION_ERROR); return (ctl_extent_check_lba(lba1, len1, lba2, len2, seq)); } static ctl_action ctl_extent_check_seq(union ctl_io *io1, union ctl_io *io2) { uint64_t lba1, lba2; uint64_t len1, len2; if (ctl_get_lba_len(io1, &lba1, &len1) != 0) return (CTL_ACTION_ERROR); if (ctl_get_lba_len(io2, &lba2, &len2) != 0) return (CTL_ACTION_ERROR); if (lba1 + len1 == lba2) return (CTL_ACTION_BLOCK); return (CTL_ACTION_PASS); } static ctl_action ctl_check_for_blockage(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *ooa_io) { const struct ctl_cmd_entry *pending_entry, *ooa_entry; ctl_serialize_action *serialize_row; /* * The initiator attempted multiple untagged commands at the same * time. Can't do that. */ if ((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED) && (ooa_io->scsiio.tag_type == CTL_TAG_UNTAGGED) && ((pending_io->io_hdr.nexus.targ_port == ooa_io->io_hdr.nexus.targ_port) && (pending_io->io_hdr.nexus.initid == ooa_io->io_hdr.nexus.initid)) && ((ooa_io->io_hdr.flags & (CTL_FLAG_ABORT | CTL_FLAG_STATUS_SENT)) == 0)) return (CTL_ACTION_OVERLAP); /* * The initiator attempted to send multiple tagged commands with * the same ID. (It's fine if different initiators have the same * tag ID.) * * Even if all of those conditions are true, we don't kill the I/O * if the command ahead of us has been aborted. We won't end up * sending it to the FETD, and it's perfectly legal to resend a * command with the same tag number as long as the previous * instance of this tag number has been aborted somehow. */ if ((pending_io->scsiio.tag_type != CTL_TAG_UNTAGGED) && (ooa_io->scsiio.tag_type != CTL_TAG_UNTAGGED) && (pending_io->scsiio.tag_num == ooa_io->scsiio.tag_num) && ((pending_io->io_hdr.nexus.targ_port == ooa_io->io_hdr.nexus.targ_port) && (pending_io->io_hdr.nexus.initid == ooa_io->io_hdr.nexus.initid)) && ((ooa_io->io_hdr.flags & (CTL_FLAG_ABORT | CTL_FLAG_STATUS_SENT)) == 0)) return (CTL_ACTION_OVERLAP_TAG); /* * If we get a head of queue tag, SAM-3 says that we should * immediately execute it. * * What happens if this command would normally block for some other * reason? e.g. a request sense with a head of queue tag * immediately after a write. Normally that would block, but this * will result in its getting executed immediately... * * We currently return "pass" instead of "skip", so we'll end up * going through the rest of the queue to check for overlapped tags. * * XXX KDM check for other types of blockage first?? */ if (pending_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE) return (CTL_ACTION_PASS); /* * Ordered tags have to block until all items ahead of them * have completed. If we get called with an ordered tag, we always * block, if something else is ahead of us in the queue. */ if (pending_io->scsiio.tag_type == CTL_TAG_ORDERED) return (CTL_ACTION_BLOCK); /* * Simple tags get blocked until all head of queue and ordered tags * ahead of them have completed. I'm lumping untagged commands in * with simple tags here. XXX KDM is that the right thing to do? */ if (((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED) || (pending_io->scsiio.tag_type == CTL_TAG_SIMPLE)) && ((ooa_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE) || (ooa_io->scsiio.tag_type == CTL_TAG_ORDERED))) return (CTL_ACTION_BLOCK); pending_entry = ctl_get_cmd_entry(&pending_io->scsiio, NULL); ooa_entry = ctl_get_cmd_entry(&ooa_io->scsiio, NULL); serialize_row = ctl_serialize_table[ooa_entry->seridx]; switch (serialize_row[pending_entry->seridx]) { case CTL_SER_BLOCK: return (CTL_ACTION_BLOCK); case CTL_SER_EXTENT: return (ctl_extent_check(ooa_io, pending_io, (lun->be_lun && lun->be_lun->serseq == CTL_LUN_SERSEQ_ON))); case CTL_SER_EXTENTOPT: if ((lun->mode_pages.control_page[CTL_PAGE_CURRENT].queue_flags & SCP_QUEUE_ALG_MASK) != SCP_QUEUE_ALG_UNRESTRICTED) return (ctl_extent_check(ooa_io, pending_io, (lun->be_lun && lun->be_lun->serseq == CTL_LUN_SERSEQ_ON))); return (CTL_ACTION_PASS); case CTL_SER_EXTENTSEQ: if (lun->be_lun && lun->be_lun->serseq != CTL_LUN_SERSEQ_OFF) return (ctl_extent_check_seq(ooa_io, pending_io)); return (CTL_ACTION_PASS); case CTL_SER_PASS: return (CTL_ACTION_PASS); case CTL_SER_BLOCKOPT: if ((lun->mode_pages.control_page[CTL_PAGE_CURRENT].queue_flags & SCP_QUEUE_ALG_MASK) != SCP_QUEUE_ALG_UNRESTRICTED) return (CTL_ACTION_BLOCK); return (CTL_ACTION_PASS); case CTL_SER_SKIP: return (CTL_ACTION_SKIP); default: panic("invalid serialization value %d", serialize_row[pending_entry->seridx]); } return (CTL_ACTION_ERROR); } /* * Check for blockage or overlaps against the OOA (Order Of Arrival) queue. * Assumptions: * - pending_io is generally either incoming, or on the blocked queue * - starting I/O is the I/O we want to start the check with. */ static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *starting_io) { union ctl_io *ooa_io; ctl_action action; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run back along the OOA queue, starting with the current * blocked I/O and going through every I/O before it on the * queue. If starting_io is NULL, we'll just end up returning * CTL_ACTION_PASS. */ for (ooa_io = starting_io; ooa_io != NULL; ooa_io = (union ctl_io *)TAILQ_PREV(&ooa_io->io_hdr, ctl_ooaq, ooa_links)){ /* * This routine just checks to see whether * cur_blocked is blocked by ooa_io, which is ahead * of it in the queue. It doesn't queue/dequeue * cur_blocked. */ action = ctl_check_for_blockage(lun, pending_io, ooa_io); switch (action) { case CTL_ACTION_BLOCK: case CTL_ACTION_OVERLAP: case CTL_ACTION_OVERLAP_TAG: case CTL_ACTION_SKIP: case CTL_ACTION_ERROR: return (action); break; /* NOTREACHED */ case CTL_ACTION_PASS: break; default: panic("invalid action %d", action); break; /* NOTREACHED */ } } return (CTL_ACTION_PASS); } /* * Assumptions: * - An I/O has just completed, and has been removed from the per-LUN OOA * queue, so some items on the blocked queue may now be unblocked. */ static int ctl_check_blocked(struct ctl_lun *lun) { struct ctl_softc *softc = lun->ctl_softc; union ctl_io *cur_blocked, *next_blocked; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run forward from the head of the blocked queue, checking each * entry against the I/Os prior to it on the OOA queue to see if * there is still any blockage. * * We cannot use the TAILQ_FOREACH() macro, because it can't deal * with our removing a variable on it while it is traversing the * list. */ for (cur_blocked = (union ctl_io *)TAILQ_FIRST(&lun->blocked_queue); cur_blocked != NULL; cur_blocked = next_blocked) { union ctl_io *prev_ooa; ctl_action action; next_blocked = (union ctl_io *)TAILQ_NEXT(&cur_blocked->io_hdr, blocked_links); prev_ooa = (union ctl_io *)TAILQ_PREV(&cur_blocked->io_hdr, ctl_ooaq, ooa_links); /* * If cur_blocked happens to be the first item in the OOA * queue now, prev_ooa will be NULL, and the action * returned will just be CTL_ACTION_PASS. */ action = ctl_check_ooa(lun, cur_blocked, prev_ooa); switch (action) { case CTL_ACTION_BLOCK: /* Nothing to do here, still blocked */ break; case CTL_ACTION_OVERLAP: case CTL_ACTION_OVERLAP_TAG: /* * This shouldn't happen! In theory we've already * checked this command for overlap... */ break; case CTL_ACTION_PASS: case CTL_ACTION_SKIP: { const struct ctl_cmd_entry *entry; /* * The skip case shouldn't happen, this transaction * should have never made it onto the blocked queue. */ /* * This I/O is no longer blocked, we can remove it * from the blocked queue. Since this is a TAILQ * (doubly linked list), we can do O(1) removals * from any place on the list. */ TAILQ_REMOVE(&lun->blocked_queue, &cur_blocked->io_hdr, blocked_links); cur_blocked->io_hdr.flags &= ~CTL_FLAG_BLOCKED; if ((softc->ha_mode != CTL_HA_MODE_XFER) && (cur_blocked->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)){ /* * Need to send IO back to original side to * run */ union ctl_ha_msg msg_info; cur_blocked->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; msg_info.hdr.original_sc = cur_blocked->io_hdr.original_sc; msg_info.hdr.serializing_sc = cur_blocked; msg_info.hdr.msg_type = CTL_MSG_R2R; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.hdr), M_NOWAIT); break; } entry = ctl_get_cmd_entry(&cur_blocked->scsiio, NULL); /* * Check this I/O for LUN state changes that may * have happened while this command was blocked. * The LUN state may have been changed by a command * ahead of us in the queue, so we need to re-check * for any states that can be caused by SCSI * commands. */ if (ctl_scsiio_lun_check(lun, entry, &cur_blocked->scsiio) == 0) { cur_blocked->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr(cur_blocked); } else ctl_done(cur_blocked); break; } default: /* * This probably shouldn't happen -- we shouldn't * get CTL_ACTION_ERROR, or anything else. */ break; } } return (CTL_RETVAL_COMPLETE); } /* * This routine (with one exception) checks LUN flags that can be set by * commands ahead of us in the OOA queue. These flags have to be checked * when a command initially comes in, and when we pull a command off the * blocked queue and are preparing to execute it. The reason we have to * check these flags for commands on the blocked queue is that the LUN * state may have been changed by a command ahead of us while we're on the * blocked queue. * * Ordering is somewhat important with these checks, so please pay * careful attention to the placement of any new checks. */ static int ctl_scsiio_lun_check(struct ctl_lun *lun, const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio) { struct ctl_softc *softc = lun->ctl_softc; int retval; uint32_t residx; retval = 0; mtx_assert(&lun->lun_lock, MA_OWNED); /* * If this shelf is a secondary shelf controller, we may have to * reject some commands disallowed by HA mode and link state. */ if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0) { if (softc->ha_link == CTL_HA_LINK_OFFLINE && (entry->flags & CTL_CMD_FLAG_OK_ON_UNAVAIL) == 0) { ctl_set_lun_unavail(ctsio); retval = 1; goto bailout; } if ((lun->flags & CTL_LUN_PEER_SC_PRIMARY) == 0 && (entry->flags & CTL_CMD_FLAG_OK_ON_UNAVAIL) == 0) { ctl_set_lun_transit(ctsio); retval = 1; goto bailout; } if (softc->ha_mode == CTL_HA_MODE_ACT_STBY && (entry->flags & CTL_CMD_FLAG_OK_ON_STANDBY) == 0) { ctl_set_lun_standby(ctsio); retval = 1; goto bailout; } /* The rest of checks are only done on executing side */ if (softc->ha_mode == CTL_HA_MODE_XFER) goto bailout; } if (entry->pattern & CTL_LUN_PAT_WRITE) { if (lun->be_lun && lun->be_lun->flags & CTL_LUN_FLAG_READONLY) { ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_DATA_PROTECT, /*asc*/ 0x27, /*ascq*/ 0x01, SSD_ELEM_NONE); retval = 1; goto bailout; } if ((lun->mode_pages.control_page[CTL_PAGE_CURRENT] .eca_and_aen & SCP_SWP) != 0) { ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_DATA_PROTECT, /*asc*/ 0x27, /*ascq*/ 0x02, SSD_ELEM_NONE); retval = 1; goto bailout; } } /* * Check for a reservation conflict. If this command isn't allowed * even on reserved LUNs, and if this initiator isn't the one who * reserved us, reject the command with a reservation conflict. */ residx = ctl_get_initindex(&ctsio->io_hdr.nexus); if ((lun->flags & CTL_LUN_RESERVED) && ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_RESV) == 0)) { if (lun->res_idx != residx) { ctl_set_reservation_conflict(ctsio); retval = 1; goto bailout; } } if ((lun->flags & CTL_LUN_PR_RESERVED) == 0 || (entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_RESV)) { /* No reservation or command is allowed. */; } else if ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_WRESV) && (lun->res_type == SPR_TYPE_WR_EX || lun->res_type == SPR_TYPE_WR_EX_RO || lun->res_type == SPR_TYPE_WR_EX_AR)) { /* The command is allowed for Write Exclusive resv. */; } else { /* * if we aren't registered or it's a res holder type * reservation and this isn't the res holder then set a * conflict. */ if (ctl_get_prkey(lun, residx) == 0 || (residx != lun->pr_res_idx && lun->res_type < 4)) { ctl_set_reservation_conflict(ctsio); retval = 1; goto bailout; } } if ((lun->flags & CTL_LUN_OFFLINE) && ((entry->flags & CTL_CMD_FLAG_OK_ON_STANDBY) == 0)) { ctl_set_lun_not_ready(ctsio); retval = 1; goto bailout; } if ((lun->flags & CTL_LUN_STOPPED) && ((entry->flags & CTL_CMD_FLAG_OK_ON_STOPPED) == 0)) { /* "Logical unit not ready, initializing cmd. required" */ ctl_set_lun_stopped(ctsio); retval = 1; goto bailout; } if ((lun->flags & CTL_LUN_INOPERABLE) && ((entry->flags & CTL_CMD_FLAG_OK_ON_INOPERABLE) == 0)) { /* "Medium format corrupted" */ ctl_set_medium_format_corrupted(ctsio); retval = 1; goto bailout; } bailout: return (retval); } static void ctl_failover_io(union ctl_io *io, int have_lock) { ctl_set_busy(&io->scsiio); ctl_done(io); } static void ctl_failover_lun(struct ctl_lun *lun) { struct ctl_softc *softc = lun->ctl_softc; struct ctl_io_hdr *io, *next_io; CTL_DEBUG_PRINT(("FAILOVER for lun %ju\n", lun->lun)); if (softc->ha_mode == CTL_HA_MODE_XFER) { TAILQ_FOREACH_SAFE(io, &lun->ooa_queue, ooa_links, next_io) { /* We are master */ if (io->flags & CTL_FLAG_FROM_OTHER_SC) { if (io->flags & CTL_FLAG_IO_ACTIVE) { io->flags |= CTL_FLAG_ABORT; } else { /* This can be only due to DATAMOVE */ io->msg_type = CTL_MSG_DATAMOVE_DONE; io->flags |= CTL_FLAG_IO_ACTIVE; io->port_status = 31340; ctl_enqueue_isc((union ctl_io *)io); } } /* We are slave */ if (io->flags & CTL_FLAG_SENT_2OTHER_SC) { io->flags &= ~CTL_FLAG_SENT_2OTHER_SC; if (io->flags & CTL_FLAG_IO_ACTIVE) { io->flags |= CTL_FLAG_FAILOVER; } else { ctl_set_busy(&((union ctl_io *)io)-> scsiio); ctl_done((union ctl_io *)io); } } } } else { /* SERIALIZE modes */ TAILQ_FOREACH_SAFE(io, &lun->blocked_queue, blocked_links, next_io) { /* We are master */ if (io->flags & CTL_FLAG_FROM_OTHER_SC) { TAILQ_REMOVE(&lun->blocked_queue, io, blocked_links); io->flags &= ~CTL_FLAG_BLOCKED; TAILQ_REMOVE(&lun->ooa_queue, io, ooa_links); ctl_free_io((union ctl_io *)io); } } TAILQ_FOREACH_SAFE(io, &lun->ooa_queue, ooa_links, next_io) { /* We are master */ if (io->flags & CTL_FLAG_FROM_OTHER_SC) { TAILQ_REMOVE(&lun->ooa_queue, io, ooa_links); ctl_free_io((union ctl_io *)io); } /* We are slave */ if (io->flags & CTL_FLAG_SENT_2OTHER_SC) { io->flags &= ~CTL_FLAG_SENT_2OTHER_SC; if (!(io->flags & CTL_FLAG_IO_ACTIVE)) { ctl_set_busy(&((union ctl_io *)io)-> scsiio); ctl_done((union ctl_io *)io); } } } ctl_check_blocked(lun); } } static int ctl_scsiio_precheck(struct ctl_softc *softc, struct ctl_scsiio *ctsio) { struct ctl_lun *lun; const struct ctl_cmd_entry *entry; uint32_t initidx, targ_lun; int retval; retval = 0; lun = NULL; targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun; if ((targ_lun < CTL_MAX_LUNS) && ((lun = softc->ctl_luns[targ_lun]) != NULL)) { /* * If the LUN is invalid, pretend that it doesn't exist. * It will go away as soon as all pending I/O has been * completed. */ mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); lun = NULL; ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = NULL; ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr = NULL; } else { ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = lun; ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr = lun->be_lun; /* * Every I/O goes into the OOA queue for a * particular LUN, and stays there until completion. */ #ifdef CTL_TIME_IO if (TAILQ_EMPTY(&lun->ooa_queue)) { lun->idle_time += getsbinuptime() - lun->last_busy; } #endif TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); } } else { ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr = NULL; ctsio->io_hdr.ctl_private[CTL_PRIV_BACKEND_LUN].ptr = NULL; } /* Get command entry and return error if it is unsuppotyed. */ entry = ctl_validate_command(ctsio); if (entry == NULL) { if (lun) mtx_unlock(&lun->lun_lock); return (retval); } ctsio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; ctsio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; /* * Check to see whether we can send this command to LUNs that don't * exist. This should pretty much only be the case for inquiry * and request sense. Further checks, below, really require having * a LUN, so we can't really check the command anymore. Just put * it on the rtr queue. */ if (lun == NULL) { if (entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) { ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr((union ctl_io *)ctsio); return (retval); } ctl_set_unsupported_lun(ctsio); ctl_done((union ctl_io *)ctsio); CTL_DEBUG_PRINT(("ctl_scsiio_precheck: bailing out due to invalid LUN\n")); return (retval); } else { /* * Make sure we support this particular command on this LUN. * e.g., we don't support writes to the control LUN. */ if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } } initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); #ifdef CTL_WITH_CA /* * If we've got a request sense, it'll clear the contingent * allegiance condition. Otherwise, if we have a CA condition for * this initiator, clear it, because it sent down a command other * than request sense. */ if ((ctsio->cdb[0] != REQUEST_SENSE) && (ctl_is_set(lun->have_ca, initidx))) ctl_clear_mask(lun->have_ca, initidx); #endif /* * If the command has this flag set, it handles its own unit * attention reporting, we shouldn't do anything. Otherwise we * check for any pending unit attentions, and send them back to the * initiator. We only do this when a command initially comes in, * not when we pull it off the blocked queue. * * According to SAM-3, section 5.3.2, the order that things get * presented back to the host is basically unit attentions caused * by some sort of reset event, busy status, reservation conflicts * or task set full, and finally any other status. * * One issue here is that some of the unit attentions we report * don't fall into the "reset" category (e.g. "reported luns data * has changed"). So reporting it here, before the reservation * check, may be technically wrong. I guess the only thing to do * would be to check for and report the reset events here, and then * check for the other unit attention types after we check for a * reservation conflict. * * XXX KDM need to fix this */ if ((entry->flags & CTL_CMD_FLAG_NO_SENSE) == 0) { ctl_ua_type ua_type; scsi_sense_data_type sense_format; if (lun->flags & CTL_LUN_SENSE_DESC) sense_format = SSD_TYPE_DESC; else sense_format = SSD_TYPE_FIXED; ua_type = ctl_build_ua(lun, initidx, &ctsio->sense_data, sense_format); if (ua_type != CTL_UA_NONE) { mtx_unlock(&lun->lun_lock); ctsio->scsi_status = SCSI_STATUS_CHECK_COND; ctsio->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; ctsio->sense_len = SSD_FULL_SIZE; ctl_done((union ctl_io *)ctsio); return (retval); } } if (ctl_scsiio_lun_check(lun, entry, ctsio) != 0) { mtx_unlock(&lun->lun_lock); ctl_done((union ctl_io *)ctsio); return (retval); } /* * XXX CHD this is where we want to send IO to other side if * this LUN is secondary on this SC. We will need to make a copy * of the IO and flag the IO on this side as SENT_2OTHER and the flag * the copy we send as FROM_OTHER. * We also need to stuff the address of the original IO so we can * find it easily. Something similar will need be done on the other * side so when we are done we can find the copy. */ if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && (lun->flags & CTL_LUN_PEER_SC_PRIMARY) != 0) { union ctl_ha_msg msg_info; int isc_retval; ctsio->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC; ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; mtx_unlock(&lun->lun_lock); msg_info.hdr.msg_type = CTL_MSG_SERIALIZE; msg_info.hdr.original_sc = (union ctl_io *)ctsio; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.nexus = ctsio->io_hdr.nexus; msg_info.scsi.tag_num = ctsio->tag_num; msg_info.scsi.tag_type = ctsio->tag_type; msg_info.scsi.cdb_len = ctsio->cdb_len; memcpy(msg_info.scsi.cdb, ctsio->cdb, CTL_MAX_CDBLEN); if ((isc_retval = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi) - sizeof(msg_info.scsi.sense_data), M_WAITOK)) > CTL_HA_STATUS_SUCCESS) { ctl_set_busy(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } return (retval); } switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links))) { case CTL_ACTION_BLOCK: ctsio->io_hdr.flags |= CTL_FLAG_BLOCKED; TAILQ_INSERT_TAIL(&lun->blocked_queue, &ctsio->io_hdr, blocked_links); mtx_unlock(&lun->lun_lock); return (retval); case CTL_ACTION_PASS: case CTL_ACTION_SKIP: ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; mtx_unlock(&lun->lun_lock); ctl_enqueue_rtr((union ctl_io *)ctsio); break; case CTL_ACTION_OVERLAP: mtx_unlock(&lun->lun_lock); ctl_set_overlapped_cmd(ctsio); ctl_done((union ctl_io *)ctsio); break; case CTL_ACTION_OVERLAP_TAG: mtx_unlock(&lun->lun_lock); ctl_set_overlapped_tag(ctsio, ctsio->tag_num & 0xff); ctl_done((union ctl_io *)ctsio); break; case CTL_ACTION_ERROR: default: mtx_unlock(&lun->lun_lock); ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 0); ctl_done((union ctl_io *)ctsio); break; } return (retval); } const struct ctl_cmd_entry * ctl_get_cmd_entry(struct ctl_scsiio *ctsio, int *sa) { const struct ctl_cmd_entry *entry; int service_action; entry = &ctl_cmd_table[ctsio->cdb[0]]; if (sa) *sa = ((entry->flags & CTL_CMD_FLAG_SA5) != 0); if (entry->flags & CTL_CMD_FLAG_SA5) { service_action = ctsio->cdb[1] & SERVICE_ACTION_MASK; entry = &((const struct ctl_cmd_entry *) entry->execute)[service_action]; } return (entry); } const struct ctl_cmd_entry * ctl_validate_command(struct ctl_scsiio *ctsio) { const struct ctl_cmd_entry *entry; int i, sa; uint8_t diff; entry = ctl_get_cmd_entry(ctsio, &sa); if (entry->execute == NULL) { if (sa) ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); else ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (NULL); } KASSERT(entry->length > 0, ("Not defined length for command 0x%02x/0x%02x", ctsio->cdb[0], ctsio->cdb[1])); for (i = 1; i < entry->length; i++) { diff = ctsio->cdb[i] & ~entry->usage[i - 1]; if (diff == 0) continue; ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ i, /*bit_valid*/ 1, /*bit*/ fls(diff) - 1); ctl_done((union ctl_io *)ctsio); return (NULL); } return (entry); } static int ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry) { switch (lun_type) { case T_PROCESSOR: if (((entry->flags & CTL_CMD_FLAG_OK_ON_PROC) == 0) && ((entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) == 0)) return (0); break; case T_DIRECT: if (((entry->flags & CTL_CMD_FLAG_OK_ON_SLUN) == 0) && ((entry->flags & CTL_CMD_FLAG_OK_ON_ALL_LUNS) == 0)) return (0); break; default: return (0); } return (1); } static int ctl_scsiio(struct ctl_scsiio *ctsio) { int retval; const struct ctl_cmd_entry *entry; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_scsiio cdb[0]=%02X\n", ctsio->cdb[0])); entry = ctl_get_cmd_entry(ctsio, NULL); /* * If this I/O has been aborted, just send it straight to * ctl_done() without executing it. */ if (ctsio->io_hdr.flags & CTL_FLAG_ABORT) { ctl_done((union ctl_io *)ctsio); goto bailout; } /* * All the checks should have been handled by ctl_scsiio_precheck(). * We should be clear now to just execute the I/O. */ retval = entry->execute(ctsio); bailout: return (retval); } /* * Since we only implement one target right now, a bus reset simply resets * our single target. */ static int ctl_bus_reset(struct ctl_softc *softc, union ctl_io *io) { return(ctl_target_reset(softc, io, CTL_UA_BUS_RESET)); } static int ctl_target_reset(struct ctl_softc *softc, union ctl_io *io, ctl_ua_type ua_type) { struct ctl_lun *lun; int retval; if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = io->io_hdr.nexus; if (ua_type==CTL_UA_TARG_RESET) msg_info.task.task_action = CTL_TASK_TARGET_RESET; else msg_info.task.task_action = CTL_TASK_BUS_RESET; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_WAITOK); } retval = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) retval += ctl_lun_reset(lun, io, ua_type); mtx_unlock(&softc->ctl_lock); return (retval); } /* * The LUN should always be set. The I/O is optional, and is used to * distinguish between I/Os sent by this initiator, and by other * initiators. We set unit attention for initiators other than this one. * SAM-3 is vague on this point. It does say that a unit attention should * be established for other initiators when a LUN is reset (see section * 5.7.3), but it doesn't specifically say that the unit attention should * be established for this particular initiator when a LUN is reset. Here * is the relevant text, from SAM-3 rev 8: * * 5.7.2 When a SCSI initiator port aborts its own tasks * * When a SCSI initiator port causes its own task(s) to be aborted, no * notification that the task(s) have been aborted shall be returned to * the SCSI initiator port other than the completion response for the * command or task management function action that caused the task(s) to * be aborted and notification(s) associated with related effects of the * action (e.g., a reset unit attention condition). * * XXX KDM for now, we're setting unit attention for all initiators. */ static int ctl_lun_reset(struct ctl_lun *lun, union ctl_io *io, ctl_ua_type ua_type) { union ctl_io *xio; #if 0 uint32_t initidx; #endif #ifdef CTL_WITH_CA int i; #endif mtx_lock(&lun->lun_lock); /* * Run through the OOA queue and abort each I/O. */ for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { xio->io_hdr.flags |= CTL_FLAG_ABORT | CTL_FLAG_ABORT_STATUS; } /* * This version sets unit attention for every */ #if 0 initidx = ctl_get_initindex(&io->io_hdr.nexus); ctl_est_ua_all(lun, initidx, ua_type); #else ctl_est_ua_all(lun, -1, ua_type); #endif /* * A reset (any kind, really) clears reservations established with * RESERVE/RELEASE. It does not clear reservations established * with PERSISTENT RESERVE OUT, but we don't support that at the * moment anyway. See SPC-2, section 5.6. SPC-3 doesn't address * reservations made with the RESERVE/RELEASE commands, because * those commands are obsolete in SPC-3. */ lun->flags &= ~CTL_LUN_RESERVED; #ifdef CTL_WITH_CA for (i = 0; i < CTL_MAX_INITIATORS; i++) ctl_clear_mask(lun->have_ca, i); #endif mtx_unlock(&lun->lun_lock); return (0); } static void ctl_abort_tasks_lun(struct ctl_lun *lun, uint32_t targ_port, uint32_t init_id, int other_sc) { union ctl_io *xio; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run through the OOA queue and attempt to find the given I/O. * The target port, initiator ID, tag type and tag number have to * match the values that we got from the initiator. If we have an * untagged command to abort, simply abort the first untagged command * we come to. We only allow one untagged command at a time of course. */ for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { if ((targ_port == UINT32_MAX || targ_port == xio->io_hdr.nexus.targ_port) && (init_id == UINT32_MAX || init_id == xio->io_hdr.nexus.initid)) { if (targ_port != xio->io_hdr.nexus.targ_port || init_id != xio->io_hdr.nexus.initid) xio->io_hdr.flags |= CTL_FLAG_ABORT_STATUS; xio->io_hdr.flags |= CTL_FLAG_ABORT; if (!other_sc && !(lun->flags & CTL_LUN_PRIMARY_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = xio->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_ABORT_TASK; msg_info.task.tag_num = xio->scsiio.tag_num; msg_info.task.tag_type = xio->scsiio.tag_type; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_NOWAIT); } } } } static int ctl_abort_task_set(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_lun *lun; uint32_t targ_lun; /* * Look up the LUN. */ targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if ((targ_lun < CTL_MAX_LUNS) && (softc->ctl_luns[targ_lun] != NULL)) lun = softc->ctl_luns[targ_lun]; else { mtx_unlock(&softc->ctl_lock); return (1); } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (io->taskio.task_action == CTL_TASK_ABORT_TASK_SET) { ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.initid, (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); } else { /* CTL_TASK_CLEAR_TASK_SET */ ctl_abort_tasks_lun(lun, UINT32_MAX, UINT32_MAX, (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); } mtx_unlock(&lun->lun_lock); return (0); } static int ctl_i_t_nexus_reset(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_lun *lun; uint32_t initidx; + if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { + union ctl_ha_msg msg_info; + + msg_info.hdr.nexus = io->io_hdr.nexus; + msg_info.task.task_action = CTL_TASK_I_T_NEXUS_RESET; + msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; + msg_info.hdr.original_sc = NULL; + msg_info.hdr.serializing_sc = NULL; + ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, + sizeof(msg_info.task), M_WAITOK); + } + initidx = ctl_get_initindex(&io->io_hdr.nexus); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port, - io->io_hdr.nexus.initid, - (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); + io->io_hdr.nexus.initid, 1); #ifdef CTL_WITH_CA ctl_clear_mask(lun->have_ca, initidx); #endif if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx == initidx)) lun->flags &= ~CTL_LUN_RESERVED; ctl_est_ua(lun, initidx, CTL_UA_I_T_NEXUS_LOSS); mtx_unlock(&lun->lun_lock); } mtx_unlock(&softc->ctl_lock); return (0); } static int ctl_abort_task(union ctl_io *io) { union ctl_io *xio; struct ctl_lun *lun; struct ctl_softc *softc; #if 0 struct sbuf sb; char printbuf[128]; #endif int found; uint32_t targ_lun; softc = control_softc; found = 0; /* * Look up the LUN. */ targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if ((targ_lun < CTL_MAX_LUNS) && (softc->ctl_luns[targ_lun] != NULL)) lun = softc->ctl_luns[targ_lun]; else { mtx_unlock(&softc->ctl_lock); return (1); } #if 0 printf("ctl_abort_task: called for lun %lld, tag %d type %d\n", lun->lun, io->taskio.tag_num, io->taskio.tag_type); #endif mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); /* * Run through the OOA queue and attempt to find the given I/O. * The target port, initiator ID, tag type and tag number have to * match the values that we got from the initiator. If we have an * untagged command to abort, simply abort the first untagged command * we come to. We only allow one untagged command at a time of course. */ for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { #if 0 sbuf_new(&sb, printbuf, sizeof(printbuf), SBUF_FIXEDLEN); sbuf_printf(&sb, "LUN %lld tag %d type %d%s%s%s%s: ", lun->lun, xio->scsiio.tag_num, xio->scsiio.tag_type, (xio->io_hdr.blocked_links.tqe_prev == NULL) ? "" : " BLOCKED", (xio->io_hdr.flags & CTL_FLAG_DMA_INPROG) ? " DMA" : "", (xio->io_hdr.flags & CTL_FLAG_ABORT) ? " ABORT" : "", (xio->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR ? " RTR" : "")); ctl_scsi_command_string(&xio->scsiio, NULL, &sb); sbuf_finish(&sb); printf("%s\n", sbuf_data(&sb)); #endif if ((xio->io_hdr.nexus.targ_port != io->io_hdr.nexus.targ_port) || (xio->io_hdr.nexus.initid != io->io_hdr.nexus.initid) || (xio->io_hdr.flags & CTL_FLAG_ABORT)) continue; /* * If the abort says that the task is untagged, the * task in the queue must be untagged. Otherwise, * we just check to see whether the tag numbers * match. This is because the QLogic firmware * doesn't pass back the tag type in an abort * request. */ #if 0 if (((xio->scsiio.tag_type == CTL_TAG_UNTAGGED) && (io->taskio.tag_type == CTL_TAG_UNTAGGED)) || (xio->scsiio.tag_num == io->taskio.tag_num)) #endif /* * XXX KDM we've got problems with FC, because it * doesn't send down a tag type with aborts. So we * can only really go by the tag number... * This may cause problems with parallel SCSI. * Need to figure that out!! */ if (xio->scsiio.tag_num == io->taskio.tag_num) { xio->io_hdr.flags |= CTL_FLAG_ABORT; found = 1; if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0 && !(lun->flags & CTL_LUN_PRIMARY_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_ABORT_TASK; msg_info.task.tag_num = io->taskio.tag_num; msg_info.task.tag_type = io->taskio.tag_type; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; #if 0 printf("Sent Abort to other side\n"); #endif ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_NOWAIT); } #if 0 printf("ctl_abort_task: found I/O to abort\n"); #endif } } mtx_unlock(&lun->lun_lock); if (found == 0) { /* * This isn't really an error. It's entirely possible for * the abort and command completion to cross on the wire. * This is more of an informative/diagnostic error. */ #if 0 printf("ctl_abort_task: ABORT sent for nonexistent I/O: " "%u:%u:%u tag %d type %d\n", io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun, io->taskio.tag_num, io->taskio.tag_type); #endif } return (0); } static void ctl_run_task(union ctl_io *io) { struct ctl_softc *softc = control_softc; int retval = 1; const char *task_desc; CTL_DEBUG_PRINT(("ctl_run_task\n")); KASSERT(io->io_hdr.io_type == CTL_IO_TASK, ("ctl_run_task: Unextected io_type %d\n", io->io_hdr.io_type)); task_desc = ctl_scsi_task_string(&io->taskio); if (task_desc != NULL) { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_TASK_REPORT, csevent_LogType_Trace, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "CTL: received task: %s",task_desc); #endif } else { #ifdef NEEDTOPORT csevent_log(CSC_CTL | CSC_SHELF_SW | CTL_TASK_REPORT, csevent_LogType_Trace, csevent_Severity_Information, csevent_AlertLevel_Green, csevent_FRU_Firmware, csevent_FRU_Unknown, "CTL: received unknown task " "type: %d (%#x)", io->taskio.task_action, io->taskio.task_action); #endif } switch (io->taskio.task_action) { case CTL_TASK_ABORT_TASK: retval = ctl_abort_task(io); break; case CTL_TASK_ABORT_TASK_SET: case CTL_TASK_CLEAR_TASK_SET: retval = ctl_abort_task_set(io); break; case CTL_TASK_CLEAR_ACA: break; case CTL_TASK_I_T_NEXUS_RESET: retval = ctl_i_t_nexus_reset(io); break; case CTL_TASK_LUN_RESET: { struct ctl_lun *lun; uint32_t targ_lun; targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if ((targ_lun < CTL_MAX_LUNS) && (softc->ctl_luns[targ_lun] != NULL)) lun = softc->ctl_luns[targ_lun]; else { mtx_unlock(&softc->ctl_lock); retval = 1; break; } retval = ctl_lun_reset(lun, io, CTL_UA_LUN_RESET); mtx_unlock(&softc->ctl_lock); if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0) { union ctl_ha_msg msg_info; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_LUN_RESET; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_WAITOK); } break; } case CTL_TASK_TARGET_RESET: retval = ctl_target_reset(softc, io, CTL_UA_TARG_RESET); break; case CTL_TASK_BUS_RESET: retval = ctl_bus_reset(softc, io); break; case CTL_TASK_PORT_LOGIN: break; case CTL_TASK_PORT_LOGOUT: break; default: printf("ctl_run_task: got unknown task management event %d\n", io->taskio.task_action); break; } if (retval == 0) io->io_hdr.status = CTL_SUCCESS; else io->io_hdr.status = CTL_ERROR; ctl_done(io); } /* * For HA operation. Handle commands that come in from the other * controller. */ static void ctl_handle_isc(union ctl_io *io) { int free_io; struct ctl_lun *lun; struct ctl_softc *softc; uint32_t targ_lun; softc = control_softc; targ_lun = io->io_hdr.nexus.targ_mapped_lun; lun = softc->ctl_luns[targ_lun]; switch (io->io_hdr.msg_type) { case CTL_MSG_SERIALIZE: free_io = ctl_serialize_other_sc_cmd(&io->scsiio); break; case CTL_MSG_R2R: { const struct ctl_cmd_entry *entry; /* * This is only used in SER_ONLY mode. */ free_io = 0; entry = ctl_get_cmd_entry(&io->scsiio, NULL); mtx_lock(&lun->lun_lock); if (ctl_scsiio_lun_check(lun, entry, (struct ctl_scsiio *)io) != 0) { mtx_unlock(&lun->lun_lock); ctl_done(io); break; } io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; mtx_unlock(&lun->lun_lock); ctl_enqueue_rtr(io); break; } case CTL_MSG_FINISH_IO: if (softc->ha_mode == CTL_HA_MODE_XFER) { free_io = 0; ctl_done(io); } else { free_io = 1; mtx_lock(&lun->lun_lock); TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); ctl_check_blocked(lun); mtx_unlock(&lun->lun_lock); } break; case CTL_MSG_PERS_ACTION: ctl_hndl_per_res_out_on_other_sc( (union ctl_ha_msg *)&io->presio.pr_msg); free_io = 1; break; case CTL_MSG_BAD_JUJU: free_io = 0; ctl_done(io); break; case CTL_MSG_DATAMOVE: /* Only used in XFER mode */ free_io = 0; ctl_datamove_remote(io); break; case CTL_MSG_DATAMOVE_DONE: /* Only used in XFER mode */ free_io = 0; io->scsiio.be_move_done(io); break; case CTL_MSG_FAILOVER: mtx_lock(&lun->lun_lock); ctl_failover_lun(lun); mtx_unlock(&lun->lun_lock); free_io = 1; break; default: free_io = 1; printf("%s: Invalid message type %d\n", __func__, io->io_hdr.msg_type); break; } if (free_io) ctl_free_io(io); } /* * Returns the match type in the case of a match, or CTL_LUN_PAT_NONE if * there is no match. */ static ctl_lun_error_pattern ctl_cmd_pattern_match(struct ctl_scsiio *ctsio, struct ctl_error_desc *desc) { const struct ctl_cmd_entry *entry; ctl_lun_error_pattern filtered_pattern, pattern; pattern = desc->error_pattern; /* * XXX KDM we need more data passed into this function to match a * custom pattern, and we actually need to implement custom pattern * matching. */ if (pattern & CTL_LUN_PAT_CMD) return (CTL_LUN_PAT_CMD); if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_ANY) return (CTL_LUN_PAT_ANY); entry = ctl_get_cmd_entry(ctsio, NULL); filtered_pattern = entry->pattern & pattern; /* * If the user requested specific flags in the pattern (e.g. * CTL_LUN_PAT_RANGE), make sure the command supports all of those * flags. * * If the user did not specify any flags, it doesn't matter whether * or not the command supports the flags. */ if ((filtered_pattern & ~CTL_LUN_PAT_MASK) != (pattern & ~CTL_LUN_PAT_MASK)) return (CTL_LUN_PAT_NONE); /* * If the user asked for a range check, see if the requested LBA * range overlaps with this command's LBA range. */ if (filtered_pattern & CTL_LUN_PAT_RANGE) { uint64_t lba1; uint64_t len1; ctl_action action; int retval; retval = ctl_get_lba_len((union ctl_io *)ctsio, &lba1, &len1); if (retval != 0) return (CTL_LUN_PAT_NONE); action = ctl_extent_check_lba(lba1, len1, desc->lba_range.lba, desc->lba_range.len, FALSE); /* * A "pass" means that the LBA ranges don't overlap, so * this doesn't match the user's range criteria. */ if (action == CTL_ACTION_PASS) return (CTL_LUN_PAT_NONE); } return (filtered_pattern); } static void ctl_inject_error(struct ctl_lun *lun, union ctl_io *io) { struct ctl_error_desc *desc, *desc2; mtx_assert(&lun->lun_lock, MA_OWNED); STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) { ctl_lun_error_pattern pattern; /* * Check to see whether this particular command matches * the pattern in the descriptor. */ pattern = ctl_cmd_pattern_match(&io->scsiio, desc); if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_NONE) continue; switch (desc->lun_error & CTL_LUN_INJ_TYPE) { case CTL_LUN_INJ_ABORTED: ctl_set_aborted(&io->scsiio); break; case CTL_LUN_INJ_MEDIUM_ERR: ctl_set_medium_error(&io->scsiio); break; case CTL_LUN_INJ_UA: /* 29h/00h POWER ON, RESET, OR BUS DEVICE RESET * OCCURRED */ ctl_set_ua(&io->scsiio, 0x29, 0x00); break; case CTL_LUN_INJ_CUSTOM: /* * We're assuming the user knows what he is doing. * Just copy the sense information without doing * checks. */ bcopy(&desc->custom_sense, &io->scsiio.sense_data, MIN(sizeof(desc->custom_sense), sizeof(io->scsiio.sense_data))); io->scsiio.scsi_status = SCSI_STATUS_CHECK_COND; io->scsiio.sense_len = SSD_FULL_SIZE; io->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; break; case CTL_LUN_INJ_NONE: default: /* * If this is an error injection type we don't know * about, clear the continuous flag (if it is set) * so it will get deleted below. */ desc->lun_error &= ~CTL_LUN_INJ_CONTINUOUS; break; } /* * By default, each error injection action is a one-shot */ if (desc->lun_error & CTL_LUN_INJ_CONTINUOUS) continue; STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links); free(desc, M_CTL); } } #ifdef CTL_IO_DELAY static void ctl_datamove_timer_wakeup(void *arg) { union ctl_io *io; io = (union ctl_io *)arg; ctl_datamove(io); } #endif /* CTL_IO_DELAY */ void ctl_datamove(union ctl_io *io) { void (*fe_datamove)(union ctl_io *io); mtx_assert(&control_softc->ctl_lock, MA_NOTOWNED); CTL_DEBUG_PRINT(("ctl_datamove\n")); #ifdef CTL_TIME_IO if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) { char str[256]; char path_str[64]; struct sbuf sb; ctl_scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); break; case CTL_IO_TASK: sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, " "Tag Type: %d\n", io->taskio.task_action, io->taskio.tag_num, io->taskio.tag_type); break; default: printf("Invalid CTL I/O type %d\n", io->io_hdr.io_type); panic("Invalid CTL I/O type %d\n", io->io_hdr.io_type); break; } sbuf_cat(&sb, path_str); sbuf_printf(&sb, "ctl_datamove: %jd seconds\n", (intmax_t)time_uptime - io->io_hdr.start_time); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); } #endif /* CTL_TIME_IO */ #ifdef CTL_IO_DELAY if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) { io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE; } else { struct ctl_lun *lun; lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if ((lun != NULL) && (lun->delay_info.datamove_delay > 0)) { callout_init(&io->io_hdr.delay_callout, /*mpsafe*/ 1); io->io_hdr.flags |= CTL_FLAG_DELAY_DONE; callout_reset(&io->io_hdr.delay_callout, lun->delay_info.datamove_delay * hz, ctl_datamove_timer_wakeup, io); if (lun->delay_info.datamove_type == CTL_DELAY_TYPE_ONESHOT) lun->delay_info.datamove_delay = 0; return; } } #endif /* * This command has been aborted. Set the port status, so we fail * the data move. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { printf("ctl_datamove: tag 0x%04x on (%u:%u:%u) aborted\n", io->scsiio.tag_num, io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun); io->io_hdr.port_status = 31337; /* * Note that the backend, in this case, will get the * callback in its context. In other cases it may get * called in the frontend's interrupt thread context. */ io->scsiio.be_move_done(io); return; } /* Don't confuse frontend with zero length data move. */ if (io->scsiio.kern_data_len == 0) { io->scsiio.be_move_done(io); return; } /* * If we're in XFER mode and this I/O is from the other shelf * controller, we need to send the DMA to the other side to * actually transfer the data to/from the host. In serialize only * mode the transfer happens below CTL and ctl_datamove() is only * called on the machine that originally received the I/O. */ if ((control_softc->ha_mode == CTL_HA_MODE_XFER) && (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg; uint32_t sg_entries_sent; int do_sg_copy; int i; memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_DATAMOVE; msg.hdr.original_sc = io->io_hdr.original_sc; msg.hdr.serializing_sc = io; msg.hdr.nexus = io->io_hdr.nexus; msg.dt.flags = io->io_hdr.flags; /* * We convert everything into a S/G list here. We can't * pass by reference, only by value between controllers. * So we can't pass a pointer to the S/G list, only as many * S/G entries as we can fit in here. If it's possible for * us to get more than CTL_HA_MAX_SG_ENTRIES S/G entries, * then we need to break this up into multiple transfers. */ if (io->scsiio.kern_sg_entries == 0) { msg.dt.kern_sg_entries = 1; #if 0 /* * Convert to a physical address if this is a * virtual address. */ if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) { msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr; } else { /* * XXX KDM use busdma here! */ msg.dt.sg_list[0].addr = (void *) vtophys(io->scsiio.kern_data_ptr); } #else KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0, ("HA does not support BUS_ADDR")); msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr; #endif msg.dt.sg_list[0].len = io->scsiio.kern_data_len; do_sg_copy = 0; } else { msg.dt.kern_sg_entries = io->scsiio.kern_sg_entries; do_sg_copy = 1; } msg.dt.kern_data_len = io->scsiio.kern_data_len; msg.dt.kern_total_len = io->scsiio.kern_total_len; msg.dt.kern_data_resid = io->scsiio.kern_data_resid; msg.dt.kern_rel_offset = io->scsiio.kern_rel_offset; msg.dt.sg_sequence = 0; /* * Loop until we've sent all of the S/G entries. On the * other end, we'll recompose these S/G entries into one * contiguous list before passing it to the */ for (sg_entries_sent = 0; sg_entries_sent < msg.dt.kern_sg_entries; msg.dt.sg_sequence++) { msg.dt.cur_sg_entries = MIN((sizeof(msg.dt.sg_list)/ sizeof(msg.dt.sg_list[0])), msg.dt.kern_sg_entries - sg_entries_sent); if (do_sg_copy != 0) { struct ctl_sg_entry *sgl; int j; sgl = (struct ctl_sg_entry *) io->scsiio.kern_data_ptr; /* * If this is in cached memory, flush the cache * before we send the DMA request to the other * controller. We want to do this in either * the * read or the write case. The read * case is straightforward. In the write * case, we want to make sure nothing is * in the local cache that could overwrite * the DMAed data. */ for (i = sg_entries_sent, j = 0; i < msg.dt.cur_sg_entries; i++, j++) { #if 0 if ((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0) { /* * XXX KDM use busdma. */ msg.dt.sg_list[j].addr =(void *) vtophys(sgl[i].addr); } else { msg.dt.sg_list[j].addr = sgl[i].addr; } #else KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0, ("HA does not support BUS_ADDR")); msg.dt.sg_list[j].addr = sgl[i].addr; #endif msg.dt.sg_list[j].len = sgl[i].len; } } sg_entries_sent += msg.dt.cur_sg_entries; if (sg_entries_sent >= msg.dt.kern_sg_entries) msg.dt.sg_last = 1; else msg.dt.sg_last = 0; if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.dt) - sizeof(msg.dt.sg_list) + sizeof(struct ctl_sg_entry)*msg.dt.cur_sg_entries, M_WAITOK) > CTL_HA_STATUS_SUCCESS) { io->io_hdr.port_status = 31341; io->scsiio.be_move_done(io); return; } msg.dt.sent_sg_entries = sg_entries_sent; } io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; } else { /* * Lookup the fe_datamove() function for this particular * front end. */ fe_datamove = ctl_io_port(&io->io_hdr)->fe_datamove; fe_datamove(io); } } static void ctl_send_datamove_done(union ctl_io *io, int have_lock) { union ctl_ha_msg msg; memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_DATAMOVE_DONE; msg.hdr.original_sc = io; msg.hdr.serializing_sc = io->io_hdr.serializing_sc; msg.hdr.nexus = io->io_hdr.nexus; msg.hdr.status = io->io_hdr.status; msg.scsi.tag_num = io->scsiio.tag_num; msg.scsi.tag_type = io->scsiio.tag_type; msg.scsi.scsi_status = io->scsiio.scsi_status; memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data, io->scsiio.sense_len); msg.scsi.sense_len = io->scsiio.sense_len; msg.scsi.sense_residual = io->scsiio.sense_residual; msg.scsi.fetd_status = io->io_hdr.port_status; msg.scsi.residual = io->scsiio.residual; io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; if (io->io_hdr.flags & CTL_FLAG_FAILOVER) { ctl_failover_io(io, /*have_lock*/ have_lock); return; } ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.scsi) - sizeof(msg.scsi.sense_data) + msg.scsi.sense_len, M_WAITOK); } /* * The DMA to the remote side is done, now we need to tell the other side * we're done so it can continue with its data movement. */ static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq) { union ctl_io *io; int i; io = rq->context; if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA write failed with error %d", __func__, rq->ret); ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ rq->ret); } ctl_dt_req_free(rq); for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(io->io_hdr.local_sglist[i].addr, M_CTL); free(io->io_hdr.remote_sglist, M_CTL); io->io_hdr.remote_sglist = NULL; io->io_hdr.local_sglist = NULL; /* * The data is in local and remote memory, so now we need to send * status (good or back) back to the other side. */ ctl_send_datamove_done(io, /*have_lock*/ 0); } /* * We've moved the data from the host/controller into local memory. Now we * need to push it over to the remote controller's memory. */ static int ctl_datamove_remote_dm_write_cb(union ctl_io *io) { int retval; retval = 0; retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_WRITE, ctl_datamove_remote_write_cb); return (retval); } static void ctl_datamove_remote_write(union ctl_io *io) { int retval; void (*fe_datamove)(union ctl_io *io); /* * - Get the data from the host/HBA into local memory. * - DMA memory from the local controller to the remote controller. * - Send status back to the remote controller. */ retval = ctl_datamove_remote_sgl_setup(io); if (retval != 0) return; /* Switch the pointer over so the FETD knows what to do */ io->scsiio.kern_data_ptr = (uint8_t *)io->io_hdr.local_sglist; /* * Use a custom move done callback, since we need to send completion * back to the other controller, not to the backend on this side. */ io->scsiio.be_move_done = ctl_datamove_remote_dm_write_cb; fe_datamove = ctl_io_port(&io->io_hdr)->fe_datamove; fe_datamove(io); return; } static int ctl_datamove_remote_dm_read_cb(union ctl_io *io) { #if 0 char str[256]; char path_str[64]; struct sbuf sb; #endif int i; for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(io->io_hdr.local_sglist[i].addr, M_CTL); free(io->io_hdr.remote_sglist, M_CTL); io->io_hdr.remote_sglist = NULL; io->io_hdr.local_sglist = NULL; #if 0 scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "%s: flags %#x, status %#x\n", __func__, io->io_hdr.flags, io->io_hdr.status); sbuf_finish(&sb); printk("%s", sbuf_data(&sb)); #endif /* * The read is done, now we need to send status (good or bad) back * to the other side. */ ctl_send_datamove_done(io, /*have_lock*/ 0); return (0); } static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq) { union ctl_io *io; void (*fe_datamove)(union ctl_io *io); io = rq->context; if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA read failed with error %d\n", __func__, rq->ret); ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ rq->ret); } ctl_dt_req_free(rq); /* Switch the pointer over so the FETD knows what to do */ io->scsiio.kern_data_ptr = (uint8_t *)io->io_hdr.local_sglist; /* * Use a custom move done callback, since we need to send completion * back to the other controller, not to the backend on this side. */ io->scsiio.be_move_done = ctl_datamove_remote_dm_read_cb; /* XXX KDM add checks like the ones in ctl_datamove? */ fe_datamove = ctl_io_port(&io->io_hdr)->fe_datamove; fe_datamove(io); } static int ctl_datamove_remote_sgl_setup(union ctl_io *io) { struct ctl_sg_entry *local_sglist, *remote_sglist; struct ctl_softc *softc; uint32_t len_to_go; int retval; int i; retval = 0; softc = control_softc; local_sglist = io->io_hdr.local_sglist; remote_sglist = io->io_hdr.remote_sglist; len_to_go = io->scsiio.kern_data_len; /* * The difficult thing here is that the size of the various * S/G segments may be different than the size from the * remote controller. That'll make it harder when DMAing * the data back to the other side. */ for (i = 0; len_to_go > 0; i++) { local_sglist[i].len = MIN(len_to_go, CTL_HA_DATAMOVE_SEGMENT); local_sglist[i].addr = malloc(local_sglist[i].len, M_CTL, M_WAITOK); len_to_go -= local_sglist[i].len; } /* * Reset the number of S/G entries accordingly. The original * number of S/G entries is available in rem_sg_entries. */ io->scsiio.kern_sg_entries = i; #if 0 printf("%s: kern_sg_entries = %d\n", __func__, io->scsiio.kern_sg_entries); for (i = 0; i < io->scsiio.kern_sg_entries; i++) printf("%s: sg[%d] = %p, %d\n", __func__, i, local_sglist[i].addr, local_sglist[i].len); #endif return (retval); } static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command, ctl_ha_dt_cb callback) { struct ctl_ha_dt_req *rq; struct ctl_sg_entry *remote_sglist, *local_sglist; uint32_t local_used, remote_used, total_used; int i, j, isc_ret; rq = ctl_dt_req_alloc(); /* * If we failed to allocate the request, and if the DMA didn't fail * anyway, set busy status. This is just a resource allocation * failure. */ if ((rq == NULL) && ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) ctl_set_busy(&io->scsiio); if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE) { if (rq != NULL) ctl_dt_req_free(rq); /* * The data move failed. We need to return status back * to the other controller. No point in trying to DMA * data to the remote controller. */ ctl_send_datamove_done(io, /*have_lock*/ 0); return (1); } local_sglist = io->io_hdr.local_sglist; remote_sglist = io->io_hdr.remote_sglist; local_used = 0; remote_used = 0; total_used = 0; /* * Pull/push the data over the wire from/to the other controller. * This takes into account the possibility that the local and * remote sglists may not be identical in terms of the size of * the elements and the number of elements. * * One fundamental assumption here is that the length allocated for * both the local and remote sglists is identical. Otherwise, we've * essentially got a coding error of some sort. */ isc_ret = CTL_HA_STATUS_SUCCESS; for (i = 0, j = 0; total_used < io->scsiio.kern_data_len; ) { uint32_t cur_len; uint8_t *tmp_ptr; rq->command = command; rq->context = io; /* * Both pointers should be aligned. But it is possible * that the allocation length is not. They should both * also have enough slack left over at the end, though, * to round up to the next 8 byte boundary. */ cur_len = MIN(local_sglist[i].len - local_used, remote_sglist[j].len - remote_used); rq->size = cur_len; tmp_ptr = (uint8_t *)local_sglist[i].addr; tmp_ptr += local_used; #if 0 /* Use physical addresses when talking to ISC hardware */ if ((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0) { /* XXX KDM use busdma */ rq->local = vtophys(tmp_ptr); } else rq->local = tmp_ptr; #else KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0, ("HA does not support BUS_ADDR")); rq->local = tmp_ptr; #endif tmp_ptr = (uint8_t *)remote_sglist[j].addr; tmp_ptr += remote_used; rq->remote = tmp_ptr; rq->callback = NULL; local_used += cur_len; if (local_used >= local_sglist[i].len) { i++; local_used = 0; } remote_used += cur_len; if (remote_used >= remote_sglist[j].len) { j++; remote_used = 0; } total_used += cur_len; if (total_used >= io->scsiio.kern_data_len) rq->callback = callback; #if 0 printf("%s: %s: local %#x remote %#x size %d\n", __func__, (command == CTL_HA_DT_CMD_WRITE) ? "WRITE" : "READ", rq->local, rq->remote, rq->size); #endif isc_ret = ctl_dt_single(rq); if (isc_ret > CTL_HA_STATUS_SUCCESS) break; } if (isc_ret != CTL_HA_STATUS_WAIT) { rq->ret = isc_ret; callback(rq); } return (0); } static void ctl_datamove_remote_read(union ctl_io *io) { int retval; int i; /* * This will send an error to the other controller in the case of a * failure. */ retval = ctl_datamove_remote_sgl_setup(io); if (retval != 0) return; retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_READ, ctl_datamove_remote_read_cb); if (retval != 0) { /* * Make sure we free memory if there was an error.. The * ctl_datamove_remote_xfer() function will send the * datamove done message, or call the callback with an * error if there is a problem. */ for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(io->io_hdr.local_sglist[i].addr, M_CTL); free(io->io_hdr.remote_sglist, M_CTL); io->io_hdr.remote_sglist = NULL; io->io_hdr.local_sglist = NULL; } return; } /* * Process a datamove request from the other controller. This is used for * XFER mode only, not SER_ONLY mode. For writes, we DMA into local memory * first. Once that is complete, the data gets DMAed into the remote * controller's memory. For reads, we DMA from the remote controller's * memory into our memory first, and then move it out to the FETD. */ static void ctl_datamove_remote(union ctl_io *io) { mtx_assert(&control_softc->ctl_lock, MA_NOTOWNED); if (io->io_hdr.flags & CTL_FLAG_FAILOVER) { ctl_failover_io(io, /*have_lock*/ 0); return; } /* * Note that we look for an aborted I/O here, but don't do some of * the other checks that ctl_datamove() normally does. * We don't need to run the datamove delay code, since that should * have been done if need be on the other controller. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { printf("%s: tag 0x%04x on (%u:%u:%u) aborted\n", __func__, io->scsiio.tag_num, io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun); io->io_hdr.port_status = 31338; ctl_send_datamove_done(io, /*have_lock*/ 0); return; } if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) ctl_datamove_remote_write(io); else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) ctl_datamove_remote_read(io); else { io->io_hdr.port_status = 31339; ctl_send_datamove_done(io, /*have_lock*/ 0); } } static int ctl_process_done(union ctl_io *io) { struct ctl_lun *lun; struct ctl_softc *softc = control_softc; void (*fe_done)(union ctl_io *io); union ctl_ha_msg msg; uint32_t targ_port = io->io_hdr.nexus.targ_port; CTL_DEBUG_PRINT(("ctl_process_done\n")); if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0) fe_done = softc->ctl_ports[targ_port]->fe_done; else fe_done = NULL; #ifdef CTL_TIME_IO if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) { char str[256]; char path_str[64]; struct sbuf sb; ctl_scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); break; case CTL_IO_TASK: sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, " "Tag Type: %d\n", io->taskio.task_action, io->taskio.tag_num, io->taskio.tag_type); break; default: printf("Invalid CTL I/O type %d\n", io->io_hdr.io_type); panic("Invalid CTL I/O type %d\n", io->io_hdr.io_type); break; } sbuf_cat(&sb, path_str); sbuf_printf(&sb, "ctl_process_done: %jd seconds\n", (intmax_t)time_uptime - io->io_hdr.start_time); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); } #endif /* CTL_TIME_IO */ switch (io->io_hdr.io_type) { case CTL_IO_SCSI: break; case CTL_IO_TASK: if (ctl_debug & CTL_DEBUG_INFO) ctl_io_error_print(io, NULL); if (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) ctl_free_io(io); else fe_done(io); return (CTL_RETVAL_COMPLETE); default: panic("ctl_process_done: invalid io type %d\n", io->io_hdr.io_type); break; /* NOTREACHED */ } lun = (struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if (lun == NULL) { CTL_DEBUG_PRINT(("NULL LUN for lun %d\n", io->io_hdr.nexus.targ_mapped_lun)); goto bailout; } mtx_lock(&lun->lun_lock); /* * Check to see if we have any errors to inject here. We only * inject errors for commands that don't already have errors set. */ if ((STAILQ_FIRST(&lun->error_list) != NULL) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS) && ((io->io_hdr.flags & CTL_FLAG_STATUS_SENT) == 0)) ctl_inject_error(lun, io); /* * XXX KDM how do we treat commands that aren't completed * successfully? * * XXX KDM should we also track I/O latency? */ if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS && io->io_hdr.io_type == CTL_IO_SCSI) { #ifdef CTL_TIME_IO struct bintime cur_bt; #endif int type; if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) type = CTL_STATS_READ; else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) type = CTL_STATS_WRITE; else type = CTL_STATS_NO_IO; lun->stats.ports[targ_port].bytes[type] += io->scsiio.kern_total_len; lun->stats.ports[targ_port].operations[type]++; #ifdef CTL_TIME_IO bintime_add(&lun->stats.ports[targ_port].dma_time[type], &io->io_hdr.dma_bt); lun->stats.ports[targ_port].num_dmas[type] += io->io_hdr.num_dmas; getbintime(&cur_bt); bintime_sub(&cur_bt, &io->io_hdr.start_bt); bintime_add(&lun->stats.ports[targ_port].time[type], &cur_bt); #endif } /* * Remove this from the OOA queue. */ TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); #ifdef CTL_TIME_IO if (TAILQ_EMPTY(&lun->ooa_queue)) lun->last_busy = getsbinuptime(); #endif /* * Run through the blocked queue on this LUN and see if anything * has become unblocked, now that this transaction is done. */ ctl_check_blocked(lun); /* * If the LUN has been invalidated, free it if there is nothing * left on its OOA queue. */ if ((lun->flags & CTL_LUN_INVALID) && TAILQ_EMPTY(&lun->ooa_queue)) { mtx_unlock(&lun->lun_lock); mtx_lock(&softc->ctl_lock); ctl_free_lun(lun); mtx_unlock(&softc->ctl_lock); } else mtx_unlock(&lun->lun_lock); bailout: /* * If this command has been aborted, make sure we set the status * properly. The FETD is responsible for freeing the I/O and doing * whatever it needs to do to clean up its state. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) ctl_set_task_aborted(&io->scsiio); /* * If enabled, print command error status. */ if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS && (ctl_debug & CTL_DEBUG_INFO) != 0) ctl_io_error_print(io, NULL); /* * Tell the FETD or the other shelf controller we're done with this * command. Note that only SCSI commands get to this point. Task * management commands are completed above. */ if ((softc->ha_mode != CTL_HA_MODE_XFER) && (io->io_hdr.flags & CTL_FLAG_SENT_2OTHER_SC)) { memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_FINISH_IO; msg.hdr.serializing_sc = io->io_hdr.serializing_sc; msg.hdr.nexus = io->io_hdr.nexus; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.scsi) - sizeof(msg.scsi.sense_data), M_WAITOK); } if ((softc->ha_mode == CTL_HA_MODE_XFER) && (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_FINISH_IO; msg.hdr.original_sc = io->io_hdr.original_sc; msg.hdr.nexus = io->io_hdr.nexus; msg.hdr.status = io->io_hdr.status; msg.scsi.scsi_status = io->scsiio.scsi_status; msg.scsi.tag_num = io->scsiio.tag_num; msg.scsi.tag_type = io->scsiio.tag_type; msg.scsi.sense_len = io->scsiio.sense_len; msg.scsi.sense_residual = io->scsiio.sense_residual; msg.scsi.residual = io->scsiio.residual; memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data, io->scsiio.sense_len); /* * We copy this whether or not this is an I/O-related * command. Otherwise, we'd have to go and check to see * whether it's a read/write command, and it really isn't * worth it. */ memcpy(&msg.scsi.lbalen, &io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].bytes, sizeof(msg.scsi.lbalen)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.scsi) - sizeof(msg.scsi.sense_data) + msg.scsi.sense_len, M_WAITOK); ctl_free_io(io); } else fe_done(io); return (CTL_RETVAL_COMPLETE); } #ifdef CTL_WITH_CA /* * Front end should call this if it doesn't do autosense. When the request * sense comes back in from the initiator, we'll dequeue this and send it. */ int ctl_queue_sense(union ctl_io *io) { struct ctl_lun *lun; struct ctl_port *port; struct ctl_softc *softc; uint32_t initidx, targ_lun; softc = control_softc; CTL_DEBUG_PRINT(("ctl_queue_sense\n")); /* * LUN lookup will likely move to the ctl_work_thread() once we * have our new queueing infrastructure (that doesn't put things on * a per-LUN queue initially). That is so that we can handle * things like an INQUIRY to a LUN that we don't have enabled. We * can't deal with that right now. */ mtx_lock(&softc->ctl_lock); /* * If we don't have a LUN for this, just toss the sense * information. */ port = ctl_io_port(&ctsio->io_hdr); targ_lun = ctl_lun_map_from_port(port, io->io_hdr.nexus.targ_lun); if ((targ_lun < CTL_MAX_LUNS) && (softc->ctl_luns[targ_lun] != NULL)) lun = softc->ctl_luns[targ_lun]; else goto bailout; initidx = ctl_get_initindex(&io->io_hdr.nexus); mtx_lock(&lun->lun_lock); /* * Already have CA set for this LUN...toss the sense information. */ if (ctl_is_set(lun->have_ca, initidx)) { mtx_unlock(&lun->lun_lock); goto bailout; } memcpy(&lun->pending_sense[initidx], &io->scsiio.sense_data, MIN(sizeof(lun->pending_sense[initidx]), sizeof(io->scsiio.sense_data))); ctl_set_mask(lun->have_ca, initidx); mtx_unlock(&lun->lun_lock); bailout: mtx_unlock(&softc->ctl_lock); ctl_free_io(io); return (CTL_RETVAL_COMPLETE); } #endif /* * Primary command inlet from frontend ports. All SCSI and task I/O * requests must go through this function. */ int ctl_queue(union ctl_io *io) { struct ctl_port *port; CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; getbintime(&io->io_hdr.start_bt); #endif /* CTL_TIME_IO */ /* Map FE-specific LUN ID into global one. */ port = ctl_io_port(&io->io_hdr); io->io_hdr.nexus.targ_mapped_lun = ctl_lun_map_from_port(port, io->io_hdr.nexus.targ_lun); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: case CTL_IO_TASK: if (ctl_debug & CTL_DEBUG_CDB) ctl_io_print(io); ctl_enqueue_incoming(io); break; default: printf("ctl_queue: unknown I/O type %d\n", io->io_hdr.io_type); return (EINVAL); } return (CTL_RETVAL_COMPLETE); } #ifdef CTL_IO_DELAY static void ctl_done_timer_wakeup(void *arg) { union ctl_io *io; io = (union ctl_io *)arg; ctl_done(io); } #endif /* CTL_IO_DELAY */ void ctl_done(union ctl_io *io) { /* * Enable this to catch duplicate completion issues. */ #if 0 if (io->io_hdr.flags & CTL_FLAG_ALREADY_DONE) { printf("%s: type %d msg %d cdb %x iptl: " "%u:%u:%u tag 0x%04x " "flag %#x status %x\n", __func__, io->io_hdr.io_type, io->io_hdr.msg_type, io->scsiio.cdb[0], io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun, (io->io_hdr.io_type == CTL_IO_TASK) ? io->taskio.tag_num : io->scsiio.tag_num, io->io_hdr.flags, io->io_hdr.status); } else io->io_hdr.flags |= CTL_FLAG_ALREADY_DONE; #endif /* * This is an internal copy of an I/O, and should not go through * the normal done processing logic. */ if (io->io_hdr.flags & CTL_FLAG_INT_COPY) return; #ifdef CTL_IO_DELAY if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) { struct ctl_lun *lun; lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE; } else { struct ctl_lun *lun; lun =(struct ctl_lun *)io->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; if ((lun != NULL) && (lun->delay_info.done_delay > 0)) { callout_init(&io->io_hdr.delay_callout, /*mpsafe*/ 1); io->io_hdr.flags |= CTL_FLAG_DELAY_DONE; callout_reset(&io->io_hdr.delay_callout, lun->delay_info.done_delay * hz, ctl_done_timer_wakeup, io); if (lun->delay_info.done_type == CTL_DELAY_TYPE_ONESHOT) lun->delay_info.done_delay = 0; return; } } #endif /* CTL_IO_DELAY */ ctl_enqueue_done(io); } static void ctl_work_thread(void *arg) { struct ctl_thread *thr = (struct ctl_thread *)arg; struct ctl_softc *softc = thr->ctl_softc; union ctl_io *io; int retval; CTL_DEBUG_PRINT(("ctl_work_thread starting\n")); for (;;) { retval = 0; /* * We handle the queues in this order: * - ISC * - done queue (to free up resources, unblock other commands) * - RtR queue * - incoming queue * * If those queues are empty, we break out of the loop and * go to sleep. */ mtx_lock(&thr->queue_lock); io = (union ctl_io *)STAILQ_FIRST(&thr->isc_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->isc_queue, links); mtx_unlock(&thr->queue_lock); ctl_handle_isc(io); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->done_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->done_queue, links); /* clear any blocked commands, call fe_done */ mtx_unlock(&thr->queue_lock); retval = ctl_process_done(io); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->incoming_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->incoming_queue, links); mtx_unlock(&thr->queue_lock); if (io->io_hdr.io_type == CTL_IO_TASK) ctl_run_task(io); else ctl_scsiio_precheck(softc, &io->scsiio); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->rtr_queue, links); mtx_unlock(&thr->queue_lock); retval = ctl_scsiio(&io->scsiio); if (retval != CTL_RETVAL_COMPLETE) CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); continue; } /* Sleep until we have something to do. */ mtx_sleep(thr, &thr->queue_lock, PDROP | PRIBIO, "-", 0); } } static void ctl_lun_thread(void *arg) { struct ctl_softc *softc = (struct ctl_softc *)arg; struct ctl_be_lun *be_lun; int retval; CTL_DEBUG_PRINT(("ctl_lun_thread starting\n")); for (;;) { retval = 0; mtx_lock(&softc->ctl_lock); be_lun = STAILQ_FIRST(&softc->pending_lun_queue); if (be_lun != NULL) { STAILQ_REMOVE_HEAD(&softc->pending_lun_queue, links); mtx_unlock(&softc->ctl_lock); ctl_create_lun(be_lun); continue; } /* Sleep until we have something to do. */ mtx_sleep(&softc->pending_lun_queue, &softc->ctl_lock, PDROP | PRIBIO, "-", 0); } } static void ctl_thresh_thread(void *arg) { struct ctl_softc *softc = (struct ctl_softc *)arg; struct ctl_lun *lun; struct ctl_be_lun *be_lun; struct scsi_da_rw_recovery_page *rwpage; struct ctl_logical_block_provisioning_page *page; const char *attr; union ctl_ha_msg msg; uint64_t thres, val; int i, e, set; CTL_DEBUG_PRINT(("ctl_thresh_thread starting\n")); for (;;) { mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { be_lun = lun->be_lun; if ((lun->flags & CTL_LUN_DISABLED) || (lun->flags & CTL_LUN_OFFLINE) || lun->backend->lun_attr == NULL) continue; if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && softc->ha_mode == CTL_HA_MODE_XFER) continue; rwpage = &lun->mode_pages.rw_er_page[CTL_PAGE_CURRENT]; if ((rwpage->byte8 & SMS_RWER_LBPERE) == 0) continue; e = 0; page = &lun->mode_pages.lbp_page[CTL_PAGE_CURRENT]; for (i = 0; i < CTL_NUM_LBP_THRESH; i++) { if ((page->descr[i].flags & SLBPPD_ENABLED) == 0) continue; thres = scsi_4btoul(page->descr[i].count); thres <<= CTL_LBP_EXPONENT; switch (page->descr[i].resource) { case 0x01: attr = "blocksavail"; break; case 0x02: attr = "blocksused"; break; case 0xf1: attr = "poolblocksavail"; break; case 0xf2: attr = "poolblocksused"; break; default: continue; } mtx_unlock(&softc->ctl_lock); // XXX val = lun->backend->lun_attr( lun->be_lun->be_lun, attr); mtx_lock(&softc->ctl_lock); if (val == UINT64_MAX) continue; if ((page->descr[i].flags & SLBPPD_ARMING_MASK) == SLBPPD_ARMING_INC) e |= (val >= thres); else e |= (val <= thres); } mtx_lock(&lun->lun_lock); if (e) { if (lun->lasttpt == 0 || time_uptime - lun->lasttpt >= CTL_LBP_UA_PERIOD) { lun->lasttpt = time_uptime; ctl_est_ua_all(lun, -1, CTL_UA_THIN_PROV_THRES); set = 1; } else set = 0; } else { lun->lasttpt = 0; ctl_clr_ua_all(lun, -1, CTL_UA_THIN_PROV_THRES); set = -1; } mtx_unlock(&lun->lun_lock); if (set != 0 && lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) { /* Send msg to other side. */ bzero(&msg.ua, sizeof(msg.ua)); msg.hdr.msg_type = CTL_MSG_UA; msg.hdr.nexus.initid = -1; msg.hdr.nexus.targ_port = -1; msg.hdr.nexus.targ_lun = lun->lun; msg.hdr.nexus.targ_mapped_lun = lun->lun; msg.ua.ua_all = 1; msg.ua.ua_set = (set > 0); msg.ua.ua_type = CTL_UA_THIN_PROV_THRES; mtx_unlock(&softc->ctl_lock); // XXX ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.ua), M_WAITOK); mtx_lock(&softc->ctl_lock); } } mtx_unlock(&softc->ctl_lock); pause("-", CTL_LBP_PERIOD * hz); } } static void ctl_enqueue_incoming(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; u_int idx; idx = (io->io_hdr.nexus.targ_port * 127 + io->io_hdr.nexus.initid) % worker_threads; thr = &softc->threads[idx]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->incoming_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_rtr(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->rtr_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_done(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->done_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_isc(union ctl_io *io) { struct ctl_softc *softc = control_softc; struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->isc_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } /* * vim: ts=8 */ Index: projects/iosched/sys/cam/ctl/ctl_cmd_table.c =================================================================== --- projects/iosched/sys/cam/ctl/ctl_cmd_table.c (revision 287721) +++ projects/iosched/sys/cam/ctl/ctl_cmd_table.c (revision 287722) @@ -1,1488 +1,1488 @@ /*- * Copyright (c) 2003, 2004, 2005, 2009 Silicon Graphics International Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_cmd_table.c#4 $ * $FreeBSD$ */ /* * CAM Target Layer command table. * * Author: Ken Merry , Kim Le */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Whenever support for a new command is added, it should be added to these * tables. */ /* 5E PERSISTENT RESERVE IN */ const struct ctl_cmd_entry ctl_cmd_table_5e[32] = { /* 00 READ KEYS */ {ctl_persistent_reserve_in, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x00, 0, 0, 0, 0, 0, 0xff, 0xff, 0x07}}, /* 01 READ RESERVATION */ {ctl_persistent_reserve_in, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x01, 0, 0, 0, 0, 0, 0xff, 0xff, 0x07}}, /* 02 REPORT CAPABILITIES */ {ctl_persistent_reserve_in, CTL_SERIDX_INQ, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x02, 0, 0, 0, 0, 0, 0xff, 0xff, 0x07}}, /* 03 READ FULL STATUS */ {ctl_persistent_reserve_in, CTL_SERIDX_INQ, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x03, 0, 0, 0, 0, 0, 0xff, 0xff, 0x07}}, /* 04-1f */ }; /* 5F PERSISTENT RESERVE OUT */ const struct ctl_cmd_entry ctl_cmd_table_5f[32] = { /* 00 REGISTER */ {ctl_persistent_reserve_out, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x00, 0xff, 0, 0, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 01 RESERVE */ {ctl_persistent_reserve_out, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x01, 0xff, 0, 0, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 02 RELEASE */ {ctl_persistent_reserve_out, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x02, 0xff, 0, 0, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 03 CLEAR */ {ctl_persistent_reserve_out, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x03, 0xff, 0, 0, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 04 PREEMPT */ {ctl_persistent_reserve_out, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x04, 0xff, 0, 0, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 05 PREEMPT AND ABORT */ {ctl_persistent_reserve_out, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x05, 0xff, 0, 0, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 06 REGISTER AND IGNORE EXISTING KEY */ {ctl_persistent_reserve_out, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, { 0x06, 0xff, 0, 0, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 07 REGISTER AND MOVE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 08-1f */ }; /* 83 EXTENDED COPY */ const struct ctl_cmd_entry ctl_cmd_table_83[32] = { /* 00 EXTENDED COPY (LID1) */ {ctl_extended_copy_lid1, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 16, { 0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 01 EXTENDED COPY (LID4) */ {ctl_extended_copy_lid4, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 16, { 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 02 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 03 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 04 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 05 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 06 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 07 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 08 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 09 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0A */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0C */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0D */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0E */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0F */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 10 POPULATE TOKEN */ {ctl_populate_token, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_NONE, 16, { 0x10, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 11 WRITE USING TOKEN */ {ctl_write_using_token, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 16, { 0x11, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 12 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 13 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 14 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 15 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 16 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 17 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 18 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 19 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 1A */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 1B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 1C COPY OPERATION ABORT */ {ctl_copy_operation_abort, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_NONE, CTL_LUN_PAT_NONE, 16, { 0x1c, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x07}}, }; /* 84 RECEIVE COPY STATUS */ const struct ctl_cmd_entry ctl_cmd_table_84[32] = { /* 00 RECEIVE COPY STATUS (LID1) */ {ctl_receive_copy_status_lid1, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 16, {0x00, 0xff, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 01 RECEIVE COPY DATA (LID1) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 02 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 03 RECEIVE COPY OPERATING PARAMETERS */ {ctl_receive_copy_operating_parameters, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 16, {0x03, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 04 RECEIVE COPY FAILURE DETAILS (LID1) */ {ctl_receive_copy_failure_details, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 16, {0x04, 0xff, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 05 RECEIVE COPY STATUS (LID4) */ {ctl_receive_copy_status_lid4, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 16, {0x05, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 06 RECEIVE COPY DATA (LID4)*/ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 07 RECEIVE ROD TOKEN INFORMATION */ {ctl_receive_rod_token_information, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 16, {0x07, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 08 REPORT ALL ROD TOKENS */ {ctl_report_all_rod_tokens, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 16, {0x08, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, }; /* 9E SERVICE ACTION IN(16) */ const struct ctl_cmd_entry ctl_cmd_table_9e[32] = { /* 00 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 01 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 02 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 03 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 04 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 05 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 06 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 07 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 08 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 09 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0A */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0C */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0D */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0E */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0F */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 10 READ CAPACITY(16) */ {ctl_read_capacity_16, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_SLUN | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_READCAP, 16, {0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 11 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 12 GET LBA STATUS */ {ctl_get_lba_status, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 16, {0x12, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 13-1f */ }; /* A3 MAINTENANCE IN */ const struct ctl_cmd_entry ctl_cmd_table_a3[32] = { /* 00 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 01 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 02 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 03 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 04 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 05 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 06 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 07 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 08 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 09 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0A REPORT TARGET PORT GROUPS */ {ctl_report_tagret_port_groups, CTL_SERIDX_INQ, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_CMD_FLAG_OK_ON_UNAVAIL | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, - 12, {0x0a, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, + 12, {0xea, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 0B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0C REPORT SUPPORTED_OPCODES */ {ctl_report_supported_opcodes, CTL_SERIDX_INQ, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_CMD_FLAG_OK_ON_UNAVAIL | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 12, {0x0c, 0x87, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 0D REPORT SUPPORTED_TASK MANAGEMENT FUNCTIONS */ {ctl_report_supported_tmf, CTL_SERIDX_INQ, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_CMD_FLAG_OK_ON_UNAVAIL | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 12, {0x0d, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 0E */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0F REPORT TIMESTAMP */ {ctl_report_timestamp, CTL_SERIDX_INQ, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_CMD_FLAG_OK_ON_UNAVAIL | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 12, {0x0f, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 10-1f */ }; const struct ctl_cmd_entry ctl_cmd_table[256] = { /* 00 TEST UNIT READY */ {ctl_tur, CTL_SERIDX_TUR, CTL_CMD_FLAG_OK_ON_BOTH | CTL_FLAG_DATA_NONE | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_TUR, 6, {0, 0, 0, 0, 0x07}}, /* 01 REWIND */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 02 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 03 REQUEST SENSE */ {ctl_request_sense, CTL_SERIDX_RQ_SNS, CTL_FLAG_DATA_IN | CTL_CMD_FLAG_OK_ON_ALL_LUNS | CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_NO_SENSE | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_CMD_FLAG_OK_ON_UNAVAIL | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 6, {0x01, 0, 0, 0xff, 0x07}}, /* 04 FORMAT UNIT */ {ctl_format, CTL_SERIDX_FORMAT, CTL_CMD_FLAG_OK_ON_SLUN | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 6, {0xff, 0, 0, 0, 0x07}}, /* 05 READ BLOCK LIMITS */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 06 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 07 REASSIGN BLOCKS */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 08 READ(6) */ {ctl_read_write, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 6, {0x1f, 0xff, 0xff, 0xff, 0x07}}, /* 09 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0A WRITE(6) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 6, {0x1f, 0xff, 0xff, 0xff, 0x07}}, /* 0B SEEK(6) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0C */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0D */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0E */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 0F READ REVERSE(6) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 10 WRITE FILEMARKS(6) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 11 SPACE(6) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 12 INQUIRY */ {ctl_inquiry, CTL_SERIDX_INQ, CTL_CMD_FLAG_OK_ON_ALL_LUNS | CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_NO_SENSE | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_CMD_FLAG_OK_ON_UNAVAIL | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 6, {0xe1, 0xff, 0xff, 0xff, 0x07}}, /* 13 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 14 RECOVER BUFFERED DATA */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 15 MODE SELECT(6) */ {ctl_mode_select, CTL_SERIDX_MD_SEL, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 6, {0x11, 0, 0, 0xff, 0x07}}, /* 16 RESERVE(6) */ {ctl_scsi_reserve, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 6, {0, 0, 0, 0, 0x07}}, /* 17 RELEASE(6) */ {ctl_scsi_release, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_NONE, CTL_LUN_PAT_NONE, 6, {0, 0, 0, 0, 0x07}}, /* 18 COPY */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 19 ERASE(6) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 1A MODE SENSE(6) */ {ctl_mode_sense, CTL_SERIDX_MD_SNS, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_NONE, 6, {0x08, 0xff, 0xff, 0xff, 0x07}}, /* 1B START STOP UNIT */ {ctl_start_stop, CTL_SERIDX_START, CTL_CMD_FLAG_OK_ON_SLUN | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_FLAG_DATA_NONE | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 6, {0x01, 0, 0, 0x03, 0x07}}, /* 1C RECEIVE DIAGNOSTIC RESULTS */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 1D SEND DIAGNOSTIC */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 1E PREVENT ALLOW MEDIUM REMOVAL */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 1F */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 20 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 21 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 22 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 23 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 24 SET WINDOW */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 25 READ CAPACITY(10) */ {ctl_read_capacity, CTL_SERIDX_RD_CAP, CTL_CMD_FLAG_OK_ON_SLUN| CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_READCAP, 10, {0, 0, 0, 0, 0, 0, 0, 0, 0x07}}, /* 26 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 27 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 28 READ(10) */ {ctl_read_write, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 10, {0x1a, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0x07}}, /* 29 READ GENERATION */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 2A WRITE(10) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 10, {0x1a, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0x07}}, /* 2B SEEK(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 2C ERASE(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 2D READ UPDATED BLOCK */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 2E WRITE AND VERIFY(10) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 10, {0x12, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0x07}}, /* 2F VERIFY(10) */ {ctl_verify, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 10, {0x16, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0x07}}, /* 30 SEARCH DATA HIGH(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 31 SEARCH DATA EQUAL(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 32 SEARCH DATA LOW(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 33 SET LIMITS(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 34 PRE-FETCH(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 35 SYNCHRONIZE CACHE(10) */ {ctl_sync_cache, CTL_SERIDX_SYNC, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_NONE, CTL_LUN_PAT_NONE, 10, {0x02, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0x07}}, /* 36 LOCK UNLOCK CACHE(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 37 READ DEFECT DATA(10) */ {ctl_read_defect, CTL_SERIDX_MD_SNS, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_NONE, 10, {0, 0x1f, 0, 0, 0, 0, 0xff, 0xff, 0x07}}, /* 38 MEDIUM SCAN */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 39 COMPARE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 3A COPY AND VERIFY */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 3B WRITE BUFFER */ {ctl_write_buffer, CTL_SERIDX_MD_SEL, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 10, {0x1f, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 3C READ BUFFER */ {ctl_read_buffer, CTL_SERIDX_MD_SNS, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_NONE, 10, {0x1f, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x07}}, /* 3D UPDATE BLOCK */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 3E READ LONG */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 3F WRITE LONG */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 40 CHANGE DEFINITION */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 41 WRITE SAME(10) */ {ctl_write_same, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 10, {0x1a, 0xff, 0xff, 0xff, 0xff, 0, 0xff, 0xff, 0x07}}, /* 42 READ SUB-CHANNEL / UNMAP */ {ctl_unmap, CTL_SERIDX_UNMAP, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE, 10, {1, 0, 0, 0, 0, 0, 0xff, 0xff, 0x07}}, /* 43 READ TOC/PMA/ATIP */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 44 REPORT DENSITY SUPPORT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 45 PLAY AUDIO(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 46 GET CONFIGURATION */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 47 PLAY AUDIO MSF */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 48 PLAY AUDIO TRACK INDEX */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 49 PLAY TRACK RELATIVE(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 4A GET EVENT STATUS NOTIFICATION */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 4B PAUSE/RESUME */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 4C LOG SELECT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 4D LOG SENSE */ {ctl_log_sense, CTL_SERIDX_LOG_SNS, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 10, {0, 0xff, 0xff, 0, 0xff, 0xff, 0xff, 0xff, 0x07} }, /* 4E STOP PLAY/SCAN */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 4F */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 50 XDWRITE(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 51 XPWRITE(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 52 XDREAD(10) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 53 RESERVE TRACK */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 54 SEND OPC INFORMATION */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 55 MODE SELECT(10) */ {ctl_mode_select, CTL_SERIDX_MD_SEL, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 10, {0x11, 0, 0, 0, 0, 0, 0xff, 0xff, 0x07} }, /* 56 RESERVE(10) */ {ctl_scsi_reserve, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 10, {0x02, 0, 0xff, 0, 0, 0, 0xff, 0xff, 0x07} }, /* 57 RELEASE(10) */ {ctl_scsi_release, CTL_SERIDX_RES, CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_NONE, 10, {0x02, 0, 0xff, 0, 0, 0, 0xff, 0xff, 0x07} }, /* 58 REPAIR TRACK */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 59 READ MASTER CUE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 5A MODE SENSE(10) */ {ctl_mode_sense, CTL_SERIDX_MD_SNS, CTL_CMD_FLAG_OK_ON_BOTH | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_NONE, 10, {0x18, 0xff, 0xff, 0, 0, 0, 0xff, 0xff, 0x07} }, /* 5B CLOSE TRACK/SESSION */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 5C READ BUFFER CAPACITY */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 5D SEND CUE SHEET */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 5E PERSISTENT RESERVE IN */ {__DECONST(ctl_opfunc *, ctl_cmd_table_5e), CTL_SERIDX_INVLD, CTL_CMD_FLAG_SA5, CTL_LUN_PAT_NONE}, /* 5F PERSISTENT RESERVE OUT */ {__DECONST(ctl_opfunc *, ctl_cmd_table_5f), CTL_SERIDX_INVLD, CTL_CMD_FLAG_SA5, CTL_LUN_PAT_NONE}, /* 60 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 61 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 62 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 63 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 64 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 65 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 66 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 67 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 68 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 69 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 6A */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 6B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 6C */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 6D */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 6E */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 6F */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 70 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 71 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 72 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 73 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 74 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 75 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 76 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 77 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 78 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 79 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 7A */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 7B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 7C */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 7D */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 7E */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 7F */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 80 XDWRITE EXTENDED(16) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 81 REBUILD(16) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 82 REGENERATE(16) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 83 EXTENDED COPY */ {__DECONST(ctl_opfunc *, ctl_cmd_table_83), CTL_SERIDX_INVLD, CTL_CMD_FLAG_SA5, CTL_LUN_PAT_NONE}, /* 84 RECEIVE COPY RESULTS */ {__DECONST(ctl_opfunc *, ctl_cmd_table_84), CTL_SERIDX_INVLD, CTL_CMD_FLAG_SA5, CTL_LUN_PAT_NONE}, /* 85 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 86 ACCESS CONTROL IN */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 87 ACCESS CONTROL OUT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 88 READ(16) */ {ctl_read_write, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 16, {0x1a, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 89 COMPARE AND WRITE */ {ctl_cnw, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 16, {0x18, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0xff, 0, 0x07}}, /* 8A WRITE(16) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 16, {0x1a, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 8B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 8C READ ATTRIBUTE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 8D WRITE ATTRIBUTE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 8E WRITE AND VERIFY(16) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 16, {0x12, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 8F VERIFY(16) */ {ctl_verify, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 16, {0x16, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 90 PRE-FETCH(16) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 91 SYNCHRONIZE CACHE(16) */ {ctl_sync_cache, CTL_SERIDX_SYNC, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_NONE, CTL_LUN_PAT_NONE, 16, {0x02, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 92 LOCK UNLOCK CACHE(16) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 93 WRITE SAME(16) */ {ctl_write_same, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 16, {0x1b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* 94 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 95 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 96 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 97 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 98 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 99 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 9A */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 9B */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 9C WRITE ATOMIC (16) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 16, {0x18, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0xff, 0xff, 0, 0x07}}, /* 9D */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* 9E SERVICE ACTION IN(16) */ {__DECONST(ctl_opfunc *, ctl_cmd_table_9e), CTL_SERIDX_INVLD, CTL_CMD_FLAG_SA5, CTL_LUN_PAT_NONE}, /* 9F SERVICE ACTION OUT(16) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* A0 REPORT LUNS */ {ctl_report_luns, CTL_SERIDX_INQ, CTL_CMD_FLAG_OK_ON_ALL_LUNS | CTL_CMD_FLAG_ALLOW_ON_RESV | CTL_CMD_FLAG_NO_SENSE | CTL_CMD_FLAG_OK_ON_STOPPED | CTL_CMD_FLAG_OK_ON_INOPERABLE | CTL_CMD_FLAG_OK_ON_STANDBY | CTL_CMD_FLAG_OK_ON_UNAVAIL | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_RESV, CTL_LUN_PAT_NONE, 12, {0, 0xff, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* A1 BLANK */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* A2 SEND EVENT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* A3 MAINTENANCE IN */ {__DECONST(ctl_opfunc *, ctl_cmd_table_a3), CTL_SERIDX_INVLD, CTL_CMD_FLAG_SA5, CTL_LUN_PAT_NONE}, /* A4 MAINTENANCE OUT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* A5 MOVE MEDIUM */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* A6 EXCHANGE MEDIUM */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* A7 MOVE MEDIUM ATTACHED */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* A8 READ(12) */ {ctl_read_write, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 12, {0x1a, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* A9 PLAY TRACK RELATIVE(12) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* AA WRITE(12) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 12, {0x1a, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* AB SERVICE ACTION IN(12) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* AC ERASE(12) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* AD READ DVD STRUCTURE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* AE WRITE AND VERIFY(12) */ {ctl_read_write, CTL_SERIDX_WRITE, CTL_CMD_FLAG_OK_ON_SLUN| CTL_FLAG_DATA_OUT, CTL_LUN_PAT_WRITE | CTL_LUN_PAT_RANGE, 12, {0x12, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* AF VERIFY(12) */ {ctl_verify, CTL_SERIDX_READ, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_OUT | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_READ | CTL_LUN_PAT_RANGE, 12, {0x16, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* B0 SEARCH DATA HIGH(12) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B1 SEARCH DATA EQUAL(12) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B2 SEARCH DATA LOW(12) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B3 SET LIMITS(12) */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B4 READ ELEMENT STATUS ATTACHED */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B5 REQUEST VOLUME ELEMENT ADDRESS */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B6 SEND VOLUME TAG */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B7 READ DEFECT DATA(12) */ {ctl_read_defect, CTL_SERIDX_MD_SNS, CTL_CMD_FLAG_OK_ON_SLUN | CTL_FLAG_DATA_IN | CTL_CMD_FLAG_ALLOW_ON_PR_WRESV, CTL_LUN_PAT_NONE, 12, {0x1f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0x07}}, /* B8 READ ELEMENT STATUS */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* B9 READ CD MSF */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* BA REDUNDANCY GROUP IN */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* BB REDUNDANCY GROUP OUT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* BC SPARE IN */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* BD SPARE OUT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* BE VOLUME SET IN */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* BF VOLUME SET OUT */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C0 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C1 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C2 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C3 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C4 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C5 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C6 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C7 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C8 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* C9 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* CA */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* CB */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* CC */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* CD */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* CE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* CF */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D0 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D1 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D2 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D3 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D4 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D5 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D6 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D7 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D8 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* D9 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* DA */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* DB */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* DC */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* DD */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* DE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* DF */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E0 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E1 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E2 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E3 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E4 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E5 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E6 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E7 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E8 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* E9 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* EA */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* EB */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* EC */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* ED */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* EE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* EF */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F0 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F1 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F2 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F3 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F4 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F5 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F6 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F7 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F8 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* F9 */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* FA */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* FB */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* FC */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* FD */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* FE */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE}, /* FF */ {NULL, CTL_SERIDX_INVLD, CTL_CMD_FLAG_NONE, CTL_LUN_PAT_NONE} }; Index: projects/iosched/sys/cam/ctl/ctl_tpc.c =================================================================== --- projects/iosched/sys/cam/ctl/ctl_tpc.c (revision 287721) +++ projects/iosched/sys/cam/ctl/ctl_tpc.c (revision 287722) @@ -1,2307 +1,2318 @@ /*- * Copyright (c) 2014 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TPC_MAX_CSCDS 64 #define TPC_MAX_SEGS 64 #define TPC_MAX_SEG 0 #define TPC_MAX_LIST 8192 #define TPC_MAX_INLINE 0 #define TPC_MAX_LISTS 255 #define TPC_MAX_IO_SIZE (1024 * 1024) #define TPC_MAX_IOCHUNK_SIZE (TPC_MAX_IO_SIZE * 16) #define TPC_MIN_TOKEN_TIMEOUT 1 #define TPC_DFL_TOKEN_TIMEOUT 60 #define TPC_MAX_TOKEN_TIMEOUT 600 MALLOC_DEFINE(M_CTL_TPC, "ctltpc", "CTL TPC"); typedef enum { TPC_ERR_RETRY = 0x000, TPC_ERR_FAIL = 0x001, TPC_ERR_MASK = 0x0ff, TPC_ERR_NO_DECREMENT = 0x100 } tpc_error_action; struct tpc_list; TAILQ_HEAD(runl, tpc_io); struct tpc_io { union ctl_io *io; uint64_t lun; struct tpc_list *list; struct runl run; TAILQ_ENTRY(tpc_io) rlinks; TAILQ_ENTRY(tpc_io) links; }; struct tpc_token { uint8_t token[512]; uint64_t lun; uint32_t blocksize; uint8_t *params; struct scsi_range_desc *range; int nrange; int active; time_t last_active; uint32_t timeout; TAILQ_ENTRY(tpc_token) links; }; struct tpc_list { uint8_t service_action; int init_port; uint32_t init_idx; uint32_t list_id; uint8_t flags; uint8_t *params; struct scsi_ec_cscd *cscd; struct scsi_ec_segment *seg[TPC_MAX_SEGS]; uint8_t *inl; int ncscd; int nseg; int leninl; struct tpc_token *token; struct scsi_range_desc *range; int nrange; off_t offset_into_rod; int curseg; off_t cursectors; off_t curbytes; int curops; int stage; uint8_t *buf; off_t segsectors; off_t segbytes; int tbdio; int error; int abort; int completed; time_t last_active; TAILQ_HEAD(, tpc_io) allio; struct scsi_sense_data sense_data; uint8_t sense_len; uint8_t scsi_status; struct ctl_scsiio *ctsio; struct ctl_lun *lun; int res_token_valid; uint8_t res_token[512]; TAILQ_ENTRY(tpc_list) links; }; static void tpc_timeout(void *arg) { struct ctl_softc *softc = arg; struct ctl_lun *lun; struct tpc_token *token, *ttoken; struct tpc_list *list, *tlist; /* Free completed lists with expired timeout. */ STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); TAILQ_FOREACH_SAFE(list, &lun->tpc_lists, links, tlist) { if (!list->completed || time_uptime < list->last_active + TPC_DFL_TOKEN_TIMEOUT) continue; TAILQ_REMOVE(&lun->tpc_lists, list, links); free(list, M_CTL); } mtx_unlock(&lun->lun_lock); } /* Free inactive ROD tokens with expired timeout. */ mtx_lock(&softc->tpc_lock); TAILQ_FOREACH_SAFE(token, &softc->tpc_tokens, links, ttoken) { if (token->active || time_uptime < token->last_active + token->timeout + 1) continue; TAILQ_REMOVE(&softc->tpc_tokens, token, links); free(token->params, M_CTL); free(token, M_CTL); } mtx_unlock(&softc->tpc_lock); callout_schedule(&softc->tpc_timeout, hz); } void ctl_tpc_init(struct ctl_softc *softc) { mtx_init(&softc->tpc_lock, "CTL TPC mutex", NULL, MTX_DEF); TAILQ_INIT(&softc->tpc_tokens); callout_init_mtx(&softc->tpc_timeout, &softc->ctl_lock, 0); callout_reset(&softc->tpc_timeout, hz, tpc_timeout, softc); } void ctl_tpc_shutdown(struct ctl_softc *softc) { struct tpc_token *token; callout_drain(&softc->tpc_timeout); /* Free ROD tokens. */ mtx_lock(&softc->tpc_lock); while ((token = TAILQ_FIRST(&softc->tpc_tokens)) != NULL) { TAILQ_REMOVE(&softc->tpc_tokens, token, links); free(token->params, M_CTL); free(token, M_CTL); } mtx_unlock(&softc->tpc_lock); mtx_destroy(&softc->tpc_lock); } void ctl_tpc_lun_init(struct ctl_lun *lun) { TAILQ_INIT(&lun->tpc_lists); } void ctl_tpc_lun_shutdown(struct ctl_lun *lun) { struct ctl_softc *softc = lun->ctl_softc; struct tpc_list *list; struct tpc_token *token, *ttoken; /* Free lists for this LUN. */ while ((list = TAILQ_FIRST(&lun->tpc_lists)) != NULL) { TAILQ_REMOVE(&lun->tpc_lists, list, links); KASSERT(list->completed, ("Not completed TPC (%p) on shutdown", list)); free(list, M_CTL); } /* Free ROD tokens for this LUN. */ mtx_lock(&softc->tpc_lock); TAILQ_FOREACH_SAFE(token, &softc->tpc_tokens, links, ttoken) { if (token->lun != lun->lun || token->active) continue; TAILQ_REMOVE(&softc->tpc_tokens, token, links); free(token->params, M_CTL); free(token, M_CTL); } mtx_unlock(&softc->tpc_lock); } int ctl_inquiry_evpd_tpc(struct ctl_scsiio *ctsio, int alloc_len) { struct scsi_vpd_tpc *tpc_ptr; struct scsi_vpd_tpc_descriptor *d_ptr; struct scsi_vpd_tpc_descriptor_bdrl *bdrl_ptr; struct scsi_vpd_tpc_descriptor_sc *sc_ptr; struct scsi_vpd_tpc_descriptor_sc_descr *scd_ptr; struct scsi_vpd_tpc_descriptor_pd *pd_ptr; struct scsi_vpd_tpc_descriptor_sd *sd_ptr; struct scsi_vpd_tpc_descriptor_sdid *sdid_ptr; struct scsi_vpd_tpc_descriptor_rtf *rtf_ptr; struct scsi_vpd_tpc_descriptor_rtf_block *rtfb_ptr; struct scsi_vpd_tpc_descriptor_srt *srt_ptr; struct scsi_vpd_tpc_descriptor_srtd *srtd_ptr; struct scsi_vpd_tpc_descriptor_gco *gco_ptr; struct ctl_lun *lun; int data_len; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; data_len = sizeof(struct scsi_vpd_tpc) + sizeof(struct scsi_vpd_tpc_descriptor_bdrl) + roundup2(sizeof(struct scsi_vpd_tpc_descriptor_sc) + 2 * sizeof(struct scsi_vpd_tpc_descriptor_sc_descr) + 11, 4) + sizeof(struct scsi_vpd_tpc_descriptor_pd) + roundup2(sizeof(struct scsi_vpd_tpc_descriptor_sd) + 4, 4) + roundup2(sizeof(struct scsi_vpd_tpc_descriptor_sdid) + 2, 4) + sizeof(struct scsi_vpd_tpc_descriptor_rtf) + sizeof(struct scsi_vpd_tpc_descriptor_rtf_block) + sizeof(struct scsi_vpd_tpc_descriptor_srt) + 2*sizeof(struct scsi_vpd_tpc_descriptor_srtd) + sizeof(struct scsi_vpd_tpc_descriptor_gco); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); tpc_ptr = (struct scsi_vpd_tpc *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; if (data_len < alloc_len) { ctsio->residual = alloc_len - data_len; ctsio->kern_data_len = data_len; ctsio->kern_total_len = data_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) tpc_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else tpc_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; tpc_ptr->page_code = SVPD_SCSI_TPC; scsi_ulto2b(data_len - 4, tpc_ptr->page_length); /* Block Device ROD Limits */ d_ptr = (struct scsi_vpd_tpc_descriptor *)&tpc_ptr->descr[0]; bdrl_ptr = (struct scsi_vpd_tpc_descriptor_bdrl *)d_ptr; scsi_ulto2b(SVPD_TPC_BDRL, bdrl_ptr->desc_type); scsi_ulto2b(sizeof(*bdrl_ptr) - 4, bdrl_ptr->desc_length); scsi_ulto2b(TPC_MAX_SEGS, bdrl_ptr->maximum_ranges); scsi_ulto4b(TPC_MAX_TOKEN_TIMEOUT, bdrl_ptr->maximum_inactivity_timeout); scsi_ulto4b(TPC_DFL_TOKEN_TIMEOUT, bdrl_ptr->default_inactivity_timeout); scsi_u64to8b(0, bdrl_ptr->maximum_token_transfer_size); scsi_u64to8b(0, bdrl_ptr->optimal_transfer_count); /* Supported commands */ d_ptr = (struct scsi_vpd_tpc_descriptor *) (&d_ptr->parameters[0] + scsi_2btoul(d_ptr->desc_length)); sc_ptr = (struct scsi_vpd_tpc_descriptor_sc *)d_ptr; scsi_ulto2b(SVPD_TPC_SC, sc_ptr->desc_type); sc_ptr->list_length = 2 * sizeof(*scd_ptr) + 11; scsi_ulto2b(roundup2(1 + sc_ptr->list_length, 4), sc_ptr->desc_length); scd_ptr = &sc_ptr->descr[0]; scd_ptr->opcode = EXTENDED_COPY; scd_ptr->sa_length = 5; scd_ptr->supported_service_actions[0] = EC_EC_LID1; scd_ptr->supported_service_actions[1] = EC_EC_LID4; scd_ptr->supported_service_actions[2] = EC_PT; scd_ptr->supported_service_actions[3] = EC_WUT; scd_ptr->supported_service_actions[4] = EC_COA; scd_ptr = (struct scsi_vpd_tpc_descriptor_sc_descr *) &scd_ptr->supported_service_actions[scd_ptr->sa_length]; scd_ptr->opcode = RECEIVE_COPY_STATUS; scd_ptr->sa_length = 6; scd_ptr->supported_service_actions[0] = RCS_RCS_LID1; scd_ptr->supported_service_actions[1] = RCS_RCFD; scd_ptr->supported_service_actions[2] = RCS_RCS_LID4; scd_ptr->supported_service_actions[3] = RCS_RCOP; scd_ptr->supported_service_actions[4] = RCS_RRTI; scd_ptr->supported_service_actions[5] = RCS_RART; /* Parameter data. */ d_ptr = (struct scsi_vpd_tpc_descriptor *) (&d_ptr->parameters[0] + scsi_2btoul(d_ptr->desc_length)); pd_ptr = (struct scsi_vpd_tpc_descriptor_pd *)d_ptr; scsi_ulto2b(SVPD_TPC_PD, pd_ptr->desc_type); scsi_ulto2b(sizeof(*pd_ptr) - 4, pd_ptr->desc_length); scsi_ulto2b(TPC_MAX_CSCDS, pd_ptr->maximum_cscd_descriptor_count); scsi_ulto2b(TPC_MAX_SEGS, pd_ptr->maximum_segment_descriptor_count); scsi_ulto4b(TPC_MAX_LIST, pd_ptr->maximum_descriptor_list_length); scsi_ulto4b(TPC_MAX_INLINE, pd_ptr->maximum_inline_data_length); /* Supported Descriptors */ d_ptr = (struct scsi_vpd_tpc_descriptor *) (&d_ptr->parameters[0] + scsi_2btoul(d_ptr->desc_length)); sd_ptr = (struct scsi_vpd_tpc_descriptor_sd *)d_ptr; scsi_ulto2b(SVPD_TPC_SD, sd_ptr->desc_type); scsi_ulto2b(roundup2(sizeof(*sd_ptr) - 4 + 4, 4), sd_ptr->desc_length); sd_ptr->list_length = 4; sd_ptr->supported_descriptor_codes[0] = EC_SEG_B2B; sd_ptr->supported_descriptor_codes[1] = EC_SEG_VERIFY; sd_ptr->supported_descriptor_codes[2] = EC_SEG_REGISTER_KEY; sd_ptr->supported_descriptor_codes[3] = EC_CSCD_ID; /* Supported CSCD Descriptor IDs */ d_ptr = (struct scsi_vpd_tpc_descriptor *) (&d_ptr->parameters[0] + scsi_2btoul(d_ptr->desc_length)); sdid_ptr = (struct scsi_vpd_tpc_descriptor_sdid *)d_ptr; scsi_ulto2b(SVPD_TPC_SDID, sdid_ptr->desc_type); scsi_ulto2b(roundup2(sizeof(*sdid_ptr) - 4 + 2, 4), sdid_ptr->desc_length); scsi_ulto2b(2, sdid_ptr->list_length); scsi_ulto2b(0xffff, &sdid_ptr->supported_descriptor_ids[0]); /* ROD Token Features */ d_ptr = (struct scsi_vpd_tpc_descriptor *) (&d_ptr->parameters[0] + scsi_2btoul(d_ptr->desc_length)); rtf_ptr = (struct scsi_vpd_tpc_descriptor_rtf *)d_ptr; scsi_ulto2b(SVPD_TPC_RTF, rtf_ptr->desc_type); scsi_ulto2b(sizeof(*rtf_ptr) - 4 + sizeof(*rtfb_ptr), rtf_ptr->desc_length); rtf_ptr->remote_tokens = 0; scsi_ulto4b(TPC_MIN_TOKEN_TIMEOUT, rtf_ptr->minimum_token_lifetime); scsi_ulto4b(UINT32_MAX, rtf_ptr->maximum_token_lifetime); scsi_ulto4b(TPC_MAX_TOKEN_TIMEOUT, rtf_ptr->maximum_token_inactivity_timeout); scsi_ulto2b(sizeof(*rtfb_ptr), rtf_ptr->type_specific_features_length); rtfb_ptr = (struct scsi_vpd_tpc_descriptor_rtf_block *) &rtf_ptr->type_specific_features; rtfb_ptr->type_format = SVPD_TPC_RTF_BLOCK; scsi_ulto2b(sizeof(*rtfb_ptr) - 4, rtfb_ptr->desc_length); scsi_ulto2b(0, rtfb_ptr->optimal_length_granularity); scsi_u64to8b(0, rtfb_ptr->maximum_bytes); scsi_u64to8b(0, rtfb_ptr->optimal_bytes); + scsi_u64to8b(UINT64_MAX, rtfb_ptr->optimal_bytes_to_token_per_segment); scsi_u64to8b(TPC_MAX_IOCHUNK_SIZE, - rtfb_ptr->optimal_bytes_to_token_per_segment); - scsi_u64to8b(TPC_MAX_IOCHUNK_SIZE, rtfb_ptr->optimal_bytes_from_token_per_segment); /* Supported ROD Tokens */ d_ptr = (struct scsi_vpd_tpc_descriptor *) (&d_ptr->parameters[0] + scsi_2btoul(d_ptr->desc_length)); srt_ptr = (struct scsi_vpd_tpc_descriptor_srt *)d_ptr; scsi_ulto2b(SVPD_TPC_SRT, srt_ptr->desc_type); scsi_ulto2b(sizeof(*srt_ptr) - 4 + 2*sizeof(*srtd_ptr), srt_ptr->desc_length); scsi_ulto2b(2*sizeof(*srtd_ptr), srt_ptr->rod_type_descriptors_length); srtd_ptr = (struct scsi_vpd_tpc_descriptor_srtd *) &srt_ptr->rod_type_descriptors; scsi_ulto4b(ROD_TYPE_AUR, srtd_ptr->rod_type); srtd_ptr->flags = SVPD_TPC_SRTD_TIN | SVPD_TPC_SRTD_TOUT; scsi_ulto2b(0, srtd_ptr->preference_indicator); srtd_ptr++; scsi_ulto4b(ROD_TYPE_BLOCK_ZERO, srtd_ptr->rod_type); srtd_ptr->flags = SVPD_TPC_SRTD_TIN; scsi_ulto2b(0, srtd_ptr->preference_indicator); /* General Copy Operations */ d_ptr = (struct scsi_vpd_tpc_descriptor *) (&d_ptr->parameters[0] + scsi_2btoul(d_ptr->desc_length)); gco_ptr = (struct scsi_vpd_tpc_descriptor_gco *)d_ptr; scsi_ulto2b(SVPD_TPC_GCO, gco_ptr->desc_type); scsi_ulto2b(sizeof(*gco_ptr) - 4, gco_ptr->desc_length); scsi_ulto4b(TPC_MAX_LISTS, gco_ptr->total_concurrent_copies); scsi_ulto4b(TPC_MAX_LISTS, gco_ptr->maximum_identified_concurrent_copies); scsi_ulto4b(TPC_MAX_SEG, gco_ptr->maximum_segment_length); gco_ptr->data_segment_granularity = 0; gco_ptr->inline_data_granularity = 0; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_receive_copy_operating_parameters(struct ctl_scsiio *ctsio) { struct scsi_receive_copy_operating_parameters *cdb; struct scsi_receive_copy_operating_parameters_data *data; int retval; int alloc_len, total_len; CTL_DEBUG_PRINT(("ctl_report_supported_tmf\n")); cdb = (struct scsi_receive_copy_operating_parameters *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; total_len = sizeof(*data) + 4; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_receive_copy_operating_parameters_data *)ctsio->kern_data_ptr; scsi_ulto4b(sizeof(*data) - 4 + 4, data->length); data->snlid = RCOP_SNLID; scsi_ulto2b(TPC_MAX_CSCDS, data->maximum_cscd_descriptor_count); scsi_ulto2b(TPC_MAX_SEGS, data->maximum_segment_descriptor_count); scsi_ulto4b(TPC_MAX_LIST, data->maximum_descriptor_list_length); scsi_ulto4b(TPC_MAX_SEG, data->maximum_segment_length); scsi_ulto4b(TPC_MAX_INLINE, data->maximum_inline_data_length); scsi_ulto4b(0, data->held_data_limit); scsi_ulto4b(0, data->maximum_stream_device_transfer_size); scsi_ulto2b(TPC_MAX_LISTS, data->total_concurrent_copies); data->maximum_concurrent_copies = TPC_MAX_LISTS; data->data_segment_granularity = 0; data->inline_data_granularity = 0; data->held_data_granularity = 0; data->implemented_descriptor_list_length = 4; data->list_of_implemented_descriptor_type_codes[0] = EC_SEG_B2B; data->list_of_implemented_descriptor_type_codes[1] = EC_SEG_VERIFY; data->list_of_implemented_descriptor_type_codes[2] = EC_SEG_REGISTER_KEY; data->list_of_implemented_descriptor_type_codes[3] = EC_CSCD_ID; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } static struct tpc_list * tpc_find_list(struct ctl_lun *lun, uint32_t list_id, uint32_t init_idx) { struct tpc_list *list; mtx_assert(&lun->lun_lock, MA_OWNED); TAILQ_FOREACH(list, &lun->tpc_lists, links) { if ((list->flags & EC_LIST_ID_USAGE_MASK) != EC_LIST_ID_USAGE_NONE && list->list_id == list_id && list->init_idx == init_idx) break; } return (list); } int ctl_receive_copy_status_lid1(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_receive_copy_status_lid1 *cdb; struct scsi_receive_copy_status_lid1_data *data; struct tpc_list *list; struct tpc_list list_copy; int retval; int alloc_len, total_len; uint32_t list_id; CTL_DEBUG_PRINT(("ctl_receive_copy_status_lid1\n")); cdb = (struct scsi_receive_copy_status_lid1 *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; list_id = cdb->list_identifier; mtx_lock(&lun->lun_lock); list = tpc_find_list(lun, list_id, ctl_get_initindex(&ctsio->io_hdr.nexus)); if (list == NULL) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } list_copy = *list; if (list->completed) { TAILQ_REMOVE(&lun->tpc_lists, list, links); free(list, M_CTL); } mtx_unlock(&lun->lun_lock); total_len = sizeof(*data); alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_receive_copy_status_lid1_data *)ctsio->kern_data_ptr; scsi_ulto4b(sizeof(*data) - 4, data->available_data); if (list_copy.completed) { if (list_copy.error || list_copy.abort) data->copy_command_status = RCS_CCS_ERROR; else data->copy_command_status = RCS_CCS_COMPLETED; } else data->copy_command_status = RCS_CCS_INPROG; scsi_ulto2b(list_copy.curseg, data->segments_processed); if (list_copy.curbytes <= UINT32_MAX) { data->transfer_count_units = RCS_TC_BYTES; scsi_ulto4b(list_copy.curbytes, data->transfer_count); } else { data->transfer_count_units = RCS_TC_MBYTES; scsi_ulto4b(list_copy.curbytes >> 20, data->transfer_count); } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_receive_copy_failure_details(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_receive_copy_failure_details *cdb; struct scsi_receive_copy_failure_details_data *data; struct tpc_list *list; struct tpc_list list_copy; int retval; int alloc_len, total_len; uint32_t list_id; CTL_DEBUG_PRINT(("ctl_receive_copy_failure_details\n")); cdb = (struct scsi_receive_copy_failure_details *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; list_id = cdb->list_identifier; mtx_lock(&lun->lun_lock); list = tpc_find_list(lun, list_id, ctl_get_initindex(&ctsio->io_hdr.nexus)); if (list == NULL || !list->completed) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } list_copy = *list; TAILQ_REMOVE(&lun->tpc_lists, list, links); free(list, M_CTL); mtx_unlock(&lun->lun_lock); total_len = sizeof(*data) + list_copy.sense_len; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_receive_copy_failure_details_data *)ctsio->kern_data_ptr; if (list_copy.completed && (list_copy.error || list_copy.abort)) { scsi_ulto4b(sizeof(*data) - 4 + list_copy.sense_len, data->available_data); data->copy_command_status = RCS_CCS_ERROR; } else scsi_ulto4b(0, data->available_data); scsi_ulto2b(list_copy.sense_len, data->sense_data_length); memcpy(data->sense_data, &list_copy.sense_data, list_copy.sense_len); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_receive_copy_status_lid4(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_receive_copy_status_lid4 *cdb; struct scsi_receive_copy_status_lid4_data *data; struct tpc_list *list; struct tpc_list list_copy; int retval; int alloc_len, total_len; uint32_t list_id; CTL_DEBUG_PRINT(("ctl_receive_copy_status_lid4\n")); cdb = (struct scsi_receive_copy_status_lid4 *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; list_id = scsi_4btoul(cdb->list_identifier); mtx_lock(&lun->lun_lock); list = tpc_find_list(lun, list_id, ctl_get_initindex(&ctsio->io_hdr.nexus)); if (list == NULL) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } list_copy = *list; if (list->completed) { TAILQ_REMOVE(&lun->tpc_lists, list, links); free(list, M_CTL); } mtx_unlock(&lun->lun_lock); total_len = sizeof(*data) + list_copy.sense_len; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_receive_copy_status_lid4_data *)ctsio->kern_data_ptr; scsi_ulto4b(sizeof(*data) - 4 + list_copy.sense_len, data->available_data); data->response_to_service_action = list_copy.service_action; if (list_copy.completed) { if (list_copy.error) data->copy_command_status = RCS_CCS_ERROR; else if (list_copy.abort) data->copy_command_status = RCS_CCS_ABORTED; else data->copy_command_status = RCS_CCS_COMPLETED; } else data->copy_command_status = RCS_CCS_INPROG_FG; scsi_ulto2b(list_copy.curops, data->operation_counter); scsi_ulto4b(UINT32_MAX, data->estimated_status_update_delay); data->transfer_count_units = RCS_TC_BYTES; scsi_u64to8b(list_copy.curbytes, data->transfer_count); scsi_ulto2b(list_copy.curseg, data->segments_processed); data->length_of_the_sense_data_field = list_copy.sense_len; data->sense_data_length = list_copy.sense_len; memcpy(data->sense_data, &list_copy.sense_data, list_copy.sense_len); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_copy_operation_abort(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_copy_operation_abort *cdb; struct tpc_list *list; int retval; uint32_t list_id; CTL_DEBUG_PRINT(("ctl_copy_operation_abort\n")); cdb = (struct scsi_copy_operation_abort *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; list_id = scsi_4btoul(cdb->list_identifier); mtx_lock(&lun->lun_lock); list = tpc_find_list(lun, list_id, ctl_get_initindex(&ctsio->io_hdr.nexus)); if (list == NULL) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } list->abort = 1; mtx_unlock(&lun->lun_lock); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } static uint64_t tpc_resolve(struct tpc_list *list, uint16_t idx, uint32_t *ss, uint32_t *pb, uint32_t *pbo) { if (idx == 0xffff) { if (ss && list->lun->be_lun) *ss = list->lun->be_lun->blocksize; if (pb && list->lun->be_lun) *pb = list->lun->be_lun->blocksize << list->lun->be_lun->pblockexp; if (pbo && list->lun->be_lun) *pbo = list->lun->be_lun->blocksize * list->lun->be_lun->pblockoff; return (list->lun->lun); } if (idx >= list->ncscd) return (UINT64_MAX); return (tpcl_resolve(list->lun->ctl_softc, list->init_port, &list->cscd[idx], ss, pb, pbo)); } static int tpc_process_b2b(struct tpc_list *list) { struct scsi_ec_segment_b2b *seg; struct scsi_ec_cscd_dtsp *sdstp, *ddstp; struct tpc_io *tior, *tiow; struct runl run; uint64_t sl, dl; off_t srclba, dstlba, numbytes, donebytes, roundbytes; int numlba; uint32_t srcblock, dstblock, pb, pbo, adj; if (list->stage == 1) { while ((tior = TAILQ_FIRST(&list->allio)) != NULL) { TAILQ_REMOVE(&list->allio, tior, links); ctl_free_io(tior->io); free(tior, M_CTL); } free(list->buf, M_CTL); if (list->abort) { ctl_set_task_aborted(list->ctsio); return (CTL_RETVAL_ERROR); } else if (list->error) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x0d, /*ascq*/ 0x01, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } list->cursectors += list->segsectors; list->curbytes += list->segbytes; return (CTL_RETVAL_COMPLETE); } TAILQ_INIT(&list->allio); seg = (struct scsi_ec_segment_b2b *)list->seg[list->curseg]; sl = tpc_resolve(list, scsi_2btoul(seg->src_cscd), &srcblock, NULL, NULL); dl = tpc_resolve(list, scsi_2btoul(seg->dst_cscd), &dstblock, &pb, &pbo); if (sl >= CTL_MAX_LUNS || dl >= CTL_MAX_LUNS) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x08, /*ascq*/ 0x04, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } if (pbo > 0) pbo = pb - pbo; sdstp = &list->cscd[scsi_2btoul(seg->src_cscd)].dtsp; if (scsi_3btoul(sdstp->block_length) != 0) srcblock = scsi_3btoul(sdstp->block_length); ddstp = &list->cscd[scsi_2btoul(seg->dst_cscd)].dtsp; if (scsi_3btoul(ddstp->block_length) != 0) dstblock = scsi_3btoul(ddstp->block_length); numlba = scsi_2btoul(seg->number_of_blocks); if (seg->flags & EC_SEG_DC) numbytes = (off_t)numlba * dstblock; else numbytes = (off_t)numlba * srcblock; srclba = scsi_8btou64(seg->src_lba); dstlba = scsi_8btou64(seg->dst_lba); // printf("Copy %ju bytes from %ju @ %ju to %ju @ %ju\n", // (uintmax_t)numbytes, sl, scsi_8btou64(seg->src_lba), // dl, scsi_8btou64(seg->dst_lba)); if (numbytes == 0) return (CTL_RETVAL_COMPLETE); if (numbytes % srcblock != 0 || numbytes % dstblock != 0) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x26, /*ascq*/ 0x0A, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } list->buf = malloc(numbytes, M_CTL, M_WAITOK); list->segbytes = numbytes; list->segsectors = numbytes / dstblock; donebytes = 0; TAILQ_INIT(&run); list->tbdio = 0; while (donebytes < numbytes) { roundbytes = numbytes - donebytes; if (roundbytes > TPC_MAX_IO_SIZE) { roundbytes = TPC_MAX_IO_SIZE; roundbytes -= roundbytes % dstblock; if (pb > dstblock) { adj = (dstlba * dstblock + roundbytes - pbo) % pb; if (roundbytes > adj) roundbytes -= adj; } } tior = malloc(sizeof(*tior), M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&tior->run); tior->list = list; TAILQ_INSERT_TAIL(&list->allio, tior, links); tior->io = tpcl_alloc_io(); ctl_scsi_read_write(tior->io, /*data_ptr*/ &list->buf[donebytes], /*data_len*/ roundbytes, /*read_op*/ 1, /*byte2*/ 0, /*minimum_cdb_size*/ 0, /*lba*/ srclba, /*num_blocks*/ roundbytes / srcblock, /*tag_type*/ CTL_TAG_SIMPLE, /*control*/ 0); tior->io->io_hdr.retries = 3; tior->lun = sl; tior->io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = tior; tiow = malloc(sizeof(*tior), M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&tiow->run); tiow->list = list; TAILQ_INSERT_TAIL(&list->allio, tiow, links); tiow->io = tpcl_alloc_io(); ctl_scsi_read_write(tiow->io, /*data_ptr*/ &list->buf[donebytes], /*data_len*/ roundbytes, /*read_op*/ 0, /*byte2*/ 0, /*minimum_cdb_size*/ 0, /*lba*/ dstlba, /*num_blocks*/ roundbytes / dstblock, /*tag_type*/ CTL_TAG_SIMPLE, /*control*/ 0); tiow->io->io_hdr.retries = 3; tiow->lun = dl; tiow->io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = tiow; TAILQ_INSERT_TAIL(&tior->run, tiow, rlinks); TAILQ_INSERT_TAIL(&run, tior, rlinks); list->tbdio++; donebytes += roundbytes; srclba += roundbytes / srcblock; dstlba += roundbytes / dstblock; } while ((tior = TAILQ_FIRST(&run)) != NULL) { TAILQ_REMOVE(&run, tior, rlinks); if (tpcl_queue(tior->io, tior->lun) != CTL_RETVAL_COMPLETE) panic("tpcl_queue() error"); } list->stage++; return (CTL_RETVAL_QUEUED); } static int tpc_process_verify(struct tpc_list *list) { struct scsi_ec_segment_verify *seg; struct tpc_io *tio; uint64_t sl; if (list->stage == 1) { while ((tio = TAILQ_FIRST(&list->allio)) != NULL) { TAILQ_REMOVE(&list->allio, tio, links); ctl_free_io(tio->io); free(tio, M_CTL); } if (list->abort) { ctl_set_task_aborted(list->ctsio); return (CTL_RETVAL_ERROR); } else if (list->error) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x0d, /*ascq*/ 0x01, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } else return (CTL_RETVAL_COMPLETE); } TAILQ_INIT(&list->allio); seg = (struct scsi_ec_segment_verify *)list->seg[list->curseg]; sl = tpc_resolve(list, scsi_2btoul(seg->src_cscd), NULL, NULL, NULL); if (sl >= CTL_MAX_LUNS) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x08, /*ascq*/ 0x04, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } // printf("Verify %ju\n", sl); if ((seg->tur & 0x01) == 0) return (CTL_RETVAL_COMPLETE); list->tbdio = 1; tio = malloc(sizeof(*tio), M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&tio->run); tio->list = list; TAILQ_INSERT_TAIL(&list->allio, tio, links); tio->io = tpcl_alloc_io(); ctl_scsi_tur(tio->io, /*tag_type*/ CTL_TAG_SIMPLE, /*control*/ 0); tio->io->io_hdr.retries = 3; tio->lun = sl; tio->io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = tio; list->stage++; if (tpcl_queue(tio->io, tio->lun) != CTL_RETVAL_COMPLETE) panic("tpcl_queue() error"); return (CTL_RETVAL_QUEUED); } static int tpc_process_register_key(struct tpc_list *list) { struct scsi_ec_segment_register_key *seg; struct tpc_io *tio; uint64_t dl; int datalen; if (list->stage == 1) { while ((tio = TAILQ_FIRST(&list->allio)) != NULL) { TAILQ_REMOVE(&list->allio, tio, links); ctl_free_io(tio->io); free(tio, M_CTL); } free(list->buf, M_CTL); if (list->abort) { ctl_set_task_aborted(list->ctsio); return (CTL_RETVAL_ERROR); } else if (list->error) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x0d, /*ascq*/ 0x01, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } else return (CTL_RETVAL_COMPLETE); } TAILQ_INIT(&list->allio); seg = (struct scsi_ec_segment_register_key *)list->seg[list->curseg]; dl = tpc_resolve(list, scsi_2btoul(seg->dst_cscd), NULL, NULL, NULL); if (dl >= CTL_MAX_LUNS) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x08, /*ascq*/ 0x04, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } // printf("Register Key %ju\n", dl); list->tbdio = 1; tio = malloc(sizeof(*tio), M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&tio->run); tio->list = list; TAILQ_INSERT_TAIL(&list->allio, tio, links); tio->io = tpcl_alloc_io(); datalen = sizeof(struct scsi_per_res_out_parms); list->buf = malloc(datalen, M_CTL, M_WAITOK); ctl_scsi_persistent_res_out(tio->io, list->buf, datalen, SPRO_REGISTER, -1, scsi_8btou64(seg->res_key), scsi_8btou64(seg->sa_res_key), /*tag_type*/ CTL_TAG_SIMPLE, /*control*/ 0); tio->io->io_hdr.retries = 3; tio->lun = dl; tio->io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = tio; list->stage++; if (tpcl_queue(tio->io, tio->lun) != CTL_RETVAL_COMPLETE) panic("tpcl_queue() error"); return (CTL_RETVAL_QUEUED); } static off_t tpc_ranges_length(struct scsi_range_desc *range, int nrange) { off_t length = 0; int r; for (r = 0; r < nrange; r++) length += scsi_4btoul(range[r].length); return (length); } static int tpc_skip_ranges(struct scsi_range_desc *range, int nrange, off_t skip, int *srange, off_t *soffset) { off_t off; int r; r = 0; off = 0; while (r < nrange) { if (skip - off < scsi_4btoul(range[r].length)) { *srange = r; *soffset = skip - off; return (0); } off += scsi_4btoul(range[r].length); r++; } return (-1); } static int tpc_process_wut(struct tpc_list *list) { struct tpc_io *tio, *tior, *tiow; struct runl run, *prun; int drange, srange; off_t doffset, soffset; off_t srclba, dstlba, numbytes, donebytes, roundbytes; uint32_t srcblock, dstblock, pb, pbo, adj; if (list->stage > 0) { /* Cleanup after previous rounds. */ while ((tio = TAILQ_FIRST(&list->allio)) != NULL) { TAILQ_REMOVE(&list->allio, tio, links); ctl_free_io(tio->io); free(tio, M_CTL); } free(list->buf, M_CTL); if (list->abort) { ctl_set_task_aborted(list->ctsio); return (CTL_RETVAL_ERROR); } else if (list->error) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x0d, /*ascq*/ 0x01, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } list->cursectors += list->segsectors; list->curbytes += list->segbytes; } /* Check where we are on destination ranges list. */ if (tpc_skip_ranges(list->range, list->nrange, list->cursectors, &drange, &doffset) != 0) return (CTL_RETVAL_COMPLETE); dstblock = list->lun->be_lun->blocksize; pb = dstblock << list->lun->be_lun->pblockexp; if (list->lun->be_lun->pblockoff > 0) pbo = pb - dstblock * list->lun->be_lun->pblockoff; else pbo = 0; /* Check where we are on source ranges list. */ srcblock = list->token->blocksize; if (tpc_skip_ranges(list->token->range, list->token->nrange, list->offset_into_rod + list->cursectors * dstblock / srcblock, &srange, &soffset) != 0) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x0d, /*ascq*/ 0x04, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } srclba = scsi_8btou64(list->token->range[srange].lba) + soffset; dstlba = scsi_8btou64(list->range[drange].lba) + doffset; numbytes = srcblock * (scsi_4btoul(list->token->range[srange].length) - soffset); numbytes = omin(numbytes, dstblock * (scsi_4btoul(list->range[drange].length) - doffset)); if (numbytes > TPC_MAX_IOCHUNK_SIZE) { numbytes = TPC_MAX_IOCHUNK_SIZE; numbytes -= numbytes % dstblock; if (pb > dstblock) { adj = (dstlba * dstblock + numbytes - pbo) % pb; if (numbytes > adj) numbytes -= adj; } } if (numbytes % srcblock != 0 || numbytes % dstblock != 0) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x26, /*ascq*/ 0x0A, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } list->buf = malloc(numbytes, M_CTL, M_WAITOK | (list->token == NULL ? M_ZERO : 0)); list->segbytes = numbytes; list->segsectors = numbytes / dstblock; //printf("Copy chunk of %ju sectors from %ju to %ju\n", list->segsectors, // srclba, dstlba); donebytes = 0; TAILQ_INIT(&run); prun = &run; list->tbdio = 1; TAILQ_INIT(&list->allio); while (donebytes < numbytes) { roundbytes = numbytes - donebytes; if (roundbytes > TPC_MAX_IO_SIZE) { roundbytes = TPC_MAX_IO_SIZE; roundbytes -= roundbytes % dstblock; if (pb > dstblock) { adj = (dstlba * dstblock + roundbytes - pbo) % pb; if (roundbytes > adj) roundbytes -= adj; } } tior = malloc(sizeof(*tior), M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&tior->run); tior->list = list; TAILQ_INSERT_TAIL(&list->allio, tior, links); tior->io = tpcl_alloc_io(); ctl_scsi_read_write(tior->io, /*data_ptr*/ &list->buf[donebytes], /*data_len*/ roundbytes, /*read_op*/ 1, /*byte2*/ 0, /*minimum_cdb_size*/ 0, /*lba*/ srclba, /*num_blocks*/ roundbytes / srcblock, /*tag_type*/ CTL_TAG_SIMPLE, /*control*/ 0); tior->io->io_hdr.retries = 3; tior->lun = list->token->lun; tior->io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = tior; tiow = malloc(sizeof(*tiow), M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&tiow->run); tiow->list = list; TAILQ_INSERT_TAIL(&list->allio, tiow, links); tiow->io = tpcl_alloc_io(); ctl_scsi_read_write(tiow->io, /*data_ptr*/ &list->buf[donebytes], /*data_len*/ roundbytes, /*read_op*/ 0, /*byte2*/ 0, /*minimum_cdb_size*/ 0, /*lba*/ dstlba, /*num_blocks*/ roundbytes / dstblock, /*tag_type*/ CTL_TAG_SIMPLE, /*control*/ 0); tiow->io->io_hdr.retries = 3; tiow->lun = list->lun->lun; tiow->io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = tiow; TAILQ_INSERT_TAIL(&tior->run, tiow, rlinks); TAILQ_INSERT_TAIL(prun, tior, rlinks); prun = &tior->run; donebytes += roundbytes; srclba += roundbytes / srcblock; dstlba += roundbytes / dstblock; } while ((tior = TAILQ_FIRST(&run)) != NULL) { TAILQ_REMOVE(&run, tior, rlinks); if (tpcl_queue(tior->io, tior->lun) != CTL_RETVAL_COMPLETE) panic("tpcl_queue() error"); } list->stage++; return (CTL_RETVAL_QUEUED); } static int tpc_process_zero_wut(struct tpc_list *list) { struct tpc_io *tio, *tiow; struct runl run, *prun; int r; uint32_t dstblock, len; if (list->stage > 0) { complete: /* Cleanup after previous rounds. */ while ((tio = TAILQ_FIRST(&list->allio)) != NULL) { TAILQ_REMOVE(&list->allio, tio, links); ctl_free_io(tio->io); free(tio, M_CTL); } free(list->buf, M_CTL); if (list->abort) { ctl_set_task_aborted(list->ctsio); return (CTL_RETVAL_ERROR); } else if (list->error) { ctl_set_sense(list->ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x0d, /*ascq*/ 0x01, SSD_ELEM_NONE); return (CTL_RETVAL_ERROR); } list->cursectors += list->segsectors; list->curbytes += list->segbytes; return (CTL_RETVAL_COMPLETE); } dstblock = list->lun->be_lun->blocksize; list->buf = malloc(dstblock, M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&run); prun = &run; list->tbdio = 1; TAILQ_INIT(&list->allio); list->segsectors = 0; for (r = 0; r < list->nrange; r++) { len = scsi_4btoul(list->range[r].length); if (len == 0) continue; tiow = malloc(sizeof(*tiow), M_CTL, M_WAITOK | M_ZERO); TAILQ_INIT(&tiow->run); tiow->list = list; TAILQ_INSERT_TAIL(&list->allio, tiow, links); tiow->io = tpcl_alloc_io(); ctl_scsi_write_same(tiow->io, /*data_ptr*/ list->buf, /*data_len*/ dstblock, /*byte2*/ 0, /*lba*/ scsi_8btou64(list->range[r].lba), /*num_blocks*/ len, /*tag_type*/ CTL_TAG_SIMPLE, /*control*/ 0); tiow->io->io_hdr.retries = 3; tiow->lun = list->lun->lun; tiow->io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr = tiow; TAILQ_INSERT_TAIL(prun, tiow, rlinks); prun = &tiow->run; list->segsectors += len; } list->segbytes = list->segsectors * dstblock; if (TAILQ_EMPTY(&run)) goto complete; while ((tiow = TAILQ_FIRST(&run)) != NULL) { TAILQ_REMOVE(&run, tiow, rlinks); if (tpcl_queue(tiow->io, tiow->lun) != CTL_RETVAL_COMPLETE) panic("tpcl_queue() error"); } list->stage++; return (CTL_RETVAL_QUEUED); } static void tpc_process(struct tpc_list *list) { struct ctl_lun *lun = list->lun; struct ctl_softc *softc = lun->ctl_softc; struct scsi_ec_segment *seg; struct ctl_scsiio *ctsio = list->ctsio; int retval = CTL_RETVAL_COMPLETE; if (list->service_action == EC_WUT) { if (list->token != NULL) retval = tpc_process_wut(list); else retval = tpc_process_zero_wut(list); if (retval == CTL_RETVAL_QUEUED) return; if (retval == CTL_RETVAL_ERROR) { list->error = 1; goto done; } } else { //printf("ZZZ %d cscd, %d segs\n", list->ncscd, list->nseg); while (list->curseg < list->nseg) { seg = list->seg[list->curseg]; switch (seg->type_code) { case EC_SEG_B2B: retval = tpc_process_b2b(list); break; case EC_SEG_VERIFY: retval = tpc_process_verify(list); break; case EC_SEG_REGISTER_KEY: retval = tpc_process_register_key(list); break; default: ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_COPY_ABORTED, /*asc*/ 0x26, /*ascq*/ 0x09, SSD_ELEM_NONE); goto done; } if (retval == CTL_RETVAL_QUEUED) return; if (retval == CTL_RETVAL_ERROR) { list->error = 1; goto done; } list->curseg++; list->stage = 0; } } ctl_set_success(ctsio); done: //printf("ZZZ done\n"); free(list->params, M_CTL); list->params = NULL; if (list->token) { mtx_lock(&softc->tpc_lock); if (--list->token->active == 0) list->token->last_active = time_uptime; mtx_unlock(&softc->tpc_lock); list->token = NULL; } mtx_lock(&lun->lun_lock); if ((list->flags & EC_LIST_ID_USAGE_MASK) == EC_LIST_ID_USAGE_NONE) { TAILQ_REMOVE(&lun->tpc_lists, list, links); free(list, M_CTL); } else { list->completed = 1; list->last_active = time_uptime; list->sense_data = ctsio->sense_data; list->sense_len = ctsio->sense_len; list->scsi_status = ctsio->scsi_status; } mtx_unlock(&lun->lun_lock); ctl_done((union ctl_io *)ctsio); } /* * For any sort of check condition, busy, etc., we just retry. We do not * decrement the retry count for unit attention type errors. These are * normal, and we want to save the retry count for "real" errors. Otherwise, * we could end up with situations where a command will succeed in some * situations and fail in others, depending on whether a unit attention is * pending. Also, some of our error recovery actions, most notably the * LUN reset action, will cause a unit attention. * * We can add more detail here later if necessary. */ static tpc_error_action tpc_checkcond_parse(union ctl_io *io) { tpc_error_action error_action; int error_code, sense_key, asc, ascq; /* * Default to retrying the command. */ error_action = TPC_ERR_RETRY; scsi_extract_sense_len(&io->scsiio.sense_data, io->scsiio.sense_len, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); switch (error_code) { case SSD_DEFERRED_ERROR: case SSD_DESC_DEFERRED_ERROR: error_action |= TPC_ERR_NO_DECREMENT; break; case SSD_CURRENT_ERROR: case SSD_DESC_CURRENT_ERROR: default: switch (sense_key) { case SSD_KEY_UNIT_ATTENTION: error_action |= TPC_ERR_NO_DECREMENT; break; case SSD_KEY_HARDWARE_ERROR: /* * This is our generic "something bad happened" * error code. It often isn't recoverable. */ if ((asc == 0x44) && (ascq == 0x00)) error_action = TPC_ERR_FAIL; break; case SSD_KEY_NOT_READY: /* * If the LUN is powered down, there likely isn't * much point in retrying right now. */ if ((asc == 0x04) && (ascq == 0x02)) error_action = TPC_ERR_FAIL; /* * If the LUN is offline, there probably isn't much * point in retrying, either. */ if ((asc == 0x04) && (ascq == 0x03)) error_action = TPC_ERR_FAIL; break; } } return (error_action); } static tpc_error_action tpc_error_parse(union ctl_io *io) { tpc_error_action error_action = TPC_ERR_RETRY; switch (io->io_hdr.io_type) { case CTL_IO_SCSI: switch (io->io_hdr.status & CTL_STATUS_MASK) { case CTL_SCSI_ERROR: switch (io->scsiio.scsi_status) { case SCSI_STATUS_CHECK_COND: error_action = tpc_checkcond_parse(io); break; default: break; } break; default: break; } break; case CTL_IO_TASK: break; default: panic("%s: invalid ctl_io type %d\n", __func__, io->io_hdr.io_type); break; } return (error_action); } void tpc_done(union ctl_io *io) { struct tpc_io *tio, *tior; /* * Very minimal retry logic. We basically retry if we got an error * back, and the retry count is greater than 0. If we ever want * more sophisticated initiator type behavior, the CAM error * recovery code in ../common might be helpful. */ tio = io->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr; if (((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) && (io->io_hdr.retries > 0)) { ctl_io_status old_status; tpc_error_action error_action; error_action = tpc_error_parse(io); switch (error_action & TPC_ERR_MASK) { case TPC_ERR_FAIL: break; case TPC_ERR_RETRY: default: if ((error_action & TPC_ERR_NO_DECREMENT) == 0) io->io_hdr.retries--; old_status = io->io_hdr.status; io->io_hdr.status = CTL_STATUS_NONE; io->io_hdr.flags &= ~CTL_FLAG_ABORT; io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; if (tpcl_queue(io, tio->lun) != CTL_RETVAL_COMPLETE) { printf("%s: error returned from ctl_queue()!\n", __func__); io->io_hdr.status = old_status; } else return; } } if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) tio->list->error = 1; else atomic_add_int(&tio->list->curops, 1); if (!tio->list->error && !tio->list->abort) { while ((tior = TAILQ_FIRST(&tio->run)) != NULL) { TAILQ_REMOVE(&tio->run, tior, rlinks); atomic_add_int(&tio->list->tbdio, 1); if (tpcl_queue(tior->io, tior->lun) != CTL_RETVAL_COMPLETE) panic("tpcl_queue() error"); } } if (atomic_fetchadd_int(&tio->list->tbdio, -1) == 1) tpc_process(tio->list); } int ctl_extended_copy_lid1(struct ctl_scsiio *ctsio) { struct scsi_extended_copy *cdb; struct scsi_extended_copy_lid1_data *data; struct ctl_lun *lun; struct tpc_list *list, *tlist; uint8_t *ptr; char *value; int len, off, lencscd, lenseg, leninl, nseg; CTL_DEBUG_PRINT(("ctl_extended_copy_lid1\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_extended_copy *)ctsio->cdb; len = scsi_4btoul(cdb->length); + if (len == 0) { + ctl_set_success(ctsio); + goto done; + } if (len < sizeof(struct scsi_extended_copy_lid1_data) || len > sizeof(struct scsi_extended_copy_lid1_data) + TPC_MAX_LIST + TPC_MAX_INLINE) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 9, /*bit_valid*/ 0, /*bit*/ 0); goto done; } /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } data = (struct scsi_extended_copy_lid1_data *)ctsio->kern_data_ptr; lencscd = scsi_2btoul(data->cscd_list_length); lenseg = scsi_4btoul(data->segment_list_length); leninl = scsi_4btoul(data->inline_data_length); - if (len < sizeof(struct scsi_extended_copy_lid1_data) + - lencscd + lenseg + leninl || - leninl > TPC_MAX_INLINE) { - ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, - /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); - goto done; - } if (lencscd > TPC_MAX_CSCDS * sizeof(struct scsi_ec_cscd)) { ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x26, /*ascq*/ 0x06, SSD_ELEM_NONE); goto done; } - if (lencscd + lenseg > TPC_MAX_LIST) { + if (lenseg > TPC_MAX_SEGS * sizeof(struct scsi_ec_segment)) { + ctl_set_sense(ctsio, /*current_error*/ 1, + /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, + /*asc*/ 0x26, /*ascq*/ 0x08, SSD_ELEM_NONE); + goto done; + } + if (lencscd + lenseg > TPC_MAX_LIST || + leninl > TPC_MAX_INLINE || + len < sizeof(struct scsi_extended_copy_lid1_data) + + lencscd + lenseg + leninl) { ctl_set_param_len_error(ctsio); goto done; } list = malloc(sizeof(struct tpc_list), M_CTL, M_WAITOK | M_ZERO); list->service_action = cdb->service_action; value = ctl_get_opt(&lun->be_lun->options, "insecure_tpc"); if (value != NULL && strcmp(value, "on") == 0) list->init_port = -1; else list->init_port = ctsio->io_hdr.nexus.targ_port; list->init_idx = ctl_get_initindex(&ctsio->io_hdr.nexus); list->list_id = data->list_identifier; list->flags = data->flags; list->params = ctsio->kern_data_ptr; list->cscd = (struct scsi_ec_cscd *)&data->data[0]; ptr = &data->data[lencscd]; for (nseg = 0, off = 0; off < lenseg; nseg++) { if (nseg >= TPC_MAX_SEGS) { free(list, M_CTL); ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x26, /*ascq*/ 0x08, SSD_ELEM_NONE); goto done; } list->seg[nseg] = (struct scsi_ec_segment *)(ptr + off); off += sizeof(struct scsi_ec_segment) + scsi_2btoul(list->seg[nseg]->descr_length); } list->inl = &data->data[lencscd + lenseg]; list->ncscd = lencscd / sizeof(struct scsi_ec_cscd); list->nseg = nseg; list->leninl = leninl; list->ctsio = ctsio; list->lun = lun; mtx_lock(&lun->lun_lock); if ((list->flags & EC_LIST_ID_USAGE_MASK) != EC_LIST_ID_USAGE_NONE) { tlist = tpc_find_list(lun, list->list_id, list->init_idx); if (tlist != NULL && !tlist->completed) { mtx_unlock(&lun->lun_lock); free(list, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); goto done; } if (tlist != NULL) { TAILQ_REMOVE(&lun->tpc_lists, tlist, links); free(tlist, M_CTL); } } TAILQ_INSERT_TAIL(&lun->tpc_lists, list, links); mtx_unlock(&lun->lun_lock); tpc_process(list); return (CTL_RETVAL_COMPLETE); done: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_extended_copy_lid4(struct ctl_scsiio *ctsio) { struct scsi_extended_copy *cdb; struct scsi_extended_copy_lid4_data *data; struct ctl_lun *lun; struct tpc_list *list, *tlist; uint8_t *ptr; char *value; int len, off, lencscd, lenseg, leninl, nseg; CTL_DEBUG_PRINT(("ctl_extended_copy_lid4\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; cdb = (struct scsi_extended_copy *)ctsio->cdb; len = scsi_4btoul(cdb->length); + if (len == 0) { + ctl_set_success(ctsio); + goto done; + } if (len < sizeof(struct scsi_extended_copy_lid4_data) || len > sizeof(struct scsi_extended_copy_lid4_data) + TPC_MAX_LIST + TPC_MAX_INLINE) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 9, /*bit_valid*/ 0, /*bit*/ 0); goto done; } /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } data = (struct scsi_extended_copy_lid4_data *)ctsio->kern_data_ptr; lencscd = scsi_2btoul(data->cscd_list_length); lenseg = scsi_2btoul(data->segment_list_length); leninl = scsi_2btoul(data->inline_data_length); - if (len < sizeof(struct scsi_extended_copy_lid4_data) + - lencscd + lenseg + leninl || - leninl > TPC_MAX_INLINE) { - ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, - /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); - goto done; - } if (lencscd > TPC_MAX_CSCDS * sizeof(struct scsi_ec_cscd)) { ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x26, /*ascq*/ 0x06, SSD_ELEM_NONE); goto done; } - if (lencscd + lenseg > TPC_MAX_LIST) { + if (lenseg > TPC_MAX_SEGS * sizeof(struct scsi_ec_segment)) { + ctl_set_sense(ctsio, /*current_error*/ 1, + /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, + /*asc*/ 0x26, /*ascq*/ 0x08, SSD_ELEM_NONE); + goto done; + } + if (lencscd + lenseg > TPC_MAX_LIST || + leninl > TPC_MAX_INLINE || + len < sizeof(struct scsi_extended_copy_lid1_data) + + lencscd + lenseg + leninl) { ctl_set_param_len_error(ctsio); goto done; } list = malloc(sizeof(struct tpc_list), M_CTL, M_WAITOK | M_ZERO); list->service_action = cdb->service_action; value = ctl_get_opt(&lun->be_lun->options, "insecure_tpc"); if (value != NULL && strcmp(value, "on") == 0) list->init_port = -1; else list->init_port = ctsio->io_hdr.nexus.targ_port; list->init_idx = ctl_get_initindex(&ctsio->io_hdr.nexus); list->list_id = scsi_4btoul(data->list_identifier); list->flags = data->flags; list->params = ctsio->kern_data_ptr; list->cscd = (struct scsi_ec_cscd *)&data->data[0]; ptr = &data->data[lencscd]; for (nseg = 0, off = 0; off < lenseg; nseg++) { if (nseg >= TPC_MAX_SEGS) { free(list, M_CTL); ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x26, /*ascq*/ 0x08, SSD_ELEM_NONE); goto done; } list->seg[nseg] = (struct scsi_ec_segment *)(ptr + off); off += sizeof(struct scsi_ec_segment) + scsi_2btoul(list->seg[nseg]->descr_length); } list->inl = &data->data[lencscd + lenseg]; list->ncscd = lencscd / sizeof(struct scsi_ec_cscd); list->nseg = nseg; list->leninl = leninl; list->ctsio = ctsio; list->lun = lun; mtx_lock(&lun->lun_lock); if ((list->flags & EC_LIST_ID_USAGE_MASK) != EC_LIST_ID_USAGE_NONE) { tlist = tpc_find_list(lun, list->list_id, list->init_idx); if (tlist != NULL && !tlist->completed) { mtx_unlock(&lun->lun_lock); free(list, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); goto done; } if (tlist != NULL) { TAILQ_REMOVE(&lun->tpc_lists, tlist, links); free(tlist, M_CTL); } } TAILQ_INSERT_TAIL(&lun->tpc_lists, list, links); mtx_unlock(&lun->lun_lock); tpc_process(list); return (CTL_RETVAL_COMPLETE); done: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static void tpc_create_token(struct ctl_lun *lun, struct ctl_port *port, off_t len, struct scsi_token *token) { static int id = 0; struct scsi_vpd_id_descriptor *idd = NULL; struct scsi_ec_cscd_id *cscd; struct scsi_read_capacity_data_long *dtsd; int targid_len; scsi_ulto4b(ROD_TYPE_AUR, token->type); scsi_ulto2b(0x01f8, token->length); scsi_u64to8b(atomic_fetchadd_int(&id, 1), &token->body[0]); if (lun->lun_devid) idd = scsi_get_devid_desc((struct scsi_vpd_id_descriptor *) lun->lun_devid->data, lun->lun_devid->len, scsi_devid_is_lun_naa); if (idd == NULL && lun->lun_devid) idd = scsi_get_devid_desc((struct scsi_vpd_id_descriptor *) lun->lun_devid->data, lun->lun_devid->len, scsi_devid_is_lun_eui64); if (idd != NULL) { cscd = (struct scsi_ec_cscd_id *)&token->body[8]; cscd->type_code = EC_CSCD_ID; cscd->luidt_pdt = T_DIRECT; memcpy(&cscd->codeset, idd, 4 + idd->length); scsi_ulto3b(lun->be_lun->blocksize, cscd->dtsp.block_length); } scsi_u64to8b(0, &token->body[40]); /* XXX: Should be 128bit value. */ scsi_u64to8b(len, &token->body[48]); /* ROD token device type specific data (RC16 without first field) */ dtsd = (struct scsi_read_capacity_data_long *)&token->body[88 - 8]; scsi_ulto4b(lun->be_lun->blocksize, dtsd->length); dtsd->prot_lbppbe = lun->be_lun->pblockexp & SRC16_LBPPBE; scsi_ulto2b(lun->be_lun->pblockoff & SRC16_LALBA_A, dtsd->lalba_lbp); if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) dtsd->lalba_lbp[0] |= SRC16_LBPME | SRC16_LBPRZ; if (port->target_devid) { targid_len = port->target_devid->len; memcpy(&token->body[120], port->target_devid->data, targid_len); } else targid_len = 32; arc4rand(&token->body[120 + targid_len], 384 - targid_len, 0); }; int ctl_populate_token(struct ctl_scsiio *ctsio) { struct scsi_populate_token *cdb; struct scsi_populate_token_data *data; struct ctl_softc *softc; struct ctl_lun *lun; struct ctl_port *port; struct tpc_list *list, *tlist; struct tpc_token *token; int len, lendesc; CTL_DEBUG_PRINT(("ctl_populate_token\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; softc = lun->ctl_softc; port = softc->ctl_ports[ctsio->io_hdr.nexus.targ_port]; cdb = (struct scsi_populate_token *)ctsio->cdb; len = scsi_4btoul(cdb->length); if (len < sizeof(struct scsi_populate_token_data) || len > sizeof(struct scsi_populate_token_data) + TPC_MAX_SEGS * sizeof(struct scsi_range_desc)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 9, /*bit_valid*/ 0, /*bit*/ 0); goto done; } /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } data = (struct scsi_populate_token_data *)ctsio->kern_data_ptr; lendesc = scsi_2btoul(data->range_descriptor_length); if (len < sizeof(struct scsi_populate_token_data) + lendesc) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto done; } /* printf("PT(list=%u) flags=%x to=%d rt=%x len=%x\n", scsi_4btoul(cdb->list_identifier), data->flags, scsi_4btoul(data->inactivity_timeout), scsi_4btoul(data->rod_type), scsi_2btoul(data->range_descriptor_length)); */ if ((data->flags & EC_PT_RTV) && scsi_4btoul(data->rod_type) != ROD_TYPE_AUR) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 8, /*bit_valid*/ 0, /*bit*/ 0); goto done; } list = malloc(sizeof(struct tpc_list), M_CTL, M_WAITOK | M_ZERO); list->service_action = cdb->service_action; list->init_port = ctsio->io_hdr.nexus.targ_port; list->init_idx = ctl_get_initindex(&ctsio->io_hdr.nexus); list->list_id = scsi_4btoul(cdb->list_identifier); list->flags = data->flags; list->ctsio = ctsio; list->lun = lun; mtx_lock(&lun->lun_lock); tlist = tpc_find_list(lun, list->list_id, list->init_idx); if (tlist != NULL && !tlist->completed) { mtx_unlock(&lun->lun_lock); free(list, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); goto done; } if (tlist != NULL) { TAILQ_REMOVE(&lun->tpc_lists, tlist, links); free(tlist, M_CTL); } TAILQ_INSERT_TAIL(&lun->tpc_lists, list, links); mtx_unlock(&lun->lun_lock); token = malloc(sizeof(*token), M_CTL, M_WAITOK | M_ZERO); token->lun = lun->lun; token->blocksize = lun->be_lun->blocksize; token->params = ctsio->kern_data_ptr; token->range = &data->desc[0]; token->nrange = scsi_2btoul(data->range_descriptor_length) / sizeof(struct scsi_range_desc); list->cursectors = tpc_ranges_length(token->range, token->nrange); list->curbytes = (off_t)list->cursectors * lun->be_lun->blocksize; tpc_create_token(lun, port, list->curbytes, (struct scsi_token *)token->token); token->active = 0; token->last_active = time_uptime; token->timeout = scsi_4btoul(data->inactivity_timeout); if (token->timeout == 0) token->timeout = TPC_DFL_TOKEN_TIMEOUT; else if (token->timeout < TPC_MIN_TOKEN_TIMEOUT) token->timeout = TPC_MIN_TOKEN_TIMEOUT; else if (token->timeout > TPC_MAX_TOKEN_TIMEOUT) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 4, /*bit_valid*/ 0, /*bit*/ 0); } memcpy(list->res_token, token->token, sizeof(list->res_token)); list->res_token_valid = 1; list->curseg = 0; list->completed = 1; list->last_active = time_uptime; mtx_lock(&softc->tpc_lock); TAILQ_INSERT_TAIL(&softc->tpc_tokens, token, links); mtx_unlock(&softc->tpc_lock); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); done: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_write_using_token(struct ctl_scsiio *ctsio) { struct scsi_write_using_token *cdb; struct scsi_write_using_token_data *data; struct ctl_softc *softc; struct ctl_lun *lun; struct tpc_list *list, *tlist; struct tpc_token *token; int len, lendesc; CTL_DEBUG_PRINT(("ctl_write_using_token\n")); lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; softc = lun->ctl_softc; cdb = (struct scsi_write_using_token *)ctsio->cdb; len = scsi_4btoul(cdb->length); if (len < sizeof(struct scsi_populate_token_data) || len > sizeof(struct scsi_populate_token_data) + TPC_MAX_SEGS * sizeof(struct scsi_range_desc)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 9, /*bit_valid*/ 0, /*bit*/ 0); goto done; } /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } data = (struct scsi_write_using_token_data *)ctsio->kern_data_ptr; lendesc = scsi_2btoul(data->range_descriptor_length); if (len < sizeof(struct scsi_populate_token_data) + lendesc) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto done; } /* printf("WUT(list=%u) flags=%x off=%ju len=%x\n", scsi_4btoul(cdb->list_identifier), data->flags, scsi_8btou64(data->offset_into_rod), scsi_2btoul(data->range_descriptor_length)); */ list = malloc(sizeof(struct tpc_list), M_CTL, M_WAITOK | M_ZERO); list->service_action = cdb->service_action; list->init_port = ctsio->io_hdr.nexus.targ_port; list->init_idx = ctl_get_initindex(&ctsio->io_hdr.nexus); list->list_id = scsi_4btoul(cdb->list_identifier); list->flags = data->flags; list->params = ctsio->kern_data_ptr; list->range = &data->desc[0]; list->nrange = scsi_2btoul(data->range_descriptor_length) / sizeof(struct scsi_range_desc); list->offset_into_rod = scsi_8btou64(data->offset_into_rod); list->ctsio = ctsio; list->lun = lun; mtx_lock(&lun->lun_lock); tlist = tpc_find_list(lun, list->list_id, list->init_idx); if (tlist != NULL && !tlist->completed) { mtx_unlock(&lun->lun_lock); free(list, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); goto done; } if (tlist != NULL) { TAILQ_REMOVE(&lun->tpc_lists, tlist, links); free(tlist, M_CTL); } TAILQ_INSERT_TAIL(&lun->tpc_lists, list, links); mtx_unlock(&lun->lun_lock); /* Block device zero ROD token -> no token. */ if (scsi_4btoul(data->rod_token) == ROD_TYPE_BLOCK_ZERO) { tpc_process(list); return (CTL_RETVAL_COMPLETE); } mtx_lock(&softc->tpc_lock); TAILQ_FOREACH(token, &softc->tpc_tokens, links) { if (memcmp(token->token, data->rod_token, sizeof(data->rod_token)) == 0) break; } if (token != NULL) { token->active++; list->token = token; if (data->flags & EC_WUT_DEL_TKN) token->timeout = 0; } mtx_unlock(&softc->tpc_lock); if (token == NULL) { mtx_lock(&lun->lun_lock); TAILQ_REMOVE(&lun->tpc_lists, list, links); mtx_unlock(&lun->lun_lock); free(list, M_CTL); ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x23, /*ascq*/ 0x04, SSD_ELEM_NONE); goto done; } tpc_process(list); return (CTL_RETVAL_COMPLETE); done: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_receive_rod_token_information(struct ctl_scsiio *ctsio) { struct ctl_lun *lun; struct scsi_receive_rod_token_information *cdb; struct scsi_receive_copy_status_lid4_data *data; struct tpc_list *list; struct tpc_list list_copy; uint8_t *ptr; int retval; int alloc_len, total_len, token_len; uint32_t list_id; CTL_DEBUG_PRINT(("ctl_receive_rod_token_information\n")); cdb = (struct scsi_receive_rod_token_information *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; retval = CTL_RETVAL_COMPLETE; list_id = scsi_4btoul(cdb->list_identifier); mtx_lock(&lun->lun_lock); list = tpc_find_list(lun, list_id, ctl_get_initindex(&ctsio->io_hdr.nexus)); if (list == NULL) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } list_copy = *list; if (list->completed) { TAILQ_REMOVE(&lun->tpc_lists, list, links); free(list, M_CTL); } mtx_unlock(&lun->lun_lock); token_len = list_copy.res_token_valid ? 2 + sizeof(list_copy.res_token) : 0; total_len = sizeof(*data) + list_copy.sense_len + 4 + token_len; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_receive_copy_status_lid4_data *)ctsio->kern_data_ptr; scsi_ulto4b(sizeof(*data) - 4 + list_copy.sense_len + 4 + token_len, data->available_data); data->response_to_service_action = list_copy.service_action; if (list_copy.completed) { if (list_copy.error) data->copy_command_status = RCS_CCS_ERROR; else if (list_copy.abort) data->copy_command_status = RCS_CCS_ABORTED; else data->copy_command_status = RCS_CCS_COMPLETED; } else data->copy_command_status = RCS_CCS_INPROG_FG; scsi_ulto2b(list_copy.curops, data->operation_counter); scsi_ulto4b(UINT32_MAX, data->estimated_status_update_delay); data->transfer_count_units = RCS_TC_LBAS; scsi_u64to8b(list_copy.cursectors, data->transfer_count); scsi_ulto2b(list_copy.curseg, data->segments_processed); data->length_of_the_sense_data_field = list_copy.sense_len; data->sense_data_length = list_copy.sense_len; memcpy(data->sense_data, &list_copy.sense_data, list_copy.sense_len); ptr = &data->sense_data[data->length_of_the_sense_data_field]; scsi_ulto4b(token_len, &ptr[0]); if (list_copy.res_token_valid) { scsi_ulto2b(0, &ptr[4]); memcpy(&ptr[6], list_copy.res_token, sizeof(list_copy.res_token)); } /* printf("RRTI(list=%u) valid=%d\n", scsi_4btoul(cdb->list_identifier), list_copy.res_token_valid); */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_report_all_rod_tokens(struct ctl_scsiio *ctsio) { struct ctl_softc *softc; struct ctl_lun *lun; struct scsi_report_all_rod_tokens *cdb; struct scsi_report_all_rod_tokens_data *data; struct tpc_token *token; int retval; int alloc_len, total_len, tokens, i; CTL_DEBUG_PRINT(("ctl_receive_rod_token_information\n")); cdb = (struct scsi_report_all_rod_tokens *)ctsio->cdb; lun = (struct ctl_lun *)ctsio->io_hdr.ctl_private[CTL_PRIV_LUN].ptr; softc = lun->ctl_softc; retval = CTL_RETVAL_COMPLETE; tokens = 0; mtx_lock(&softc->tpc_lock); TAILQ_FOREACH(token, &softc->tpc_tokens, links) tokens++; mtx_unlock(&softc->tpc_lock); if (tokens > 512) tokens = 512; total_len = sizeof(*data) + tokens * 96; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; if (total_len < alloc_len) { ctsio->residual = alloc_len - total_len; ctsio->kern_data_len = total_len; ctsio->kern_total_len = total_len; } else { ctsio->residual = 0; ctsio->kern_data_len = alloc_len; ctsio->kern_total_len = alloc_len; } ctsio->kern_data_resid = 0; ctsio->kern_rel_offset = 0; data = (struct scsi_report_all_rod_tokens_data *)ctsio->kern_data_ptr; i = 0; mtx_lock(&softc->tpc_lock); TAILQ_FOREACH(token, &softc->tpc_tokens, links) { if (i >= tokens) break; memcpy(&data->rod_management_token_list[i * 96], token->token, 96); i++; } mtx_unlock(&softc->tpc_lock); scsi_ulto4b(sizeof(*data) - 4 + i * 96, data->available_data); /* printf("RART tokens=%d\n", i); */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } Index: projects/iosched/sys/cam/ctl/ctl_tpc_local.c =================================================================== --- projects/iosched/sys/cam/ctl/ctl_tpc_local.c (revision 287721) +++ projects/iosched/sys/cam/ctl/ctl_tpc_local.c (revision 287722) @@ -1,337 +1,338 @@ /*- * Copyright (c) 2014 Alexander Motin * Copyright (c) 2004, 2005 Silicon Graphics International Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct tpcl_softc { struct ctl_port port; int cur_tag_num; }; static struct tpcl_softc tpcl_softc; static int tpcl_init(void); static void tpcl_shutdown(void); static void tpcl_datamove(union ctl_io *io); static void tpcl_done(union ctl_io *io); static struct ctl_frontend tpcl_frontend = { .name = "tpc", .init = tpcl_init, .shutdown = tpcl_shutdown, }; CTL_FRONTEND_DECLARE(ctltpc, tpcl_frontend); static int tpcl_init(void) { struct tpcl_softc *tsoftc = &tpcl_softc; struct ctl_port *port; struct scsi_transportid_spi *tid; int len; memset(tsoftc, 0, sizeof(*tsoftc)); port = &tsoftc->port; port->frontend = &tpcl_frontend; port->port_type = CTL_PORT_INTERNAL; port->num_requested_ctl_io = 100; port->port_name = "tpc"; port->fe_datamove = tpcl_datamove; port->fe_done = tpcl_done; port->max_targets = 1; port->max_target_id = 0; port->targ_port = -1; port->max_initiators = 1; if (ctl_port_register(port) != 0) { printf("%s: ctl_port_register() failed with error\n", __func__); return (0); } len = sizeof(struct scsi_transportid_spi); port->init_devid = malloc(sizeof(struct ctl_devid) + len, M_CTL, M_WAITOK | M_ZERO); port->init_devid->len = len; tid = (struct scsi_transportid_spi *)port->init_devid->data; tid->format_protocol = SCSI_TRN_SPI_FORMAT_DEFAULT | SCSI_PROTO_SPI; scsi_ulto2b(0, tid->scsi_addr); scsi_ulto2b(port->targ_port, tid->rel_trgt_port_id); ctl_port_online(port); return (0); } void tpcl_shutdown(void) { struct tpcl_softc *tsoftc = &tpcl_softc; struct ctl_port *port; port = &tsoftc->port; ctl_port_offline(port); if (ctl_port_deregister(&tsoftc->port) != 0) printf("%s: ctl_frontend_deregister() failed\n", __func__); } static void tpcl_datamove(union ctl_io *io) { struct ctl_sg_entry *ext_sglist, *kern_sglist; struct ctl_sg_entry ext_entry, kern_entry; int ext_sg_entries, kern_sg_entries; int ext_sg_start, ext_offset; int len_to_copy, len_copied; int kern_watermark, ext_watermark; struct ctl_scsiio *ctsio; int i, j; ext_sg_start = 0; ext_offset = 0; ext_sglist = NULL; CTL_DEBUG_PRINT(("%s\n", __func__)); ctsio = &io->scsiio; /* * If this is the case, we're probably doing a BBR read and don't * actually need to transfer the data. This will effectively * bit-bucket the data. */ if (ctsio->ext_data_ptr == NULL) goto bailout; /* * To simplify things here, if we have a single buffer, stick it in * a S/G entry and just make it a single entry S/G list. */ if (ctsio->io_hdr.flags & CTL_FLAG_EDPTR_SGLIST) { int len_seen; ext_sglist = (struct ctl_sg_entry *)ctsio->ext_data_ptr; ext_sg_entries = ctsio->ext_sg_entries; ext_sg_start = 0; ext_offset = 0; len_seen = 0; for (i = 0; i < ext_sg_entries; i++) { if ((len_seen + ext_sglist[i].len) >= ctsio->ext_data_filled) { ext_sg_start = i; ext_offset = ctsio->ext_data_filled - len_seen; break; } len_seen += ext_sglist[i].len; } } else { ext_sglist = &ext_entry; ext_sglist->addr = ctsio->ext_data_ptr; ext_sglist->len = ctsio->ext_data_len; ext_sg_entries = 1; ext_sg_start = 0; ext_offset = ctsio->ext_data_filled; } if (ctsio->kern_sg_entries > 0) { kern_sglist = (struct ctl_sg_entry *)ctsio->kern_data_ptr; kern_sg_entries = ctsio->kern_sg_entries; } else { kern_sglist = &kern_entry; kern_sglist->addr = ctsio->kern_data_ptr; kern_sglist->len = ctsio->kern_data_len; kern_sg_entries = 1; } kern_watermark = 0; ext_watermark = ext_offset; len_copied = 0; for (i = ext_sg_start, j = 0; i < ext_sg_entries && j < kern_sg_entries;) { uint8_t *ext_ptr, *kern_ptr; len_to_copy = min(ext_sglist[i].len - ext_watermark, kern_sglist[j].len - kern_watermark); ext_ptr = (uint8_t *)ext_sglist[i].addr; ext_ptr = ext_ptr + ext_watermark; if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) { /* * XXX KDM fix this! */ panic("need to implement bus address support"); #if 0 kern_ptr = bus_to_virt(kern_sglist[j].addr); #endif } else kern_ptr = (uint8_t *)kern_sglist[j].addr; kern_ptr = kern_ptr + kern_watermark; kern_watermark += len_to_copy; ext_watermark += len_to_copy; if ((ctsio->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) { CTL_DEBUG_PRINT(("%s: copying %d bytes to user\n", __func__, len_to_copy)); CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__, kern_ptr, ext_ptr)); memcpy(ext_ptr, kern_ptr, len_to_copy); } else { CTL_DEBUG_PRINT(("%s: copying %d bytes from user\n", __func__, len_to_copy)); CTL_DEBUG_PRINT(("%s: from %p to %p\n", __func__, ext_ptr, kern_ptr)); memcpy(kern_ptr, ext_ptr, len_to_copy); } len_copied += len_to_copy; if (ext_sglist[i].len == ext_watermark) { i++; ext_watermark = 0; } if (kern_sglist[j].len == kern_watermark) { j++; kern_watermark = 0; } } ctsio->ext_data_filled += len_copied; CTL_DEBUG_PRINT(("%s: ext_sg_entries: %d, kern_sg_entries: %d\n", __func__, ext_sg_entries, kern_sg_entries)); CTL_DEBUG_PRINT(("%s: ext_data_len = %d, kern_data_len = %d\n", __func__, ctsio->ext_data_len, ctsio->kern_data_len)); /* XXX KDM set residual?? */ bailout: io->scsiio.be_move_done(io); } static void tpcl_done(union ctl_io *io) { tpc_done(io); } uint64_t tpcl_resolve(struct ctl_softc *softc, int init_port, struct scsi_ec_cscd *cscd, uint32_t *ss, uint32_t *ps, uint32_t *pso) { struct scsi_ec_cscd_id *cscdid; struct ctl_port *port; struct ctl_lun *lun; uint64_t lunid = UINT64_MAX; - if (cscd->type_code != EC_CSCD_ID) + if (cscd->type_code != EC_CSCD_ID || + (cscd->luidt_pdt & EC_LUIDT_MASK) != EC_LUIDT_LUN) return (lunid); cscdid = (struct scsi_ec_cscd_id *)cscd; mtx_lock(&softc->ctl_lock); if (init_port >= 0) port = softc->ctl_ports[init_port]; else port = NULL; STAILQ_FOREACH(lun, &softc->lun_list, links) { if (port != NULL && ctl_lun_map_to_port(port, lun->lun) >= CTL_MAX_LUNS) continue; if (lun->lun_devid == NULL) continue; if (scsi_devid_match(lun->lun_devid->data, lun->lun_devid->len, &cscdid->codeset, cscdid->length + 4) == 0) { lunid = lun->lun; if (ss && lun->be_lun) *ss = lun->be_lun->blocksize; if (ps && lun->be_lun) *ps = lun->be_lun->blocksize << lun->be_lun->pblockexp; if (pso && lun->be_lun) *pso = lun->be_lun->blocksize * lun->be_lun->pblockoff; break; } } mtx_unlock(&softc->ctl_lock); return (lunid); }; union ctl_io * tpcl_alloc_io(void) { struct tpcl_softc *tsoftc = &tpcl_softc; return (ctl_alloc_io(tsoftc->port.ctl_pool_ref)); }; int tpcl_queue(union ctl_io *io, uint64_t lun) { struct tpcl_softc *tsoftc = &tpcl_softc; io->io_hdr.nexus.initid = 0; io->io_hdr.nexus.targ_port = tsoftc->port.targ_port; io->io_hdr.nexus.targ_lun = lun; io->scsiio.tag_num = atomic_fetchadd_int(&tsoftc->cur_tag_num, 1); io->scsiio.ext_data_filled = 0; return (ctl_queue(io)); } Index: projects/iosched/sys/cam/scsi/scsi_all.c =================================================================== --- projects/iosched/sys/cam/scsi/scsi_all.c (revision 287721) +++ projects/iosched/sys/cam/scsi/scsi_all.c (revision 287722) @@ -1,8641 +1,8642 @@ /*- * Implementation of Utility functions for all SCSI device types. * * Copyright (c) 1997, 1998, 1999 Justin T. Gibbs. * Copyright (c) 1997, 1998, 2003 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #else #include #include #include #include #include #endif #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #else #include #include #ifndef FALSE #define FALSE 0 #endif /* FALSE */ #ifndef TRUE #define TRUE 1 #endif /* TRUE */ #define ERESTART -1 /* restart syscall */ #define EJUSTRETURN -2 /* don't modify regs, just return */ #endif /* !_KERNEL */ /* * This is the default number of milliseconds we wait for devices to settle * after a SCSI bus reset. */ #ifndef SCSI_DELAY #define SCSI_DELAY 2000 #endif /* * All devices need _some_ sort of bus settle delay, so we'll set it to * a minimum value of 100ms. Note that this is pertinent only for SPI- * not transport like Fibre Channel or iSCSI where 'delay' is completely * meaningless. */ #ifndef SCSI_MIN_DELAY #define SCSI_MIN_DELAY 100 #endif /* * Make sure the user isn't using seconds instead of milliseconds. */ #if (SCSI_DELAY < SCSI_MIN_DELAY && SCSI_DELAY != 0) #error "SCSI_DELAY is in milliseconds, not seconds! Please use a larger value" #endif int scsi_delay; static int ascentrycomp(const void *key, const void *member); static int senseentrycomp(const void *key, const void *member); static void fetchtableentries(int sense_key, int asc, int ascq, struct scsi_inquiry_data *, const struct sense_key_table_entry **, const struct asc_table_entry **); #ifdef _KERNEL static void init_scsi_delay(void); static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS); static int set_scsi_delay(int delay); #endif #if !defined(SCSI_NO_OP_STRINGS) #define D (1 << T_DIRECT) #define T (1 << T_SEQUENTIAL) #define L (1 << T_PRINTER) #define P (1 << T_PROCESSOR) #define W (1 << T_WORM) #define R (1 << T_CDROM) #define O (1 << T_OPTICAL) #define M (1 << T_CHANGER) #define A (1 << T_STORARRAY) #define E (1 << T_ENCLOSURE) #define B (1 << T_RBC) #define K (1 << T_OCRW) #define V (1 << T_ADC) #define F (1 << T_OSD) #define S (1 << T_SCANNER) #define C (1 << T_COMM) #define ALL (D | T | L | P | W | R | O | M | A | E | B | K | V | F | S | C) static struct op_table_entry plextor_cd_ops[] = { { 0xD8, R, "CD-DA READ" } }; static struct scsi_op_quirk_entry scsi_op_quirk_table[] = { { /* * I believe that 0xD8 is the Plextor proprietary command * to read CD-DA data. I'm not sure which Plextor CDROM * models support the command, though. I know for sure * that the 4X, 8X, and 12X models do, and presumably the * 12-20X does. I don't know about any earlier models, * though. If anyone has any more complete information, * feel free to change this quirk entry. */ {T_CDROM, SIP_MEDIA_REMOVABLE, "PLEXTOR", "CD-ROM PX*", "*"}, sizeof(plextor_cd_ops)/sizeof(struct op_table_entry), plextor_cd_ops } }; static struct op_table_entry scsi_op_codes[] = { /* * From: http://www.t10.org/lists/op-num.txt * Modifications by Kenneth Merry (ken@FreeBSD.ORG) * and Jung-uk Kim (jkim@FreeBSD.org) * * Note: order is important in this table, scsi_op_desc() currently * depends on the opcodes in the table being in order to save * search time. * Note: scanner and comm. devices are carried over from the previous * version because they were removed in the latest spec. */ /* File: OP-NUM.TXT * * SCSI Operation Codes * Numeric Sorted Listing * as of 3/11/08 * * D - DIRECT ACCESS DEVICE (SBC-2) device column key * .T - SEQUENTIAL ACCESS DEVICE (SSC-2) ----------------- * . L - PRINTER DEVICE (SSC) M = Mandatory * . P - PROCESSOR DEVICE (SPC) O = Optional * . .W - WRITE ONCE READ MULTIPLE DEVICE (SBC-2) V = Vendor spec. * . . R - CD/DVE DEVICE (MMC-3) Z = Obsolete * . . O - OPTICAL MEMORY DEVICE (SBC-2) * . . .M - MEDIA CHANGER DEVICE (SMC-2) * . . . A - STORAGE ARRAY DEVICE (SCC-2) * . . . .E - ENCLOSURE SERVICES DEVICE (SES) * . . . .B - SIMPLIFIED DIRECT-ACCESS DEVICE (RBC) * . . . . K - OPTICAL CARD READER/WRITER DEVICE (OCRW) * . . . . V - AUTOMATION/DRIVE INTERFACE (ADC) * . . . . .F - OBJECT-BASED STORAGE (OSD) * OP DTLPWROMAEBKVF Description * -- -------------- ---------------------------------------------- */ /* 00 MMMMMMMMMMMMMM TEST UNIT READY */ { 0x00, ALL, "TEST UNIT READY" }, /* 01 M REWIND */ { 0x01, T, "REWIND" }, /* 01 Z V ZZZZ REZERO UNIT */ { 0x01, D | W | R | O | M, "REZERO UNIT" }, /* 02 VVVVVV V */ /* 03 MMMMMMMMMMOMMM REQUEST SENSE */ { 0x03, ALL, "REQUEST SENSE" }, /* 04 M OO FORMAT UNIT */ { 0x04, D | R | O, "FORMAT UNIT" }, /* 04 O FORMAT MEDIUM */ { 0x04, T, "FORMAT MEDIUM" }, /* 04 O FORMAT */ { 0x04, L, "FORMAT" }, /* 05 VMVVVV V READ BLOCK LIMITS */ { 0x05, T, "READ BLOCK LIMITS" }, /* 06 VVVVVV V */ /* 07 OVV O OV REASSIGN BLOCKS */ { 0x07, D | W | O, "REASSIGN BLOCKS" }, /* 07 O INITIALIZE ELEMENT STATUS */ { 0x07, M, "INITIALIZE ELEMENT STATUS" }, /* 08 MOV O OV READ(6) */ { 0x08, D | T | W | O, "READ(6)" }, /* 08 O RECEIVE */ { 0x08, P, "RECEIVE" }, /* 08 GET MESSAGE(6) */ { 0x08, C, "GET MESSAGE(6)" }, /* 09 VVVVVV V */ /* 0A OO O OV WRITE(6) */ { 0x0A, D | T | W | O, "WRITE(6)" }, /* 0A M SEND(6) */ { 0x0A, P, "SEND(6)" }, /* 0A SEND MESSAGE(6) */ { 0x0A, C, "SEND MESSAGE(6)" }, /* 0A M PRINT */ { 0x0A, L, "PRINT" }, /* 0B Z ZOZV SEEK(6) */ { 0x0B, D | W | R | O, "SEEK(6)" }, /* 0B O SET CAPACITY */ { 0x0B, T, "SET CAPACITY" }, /* 0B O SLEW AND PRINT */ { 0x0B, L, "SLEW AND PRINT" }, /* 0C VVVVVV V */ /* 0D VVVVVV V */ /* 0E VVVVVV V */ /* 0F VOVVVV V READ REVERSE(6) */ { 0x0F, T, "READ REVERSE(6)" }, /* 10 VM VVV WRITE FILEMARKS(6) */ { 0x10, T, "WRITE FILEMARKS(6)" }, /* 10 O SYNCHRONIZE BUFFER */ { 0x10, L, "SYNCHRONIZE BUFFER" }, /* 11 VMVVVV SPACE(6) */ { 0x11, T, "SPACE(6)" }, /* 12 MMMMMMMMMMMMMM INQUIRY */ { 0x12, ALL, "INQUIRY" }, /* 13 V VVVV */ /* 13 O VERIFY(6) */ { 0x13, T, "VERIFY(6)" }, /* 14 VOOVVV RECOVER BUFFERED DATA */ { 0x14, T | L, "RECOVER BUFFERED DATA" }, /* 15 OMO O OOOO OO MODE SELECT(6) */ { 0x15, ALL & ~(P | R | B | F), "MODE SELECT(6)" }, /* 16 ZZMZO OOOZ O RESERVE(6) */ { 0x16, ALL & ~(R | B | V | F | C), "RESERVE(6)" }, /* 16 Z RESERVE ELEMENT(6) */ { 0x16, M, "RESERVE ELEMENT(6)" }, /* 17 ZZMZO OOOZ O RELEASE(6) */ { 0x17, ALL & ~(R | B | V | F | C), "RELEASE(6)" }, /* 17 Z RELEASE ELEMENT(6) */ { 0x17, M, "RELEASE ELEMENT(6)" }, /* 18 ZZZZOZO Z COPY */ { 0x18, D | T | L | P | W | R | O | K | S, "COPY" }, /* 19 VMVVVV ERASE(6) */ { 0x19, T, "ERASE(6)" }, /* 1A OMO O OOOO OO MODE SENSE(6) */ { 0x1A, ALL & ~(P | R | B | F), "MODE SENSE(6)" }, /* 1B O OOO O MO O START STOP UNIT */ { 0x1B, D | W | R | O | A | B | K | F, "START STOP UNIT" }, /* 1B O M LOAD UNLOAD */ { 0x1B, T | V, "LOAD UNLOAD" }, /* 1B SCAN */ { 0x1B, S, "SCAN" }, /* 1B O STOP PRINT */ { 0x1B, L, "STOP PRINT" }, /* 1B O OPEN/CLOSE IMPORT/EXPORT ELEMENT */ { 0x1B, M, "OPEN/CLOSE IMPORT/EXPORT ELEMENT" }, /* 1C OOOOO OOOM OOO RECEIVE DIAGNOSTIC RESULTS */ { 0x1C, ALL & ~(R | B), "RECEIVE DIAGNOSTIC RESULTS" }, /* 1D MMMMM MMOM MMM SEND DIAGNOSTIC */ { 0x1D, ALL & ~(R | B), "SEND DIAGNOSTIC" }, /* 1E OO OOOO O O PREVENT ALLOW MEDIUM REMOVAL */ { 0x1E, D | T | W | R | O | M | K | F, "PREVENT ALLOW MEDIUM REMOVAL" }, /* 1F */ /* 20 V VVV V */ /* 21 V VVV V */ /* 22 V VVV V */ /* 23 V V V V */ /* 23 O READ FORMAT CAPACITIES */ { 0x23, R, "READ FORMAT CAPACITIES" }, /* 24 V VV SET WINDOW */ { 0x24, S, "SET WINDOW" }, /* 25 M M M M READ CAPACITY(10) */ { 0x25, D | W | O | B, "READ CAPACITY(10)" }, /* 25 O READ CAPACITY */ { 0x25, R, "READ CAPACITY" }, /* 25 M READ CARD CAPACITY */ { 0x25, K, "READ CARD CAPACITY" }, /* 25 GET WINDOW */ { 0x25, S, "GET WINDOW" }, /* 26 V VV */ /* 27 V VV */ /* 28 M MOM MM READ(10) */ { 0x28, D | W | R | O | B | K | S, "READ(10)" }, /* 28 GET MESSAGE(10) */ { 0x28, C, "GET MESSAGE(10)" }, /* 29 V VVO READ GENERATION */ { 0x29, O, "READ GENERATION" }, /* 2A O MOM MO WRITE(10) */ { 0x2A, D | W | R | O | B | K, "WRITE(10)" }, /* 2A SEND(10) */ { 0x2A, S, "SEND(10)" }, /* 2A SEND MESSAGE(10) */ { 0x2A, C, "SEND MESSAGE(10)" }, /* 2B Z OOO O SEEK(10) */ { 0x2B, D | W | R | O | K, "SEEK(10)" }, /* 2B O LOCATE(10) */ { 0x2B, T, "LOCATE(10)" }, /* 2B O POSITION TO ELEMENT */ { 0x2B, M, "POSITION TO ELEMENT" }, /* 2C V OO ERASE(10) */ { 0x2C, R | O, "ERASE(10)" }, /* 2D O READ UPDATED BLOCK */ { 0x2D, O, "READ UPDATED BLOCK" }, /* 2D V */ /* 2E O OOO MO WRITE AND VERIFY(10) */ { 0x2E, D | W | R | O | B | K, "WRITE AND VERIFY(10)" }, /* 2F O OOO VERIFY(10) */ { 0x2F, D | W | R | O, "VERIFY(10)" }, /* 30 Z ZZZ SEARCH DATA HIGH(10) */ { 0x30, D | W | R | O, "SEARCH DATA HIGH(10)" }, /* 31 Z ZZZ SEARCH DATA EQUAL(10) */ { 0x31, D | W | R | O, "SEARCH DATA EQUAL(10)" }, /* 31 OBJECT POSITION */ { 0x31, S, "OBJECT POSITION" }, /* 32 Z ZZZ SEARCH DATA LOW(10) */ { 0x32, D | W | R | O, "SEARCH DATA LOW(10)" }, /* 33 Z OZO SET LIMITS(10) */ { 0x33, D | W | R | O, "SET LIMITS(10)" }, /* 34 O O O O PRE-FETCH(10) */ { 0x34, D | W | O | K, "PRE-FETCH(10)" }, /* 34 M READ POSITION */ { 0x34, T, "READ POSITION" }, /* 34 GET DATA BUFFER STATUS */ { 0x34, S, "GET DATA BUFFER STATUS" }, /* 35 O OOO MO SYNCHRONIZE CACHE(10) */ { 0x35, D | W | R | O | B | K, "SYNCHRONIZE CACHE(10)" }, /* 36 Z O O O LOCK UNLOCK CACHE(10) */ { 0x36, D | W | O | K, "LOCK UNLOCK CACHE(10)" }, /* 37 O O READ DEFECT DATA(10) */ { 0x37, D | O, "READ DEFECT DATA(10)" }, /* 37 O INITIALIZE ELEMENT STATUS WITH RANGE */ { 0x37, M, "INITIALIZE ELEMENT STATUS WITH RANGE" }, /* 38 O O O MEDIUM SCAN */ { 0x38, W | O | K, "MEDIUM SCAN" }, /* 39 ZZZZOZO Z COMPARE */ { 0x39, D | T | L | P | W | R | O | K | S, "COMPARE" }, /* 3A ZZZZOZO Z COPY AND VERIFY */ { 0x3A, D | T | L | P | W | R | O | K | S, "COPY AND VERIFY" }, /* 3B OOOOOOOOOOMOOO WRITE BUFFER */ { 0x3B, ALL, "WRITE BUFFER" }, /* 3C OOOOOOOOOO OOO READ BUFFER */ { 0x3C, ALL & ~(B), "READ BUFFER" }, /* 3D O UPDATE BLOCK */ { 0x3D, O, "UPDATE BLOCK" }, /* 3E O O O READ LONG(10) */ { 0x3E, D | W | O, "READ LONG(10)" }, /* 3F O O O WRITE LONG(10) */ { 0x3F, D | W | O, "WRITE LONG(10)" }, /* 40 ZZZZOZOZ CHANGE DEFINITION */ { 0x40, D | T | L | P | W | R | O | M | S | C, "CHANGE DEFINITION" }, /* 41 O WRITE SAME(10) */ { 0x41, D, "WRITE SAME(10)" }, /* 42 O UNMAP */ { 0x42, D, "UNMAP" }, /* 42 O READ SUB-CHANNEL */ { 0x42, R, "READ SUB-CHANNEL" }, /* 43 O READ TOC/PMA/ATIP */ { 0x43, R, "READ TOC/PMA/ATIP" }, /* 44 M M REPORT DENSITY SUPPORT */ { 0x44, T | V, "REPORT DENSITY SUPPORT" }, /* 44 READ HEADER */ /* 45 O PLAY AUDIO(10) */ { 0x45, R, "PLAY AUDIO(10)" }, /* 46 M GET CONFIGURATION */ { 0x46, R, "GET CONFIGURATION" }, /* 47 O PLAY AUDIO MSF */ { 0x47, R, "PLAY AUDIO MSF" }, /* 48 */ /* 49 */ /* 4A M GET EVENT STATUS NOTIFICATION */ { 0x4A, R, "GET EVENT STATUS NOTIFICATION" }, /* 4B O PAUSE/RESUME */ { 0x4B, R, "PAUSE/RESUME" }, /* 4C OOOOO OOOO OOO LOG SELECT */ { 0x4C, ALL & ~(R | B), "LOG SELECT" }, /* 4D OOOOO OOOO OMO LOG SENSE */ { 0x4D, ALL & ~(R | B), "LOG SENSE" }, /* 4E O STOP PLAY/SCAN */ { 0x4E, R, "STOP PLAY/SCAN" }, /* 4F */ /* 50 O XDWRITE(10) */ { 0x50, D, "XDWRITE(10)" }, /* 51 O XPWRITE(10) */ { 0x51, D, "XPWRITE(10)" }, /* 51 O READ DISC INFORMATION */ { 0x51, R, "READ DISC INFORMATION" }, /* 52 O XDREAD(10) */ { 0x52, D, "XDREAD(10)" }, /* 52 O READ TRACK INFORMATION */ { 0x52, R, "READ TRACK INFORMATION" }, /* 53 O RESERVE TRACK */ { 0x53, R, "RESERVE TRACK" }, /* 54 O SEND OPC INFORMATION */ { 0x54, R, "SEND OPC INFORMATION" }, /* 55 OOO OMOOOOMOMO MODE SELECT(10) */ { 0x55, ALL & ~(P), "MODE SELECT(10)" }, /* 56 ZZMZO OOOZ RESERVE(10) */ { 0x56, ALL & ~(R | B | K | V | F | C), "RESERVE(10)" }, /* 56 Z RESERVE ELEMENT(10) */ { 0x56, M, "RESERVE ELEMENT(10)" }, /* 57 ZZMZO OOOZ RELEASE(10) */ { 0x57, ALL & ~(R | B | K | V | F | C), "RELEASE(10)" }, /* 57 Z RELEASE ELEMENT(10) */ { 0x57, M, "RELEASE ELEMENT(10)" }, /* 58 O REPAIR TRACK */ { 0x58, R, "REPAIR TRACK" }, /* 59 */ /* 5A OOO OMOOOOMOMO MODE SENSE(10) */ { 0x5A, ALL & ~(P), "MODE SENSE(10)" }, /* 5B O CLOSE TRACK/SESSION */ { 0x5B, R, "CLOSE TRACK/SESSION" }, /* 5C O READ BUFFER CAPACITY */ { 0x5C, R, "READ BUFFER CAPACITY" }, /* 5D O SEND CUE SHEET */ { 0x5D, R, "SEND CUE SHEET" }, /* 5E OOOOO OOOO M PERSISTENT RESERVE IN */ { 0x5E, ALL & ~(R | B | K | V | C), "PERSISTENT RESERVE IN" }, /* 5F OOOOO OOOO M PERSISTENT RESERVE OUT */ { 0x5F, ALL & ~(R | B | K | V | C), "PERSISTENT RESERVE OUT" }, /* 7E OO O OOOO O extended CDB */ { 0x7E, D | T | R | M | A | E | B | V, "extended CDB" }, /* 7F O M variable length CDB (more than 16 bytes) */ { 0x7F, D | F, "variable length CDB (more than 16 bytes)" }, /* 80 Z XDWRITE EXTENDED(16) */ { 0x80, D, "XDWRITE EXTENDED(16)" }, /* 80 M WRITE FILEMARKS(16) */ { 0x80, T, "WRITE FILEMARKS(16)" }, /* 81 Z REBUILD(16) */ { 0x81, D, "REBUILD(16)" }, /* 81 O READ REVERSE(16) */ { 0x81, T, "READ REVERSE(16)" }, /* 82 Z REGENERATE(16) */ { 0x82, D, "REGENERATE(16)" }, /* 83 OOOOO O OO EXTENDED COPY */ { 0x83, D | T | L | P | W | O | K | V, "EXTENDED COPY" }, /* 84 OOOOO O OO RECEIVE COPY RESULTS */ { 0x84, D | T | L | P | W | O | K | V, "RECEIVE COPY RESULTS" }, /* 85 O O O ATA COMMAND PASS THROUGH(16) */ { 0x85, D | R | B, "ATA COMMAND PASS THROUGH(16)" }, /* 86 OO OO OOOOOOO ACCESS CONTROL IN */ { 0x86, ALL & ~(L | R | F), "ACCESS CONTROL IN" }, /* 87 OO OO OOOOOOO ACCESS CONTROL OUT */ { 0x87, ALL & ~(L | R | F), "ACCESS CONTROL OUT" }, /* * XXX READ(16)/WRITE(16) were not listed for CD/DVE in op-num.txt * but we had it since r1.40. Do we really want them? */ /* 88 MM O O O READ(16) */ { 0x88, D | T | W | O | B, "READ(16)" }, /* 89 O COMPARE AND WRITE*/ { 0x89, D, "COMPARE AND WRITE" }, /* 8A OM O O O WRITE(16) */ { 0x8A, D | T | W | O | B, "WRITE(16)" }, /* 8B O ORWRITE */ { 0x8B, D, "ORWRITE" }, /* 8C OO O OO O M READ ATTRIBUTE */ { 0x8C, D | T | W | O | M | B | V, "READ ATTRIBUTE" }, /* 8D OO O OO O O WRITE ATTRIBUTE */ { 0x8D, D | T | W | O | M | B | V, "WRITE ATTRIBUTE" }, /* 8E O O O O WRITE AND VERIFY(16) */ { 0x8E, D | W | O | B, "WRITE AND VERIFY(16)" }, /* 8F OO O O O VERIFY(16) */ { 0x8F, D | T | W | O | B, "VERIFY(16)" }, /* 90 O O O O PRE-FETCH(16) */ { 0x90, D | W | O | B, "PRE-FETCH(16)" }, /* 91 O O O O SYNCHRONIZE CACHE(16) */ { 0x91, D | W | O | B, "SYNCHRONIZE CACHE(16)" }, /* 91 O SPACE(16) */ { 0x91, T, "SPACE(16)" }, /* 92 Z O O LOCK UNLOCK CACHE(16) */ { 0x92, D | W | O, "LOCK UNLOCK CACHE(16)" }, /* 92 O LOCATE(16) */ { 0x92, T, "LOCATE(16)" }, /* 93 O WRITE SAME(16) */ { 0x93, D, "WRITE SAME(16)" }, /* 93 M ERASE(16) */ { 0x93, T, "ERASE(16)" }, /* 94 [usage proposed by SCSI Socket Services project] */ /* 95 [usage proposed by SCSI Socket Services project] */ /* 96 [usage proposed by SCSI Socket Services project] */ /* 97 [usage proposed by SCSI Socket Services project] */ /* 98 */ /* 99 */ /* 9A */ /* 9B */ - /* 9C */ + /* 9C O WRITE ATOMIC(16) */ + { 0x9C, D, "WRITE ATOMIC(16)" }, /* 9D */ /* XXX KDM ALL for this? op-num.txt defines it for none.. */ /* 9E SERVICE ACTION IN(16) */ { 0x9E, ALL, "SERVICE ACTION IN(16)" }, /* XXX KDM ALL for this? op-num.txt defines it for ADC.. */ /* 9F M SERVICE ACTION OUT(16) */ { 0x9F, ALL, "SERVICE ACTION OUT(16)" }, /* A0 MMOOO OMMM OMO REPORT LUNS */ { 0xA0, ALL & ~(R | B), "REPORT LUNS" }, /* A1 O BLANK */ { 0xA1, R, "BLANK" }, /* A1 O O ATA COMMAND PASS THROUGH(12) */ { 0xA1, D | B, "ATA COMMAND PASS THROUGH(12)" }, /* A2 OO O O SECURITY PROTOCOL IN */ { 0xA2, D | T | R | V, "SECURITY PROTOCOL IN" }, /* A3 OOO O OOMOOOM MAINTENANCE (IN) */ { 0xA3, ALL & ~(P | R | F), "MAINTENANCE (IN)" }, /* A3 O SEND KEY */ { 0xA3, R, "SEND KEY" }, /* A4 OOO O OOOOOOO MAINTENANCE (OUT) */ { 0xA4, ALL & ~(P | R | F), "MAINTENANCE (OUT)" }, /* A4 O REPORT KEY */ { 0xA4, R, "REPORT KEY" }, /* A5 O O OM MOVE MEDIUM */ { 0xA5, T | W | O | M, "MOVE MEDIUM" }, /* A5 O PLAY AUDIO(12) */ { 0xA5, R, "PLAY AUDIO(12)" }, /* A6 O EXCHANGE MEDIUM */ { 0xA6, M, "EXCHANGE MEDIUM" }, /* A6 O LOAD/UNLOAD C/DVD */ { 0xA6, R, "LOAD/UNLOAD C/DVD" }, /* A7 ZZ O O MOVE MEDIUM ATTACHED */ { 0xA7, D | T | W | O, "MOVE MEDIUM ATTACHED" }, /* A7 O SET READ AHEAD */ { 0xA7, R, "SET READ AHEAD" }, /* A8 O OOO READ(12) */ { 0xA8, D | W | R | O, "READ(12)" }, /* A8 GET MESSAGE(12) */ { 0xA8, C, "GET MESSAGE(12)" }, /* A9 O SERVICE ACTION OUT(12) */ { 0xA9, V, "SERVICE ACTION OUT(12)" }, /* AA O OOO WRITE(12) */ { 0xAA, D | W | R | O, "WRITE(12)" }, /* AA SEND MESSAGE(12) */ { 0xAA, C, "SEND MESSAGE(12)" }, /* AB O O SERVICE ACTION IN(12) */ { 0xAB, R | V, "SERVICE ACTION IN(12)" }, /* AC O ERASE(12) */ { 0xAC, O, "ERASE(12)" }, /* AC O GET PERFORMANCE */ { 0xAC, R, "GET PERFORMANCE" }, /* AD O READ DVD STRUCTURE */ { 0xAD, R, "READ DVD STRUCTURE" }, /* AE O O O WRITE AND VERIFY(12) */ { 0xAE, D | W | O, "WRITE AND VERIFY(12)" }, /* AF O OZO VERIFY(12) */ { 0xAF, D | W | R | O, "VERIFY(12)" }, /* B0 ZZZ SEARCH DATA HIGH(12) */ { 0xB0, W | R | O, "SEARCH DATA HIGH(12)" }, /* B1 ZZZ SEARCH DATA EQUAL(12) */ { 0xB1, W | R | O, "SEARCH DATA EQUAL(12)" }, /* B2 ZZZ SEARCH DATA LOW(12) */ { 0xB2, W | R | O, "SEARCH DATA LOW(12)" }, /* B3 Z OZO SET LIMITS(12) */ { 0xB3, D | W | R | O, "SET LIMITS(12)" }, /* B4 ZZ OZO READ ELEMENT STATUS ATTACHED */ { 0xB4, D | T | W | R | O, "READ ELEMENT STATUS ATTACHED" }, /* B5 OO O O SECURITY PROTOCOL OUT */ { 0xB5, D | T | R | V, "SECURITY PROTOCOL OUT" }, /* B5 O REQUEST VOLUME ELEMENT ADDRESS */ { 0xB5, M, "REQUEST VOLUME ELEMENT ADDRESS" }, /* B6 O SEND VOLUME TAG */ { 0xB6, M, "SEND VOLUME TAG" }, /* B6 O SET STREAMING */ { 0xB6, R, "SET STREAMING" }, /* B7 O O READ DEFECT DATA(12) */ { 0xB7, D | O, "READ DEFECT DATA(12)" }, /* B8 O OZOM READ ELEMENT STATUS */ { 0xB8, T | W | R | O | M, "READ ELEMENT STATUS" }, /* B9 O READ CD MSF */ { 0xB9, R, "READ CD MSF" }, /* BA O O OOMO REDUNDANCY GROUP (IN) */ { 0xBA, D | W | O | M | A | E, "REDUNDANCY GROUP (IN)" }, /* BA O SCAN */ { 0xBA, R, "SCAN" }, /* BB O O OOOO REDUNDANCY GROUP (OUT) */ { 0xBB, D | W | O | M | A | E, "REDUNDANCY GROUP (OUT)" }, /* BB O SET CD SPEED */ { 0xBB, R, "SET CD SPEED" }, /* BC O O OOMO SPARE (IN) */ { 0xBC, D | W | O | M | A | E, "SPARE (IN)" }, /* BD O O OOOO SPARE (OUT) */ { 0xBD, D | W | O | M | A | E, "SPARE (OUT)" }, /* BD O MECHANISM STATUS */ { 0xBD, R, "MECHANISM STATUS" }, /* BE O O OOMO VOLUME SET (IN) */ { 0xBE, D | W | O | M | A | E, "VOLUME SET (IN)" }, /* BE O READ CD */ { 0xBE, R, "READ CD" }, /* BF O O OOOO VOLUME SET (OUT) */ { 0xBF, D | W | O | M | A | E, "VOLUME SET (OUT)" }, /* BF O SEND DVD STRUCTURE */ { 0xBF, R, "SEND DVD STRUCTURE" } }; const char * scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data) { caddr_t match; int i, j; u_int32_t opmask; u_int16_t pd_type; int num_ops[2]; struct op_table_entry *table[2]; int num_tables; /* * If we've got inquiry data, use it to determine what type of * device we're dealing with here. Otherwise, assume direct * access. */ if (inq_data == NULL) { pd_type = T_DIRECT; match = NULL; } else { pd_type = SID_TYPE(inq_data); match = cam_quirkmatch((caddr_t)inq_data, (caddr_t)scsi_op_quirk_table, sizeof(scsi_op_quirk_table)/ sizeof(*scsi_op_quirk_table), sizeof(*scsi_op_quirk_table), scsi_inquiry_match); } if (match != NULL) { table[0] = ((struct scsi_op_quirk_entry *)match)->op_table; num_ops[0] = ((struct scsi_op_quirk_entry *)match)->num_ops; table[1] = scsi_op_codes; num_ops[1] = sizeof(scsi_op_codes)/sizeof(scsi_op_codes[0]); num_tables = 2; } else { /* * If this is true, we have a vendor specific opcode that * wasn't covered in the quirk table. */ if ((opcode > 0xBF) || ((opcode > 0x5F) && (opcode < 0x80))) return("Vendor Specific Command"); table[0] = scsi_op_codes; num_ops[0] = sizeof(scsi_op_codes)/sizeof(scsi_op_codes[0]); num_tables = 1; } /* RBC is 'Simplified' Direct Access Device */ if (pd_type == T_RBC) pd_type = T_DIRECT; /* Map NODEVICE to Direct Access Device to handle REPORT LUNS, etc. */ if (pd_type == T_NODEVICE) pd_type = T_DIRECT; opmask = 1 << pd_type; for (j = 0; j < num_tables; j++) { for (i = 0;i < num_ops[j] && table[j][i].opcode <= opcode; i++){ if ((table[j][i].opcode == opcode) && ((table[j][i].opmask & opmask) != 0)) return(table[j][i].desc); } } /* * If we can't find a match for the command in the table, we just * assume it's a vendor specifc command. */ return("Vendor Specific Command"); } #else /* SCSI_NO_OP_STRINGS */ const char * scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data) { return(""); } #endif #if !defined(SCSI_NO_SENSE_STRINGS) #define SST(asc, ascq, action, desc) \ asc, ascq, action, desc #else const char empty_string[] = ""; #define SST(asc, ascq, action, desc) \ asc, ascq, action, empty_string #endif const struct sense_key_table_entry sense_key_table[] = { { SSD_KEY_NO_SENSE, SS_NOP, "NO SENSE" }, { SSD_KEY_RECOVERED_ERROR, SS_NOP|SSQ_PRINT_SENSE, "RECOVERED ERROR" }, { SSD_KEY_NOT_READY, SS_RDEF, "NOT READY" }, { SSD_KEY_MEDIUM_ERROR, SS_RDEF, "MEDIUM ERROR" }, { SSD_KEY_HARDWARE_ERROR, SS_RDEF, "HARDWARE FAILURE" }, { SSD_KEY_ILLEGAL_REQUEST, SS_FATAL|EINVAL, "ILLEGAL REQUEST" }, { SSD_KEY_UNIT_ATTENTION, SS_FATAL|ENXIO, "UNIT ATTENTION" }, { SSD_KEY_DATA_PROTECT, SS_FATAL|EACCES, "DATA PROTECT" }, { SSD_KEY_BLANK_CHECK, SS_FATAL|ENOSPC, "BLANK CHECK" }, { SSD_KEY_Vendor_Specific, SS_FATAL|EIO, "Vendor Specific" }, { SSD_KEY_COPY_ABORTED, SS_FATAL|EIO, "COPY ABORTED" }, { SSD_KEY_ABORTED_COMMAND, SS_RDEF, "ABORTED COMMAND" }, { SSD_KEY_EQUAL, SS_NOP, "EQUAL" }, { SSD_KEY_VOLUME_OVERFLOW, SS_FATAL|EIO, "VOLUME OVERFLOW" }, { SSD_KEY_MISCOMPARE, SS_NOP, "MISCOMPARE" }, { SSD_KEY_COMPLETED, SS_NOP, "COMPLETED" } }; const int sense_key_table_size = sizeof(sense_key_table)/sizeof(sense_key_table[0]); static struct asc_table_entry quantum_fireball_entries[] = { { SST(0x04, 0x0b, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, initializing cmd. required") } }; static struct asc_table_entry sony_mo_entries[] = { { SST(0x04, 0x00, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, cause not reportable") } }; static struct asc_table_entry hgst_entries[] = { { SST(0x04, 0xF0, SS_RDEF, "Vendor Unique - Logical Unit Not Ready") }, { SST(0x0A, 0x01, SS_RDEF, "Unrecovered Super Certification Log Write Error") }, { SST(0x0A, 0x02, SS_RDEF, "Unrecovered Super Certification Log Read Error") }, { SST(0x15, 0x03, SS_RDEF, "Unrecovered Sector Error") }, { SST(0x3E, 0x04, SS_RDEF, "Unrecovered Self-Test Hard-Cache Test Fail") }, { SST(0x3E, 0x05, SS_RDEF, "Unrecovered Self-Test OTF-Cache Fail") }, { SST(0x40, 0x00, SS_RDEF, "Unrecovered SAT No Buffer Overflow Error") }, { SST(0x40, 0x01, SS_RDEF, "Unrecovered SAT Buffer Overflow Error") }, { SST(0x40, 0x02, SS_RDEF, "Unrecovered SAT No Buffer Overflow With ECS Fault") }, { SST(0x40, 0x03, SS_RDEF, "Unrecovered SAT Buffer Overflow With ECS Fault") }, { SST(0x40, 0x81, SS_RDEF, "DRAM Failure") }, { SST(0x44, 0x0B, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xF2, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xF6, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xF9, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xFA, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x5D, 0x22, SS_RDEF, "Extreme Over-Temperature Warning") }, { SST(0x5D, 0x50, SS_RDEF, "Load/Unload cycle Count Warning") }, { SST(0x81, 0x00, SS_RDEF, "Vendor Unique - Internal Logic Error") }, { SST(0x85, 0x00, SS_RDEF, "Vendor Unique - Internal Key Seed Error") }, }; static struct asc_table_entry seagate_entries[] = { { SST(0x04, 0xF0, SS_RDEF, "Logical Unit Not Ready, super certify in Progress") }, { SST(0x08, 0x86, SS_RDEF, "Write Fault Data Corruption") }, { SST(0x09, 0x0D, SS_RDEF, "Tracking Failure") }, { SST(0x09, 0x0E, SS_RDEF, "ETF Failure") }, { SST(0x0B, 0x5D, SS_RDEF, "Pre-SMART Warning") }, { SST(0x0B, 0x85, SS_RDEF, "5V Voltage Warning") }, { SST(0x0B, 0x8C, SS_RDEF, "12V Voltage Warning") }, { SST(0x0C, 0xFF, SS_RDEF, "Write Error - Too many error recovery revs") }, { SST(0x11, 0xFF, SS_RDEF, "Unrecovered Read Error - Too many error recovery revs") }, { SST(0x19, 0x0E, SS_RDEF, "Fewer than 1/2 defect list copies") }, { SST(0x20, 0xF3, SS_RDEF, "Illegal CDB linked to skip mask cmd") }, { SST(0x24, 0xF0, SS_RDEF, "Illegal byte in CDB, LBA not matching") }, { SST(0x24, 0xF1, SS_RDEF, "Illegal byte in CDB, LEN not matching") }, { SST(0x24, 0xF2, SS_RDEF, "Mask not matching transfer length") }, { SST(0x24, 0xF3, SS_RDEF, "Drive formatted without plist") }, { SST(0x26, 0x95, SS_RDEF, "Invalid Field Parameter - CAP File") }, { SST(0x26, 0x96, SS_RDEF, "Invalid Field Parameter - RAP File") }, { SST(0x26, 0x97, SS_RDEF, "Invalid Field Parameter - TMS Firmware Tag") }, { SST(0x26, 0x98, SS_RDEF, "Invalid Field Parameter - Check Sum") }, { SST(0x26, 0x99, SS_RDEF, "Invalid Field Parameter - Firmware Tag") }, { SST(0x29, 0x08, SS_RDEF, "Write Log Dump data") }, { SST(0x29, 0x09, SS_RDEF, "Write Log Dump data") }, { SST(0x29, 0x0A, SS_RDEF, "Reserved disk space") }, { SST(0x29, 0x0B, SS_RDEF, "SDBP") }, { SST(0x29, 0x0C, SS_RDEF, "SDBP") }, { SST(0x31, 0x91, SS_RDEF, "Format Corrupted World Wide Name (WWN) is Invalid") }, { SST(0x32, 0x03, SS_RDEF, "Defect List - Length exceeds Command Allocated Length") }, { SST(0x33, 0x00, SS_RDEF, "Flash not ready for access") }, { SST(0x3F, 0x70, SS_RDEF, "Invalid RAP block") }, { SST(0x3F, 0x71, SS_RDEF, "RAP/ETF mismatch") }, { SST(0x3F, 0x90, SS_RDEF, "Invalid CAP block") }, { SST(0x3F, 0x91, SS_RDEF, "World Wide Name (WWN) Mismatch") }, { SST(0x40, 0x01, SS_RDEF, "DRAM Parity Error") }, { SST(0x40, 0x02, SS_RDEF, "DRAM Parity Error") }, { SST(0x42, 0x0A, SS_RDEF, "Loopback Test") }, { SST(0x42, 0x0B, SS_RDEF, "Loopback Test") }, { SST(0x44, 0xF2, SS_RDEF, "Compare error during data integrity check") }, { SST(0x44, 0xF6, SS_RDEF, "Unrecoverable error during data integrity check") }, { SST(0x47, 0x80, SS_RDEF, "Fibre Channel Sequence Error") }, { SST(0x4E, 0x01, SS_RDEF, "Information Unit Too Short") }, { SST(0x80, 0x00, SS_RDEF, "General Firmware Error / Command Timeout") }, { SST(0x80, 0x01, SS_RDEF, "Command Timeout") }, { SST(0x80, 0x02, SS_RDEF, "Command Timeout") }, { SST(0x80, 0x80, SS_RDEF, "FC FIFO Error During Read Transfer") }, { SST(0x80, 0x81, SS_RDEF, "FC FIFO Error During Write Transfer") }, { SST(0x80, 0x82, SS_RDEF, "DISC FIFO Error During Read Transfer") }, { SST(0x80, 0x83, SS_RDEF, "DISC FIFO Error During Write Transfer") }, { SST(0x80, 0x84, SS_RDEF, "LBA Seeded LRC Error on Read") }, { SST(0x80, 0x85, SS_RDEF, "LBA Seeded LRC Error on Write") }, { SST(0x80, 0x86, SS_RDEF, "IOEDC Error on Read") }, { SST(0x80, 0x87, SS_RDEF, "IOEDC Error on Write") }, { SST(0x80, 0x88, SS_RDEF, "Host Parity Check Failed") }, { SST(0x80, 0x89, SS_RDEF, "IOEDC error on read detected by formatter") }, { SST(0x80, 0x8A, SS_RDEF, "Host Parity Errors / Host FIFO Initialization Failed") }, { SST(0x80, 0x8B, SS_RDEF, "Host Parity Errors") }, { SST(0x80, 0x8C, SS_RDEF, "Host Parity Errors") }, { SST(0x80, 0x8D, SS_RDEF, "Host Parity Errors") }, { SST(0x81, 0x00, SS_RDEF, "LA Check Failed") }, { SST(0x82, 0x00, SS_RDEF, "Internal client detected insufficient buffer") }, { SST(0x84, 0x00, SS_RDEF, "Scheduled Diagnostic And Repair") }, }; static struct scsi_sense_quirk_entry sense_quirk_table[] = { { /* * XXX The Quantum Fireball ST and SE like to return 0x04 0x0b * when they really should return 0x04 0x02. */ {T_DIRECT, SIP_MEDIA_FIXED, "QUANTUM", "FIREBALL S*", "*"}, /*num_sense_keys*/0, sizeof(quantum_fireball_entries)/sizeof(struct asc_table_entry), /*sense key entries*/NULL, quantum_fireball_entries }, { /* * This Sony MO drive likes to return 0x04, 0x00 when it * isn't spun up. */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SONY", "SMO-*", "*"}, /*num_sense_keys*/0, sizeof(sony_mo_entries)/sizeof(struct asc_table_entry), /*sense key entries*/NULL, sony_mo_entries }, { /* * HGST vendor-specific error codes */ {T_DIRECT, SIP_MEDIA_FIXED, "HGST", "*", "*"}, /*num_sense_keys*/0, sizeof(hgst_entries)/sizeof(struct asc_table_entry), /*sense key entries*/NULL, hgst_entries }, { /* * SEAGATE vendor-specific error codes */ {T_DIRECT, SIP_MEDIA_FIXED, "SEAGATE", "*", "*"}, /*num_sense_keys*/0, sizeof(seagate_entries)/sizeof(struct asc_table_entry), /*sense key entries*/NULL, seagate_entries } }; const int sense_quirk_table_size = sizeof(sense_quirk_table)/sizeof(sense_quirk_table[0]); static struct asc_table_entry asc_table[] = { /* * From: http://www.t10.org/lists/asc-num.txt * Modifications by Jung-uk Kim (jkim@FreeBSD.org) */ /* * File: ASC-NUM.TXT * * SCSI ASC/ASCQ Assignments * Numeric Sorted Listing * as of 5/20/12 * * D - DIRECT ACCESS DEVICE (SBC-2) device column key * .T - SEQUENTIAL ACCESS DEVICE (SSC) ------------------- * . L - PRINTER DEVICE (SSC) blank = reserved * . P - PROCESSOR DEVICE (SPC) not blank = allowed * . .W - WRITE ONCE READ MULTIPLE DEVICE (SBC-2) * . . R - CD DEVICE (MMC) * . . O - OPTICAL MEMORY DEVICE (SBC-2) * . . .M - MEDIA CHANGER DEVICE (SMC) * . . . A - STORAGE ARRAY DEVICE (SCC) * . . . E - ENCLOSURE SERVICES DEVICE (SES) * . . . .B - SIMPLIFIED DIRECT-ACCESS DEVICE (RBC) * . . . . K - OPTICAL CARD READER/WRITER DEVICE (OCRW) * . . . . V - AUTOMATION/DRIVE INTERFACE (ADC) * . . . . .F - OBJECT-BASED STORAGE (OSD) * DTLPWROMAEBKVF * ASC ASCQ Action * Description */ /* DTLPWROMAEBKVF */ { SST(0x00, 0x00, SS_NOP, "No additional sense information") }, /* T */ { SST(0x00, 0x01, SS_RDEF, "Filemark detected") }, /* T */ { SST(0x00, 0x02, SS_RDEF, "End-of-partition/medium detected") }, /* T */ { SST(0x00, 0x03, SS_RDEF, "Setmark detected") }, /* T */ { SST(0x00, 0x04, SS_RDEF, "Beginning-of-partition/medium detected") }, /* TL */ { SST(0x00, 0x05, SS_RDEF, "End-of-data detected") }, /* DTLPWROMAEBKVF */ { SST(0x00, 0x06, SS_RDEF, "I/O process terminated") }, /* T */ { SST(0x00, 0x07, SS_RDEF, /* XXX TBD */ "Programmable early warning detected") }, /* R */ { SST(0x00, 0x11, SS_FATAL | EBUSY, "Audio play operation in progress") }, /* R */ { SST(0x00, 0x12, SS_NOP, "Audio play operation paused") }, /* R */ { SST(0x00, 0x13, SS_NOP, "Audio play operation successfully completed") }, /* R */ { SST(0x00, 0x14, SS_RDEF, "Audio play operation stopped due to error") }, /* R */ { SST(0x00, 0x15, SS_NOP, "No current audio status to return") }, /* DTLPWROMAEBKVF */ { SST(0x00, 0x16, SS_FATAL | EBUSY, "Operation in progress") }, /* DTL WROMAEBKVF */ { SST(0x00, 0x17, SS_RDEF, "Cleaning requested") }, /* T */ { SST(0x00, 0x18, SS_RDEF, /* XXX TBD */ "Erase operation in progress") }, /* T */ { SST(0x00, 0x19, SS_RDEF, /* XXX TBD */ "Locate operation in progress") }, /* T */ { SST(0x00, 0x1A, SS_RDEF, /* XXX TBD */ "Rewind operation in progress") }, /* T */ { SST(0x00, 0x1B, SS_RDEF, /* XXX TBD */ "Set capacity operation in progress") }, /* T */ { SST(0x00, 0x1C, SS_RDEF, /* XXX TBD */ "Verify operation in progress") }, /* DT B */ { SST(0x00, 0x1D, SS_RDEF, /* XXX TBD */ "ATA pass through information available") }, /* DT R MAEBKV */ { SST(0x00, 0x1E, SS_RDEF, /* XXX TBD */ "Conflicting SA creation request") }, /* DT B */ { SST(0x00, 0x1F, SS_RDEF, /* XXX TBD */ "Logical unit transitioning to another power condition") }, /* DT P B */ { SST(0x00, 0x20, SS_RDEF, /* XXX TBD */ "Extended copy information available") }, /* D W O BK */ { SST(0x01, 0x00, SS_RDEF, "No index/sector signal") }, /* D WRO BK */ { SST(0x02, 0x00, SS_RDEF, "No seek complete") }, /* DTL W O BK */ { SST(0x03, 0x00, SS_RDEF, "Peripheral device write fault") }, /* T */ { SST(0x03, 0x01, SS_RDEF, "No write current") }, /* T */ { SST(0x03, 0x02, SS_RDEF, "Excessive write errors") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x00, SS_RDEF, "Logical unit not ready, cause not reportable") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x01, SS_TUR | SSQ_MANY | SSQ_DECREMENT_COUNT | EBUSY, "Logical unit is in process of becoming ready") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x02, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, initializing command required") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x03, SS_FATAL | ENXIO, "Logical unit not ready, manual intervention required") }, /* DTL RO B */ { SST(0x04, 0x04, SS_FATAL | EBUSY, "Logical unit not ready, format in progress") }, /* DT W O A BK F */ { SST(0x04, 0x05, SS_FATAL | EBUSY, "Logical unit not ready, rebuild in progress") }, /* DT W O A BK */ { SST(0x04, 0x06, SS_FATAL | EBUSY, "Logical unit not ready, recalculation in progress") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x07, SS_FATAL | EBUSY, "Logical unit not ready, operation in progress") }, /* R */ { SST(0x04, 0x08, SS_FATAL | EBUSY, "Logical unit not ready, long write in progress") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x09, SS_RDEF, /* XXX TBD */ "Logical unit not ready, self-test in progress") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x0A, SS_TUR | SSQ_MANY | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not accessible, asymmetric access state transition")}, /* DTLPWROMAEBKVF */ { SST(0x04, 0x0B, SS_FATAL | ENXIO, "Logical unit not accessible, target port in standby state") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x0C, SS_FATAL | ENXIO, "Logical unit not accessible, target port in unavailable state") }, /* F */ { SST(0x04, 0x0D, SS_RDEF, /* XXX TBD */ "Logical unit not ready, structure check required") }, /* DT WROM B */ { SST(0x04, 0x10, SS_RDEF, /* XXX TBD */ "Logical unit not ready, auxiliary memory not accessible") }, /* DT WRO AEB VF */ { SST(0x04, 0x11, SS_TUR | SSQ_MANY | SSQ_DECREMENT_COUNT | EBUSY, "Logical unit not ready, notify (enable spinup) required") }, /* M V */ { SST(0x04, 0x12, SS_RDEF, /* XXX TBD */ "Logical unit not ready, offline") }, /* DT R MAEBKV */ { SST(0x04, 0x13, SS_RDEF, /* XXX TBD */ "Logical unit not ready, SA creation in progress") }, /* D B */ { SST(0x04, 0x14, SS_RDEF, /* XXX TBD */ "Logical unit not ready, space allocation in progress") }, /* M */ { SST(0x04, 0x15, SS_RDEF, /* XXX TBD */ "Logical unit not ready, robotics disabled") }, /* M */ { SST(0x04, 0x16, SS_RDEF, /* XXX TBD */ "Logical unit not ready, configuration required") }, /* M */ { SST(0x04, 0x17, SS_RDEF, /* XXX TBD */ "Logical unit not ready, calibration required") }, /* M */ { SST(0x04, 0x18, SS_RDEF, /* XXX TBD */ "Logical unit not ready, a door is open") }, /* M */ { SST(0x04, 0x19, SS_RDEF, /* XXX TBD */ "Logical unit not ready, operating in sequential mode") }, /* DT B */ { SST(0x04, 0x1A, SS_RDEF, /* XXX TBD */ "Logical unit not ready, START/STOP UNIT command in progress") }, /* D B */ { SST(0x04, 0x1B, SS_RDEF, /* XXX TBD */ "Logical unit not ready, sanitize in progress") }, /* DT MAEB */ { SST(0x04, 0x1C, SS_RDEF, /* XXX TBD */ "Logical unit not ready, additional power use not yet granted") }, /* DTL WROMAEBKVF */ { SST(0x05, 0x00, SS_RDEF, "Logical unit does not respond to selection") }, /* D WROM BK */ { SST(0x06, 0x00, SS_RDEF, "No reference position found") }, /* DTL WROM BK */ { SST(0x07, 0x00, SS_RDEF, "Multiple peripheral devices selected") }, /* DTL WROMAEBKVF */ { SST(0x08, 0x00, SS_RDEF, "Logical unit communication failure") }, /* DTL WROMAEBKVF */ { SST(0x08, 0x01, SS_RDEF, "Logical unit communication time-out") }, /* DTL WROMAEBKVF */ { SST(0x08, 0x02, SS_RDEF, "Logical unit communication parity error") }, /* DT ROM BK */ { SST(0x08, 0x03, SS_RDEF, "Logical unit communication CRC error (Ultra-DMA/32)") }, /* DTLPWRO K */ { SST(0x08, 0x04, SS_RDEF, /* XXX TBD */ "Unreachable copy target") }, /* DT WRO B */ { SST(0x09, 0x00, SS_RDEF, "Track following error") }, /* WRO K */ { SST(0x09, 0x01, SS_RDEF, "Tracking servo failure") }, /* WRO K */ { SST(0x09, 0x02, SS_RDEF, "Focus servo failure") }, /* WRO */ { SST(0x09, 0x03, SS_RDEF, "Spindle servo failure") }, /* DT WRO B */ { SST(0x09, 0x04, SS_RDEF, "Head select fault") }, /* DTLPWROMAEBKVF */ { SST(0x0A, 0x00, SS_FATAL | ENOSPC, "Error log overflow") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x00, SS_RDEF, "Warning") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x01, SS_RDEF, "Warning - specified temperature exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x02, SS_RDEF, "Warning - enclosure degraded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x03, SS_RDEF, /* XXX TBD */ "Warning - background self-test failed") }, /* DTLPWRO AEBKVF */ { SST(0x0B, 0x04, SS_RDEF, /* XXX TBD */ "Warning - background pre-scan detected medium error") }, /* DTLPWRO AEBKVF */ { SST(0x0B, 0x05, SS_RDEF, /* XXX TBD */ "Warning - background medium scan detected medium error") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x06, SS_RDEF, /* XXX TBD */ "Warning - non-volatile cache now volatile") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x07, SS_RDEF, /* XXX TBD */ "Warning - degraded power to non-volatile cache") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x08, SS_RDEF, /* XXX TBD */ "Warning - power loss expected") }, /* D */ { SST(0x0B, 0x09, SS_RDEF, /* XXX TBD */ "Warning - device statistics notification available") }, /* T R */ { SST(0x0C, 0x00, SS_RDEF, "Write error") }, /* K */ { SST(0x0C, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Write error - recovered with auto reallocation") }, /* D W O BK */ { SST(0x0C, 0x02, SS_RDEF, "Write error - auto reallocation failed") }, /* D W O BK */ { SST(0x0C, 0x03, SS_RDEF, "Write error - recommend reassignment") }, /* DT W O B */ { SST(0x0C, 0x04, SS_RDEF, "Compression check miscompare error") }, /* DT W O B */ { SST(0x0C, 0x05, SS_RDEF, "Data expansion occurred during compression") }, /* DT W O B */ { SST(0x0C, 0x06, SS_RDEF, "Block not compressible") }, /* R */ { SST(0x0C, 0x07, SS_RDEF, "Write error - recovery needed") }, /* R */ { SST(0x0C, 0x08, SS_RDEF, "Write error - recovery failed") }, /* R */ { SST(0x0C, 0x09, SS_RDEF, "Write error - loss of streaming") }, /* R */ { SST(0x0C, 0x0A, SS_RDEF, "Write error - padding blocks added") }, /* DT WROM B */ { SST(0x0C, 0x0B, SS_RDEF, /* XXX TBD */ "Auxiliary memory write error") }, /* DTLPWRO AEBKVF */ { SST(0x0C, 0x0C, SS_RDEF, /* XXX TBD */ "Write error - unexpected unsolicited data") }, /* DTLPWRO AEBKVF */ { SST(0x0C, 0x0D, SS_RDEF, /* XXX TBD */ "Write error - not enough unsolicited data") }, /* DT W O BK */ { SST(0x0C, 0x0E, SS_RDEF, /* XXX TBD */ "Multiple write errors") }, /* R */ { SST(0x0C, 0x0F, SS_RDEF, /* XXX TBD */ "Defects in error window") }, /* DTLPWRO A K */ { SST(0x0D, 0x00, SS_RDEF, /* XXX TBD */ "Error detected by third party temporary initiator") }, /* DTLPWRO A K */ { SST(0x0D, 0x01, SS_RDEF, /* XXX TBD */ "Third party device failure") }, /* DTLPWRO A K */ { SST(0x0D, 0x02, SS_RDEF, /* XXX TBD */ "Copy target device not reachable") }, /* DTLPWRO A K */ { SST(0x0D, 0x03, SS_RDEF, /* XXX TBD */ "Incorrect copy target device type") }, /* DTLPWRO A K */ { SST(0x0D, 0x04, SS_RDEF, /* XXX TBD */ "Copy target device data underrun") }, /* DTLPWRO A K */ { SST(0x0D, 0x05, SS_RDEF, /* XXX TBD */ "Copy target device data overrun") }, /* DT PWROMAEBK F */ { SST(0x0E, 0x00, SS_RDEF, /* XXX TBD */ "Invalid information unit") }, /* DT PWROMAEBK F */ { SST(0x0E, 0x01, SS_RDEF, /* XXX TBD */ "Information unit too short") }, /* DT PWROMAEBK F */ { SST(0x0E, 0x02, SS_RDEF, /* XXX TBD */ "Information unit too long") }, /* DT P R MAEBK F */ { SST(0x0E, 0x03, SS_RDEF, /* XXX TBD */ "Invalid field in command information unit") }, /* D W O BK */ { SST(0x10, 0x00, SS_RDEF, "ID CRC or ECC error") }, /* DT W O */ { SST(0x10, 0x01, SS_RDEF, /* XXX TBD */ "Logical block guard check failed") }, /* DT W O */ { SST(0x10, 0x02, SS_RDEF, /* XXX TBD */ "Logical block application tag check failed") }, /* DT W O */ { SST(0x10, 0x03, SS_RDEF, /* XXX TBD */ "Logical block reference tag check failed") }, /* T */ { SST(0x10, 0x04, SS_RDEF, /* XXX TBD */ "Logical block protection error on recovered buffer data") }, /* T */ { SST(0x10, 0x05, SS_RDEF, /* XXX TBD */ "Logical block protection method error") }, /* DT WRO BK */ { SST(0x11, 0x00, SS_FATAL|EIO, "Unrecovered read error") }, /* DT WRO BK */ { SST(0x11, 0x01, SS_FATAL|EIO, "Read retries exhausted") }, /* DT WRO BK */ { SST(0x11, 0x02, SS_FATAL|EIO, "Error too long to correct") }, /* DT W O BK */ { SST(0x11, 0x03, SS_FATAL|EIO, "Multiple read errors") }, /* D W O BK */ { SST(0x11, 0x04, SS_FATAL|EIO, "Unrecovered read error - auto reallocate failed") }, /* WRO B */ { SST(0x11, 0x05, SS_FATAL|EIO, "L-EC uncorrectable error") }, /* WRO B */ { SST(0x11, 0x06, SS_FATAL|EIO, "CIRC unrecovered error") }, /* W O B */ { SST(0x11, 0x07, SS_RDEF, "Data re-synchronization error") }, /* T */ { SST(0x11, 0x08, SS_RDEF, "Incomplete block read") }, /* T */ { SST(0x11, 0x09, SS_RDEF, "No gap found") }, /* DT O BK */ { SST(0x11, 0x0A, SS_RDEF, "Miscorrected error") }, /* D W O BK */ { SST(0x11, 0x0B, SS_FATAL|EIO, "Unrecovered read error - recommend reassignment") }, /* D W O BK */ { SST(0x11, 0x0C, SS_FATAL|EIO, "Unrecovered read error - recommend rewrite the data") }, /* DT WRO B */ { SST(0x11, 0x0D, SS_RDEF, "De-compression CRC error") }, /* DT WRO B */ { SST(0x11, 0x0E, SS_RDEF, "Cannot decompress using declared algorithm") }, /* R */ { SST(0x11, 0x0F, SS_RDEF, "Error reading UPC/EAN number") }, /* R */ { SST(0x11, 0x10, SS_RDEF, "Error reading ISRC number") }, /* R */ { SST(0x11, 0x11, SS_RDEF, "Read error - loss of streaming") }, /* DT WROM B */ { SST(0x11, 0x12, SS_RDEF, /* XXX TBD */ "Auxiliary memory read error") }, /* DTLPWRO AEBKVF */ { SST(0x11, 0x13, SS_RDEF, /* XXX TBD */ "Read error - failed retransmission request") }, /* D */ { SST(0x11, 0x14, SS_RDEF, /* XXX TBD */ "Read error - LBA marked bad by application client") }, /* D W O BK */ { SST(0x12, 0x00, SS_RDEF, "Address mark not found for ID field") }, /* D W O BK */ { SST(0x13, 0x00, SS_RDEF, "Address mark not found for data field") }, /* DTL WRO BK */ { SST(0x14, 0x00, SS_RDEF, "Recorded entity not found") }, /* DT WRO BK */ { SST(0x14, 0x01, SS_RDEF, "Record not found") }, /* T */ { SST(0x14, 0x02, SS_RDEF, "Filemark or setmark not found") }, /* T */ { SST(0x14, 0x03, SS_RDEF, "End-of-data not found") }, /* T */ { SST(0x14, 0x04, SS_RDEF, "Block sequence error") }, /* DT W O BK */ { SST(0x14, 0x05, SS_RDEF, "Record not found - recommend reassignment") }, /* DT W O BK */ { SST(0x14, 0x06, SS_RDEF, "Record not found - data auto-reallocated") }, /* T */ { SST(0x14, 0x07, SS_RDEF, /* XXX TBD */ "Locate operation failure") }, /* DTL WROM BK */ { SST(0x15, 0x00, SS_RDEF, "Random positioning error") }, /* DTL WROM BK */ { SST(0x15, 0x01, SS_RDEF, "Mechanical positioning error") }, /* DT WRO BK */ { SST(0x15, 0x02, SS_RDEF, "Positioning error detected by read of medium") }, /* D W O BK */ { SST(0x16, 0x00, SS_RDEF, "Data synchronization mark error") }, /* D W O BK */ { SST(0x16, 0x01, SS_RDEF, "Data sync error - data rewritten") }, /* D W O BK */ { SST(0x16, 0x02, SS_RDEF, "Data sync error - recommend rewrite") }, /* D W O BK */ { SST(0x16, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Data sync error - data auto-reallocated") }, /* D W O BK */ { SST(0x16, 0x04, SS_RDEF, "Data sync error - recommend reassignment") }, /* DT WRO BK */ { SST(0x17, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with no error correction applied") }, /* DT WRO BK */ { SST(0x17, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with retries") }, /* DT WRO BK */ { SST(0x17, 0x02, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with positive head offset") }, /* DT WRO BK */ { SST(0x17, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with negative head offset") }, /* WRO B */ { SST(0x17, 0x04, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with retries and/or CIRC applied") }, /* D WRO BK */ { SST(0x17, 0x05, SS_NOP | SSQ_PRINT_SENSE, "Recovered data using previous sector ID") }, /* D W O BK */ { SST(0x17, 0x06, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - data auto-reallocated") }, /* D WRO BK */ { SST(0x17, 0x07, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - recommend reassignment") }, /* D WRO BK */ { SST(0x17, 0x08, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - recommend rewrite") }, /* D WRO BK */ { SST(0x17, 0x09, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - data rewritten") }, /* DT WRO BK */ { SST(0x18, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with error correction applied") }, /* D WRO BK */ { SST(0x18, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with error corr. & retries applied") }, /* D WRO BK */ { SST(0x18, 0x02, SS_NOP | SSQ_PRINT_SENSE, "Recovered data - data auto-reallocated") }, /* R */ { SST(0x18, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with CIRC") }, /* R */ { SST(0x18, 0x04, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with L-EC") }, /* D WRO BK */ { SST(0x18, 0x05, SS_NOP | SSQ_PRINT_SENSE, "Recovered data - recommend reassignment") }, /* D WRO BK */ { SST(0x18, 0x06, SS_NOP | SSQ_PRINT_SENSE, "Recovered data - recommend rewrite") }, /* D W O BK */ { SST(0x18, 0x07, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with ECC - data rewritten") }, /* R */ { SST(0x18, 0x08, SS_RDEF, /* XXX TBD */ "Recovered data with linking") }, /* D O K */ { SST(0x19, 0x00, SS_RDEF, "Defect list error") }, /* D O K */ { SST(0x19, 0x01, SS_RDEF, "Defect list not available") }, /* D O K */ { SST(0x19, 0x02, SS_RDEF, "Defect list error in primary list") }, /* D O K */ { SST(0x19, 0x03, SS_RDEF, "Defect list error in grown list") }, /* DTLPWROMAEBKVF */ { SST(0x1A, 0x00, SS_RDEF, "Parameter list length error") }, /* DTLPWROMAEBKVF */ { SST(0x1B, 0x00, SS_RDEF, "Synchronous data transfer error") }, /* D O BK */ { SST(0x1C, 0x00, SS_RDEF, "Defect list not found") }, /* D O BK */ { SST(0x1C, 0x01, SS_RDEF, "Primary defect list not found") }, /* D O BK */ { SST(0x1C, 0x02, SS_RDEF, "Grown defect list not found") }, /* DT WRO BK */ { SST(0x1D, 0x00, SS_FATAL, "Miscompare during verify operation") }, /* D B */ { SST(0x1D, 0x01, SS_RDEF, /* XXX TBD */ "Miscomparable verify of unmapped LBA") }, /* D W O BK */ { SST(0x1E, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Recovered ID with ECC correction") }, /* D O K */ { SST(0x1F, 0x00, SS_RDEF, "Partial defect list transfer") }, /* DTLPWROMAEBKVF */ { SST(0x20, 0x00, SS_FATAL | EINVAL, "Invalid command operation code") }, /* DT PWROMAEBK */ { SST(0x20, 0x01, SS_RDEF, /* XXX TBD */ "Access denied - initiator pending-enrolled") }, /* DT PWROMAEBK */ { SST(0x20, 0x02, SS_RDEF, /* XXX TBD */ "Access denied - no access rights") }, /* DT PWROMAEBK */ { SST(0x20, 0x03, SS_RDEF, /* XXX TBD */ "Access denied - invalid mgmt ID key") }, /* T */ { SST(0x20, 0x04, SS_RDEF, /* XXX TBD */ "Illegal command while in write capable state") }, /* T */ { SST(0x20, 0x05, SS_RDEF, /* XXX TBD */ "Obsolete") }, /* T */ { SST(0x20, 0x06, SS_RDEF, /* XXX TBD */ "Illegal command while in explicit address mode") }, /* T */ { SST(0x20, 0x07, SS_RDEF, /* XXX TBD */ "Illegal command while in implicit address mode") }, /* DT PWROMAEBK */ { SST(0x20, 0x08, SS_RDEF, /* XXX TBD */ "Access denied - enrollment conflict") }, /* DT PWROMAEBK */ { SST(0x20, 0x09, SS_RDEF, /* XXX TBD */ "Access denied - invalid LU identifier") }, /* DT PWROMAEBK */ { SST(0x20, 0x0A, SS_RDEF, /* XXX TBD */ "Access denied - invalid proxy token") }, /* DT PWROMAEBK */ { SST(0x20, 0x0B, SS_RDEF, /* XXX TBD */ "Access denied - ACL LUN conflict") }, /* T */ { SST(0x20, 0x0C, SS_FATAL | EINVAL, "Illegal command when not in append-only mode") }, /* DT WRO BK */ { SST(0x21, 0x00, SS_FATAL | EINVAL, "Logical block address out of range") }, /* DT WROM BK */ { SST(0x21, 0x01, SS_FATAL | EINVAL, "Invalid element address") }, /* R */ { SST(0x21, 0x02, SS_RDEF, /* XXX TBD */ "Invalid address for write") }, /* R */ { SST(0x21, 0x03, SS_RDEF, /* XXX TBD */ "Invalid write crossing layer jump") }, /* D */ { SST(0x22, 0x00, SS_FATAL | EINVAL, "Illegal function (use 20 00, 24 00, or 26 00)") }, /* DT P B */ { SST(0x23, 0x00, SS_FATAL | EINVAL, "Invalid token operation, cause not reportable") }, /* DT P B */ { SST(0x23, 0x01, SS_FATAL | EINVAL, "Invalid token operation, unsupported token type") }, /* DT P B */ { SST(0x23, 0x02, SS_FATAL | EINVAL, "Invalid token operation, remote token usage not supported") }, /* DT P B */ { SST(0x23, 0x03, SS_FATAL | EINVAL, "Invalid token operation, remote ROD token creation not supported") }, /* DT P B */ { SST(0x23, 0x04, SS_FATAL | EINVAL, "Invalid token operation, token unknown") }, /* DT P B */ { SST(0x23, 0x05, SS_FATAL | EINVAL, "Invalid token operation, token corrupt") }, /* DT P B */ { SST(0x23, 0x06, SS_FATAL | EINVAL, "Invalid token operation, token revoked") }, /* DT P B */ { SST(0x23, 0x07, SS_FATAL | EINVAL, "Invalid token operation, token expired") }, /* DT P B */ { SST(0x23, 0x08, SS_FATAL | EINVAL, "Invalid token operation, token cancelled") }, /* DT P B */ { SST(0x23, 0x09, SS_FATAL | EINVAL, "Invalid token operation, token deleted") }, /* DT P B */ { SST(0x23, 0x0A, SS_FATAL | EINVAL, "Invalid token operation, invalid token length") }, /* DTLPWROMAEBKVF */ { SST(0x24, 0x00, SS_FATAL | EINVAL, "Invalid field in CDB") }, /* DTLPWRO AEBKVF */ { SST(0x24, 0x01, SS_RDEF, /* XXX TBD */ "CDB decryption error") }, /* T */ { SST(0x24, 0x02, SS_RDEF, /* XXX TBD */ "Obsolete") }, /* T */ { SST(0x24, 0x03, SS_RDEF, /* XXX TBD */ "Obsolete") }, /* F */ { SST(0x24, 0x04, SS_RDEF, /* XXX TBD */ "Security audit value frozen") }, /* F */ { SST(0x24, 0x05, SS_RDEF, /* XXX TBD */ "Security working key frozen") }, /* F */ { SST(0x24, 0x06, SS_RDEF, /* XXX TBD */ "NONCE not unique") }, /* F */ { SST(0x24, 0x07, SS_RDEF, /* XXX TBD */ "NONCE timestamp out of range") }, /* DT R MAEBKV */ { SST(0x24, 0x08, SS_RDEF, /* XXX TBD */ "Invalid XCDB") }, /* DTLPWROMAEBKVF */ { SST(0x25, 0x00, SS_FATAL | ENXIO | SSQ_LOST, "Logical unit not supported") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x00, SS_FATAL | EINVAL, "Invalid field in parameter list") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x01, SS_FATAL | EINVAL, "Parameter not supported") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x02, SS_FATAL | EINVAL, "Parameter value invalid") }, /* DTLPWROMAE K */ { SST(0x26, 0x03, SS_FATAL | EINVAL, "Threshold parameters not supported") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x04, SS_FATAL | EINVAL, "Invalid release of persistent reservation") }, /* DTLPWRO A BK */ { SST(0x26, 0x05, SS_RDEF, /* XXX TBD */ "Data decryption error") }, /* DTLPWRO K */ { SST(0x26, 0x06, SS_FATAL | EINVAL, "Too many target descriptors") }, /* DTLPWRO K */ { SST(0x26, 0x07, SS_FATAL | EINVAL, "Unsupported target descriptor type code") }, /* DTLPWRO K */ { SST(0x26, 0x08, SS_FATAL | EINVAL, "Too many segment descriptors") }, /* DTLPWRO K */ { SST(0x26, 0x09, SS_FATAL | EINVAL, "Unsupported segment descriptor type code") }, /* DTLPWRO K */ { SST(0x26, 0x0A, SS_FATAL | EINVAL, "Unexpected inexact segment") }, /* DTLPWRO K */ { SST(0x26, 0x0B, SS_FATAL | EINVAL, "Inline data length exceeded") }, /* DTLPWRO K */ { SST(0x26, 0x0C, SS_FATAL | EINVAL, "Invalid operation for copy source or destination") }, /* DTLPWRO K */ { SST(0x26, 0x0D, SS_FATAL | EINVAL, "Copy segment granularity violation") }, /* DT PWROMAEBK */ { SST(0x26, 0x0E, SS_RDEF, /* XXX TBD */ "Invalid parameter while port is enabled") }, /* F */ { SST(0x26, 0x0F, SS_RDEF, /* XXX TBD */ "Invalid data-out buffer integrity check value") }, /* T */ { SST(0x26, 0x10, SS_RDEF, /* XXX TBD */ "Data decryption key fail limit reached") }, /* T */ { SST(0x26, 0x11, SS_RDEF, /* XXX TBD */ "Incomplete key-associated data set") }, /* T */ { SST(0x26, 0x12, SS_RDEF, /* XXX TBD */ "Vendor specific key reference not found") }, /* DT WRO BK */ { SST(0x27, 0x00, SS_FATAL | EACCES, "Write protected") }, /* DT WRO BK */ { SST(0x27, 0x01, SS_FATAL | EACCES, "Hardware write protected") }, /* DT WRO BK */ { SST(0x27, 0x02, SS_FATAL | EACCES, "Logical unit software write protected") }, /* T R */ { SST(0x27, 0x03, SS_FATAL | EACCES, "Associated write protect") }, /* T R */ { SST(0x27, 0x04, SS_FATAL | EACCES, "Persistent write protect") }, /* T R */ { SST(0x27, 0x05, SS_FATAL | EACCES, "Permanent write protect") }, /* R F */ { SST(0x27, 0x06, SS_RDEF, /* XXX TBD */ "Conditional write protect") }, /* D B */ { SST(0x27, 0x07, SS_FATAL | ENOSPC, "Space allocation failed write protect") }, /* DTLPWROMAEBKVF */ { SST(0x28, 0x00, SS_FATAL | ENXIO, "Not ready to ready change, medium may have changed") }, /* DT WROM B */ { SST(0x28, 0x01, SS_FATAL | ENXIO, "Import or export element accessed") }, /* R */ { SST(0x28, 0x02, SS_RDEF, /* XXX TBD */ "Format-layer may have changed") }, /* M */ { SST(0x28, 0x03, SS_RDEF, /* XXX TBD */ "Import/export element accessed, medium changed") }, /* * XXX JGibbs - All of these should use the same errno, but I don't * think ENXIO is the correct choice. Should we borrow from * the networking errnos? ECONNRESET anyone? */ /* DTLPWROMAEBKVF */ { SST(0x29, 0x00, SS_FATAL | ENXIO, "Power on, reset, or bus device reset occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x01, SS_RDEF, "Power on occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x02, SS_RDEF, "SCSI bus reset occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x03, SS_RDEF, "Bus device reset function occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x04, SS_RDEF, "Device internal reset") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x05, SS_RDEF, "Transceiver mode changed to single-ended") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x06, SS_RDEF, "Transceiver mode changed to LVD") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x07, SS_RDEF, /* XXX TBD */ "I_T nexus loss occurred") }, /* DTL WROMAEBKVF */ { SST(0x2A, 0x00, SS_RDEF, "Parameters changed") }, /* DTL WROMAEBKVF */ { SST(0x2A, 0x01, SS_RDEF, "Mode parameters changed") }, /* DTL WROMAE K */ { SST(0x2A, 0x02, SS_RDEF, "Log parameters changed") }, /* DTLPWROMAE K */ { SST(0x2A, 0x03, SS_RDEF, "Reservations preempted") }, /* DTLPWROMAE */ { SST(0x2A, 0x04, SS_RDEF, /* XXX TBD */ "Reservations released") }, /* DTLPWROMAE */ { SST(0x2A, 0x05, SS_RDEF, /* XXX TBD */ "Registrations preempted") }, /* DTLPWROMAEBKVF */ { SST(0x2A, 0x06, SS_RDEF, /* XXX TBD */ "Asymmetric access state changed") }, /* DTLPWROMAEBKVF */ { SST(0x2A, 0x07, SS_RDEF, /* XXX TBD */ "Implicit asymmetric access state transition failed") }, /* DT WROMAEBKVF */ { SST(0x2A, 0x08, SS_RDEF, /* XXX TBD */ "Priority changed") }, /* D */ { SST(0x2A, 0x09, SS_RDEF, /* XXX TBD */ "Capacity data has changed") }, /* DT */ { SST(0x2A, 0x0A, SS_RDEF, /* XXX TBD */ "Error history I_T nexus cleared") }, /* DT */ { SST(0x2A, 0x0B, SS_RDEF, /* XXX TBD */ "Error history snapshot released") }, /* F */ { SST(0x2A, 0x0C, SS_RDEF, /* XXX TBD */ "Error recovery attributes have changed") }, /* T */ { SST(0x2A, 0x0D, SS_RDEF, /* XXX TBD */ "Data encryption capabilities changed") }, /* DT M E V */ { SST(0x2A, 0x10, SS_RDEF, /* XXX TBD */ "Timestamp changed") }, /* T */ { SST(0x2A, 0x11, SS_RDEF, /* XXX TBD */ "Data encryption parameters changed by another I_T nexus") }, /* T */ { SST(0x2A, 0x12, SS_RDEF, /* XXX TBD */ "Data encryption parameters changed by vendor specific event") }, /* T */ { SST(0x2A, 0x13, SS_RDEF, /* XXX TBD */ "Data encryption key instance counter has changed") }, /* DT R MAEBKV */ { SST(0x2A, 0x14, SS_RDEF, /* XXX TBD */ "SA creation capabilities data has changed") }, /* T M V */ { SST(0x2A, 0x15, SS_RDEF, /* XXX TBD */ "Medium removal prevention preempted") }, /* DTLPWRO K */ { SST(0x2B, 0x00, SS_RDEF, "Copy cannot execute since host cannot disconnect") }, /* DTLPWROMAEBKVF */ { SST(0x2C, 0x00, SS_RDEF, "Command sequence error") }, /* */ { SST(0x2C, 0x01, SS_RDEF, "Too many windows specified") }, /* */ { SST(0x2C, 0x02, SS_RDEF, "Invalid combination of windows specified") }, /* R */ { SST(0x2C, 0x03, SS_RDEF, "Current program area is not empty") }, /* R */ { SST(0x2C, 0x04, SS_RDEF, "Current program area is empty") }, /* B */ { SST(0x2C, 0x05, SS_RDEF, /* XXX TBD */ "Illegal power condition request") }, /* R */ { SST(0x2C, 0x06, SS_RDEF, /* XXX TBD */ "Persistent prevent conflict") }, /* DTLPWROMAEBKVF */ { SST(0x2C, 0x07, SS_RDEF, /* XXX TBD */ "Previous busy status") }, /* DTLPWROMAEBKVF */ { SST(0x2C, 0x08, SS_RDEF, /* XXX TBD */ "Previous task set full status") }, /* DTLPWROM EBKVF */ { SST(0x2C, 0x09, SS_RDEF, /* XXX TBD */ "Previous reservation conflict status") }, /* F */ { SST(0x2C, 0x0A, SS_RDEF, /* XXX TBD */ "Partition or collection contains user objects") }, /* T */ { SST(0x2C, 0x0B, SS_RDEF, /* XXX TBD */ "Not reserved") }, /* D */ { SST(0x2C, 0x0C, SS_RDEF, /* XXX TBD */ "ORWRITE generation does not match") }, /* T */ { SST(0x2D, 0x00, SS_RDEF, "Overwrite error on update in place") }, /* R */ { SST(0x2E, 0x00, SS_RDEF, /* XXX TBD */ "Insufficient time for operation") }, /* DTLPWROMAEBKVF */ { SST(0x2F, 0x00, SS_RDEF, "Commands cleared by another initiator") }, /* D */ { SST(0x2F, 0x01, SS_RDEF, /* XXX TBD */ "Commands cleared by power loss notification") }, /* DTLPWROMAEBKVF */ { SST(0x2F, 0x02, SS_RDEF, /* XXX TBD */ "Commands cleared by device server") }, /* DT WROM BK */ { SST(0x30, 0x00, SS_RDEF, "Incompatible medium installed") }, /* DT WRO BK */ { SST(0x30, 0x01, SS_RDEF, "Cannot read medium - unknown format") }, /* DT WRO BK */ { SST(0x30, 0x02, SS_RDEF, "Cannot read medium - incompatible format") }, /* DT R K */ { SST(0x30, 0x03, SS_RDEF, "Cleaning cartridge installed") }, /* DT WRO BK */ { SST(0x30, 0x04, SS_RDEF, "Cannot write medium - unknown format") }, /* DT WRO BK */ { SST(0x30, 0x05, SS_RDEF, "Cannot write medium - incompatible format") }, /* DT WRO B */ { SST(0x30, 0x06, SS_RDEF, "Cannot format medium - incompatible medium") }, /* DTL WROMAEBKVF */ { SST(0x30, 0x07, SS_RDEF, "Cleaning failure") }, /* R */ { SST(0x30, 0x08, SS_RDEF, "Cannot write - application code mismatch") }, /* R */ { SST(0x30, 0x09, SS_RDEF, "Current session not fixated for append") }, /* DT WRO AEBK */ { SST(0x30, 0x0A, SS_RDEF, /* XXX TBD */ "Cleaning request rejected") }, /* T */ { SST(0x30, 0x0C, SS_RDEF, /* XXX TBD */ "WORM medium - overwrite attempted") }, /* T */ { SST(0x30, 0x0D, SS_RDEF, /* XXX TBD */ "WORM medium - integrity check") }, /* R */ { SST(0x30, 0x10, SS_RDEF, /* XXX TBD */ "Medium not formatted") }, /* M */ { SST(0x30, 0x11, SS_RDEF, /* XXX TBD */ "Incompatible volume type") }, /* M */ { SST(0x30, 0x12, SS_RDEF, /* XXX TBD */ "Incompatible volume qualifier") }, /* M */ { SST(0x30, 0x13, SS_RDEF, /* XXX TBD */ "Cleaning volume expired") }, /* DT WRO BK */ { SST(0x31, 0x00, SS_RDEF, "Medium format corrupted") }, /* D L RO B */ { SST(0x31, 0x01, SS_RDEF, "Format command failed") }, /* R */ { SST(0x31, 0x02, SS_RDEF, /* XXX TBD */ "Zoned formatting failed due to spare linking") }, /* D B */ { SST(0x31, 0x03, SS_RDEF, /* XXX TBD */ "SANITIZE command failed") }, /* D W O BK */ { SST(0x32, 0x00, SS_RDEF, "No defect spare location available") }, /* D W O BK */ { SST(0x32, 0x01, SS_RDEF, "Defect list update failure") }, /* T */ { SST(0x33, 0x00, SS_RDEF, "Tape length error") }, /* DTLPWROMAEBKVF */ { SST(0x34, 0x00, SS_RDEF, "Enclosure failure") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x00, SS_RDEF, "Enclosure services failure") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x01, SS_RDEF, "Unsupported enclosure function") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x02, SS_RDEF, "Enclosure services unavailable") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x03, SS_RDEF, "Enclosure services transfer failure") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x04, SS_RDEF, "Enclosure services transfer refused") }, /* DTL WROMAEBKVF */ { SST(0x35, 0x05, SS_RDEF, /* XXX TBD */ "Enclosure services checksum error") }, /* L */ { SST(0x36, 0x00, SS_RDEF, "Ribbon, ink, or toner failure") }, /* DTL WROMAEBKVF */ { SST(0x37, 0x00, SS_RDEF, "Rounded parameter") }, /* B */ { SST(0x38, 0x00, SS_RDEF, /* XXX TBD */ "Event status notification") }, /* B */ { SST(0x38, 0x02, SS_RDEF, /* XXX TBD */ "ESN - power management class event") }, /* B */ { SST(0x38, 0x04, SS_RDEF, /* XXX TBD */ "ESN - media class event") }, /* B */ { SST(0x38, 0x06, SS_RDEF, /* XXX TBD */ "ESN - device busy class event") }, /* D */ { SST(0x38, 0x07, SS_RDEF, /* XXX TBD */ "Thin provisioning soft threshold reached") }, /* DTL WROMAE K */ { SST(0x39, 0x00, SS_RDEF, "Saving parameters not supported") }, /* DTL WROM BK */ { SST(0x3A, 0x00, SS_FATAL | ENXIO, "Medium not present") }, /* DT WROM BK */ { SST(0x3A, 0x01, SS_FATAL | ENXIO, "Medium not present - tray closed") }, /* DT WROM BK */ { SST(0x3A, 0x02, SS_FATAL | ENXIO, "Medium not present - tray open") }, /* DT WROM B */ { SST(0x3A, 0x03, SS_RDEF, /* XXX TBD */ "Medium not present - loadable") }, /* DT WRO B */ { SST(0x3A, 0x04, SS_RDEF, /* XXX TBD */ "Medium not present - medium auxiliary memory accessible") }, /* TL */ { SST(0x3B, 0x00, SS_RDEF, "Sequential positioning error") }, /* T */ { SST(0x3B, 0x01, SS_RDEF, "Tape position error at beginning-of-medium") }, /* T */ { SST(0x3B, 0x02, SS_RDEF, "Tape position error at end-of-medium") }, /* L */ { SST(0x3B, 0x03, SS_RDEF, "Tape or electronic vertical forms unit not ready") }, /* L */ { SST(0x3B, 0x04, SS_RDEF, "Slew failure") }, /* L */ { SST(0x3B, 0x05, SS_RDEF, "Paper jam") }, /* L */ { SST(0x3B, 0x06, SS_RDEF, "Failed to sense top-of-form") }, /* L */ { SST(0x3B, 0x07, SS_RDEF, "Failed to sense bottom-of-form") }, /* T */ { SST(0x3B, 0x08, SS_RDEF, "Reposition error") }, /* */ { SST(0x3B, 0x09, SS_RDEF, "Read past end of medium") }, /* */ { SST(0x3B, 0x0A, SS_RDEF, "Read past beginning of medium") }, /* */ { SST(0x3B, 0x0B, SS_RDEF, "Position past end of medium") }, /* T */ { SST(0x3B, 0x0C, SS_RDEF, "Position past beginning of medium") }, /* DT WROM BK */ { SST(0x3B, 0x0D, SS_FATAL | ENOSPC, "Medium destination element full") }, /* DT WROM BK */ { SST(0x3B, 0x0E, SS_RDEF, "Medium source element empty") }, /* R */ { SST(0x3B, 0x0F, SS_RDEF, "End of medium reached") }, /* DT WROM BK */ { SST(0x3B, 0x11, SS_RDEF, "Medium magazine not accessible") }, /* DT WROM BK */ { SST(0x3B, 0x12, SS_RDEF, "Medium magazine removed") }, /* DT WROM BK */ { SST(0x3B, 0x13, SS_RDEF, "Medium magazine inserted") }, /* DT WROM BK */ { SST(0x3B, 0x14, SS_RDEF, "Medium magazine locked") }, /* DT WROM BK */ { SST(0x3B, 0x15, SS_RDEF, "Medium magazine unlocked") }, /* R */ { SST(0x3B, 0x16, SS_RDEF, /* XXX TBD */ "Mechanical positioning or changer error") }, /* F */ { SST(0x3B, 0x17, SS_RDEF, /* XXX TBD */ "Read past end of user object") }, /* M */ { SST(0x3B, 0x18, SS_RDEF, /* XXX TBD */ "Element disabled") }, /* M */ { SST(0x3B, 0x19, SS_RDEF, /* XXX TBD */ "Element enabled") }, /* M */ { SST(0x3B, 0x1A, SS_RDEF, /* XXX TBD */ "Data transfer device removed") }, /* M */ { SST(0x3B, 0x1B, SS_RDEF, /* XXX TBD */ "Data transfer device inserted") }, /* T */ { SST(0x3B, 0x1C, SS_RDEF, /* XXX TBD */ "Too many logical objects on partition to support operation") }, /* DTLPWROMAE K */ { SST(0x3D, 0x00, SS_RDEF, "Invalid bits in IDENTIFY message") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x00, SS_RDEF, "Logical unit has not self-configured yet") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x01, SS_RDEF, "Logical unit failure") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x02, SS_RDEF, "Timeout on logical unit") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x03, SS_RDEF, /* XXX TBD */ "Logical unit failed self-test") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x04, SS_RDEF, /* XXX TBD */ "Logical unit unable to update self-test log") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x00, SS_RDEF, "Target operating conditions have changed") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x01, SS_RDEF, "Microcode has been changed") }, /* DTLPWROM BK */ { SST(0x3F, 0x02, SS_RDEF, "Changed operating definition") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x03, SS_RDEF, "INQUIRY data has changed") }, /* DT WROMAEBK */ { SST(0x3F, 0x04, SS_RDEF, "Component device attached") }, /* DT WROMAEBK */ { SST(0x3F, 0x05, SS_RDEF, "Device identifier changed") }, /* DT WROMAEB */ { SST(0x3F, 0x06, SS_RDEF, "Redundancy group created or modified") }, /* DT WROMAEB */ { SST(0x3F, 0x07, SS_RDEF, "Redundancy group deleted") }, /* DT WROMAEB */ { SST(0x3F, 0x08, SS_RDEF, "Spare created or modified") }, /* DT WROMAEB */ { SST(0x3F, 0x09, SS_RDEF, "Spare deleted") }, /* DT WROMAEBK */ { SST(0x3F, 0x0A, SS_RDEF, "Volume set created or modified") }, /* DT WROMAEBK */ { SST(0x3F, 0x0B, SS_RDEF, "Volume set deleted") }, /* DT WROMAEBK */ { SST(0x3F, 0x0C, SS_RDEF, "Volume set deassigned") }, /* DT WROMAEBK */ { SST(0x3F, 0x0D, SS_RDEF, "Volume set reassigned") }, /* DTLPWROMAE */ { SST(0x3F, 0x0E, SS_RDEF | SSQ_RESCAN , "Reported LUNs data has changed") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x0F, SS_RDEF, /* XXX TBD */ "Echo buffer overwritten") }, /* DT WROM B */ { SST(0x3F, 0x10, SS_RDEF, /* XXX TBD */ "Medium loadable") }, /* DT WROM B */ { SST(0x3F, 0x11, SS_RDEF, /* XXX TBD */ "Medium auxiliary memory accessible") }, /* DTLPWR MAEBK F */ { SST(0x3F, 0x12, SS_RDEF, /* XXX TBD */ "iSCSI IP address added") }, /* DTLPWR MAEBK F */ { SST(0x3F, 0x13, SS_RDEF, /* XXX TBD */ "iSCSI IP address removed") }, /* DTLPWR MAEBK F */ { SST(0x3F, 0x14, SS_RDEF, /* XXX TBD */ "iSCSI IP address changed") }, /* D */ { SST(0x40, 0x00, SS_RDEF, "RAM failure") }, /* deprecated - use 40 NN instead */ /* DTLPWROMAEBKVF */ { SST(0x40, 0x80, SS_RDEF, "Diagnostic failure: ASCQ = Component ID") }, /* DTLPWROMAEBKVF */ { SST(0x40, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x80->0xFF */ /* D */ { SST(0x41, 0x00, SS_RDEF, "Data path failure") }, /* deprecated - use 40 NN instead */ /* D */ { SST(0x42, 0x00, SS_RDEF, "Power-on or self-test failure") }, /* deprecated - use 40 NN instead */ /* DTLPWROMAEBKVF */ { SST(0x43, 0x00, SS_RDEF, "Message error") }, /* DTLPWROMAEBKVF */ { SST(0x44, 0x00, SS_RDEF, "Internal target failure") }, /* DT P MAEBKVF */ { SST(0x44, 0x01, SS_RDEF, /* XXX TBD */ "Persistent reservation information lost") }, /* DT B */ { SST(0x44, 0x71, SS_RDEF, /* XXX TBD */ "ATA device failed set features") }, /* DTLPWROMAEBKVF */ { SST(0x45, 0x00, SS_RDEF, "Select or reselect failure") }, /* DTLPWROM BK */ { SST(0x46, 0x00, SS_RDEF, "Unsuccessful soft reset") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x00, SS_RDEF, "SCSI parity error") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x01, SS_RDEF, /* XXX TBD */ "Data phase CRC error detected") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x02, SS_RDEF, /* XXX TBD */ "SCSI parity error detected during ST data phase") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x03, SS_RDEF, /* XXX TBD */ "Information unit iuCRC error detected") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x04, SS_RDEF, /* XXX TBD */ "Asynchronous information protection error detected") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x05, SS_RDEF, /* XXX TBD */ "Protocol service CRC error") }, /* DT MAEBKVF */ { SST(0x47, 0x06, SS_RDEF, /* XXX TBD */ "PHY test function in progress") }, /* DT PWROMAEBK */ { SST(0x47, 0x7F, SS_RDEF, /* XXX TBD */ "Some commands cleared by iSCSI protocol event") }, /* DTLPWROMAEBKVF */ { SST(0x48, 0x00, SS_RDEF, "Initiator detected error message received") }, /* DTLPWROMAEBKVF */ { SST(0x49, 0x00, SS_RDEF, "Invalid message error") }, /* DTLPWROMAEBKVF */ { SST(0x4A, 0x00, SS_RDEF, "Command phase error") }, /* DTLPWROMAEBKVF */ { SST(0x4B, 0x00, SS_RDEF, "Data phase error") }, /* DT PWROMAEBK */ { SST(0x4B, 0x01, SS_RDEF, /* XXX TBD */ "Invalid target port transfer tag received") }, /* DT PWROMAEBK */ { SST(0x4B, 0x02, SS_RDEF, /* XXX TBD */ "Too much write data") }, /* DT PWROMAEBK */ { SST(0x4B, 0x03, SS_RDEF, /* XXX TBD */ "ACK/NAK timeout") }, /* DT PWROMAEBK */ { SST(0x4B, 0x04, SS_RDEF, /* XXX TBD */ "NAK received") }, /* DT PWROMAEBK */ { SST(0x4B, 0x05, SS_RDEF, /* XXX TBD */ "Data offset error") }, /* DT PWROMAEBK */ { SST(0x4B, 0x06, SS_RDEF, /* XXX TBD */ "Initiator response timeout") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x07, SS_RDEF, /* XXX TBD */ "Connection lost") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x08, SS_RDEF, /* XXX TBD */ "Data-in buffer overflow - data buffer size") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x09, SS_RDEF, /* XXX TBD */ "Data-in buffer overflow - data buffer descriptor area") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0A, SS_RDEF, /* XXX TBD */ "Data-in buffer error") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0B, SS_RDEF, /* XXX TBD */ "Data-out buffer overflow - data buffer size") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0C, SS_RDEF, /* XXX TBD */ "Data-out buffer overflow - data buffer descriptor area") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0D, SS_RDEF, /* XXX TBD */ "Data-out buffer error") }, /* DTLPWROMAEBKVF */ { SST(0x4C, 0x00, SS_RDEF, "Logical unit failed self-configuration") }, /* DTLPWROMAEBKVF */ { SST(0x4D, 0x00, SS_RDEF, "Tagged overlapped commands: ASCQ = Queue tag ID") }, /* DTLPWROMAEBKVF */ { SST(0x4D, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x00->0xFF */ /* DTLPWROMAEBKVF */ { SST(0x4E, 0x00, SS_RDEF, "Overlapped commands attempted") }, /* T */ { SST(0x50, 0x00, SS_RDEF, "Write append error") }, /* T */ { SST(0x50, 0x01, SS_RDEF, "Write append position error") }, /* T */ { SST(0x50, 0x02, SS_RDEF, "Position error related to timing") }, /* T RO */ { SST(0x51, 0x00, SS_RDEF, "Erase failure") }, /* R */ { SST(0x51, 0x01, SS_RDEF, /* XXX TBD */ "Erase failure - incomplete erase operation detected") }, /* T */ { SST(0x52, 0x00, SS_RDEF, "Cartridge fault") }, /* DTL WROM BK */ { SST(0x53, 0x00, SS_RDEF, "Media load or eject failed") }, /* T */ { SST(0x53, 0x01, SS_RDEF, "Unload tape failure") }, /* DT WROM BK */ { SST(0x53, 0x02, SS_RDEF, "Medium removal prevented") }, /* M */ { SST(0x53, 0x03, SS_RDEF, /* XXX TBD */ "Medium removal prevented by data transfer element") }, /* T */ { SST(0x53, 0x04, SS_RDEF, /* XXX TBD */ "Medium thread or unthread failure") }, /* M */ { SST(0x53, 0x05, SS_RDEF, /* XXX TBD */ "Volume identifier invalid") }, /* T */ { SST(0x53, 0x06, SS_RDEF, /* XXX TBD */ "Volume identifier missing") }, /* M */ { SST(0x53, 0x07, SS_RDEF, /* XXX TBD */ "Duplicate volume identifier") }, /* M */ { SST(0x53, 0x08, SS_RDEF, /* XXX TBD */ "Element status unknown") }, /* P */ { SST(0x54, 0x00, SS_RDEF, "SCSI to host system interface failure") }, /* P */ { SST(0x55, 0x00, SS_RDEF, "System resource failure") }, /* D O BK */ { SST(0x55, 0x01, SS_FATAL | ENOSPC, "System buffer full") }, /* DTLPWROMAE K */ { SST(0x55, 0x02, SS_RDEF, /* XXX TBD */ "Insufficient reservation resources") }, /* DTLPWROMAE K */ { SST(0x55, 0x03, SS_RDEF, /* XXX TBD */ "Insufficient resources") }, /* DTLPWROMAE K */ { SST(0x55, 0x04, SS_RDEF, /* XXX TBD */ "Insufficient registration resources") }, /* DT PWROMAEBK */ { SST(0x55, 0x05, SS_RDEF, /* XXX TBD */ "Insufficient access control resources") }, /* DT WROM B */ { SST(0x55, 0x06, SS_RDEF, /* XXX TBD */ "Auxiliary memory out of space") }, /* F */ { SST(0x55, 0x07, SS_RDEF, /* XXX TBD */ "Quota error") }, /* T */ { SST(0x55, 0x08, SS_RDEF, /* XXX TBD */ "Maximum number of supplemental decryption keys exceeded") }, /* M */ { SST(0x55, 0x09, SS_RDEF, /* XXX TBD */ "Medium auxiliary memory not accessible") }, /* M */ { SST(0x55, 0x0A, SS_RDEF, /* XXX TBD */ "Data currently unavailable") }, /* DTLPWROMAEBKVF */ { SST(0x55, 0x0B, SS_RDEF, /* XXX TBD */ "Insufficient power for operation") }, /* DT P B */ { SST(0x55, 0x0C, SS_RDEF, /* XXX TBD */ "Insufficient resources to create ROD") }, /* DT P B */ { SST(0x55, 0x0D, SS_RDEF, /* XXX TBD */ "Insufficient resources to create ROD token") }, /* R */ { SST(0x57, 0x00, SS_RDEF, "Unable to recover table-of-contents") }, /* O */ { SST(0x58, 0x00, SS_RDEF, "Generation does not exist") }, /* O */ { SST(0x59, 0x00, SS_RDEF, "Updated block read") }, /* DTLPWRO BK */ { SST(0x5A, 0x00, SS_RDEF, "Operator request or state change input") }, /* DT WROM BK */ { SST(0x5A, 0x01, SS_RDEF, "Operator medium removal request") }, /* DT WRO A BK */ { SST(0x5A, 0x02, SS_RDEF, "Operator selected write protect") }, /* DT WRO A BK */ { SST(0x5A, 0x03, SS_RDEF, "Operator selected write permit") }, /* DTLPWROM K */ { SST(0x5B, 0x00, SS_RDEF, "Log exception") }, /* DTLPWROM K */ { SST(0x5B, 0x01, SS_RDEF, "Threshold condition met") }, /* DTLPWROM K */ { SST(0x5B, 0x02, SS_RDEF, "Log counter at maximum") }, /* DTLPWROM K */ { SST(0x5B, 0x03, SS_RDEF, "Log list codes exhausted") }, /* D O */ { SST(0x5C, 0x00, SS_RDEF, "RPL status change") }, /* D O */ { SST(0x5C, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Spindles synchronized") }, /* D O */ { SST(0x5C, 0x02, SS_RDEF, "Spindles not synchronized") }, /* DTLPWROMAEBKVF */ { SST(0x5D, 0x00, SS_RDEF, "Failure prediction threshold exceeded") }, /* R B */ { SST(0x5D, 0x01, SS_RDEF, /* XXX TBD */ "Media failure prediction threshold exceeded") }, /* R */ { SST(0x5D, 0x02, SS_RDEF, /* XXX TBD */ "Logical unit failure prediction threshold exceeded") }, /* R */ { SST(0x5D, 0x03, SS_RDEF, /* XXX TBD */ "Spare area exhaustion prediction threshold exceeded") }, /* D B */ { SST(0x5D, 0x10, SS_RDEF, /* XXX TBD */ "Hardware impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x11, SS_RDEF, /* XXX TBD */ "Hardware impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x12, SS_RDEF, /* XXX TBD */ "Hardware impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x13, SS_RDEF, /* XXX TBD */ "Hardware impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x14, SS_RDEF, /* XXX TBD */ "Hardware impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x15, SS_RDEF, /* XXX TBD */ "Hardware impending failure access times too high") }, /* D B */ { SST(0x5D, 0x16, SS_RDEF, /* XXX TBD */ "Hardware impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x17, SS_RDEF, /* XXX TBD */ "Hardware impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x18, SS_RDEF, /* XXX TBD */ "Hardware impending failure controller detected") }, /* D B */ { SST(0x5D, 0x19, SS_RDEF, /* XXX TBD */ "Hardware impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x1A, SS_RDEF, /* XXX TBD */ "Hardware impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x1B, SS_RDEF, /* XXX TBD */ "Hardware impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x1C, SS_RDEF, /* XXX TBD */ "Hardware impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x20, SS_RDEF, /* XXX TBD */ "Controller impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x21, SS_RDEF, /* XXX TBD */ "Controller impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x22, SS_RDEF, /* XXX TBD */ "Controller impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x23, SS_RDEF, /* XXX TBD */ "Controller impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x24, SS_RDEF, /* XXX TBD */ "Controller impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x25, SS_RDEF, /* XXX TBD */ "Controller impending failure access times too high") }, /* D B */ { SST(0x5D, 0x26, SS_RDEF, /* XXX TBD */ "Controller impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x27, SS_RDEF, /* XXX TBD */ "Controller impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x28, SS_RDEF, /* XXX TBD */ "Controller impending failure controller detected") }, /* D B */ { SST(0x5D, 0x29, SS_RDEF, /* XXX TBD */ "Controller impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x2A, SS_RDEF, /* XXX TBD */ "Controller impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x2B, SS_RDEF, /* XXX TBD */ "Controller impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x2C, SS_RDEF, /* XXX TBD */ "Controller impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x30, SS_RDEF, /* XXX TBD */ "Data channel impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x31, SS_RDEF, /* XXX TBD */ "Data channel impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x32, SS_RDEF, /* XXX TBD */ "Data channel impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x33, SS_RDEF, /* XXX TBD */ "Data channel impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x34, SS_RDEF, /* XXX TBD */ "Data channel impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x35, SS_RDEF, /* XXX TBD */ "Data channel impending failure access times too high") }, /* D B */ { SST(0x5D, 0x36, SS_RDEF, /* XXX TBD */ "Data channel impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x37, SS_RDEF, /* XXX TBD */ "Data channel impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x38, SS_RDEF, /* XXX TBD */ "Data channel impending failure controller detected") }, /* D B */ { SST(0x5D, 0x39, SS_RDEF, /* XXX TBD */ "Data channel impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x3A, SS_RDEF, /* XXX TBD */ "Data channel impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x3B, SS_RDEF, /* XXX TBD */ "Data channel impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x3C, SS_RDEF, /* XXX TBD */ "Data channel impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x40, SS_RDEF, /* XXX TBD */ "Servo impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x41, SS_RDEF, /* XXX TBD */ "Servo impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x42, SS_RDEF, /* XXX TBD */ "Servo impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x43, SS_RDEF, /* XXX TBD */ "Servo impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x44, SS_RDEF, /* XXX TBD */ "Servo impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x45, SS_RDEF, /* XXX TBD */ "Servo impending failure access times too high") }, /* D B */ { SST(0x5D, 0x46, SS_RDEF, /* XXX TBD */ "Servo impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x47, SS_RDEF, /* XXX TBD */ "Servo impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x48, SS_RDEF, /* XXX TBD */ "Servo impending failure controller detected") }, /* D B */ { SST(0x5D, 0x49, SS_RDEF, /* XXX TBD */ "Servo impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x4A, SS_RDEF, /* XXX TBD */ "Servo impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x4B, SS_RDEF, /* XXX TBD */ "Servo impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x4C, SS_RDEF, /* XXX TBD */ "Servo impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x50, SS_RDEF, /* XXX TBD */ "Spindle impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x51, SS_RDEF, /* XXX TBD */ "Spindle impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x52, SS_RDEF, /* XXX TBD */ "Spindle impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x53, SS_RDEF, /* XXX TBD */ "Spindle impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x54, SS_RDEF, /* XXX TBD */ "Spindle impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x55, SS_RDEF, /* XXX TBD */ "Spindle impending failure access times too high") }, /* D B */ { SST(0x5D, 0x56, SS_RDEF, /* XXX TBD */ "Spindle impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x57, SS_RDEF, /* XXX TBD */ "Spindle impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x58, SS_RDEF, /* XXX TBD */ "Spindle impending failure controller detected") }, /* D B */ { SST(0x5D, 0x59, SS_RDEF, /* XXX TBD */ "Spindle impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x5A, SS_RDEF, /* XXX TBD */ "Spindle impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x5B, SS_RDEF, /* XXX TBD */ "Spindle impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x5C, SS_RDEF, /* XXX TBD */ "Spindle impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x60, SS_RDEF, /* XXX TBD */ "Firmware impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x61, SS_RDEF, /* XXX TBD */ "Firmware impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x62, SS_RDEF, /* XXX TBD */ "Firmware impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x63, SS_RDEF, /* XXX TBD */ "Firmware impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x64, SS_RDEF, /* XXX TBD */ "Firmware impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x65, SS_RDEF, /* XXX TBD */ "Firmware impending failure access times too high") }, /* D B */ { SST(0x5D, 0x66, SS_RDEF, /* XXX TBD */ "Firmware impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x67, SS_RDEF, /* XXX TBD */ "Firmware impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x68, SS_RDEF, /* XXX TBD */ "Firmware impending failure controller detected") }, /* D B */ { SST(0x5D, 0x69, SS_RDEF, /* XXX TBD */ "Firmware impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x6A, SS_RDEF, /* XXX TBD */ "Firmware impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x6B, SS_RDEF, /* XXX TBD */ "Firmware impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x6C, SS_RDEF, /* XXX TBD */ "Firmware impending failure drive calibration retry count") }, /* DTLPWROMAEBKVF */ { SST(0x5D, 0xFF, SS_RDEF, "Failure prediction threshold exceeded (false)") }, /* DTLPWRO A K */ { SST(0x5E, 0x00, SS_RDEF, "Low power condition on") }, /* DTLPWRO A K */ { SST(0x5E, 0x01, SS_RDEF, "Idle condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x02, SS_RDEF, "Standby condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x03, SS_RDEF, "Idle condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x04, SS_RDEF, "Standby condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x05, SS_RDEF, "Idle-B condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x06, SS_RDEF, "Idle-B condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x07, SS_RDEF, "Idle-C condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x08, SS_RDEF, "Idle-C condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x09, SS_RDEF, "Standby-Y condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x0A, SS_RDEF, "Standby-Y condition activated by command") }, /* B */ { SST(0x5E, 0x41, SS_RDEF, /* XXX TBD */ "Power state change to active") }, /* B */ { SST(0x5E, 0x42, SS_RDEF, /* XXX TBD */ "Power state change to idle") }, /* B */ { SST(0x5E, 0x43, SS_RDEF, /* XXX TBD */ "Power state change to standby") }, /* B */ { SST(0x5E, 0x45, SS_RDEF, /* XXX TBD */ "Power state change to sleep") }, /* BK */ { SST(0x5E, 0x47, SS_RDEF, /* XXX TBD */ "Power state change to device control") }, /* */ { SST(0x60, 0x00, SS_RDEF, "Lamp failure") }, /* */ { SST(0x61, 0x00, SS_RDEF, "Video acquisition error") }, /* */ { SST(0x61, 0x01, SS_RDEF, "Unable to acquire video") }, /* */ { SST(0x61, 0x02, SS_RDEF, "Out of focus") }, /* */ { SST(0x62, 0x00, SS_RDEF, "Scan head positioning error") }, /* R */ { SST(0x63, 0x00, SS_RDEF, "End of user area encountered on this track") }, /* R */ { SST(0x63, 0x01, SS_FATAL | ENOSPC, "Packet does not fit in available space") }, /* R */ { SST(0x64, 0x00, SS_FATAL | ENXIO, "Illegal mode for this track") }, /* R */ { SST(0x64, 0x01, SS_RDEF, "Invalid packet size") }, /* DTLPWROMAEBKVF */ { SST(0x65, 0x00, SS_RDEF, "Voltage fault") }, /* */ { SST(0x66, 0x00, SS_RDEF, "Automatic document feeder cover up") }, /* */ { SST(0x66, 0x01, SS_RDEF, "Automatic document feeder lift up") }, /* */ { SST(0x66, 0x02, SS_RDEF, "Document jam in automatic document feeder") }, /* */ { SST(0x66, 0x03, SS_RDEF, "Document miss feed automatic in document feeder") }, /* A */ { SST(0x67, 0x00, SS_RDEF, "Configuration failure") }, /* A */ { SST(0x67, 0x01, SS_RDEF, "Configuration of incapable logical units failed") }, /* A */ { SST(0x67, 0x02, SS_RDEF, "Add logical unit failed") }, /* A */ { SST(0x67, 0x03, SS_RDEF, "Modification of logical unit failed") }, /* A */ { SST(0x67, 0x04, SS_RDEF, "Exchange of logical unit failed") }, /* A */ { SST(0x67, 0x05, SS_RDEF, "Remove of logical unit failed") }, /* A */ { SST(0x67, 0x06, SS_RDEF, "Attachment of logical unit failed") }, /* A */ { SST(0x67, 0x07, SS_RDEF, "Creation of logical unit failed") }, /* A */ { SST(0x67, 0x08, SS_RDEF, /* XXX TBD */ "Assign failure occurred") }, /* A */ { SST(0x67, 0x09, SS_RDEF, /* XXX TBD */ "Multiply assigned logical unit") }, /* DTLPWROMAEBKVF */ { SST(0x67, 0x0A, SS_RDEF, /* XXX TBD */ "Set target port groups command failed") }, /* DT B */ { SST(0x67, 0x0B, SS_RDEF, /* XXX TBD */ "ATA device feature not enabled") }, /* A */ { SST(0x68, 0x00, SS_RDEF, "Logical unit not configured") }, /* A */ { SST(0x69, 0x00, SS_RDEF, "Data loss on logical unit") }, /* A */ { SST(0x69, 0x01, SS_RDEF, "Multiple logical unit failures") }, /* A */ { SST(0x69, 0x02, SS_RDEF, "Parity/data mismatch") }, /* A */ { SST(0x6A, 0x00, SS_RDEF, "Informational, refer to log") }, /* A */ { SST(0x6B, 0x00, SS_RDEF, "State change has occurred") }, /* A */ { SST(0x6B, 0x01, SS_RDEF, "Redundancy level got better") }, /* A */ { SST(0x6B, 0x02, SS_RDEF, "Redundancy level got worse") }, /* A */ { SST(0x6C, 0x00, SS_RDEF, "Rebuild failure occurred") }, /* A */ { SST(0x6D, 0x00, SS_RDEF, "Recalculate failure occurred") }, /* A */ { SST(0x6E, 0x00, SS_RDEF, "Command to logical unit failed") }, /* R */ { SST(0x6F, 0x00, SS_RDEF, /* XXX TBD */ "Copy protection key exchange failure - authentication failure") }, /* R */ { SST(0x6F, 0x01, SS_RDEF, /* XXX TBD */ "Copy protection key exchange failure - key not present") }, /* R */ { SST(0x6F, 0x02, SS_RDEF, /* XXX TBD */ "Copy protection key exchange failure - key not established") }, /* R */ { SST(0x6F, 0x03, SS_RDEF, /* XXX TBD */ "Read of scrambled sector without authentication") }, /* R */ { SST(0x6F, 0x04, SS_RDEF, /* XXX TBD */ "Media region code is mismatched to logical unit region") }, /* R */ { SST(0x6F, 0x05, SS_RDEF, /* XXX TBD */ "Drive region must be permanent/region reset count error") }, /* R */ { SST(0x6F, 0x06, SS_RDEF, /* XXX TBD */ "Insufficient block count for binding NONCE recording") }, /* R */ { SST(0x6F, 0x07, SS_RDEF, /* XXX TBD */ "Conflict in binding NONCE recording") }, /* T */ { SST(0x70, 0x00, SS_RDEF, "Decompression exception short: ASCQ = Algorithm ID") }, /* T */ { SST(0x70, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x00 -> 0xFF */ /* T */ { SST(0x71, 0x00, SS_RDEF, "Decompression exception long: ASCQ = Algorithm ID") }, /* T */ { SST(0x71, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x00 -> 0xFF */ /* R */ { SST(0x72, 0x00, SS_RDEF, "Session fixation error") }, /* R */ { SST(0x72, 0x01, SS_RDEF, "Session fixation error writing lead-in") }, /* R */ { SST(0x72, 0x02, SS_RDEF, "Session fixation error writing lead-out") }, /* R */ { SST(0x72, 0x03, SS_RDEF, "Session fixation error - incomplete track in session") }, /* R */ { SST(0x72, 0x04, SS_RDEF, "Empty or partially written reserved track") }, /* R */ { SST(0x72, 0x05, SS_RDEF, /* XXX TBD */ "No more track reservations allowed") }, /* R */ { SST(0x72, 0x06, SS_RDEF, /* XXX TBD */ "RMZ extension is not allowed") }, /* R */ { SST(0x72, 0x07, SS_RDEF, /* XXX TBD */ "No more test zone extensions are allowed") }, /* R */ { SST(0x73, 0x00, SS_RDEF, "CD control error") }, /* R */ { SST(0x73, 0x01, SS_RDEF, "Power calibration area almost full") }, /* R */ { SST(0x73, 0x02, SS_FATAL | ENOSPC, "Power calibration area is full") }, /* R */ { SST(0x73, 0x03, SS_RDEF, "Power calibration area error") }, /* R */ { SST(0x73, 0x04, SS_RDEF, "Program memory area update failure") }, /* R */ { SST(0x73, 0x05, SS_RDEF, "Program memory area is full") }, /* R */ { SST(0x73, 0x06, SS_RDEF, /* XXX TBD */ "RMA/PMA is almost full") }, /* R */ { SST(0x73, 0x10, SS_RDEF, /* XXX TBD */ "Current power calibration area almost full") }, /* R */ { SST(0x73, 0x11, SS_RDEF, /* XXX TBD */ "Current power calibration area is full") }, /* R */ { SST(0x73, 0x17, SS_RDEF, /* XXX TBD */ "RDZ is full") }, /* T */ { SST(0x74, 0x00, SS_RDEF, /* XXX TBD */ "Security error") }, /* T */ { SST(0x74, 0x01, SS_RDEF, /* XXX TBD */ "Unable to decrypt data") }, /* T */ { SST(0x74, 0x02, SS_RDEF, /* XXX TBD */ "Unencrypted data encountered while decrypting") }, /* T */ { SST(0x74, 0x03, SS_RDEF, /* XXX TBD */ "Incorrect data encryption key") }, /* T */ { SST(0x74, 0x04, SS_RDEF, /* XXX TBD */ "Cryptographic integrity validation failed") }, /* T */ { SST(0x74, 0x05, SS_RDEF, /* XXX TBD */ "Error decrypting data") }, /* T */ { SST(0x74, 0x06, SS_RDEF, /* XXX TBD */ "Unknown signature verification key") }, /* T */ { SST(0x74, 0x07, SS_RDEF, /* XXX TBD */ "Encryption parameters not useable") }, /* DT R M E VF */ { SST(0x74, 0x08, SS_RDEF, /* XXX TBD */ "Digital signature validation failure") }, /* T */ { SST(0x74, 0x09, SS_RDEF, /* XXX TBD */ "Encryption mode mismatch on read") }, /* T */ { SST(0x74, 0x0A, SS_RDEF, /* XXX TBD */ "Encrypted block not raw read enabled") }, /* T */ { SST(0x74, 0x0B, SS_RDEF, /* XXX TBD */ "Incorrect encryption parameters") }, /* DT R MAEBKV */ { SST(0x74, 0x0C, SS_RDEF, /* XXX TBD */ "Unable to decrypt parameter list") }, /* T */ { SST(0x74, 0x0D, SS_RDEF, /* XXX TBD */ "Encryption algorithm disabled") }, /* DT R MAEBKV */ { SST(0x74, 0x10, SS_RDEF, /* XXX TBD */ "SA creation parameter value invalid") }, /* DT R MAEBKV */ { SST(0x74, 0x11, SS_RDEF, /* XXX TBD */ "SA creation parameter value rejected") }, /* DT R MAEBKV */ { SST(0x74, 0x12, SS_RDEF, /* XXX TBD */ "Invalid SA usage") }, /* T */ { SST(0x74, 0x21, SS_RDEF, /* XXX TBD */ "Data encryption configuration prevented") }, /* DT R MAEBKV */ { SST(0x74, 0x30, SS_RDEF, /* XXX TBD */ "SA creation parameter not supported") }, /* DT R MAEBKV */ { SST(0x74, 0x40, SS_RDEF, /* XXX TBD */ "Authentication failed") }, /* V */ { SST(0x74, 0x61, SS_RDEF, /* XXX TBD */ "External data encryption key manager access error") }, /* V */ { SST(0x74, 0x62, SS_RDEF, /* XXX TBD */ "External data encryption key manager error") }, /* V */ { SST(0x74, 0x63, SS_RDEF, /* XXX TBD */ "External data encryption key not found") }, /* V */ { SST(0x74, 0x64, SS_RDEF, /* XXX TBD */ "External data encryption request not authorized") }, /* T */ { SST(0x74, 0x6E, SS_RDEF, /* XXX TBD */ "External data encryption control timeout") }, /* T */ { SST(0x74, 0x6F, SS_RDEF, /* XXX TBD */ "External data encryption control error") }, /* DT R M E V */ { SST(0x74, 0x71, SS_RDEF, /* XXX TBD */ "Logical unit access not authorized") }, /* D */ { SST(0x74, 0x79, SS_RDEF, /* XXX TBD */ "Security conflict in translated device") } }; const int asc_table_size = sizeof(asc_table)/sizeof(asc_table[0]); struct asc_key { int asc; int ascq; }; static int ascentrycomp(const void *key, const void *member) { int asc; int ascq; const struct asc_table_entry *table_entry; asc = ((const struct asc_key *)key)->asc; ascq = ((const struct asc_key *)key)->ascq; table_entry = (const struct asc_table_entry *)member; if (asc >= table_entry->asc) { if (asc > table_entry->asc) return (1); if (ascq <= table_entry->ascq) { /* Check for ranges */ if (ascq == table_entry->ascq || ((table_entry->action & SSQ_RANGE) != 0 && ascq >= (table_entry - 1)->ascq)) return (0); return (-1); } return (1); } return (-1); } static int senseentrycomp(const void *key, const void *member) { int sense_key; const struct sense_key_table_entry *table_entry; sense_key = *((const int *)key); table_entry = (const struct sense_key_table_entry *)member; if (sense_key >= table_entry->sense_key) { if (sense_key == table_entry->sense_key) return (0); return (1); } return (-1); } static void fetchtableentries(int sense_key, int asc, int ascq, struct scsi_inquiry_data *inq_data, const struct sense_key_table_entry **sense_entry, const struct asc_table_entry **asc_entry) { caddr_t match; const struct asc_table_entry *asc_tables[2]; const struct sense_key_table_entry *sense_tables[2]; struct asc_key asc_ascq; size_t asc_tables_size[2]; size_t sense_tables_size[2]; int num_asc_tables; int num_sense_tables; int i; /* Default to failure */ *sense_entry = NULL; *asc_entry = NULL; match = NULL; if (inq_data != NULL) match = cam_quirkmatch((caddr_t)inq_data, (caddr_t)sense_quirk_table, sense_quirk_table_size, sizeof(*sense_quirk_table), scsi_inquiry_match); if (match != NULL) { struct scsi_sense_quirk_entry *quirk; quirk = (struct scsi_sense_quirk_entry *)match; asc_tables[0] = quirk->asc_info; asc_tables_size[0] = quirk->num_ascs; asc_tables[1] = asc_table; asc_tables_size[1] = asc_table_size; num_asc_tables = 2; sense_tables[0] = quirk->sense_key_info; sense_tables_size[0] = quirk->num_sense_keys; sense_tables[1] = sense_key_table; sense_tables_size[1] = sense_key_table_size; num_sense_tables = 2; } else { asc_tables[0] = asc_table; asc_tables_size[0] = asc_table_size; num_asc_tables = 1; sense_tables[0] = sense_key_table; sense_tables_size[0] = sense_key_table_size; num_sense_tables = 1; } asc_ascq.asc = asc; asc_ascq.ascq = ascq; for (i = 0; i < num_asc_tables; i++) { void *found_entry; found_entry = bsearch(&asc_ascq, asc_tables[i], asc_tables_size[i], sizeof(**asc_tables), ascentrycomp); if (found_entry) { *asc_entry = (struct asc_table_entry *)found_entry; break; } } for (i = 0; i < num_sense_tables; i++) { void *found_entry; found_entry = bsearch(&sense_key, sense_tables[i], sense_tables_size[i], sizeof(**sense_tables), senseentrycomp); if (found_entry) { *sense_entry = (struct sense_key_table_entry *)found_entry; break; } } } void scsi_sense_desc(int sense_key, int asc, int ascq, struct scsi_inquiry_data *inq_data, const char **sense_key_desc, const char **asc_desc) { const struct asc_table_entry *asc_entry; const struct sense_key_table_entry *sense_entry; fetchtableentries(sense_key, asc, ascq, inq_data, &sense_entry, &asc_entry); if (sense_entry != NULL) *sense_key_desc = sense_entry->desc; else *sense_key_desc = "Invalid Sense Key"; if (asc_entry != NULL) *asc_desc = asc_entry->desc; else if (asc >= 0x80 && asc <= 0xff) *asc_desc = "Vendor Specific ASC"; else if (ascq >= 0x80 && ascq <= 0xff) *asc_desc = "Vendor Specific ASCQ"; else *asc_desc = "Reserved ASC/ASCQ pair"; } /* * Given sense and device type information, return the appropriate action. * If we do not understand the specific error as identified by the ASC/ASCQ * pair, fall back on the more generic actions derived from the sense key. */ scsi_sense_action scsi_error_action(struct ccb_scsiio *csio, struct scsi_inquiry_data *inq_data, u_int32_t sense_flags) { const struct asc_table_entry *asc_entry; const struct sense_key_table_entry *sense_entry; int error_code, sense_key, asc, ascq; scsi_sense_action action; if (!scsi_extract_sense_ccb((union ccb *)csio, &error_code, &sense_key, &asc, &ascq)) { action = SS_RETRY | SSQ_DECREMENT_COUNT | SSQ_PRINT_SENSE | EIO; } else if ((error_code == SSD_DEFERRED_ERROR) || (error_code == SSD_DESC_DEFERRED_ERROR)) { /* * XXX dufault@FreeBSD.org * This error doesn't relate to the command associated * with this request sense. A deferred error is an error * for a command that has already returned GOOD status * (see SCSI2 8.2.14.2). * * By my reading of that section, it looks like the current * command has been cancelled, we should now clean things up * (hopefully recovering any lost data) and then retry the * current command. There are two easy choices, both wrong: * * 1. Drop through (like we had been doing), thus treating * this as if the error were for the current command and * return and stop the current command. * * 2. Issue a retry (like I made it do) thus hopefully * recovering the current transfer, and ignoring the * fact that we've dropped a command. * * These should probably be handled in a device specific * sense handler or punted back up to a user mode daemon */ action = SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE; } else { fetchtableentries(sense_key, asc, ascq, inq_data, &sense_entry, &asc_entry); /* * Override the 'No additional Sense' entry (0,0) * with the error action of the sense key. */ if (asc_entry != NULL && (asc != 0 || ascq != 0)) action = asc_entry->action; else if (sense_entry != NULL) action = sense_entry->action; else action = SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE; if (sense_key == SSD_KEY_RECOVERED_ERROR) { /* * The action succeeded but the device wants * the user to know that some recovery action * was required. */ action &= ~(SS_MASK|SSQ_MASK|SS_ERRMASK); action |= SS_NOP|SSQ_PRINT_SENSE; } else if (sense_key == SSD_KEY_ILLEGAL_REQUEST) { if ((sense_flags & SF_QUIET_IR) != 0) action &= ~SSQ_PRINT_SENSE; } else if (sense_key == SSD_KEY_UNIT_ATTENTION) { if ((sense_flags & SF_RETRY_UA) != 0 && (action & SS_MASK) == SS_FAIL) { action &= ~(SS_MASK|SSQ_MASK); action |= SS_RETRY|SSQ_DECREMENT_COUNT| SSQ_PRINT_SENSE; } action |= SSQ_UA; } } if ((action & SS_MASK) >= SS_START && (sense_flags & SF_NO_RECOVERY)) { action &= ~SS_MASK; action |= SS_FAIL; } else if ((action & SS_MASK) == SS_RETRY && (sense_flags & SF_NO_RETRY)) { action &= ~SS_MASK; action |= SS_FAIL; } if ((sense_flags & SF_PRINT_ALWAYS) != 0) action |= SSQ_PRINT_SENSE; else if ((sense_flags & SF_NO_PRINT) != 0) action &= ~SSQ_PRINT_SENSE; return (action); } char * scsi_cdb_string(u_int8_t *cdb_ptr, char *cdb_string, size_t len) { u_int8_t cdb_len; int i; if (cdb_ptr == NULL) return(""); /* Silence warnings */ cdb_len = 0; /* * This is taken from the SCSI-3 draft spec. * (T10/1157D revision 0.3) * The top 3 bits of an opcode are the group code. The next 5 bits * are the command code. * Group 0: six byte commands * Group 1: ten byte commands * Group 2: ten byte commands * Group 3: reserved * Group 4: sixteen byte commands * Group 5: twelve byte commands * Group 6: vendor specific * Group 7: vendor specific */ switch((*cdb_ptr >> 5) & 0x7) { case 0: cdb_len = 6; break; case 1: case 2: cdb_len = 10; break; case 3: case 6: case 7: /* in this case, just print out the opcode */ cdb_len = 1; break; case 4: cdb_len = 16; break; case 5: cdb_len = 12; break; } *cdb_string = '\0'; for (i = 0; i < cdb_len; i++) snprintf(cdb_string + strlen(cdb_string), len - strlen(cdb_string), "%02hhx ", cdb_ptr[i]); return(cdb_string); } const char * scsi_status_string(struct ccb_scsiio *csio) { switch(csio->scsi_status) { case SCSI_STATUS_OK: return("OK"); case SCSI_STATUS_CHECK_COND: return("Check Condition"); case SCSI_STATUS_BUSY: return("Busy"); case SCSI_STATUS_INTERMED: return("Intermediate"); case SCSI_STATUS_INTERMED_COND_MET: return("Intermediate-Condition Met"); case SCSI_STATUS_RESERV_CONFLICT: return("Reservation Conflict"); case SCSI_STATUS_CMD_TERMINATED: return("Command Terminated"); case SCSI_STATUS_QUEUE_FULL: return("Queue Full"); case SCSI_STATUS_ACA_ACTIVE: return("ACA Active"); case SCSI_STATUS_TASK_ABORTED: return("Task Aborted"); default: { static char unkstr[64]; snprintf(unkstr, sizeof(unkstr), "Unknown %#x", csio->scsi_status); return(unkstr); } } } /* * scsi_command_string() returns 0 for success and -1 for failure. */ #ifdef _KERNEL int scsi_command_string(struct ccb_scsiio *csio, struct sbuf *sb) #else /* !_KERNEL */ int scsi_command_string(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb) #endif /* _KERNEL/!_KERNEL */ { struct scsi_inquiry_data *inq_data; char cdb_str[(SCSI_MAX_CDBLEN * 3) + 1]; #ifdef _KERNEL struct ccb_getdev *cgd; #endif /* _KERNEL */ #ifdef _KERNEL if ((cgd = (struct ccb_getdev*)xpt_alloc_ccb_nowait()) == NULL) return(-1); /* * Get the device information. */ xpt_setup_ccb(&cgd->ccb_h, csio->ccb_h.path, CAM_PRIORITY_NORMAL); cgd->ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)cgd); /* * If the device is unconfigured, just pretend that it is a hard * drive. scsi_op_desc() needs this. */ if (cgd->ccb_h.status == CAM_DEV_NOT_THERE) cgd->inq_data.device = T_DIRECT; inq_data = &cgd->inq_data; #else /* !_KERNEL */ inq_data = &device->inq_data; #endif /* _KERNEL/!_KERNEL */ if ((csio->ccb_h.flags & CAM_CDB_POINTER) != 0) { sbuf_printf(sb, "%s. CDB: %s", scsi_op_desc(csio->cdb_io.cdb_ptr[0], inq_data), scsi_cdb_string(csio->cdb_io.cdb_ptr, cdb_str, sizeof(cdb_str))); } else { sbuf_printf(sb, "%s. CDB: %s", scsi_op_desc(csio->cdb_io.cdb_bytes[0], inq_data), scsi_cdb_string(csio->cdb_io.cdb_bytes, cdb_str, sizeof(cdb_str))); } #ifdef _KERNEL xpt_free_ccb((union ccb *)cgd); #endif return(0); } /* * Iterate over sense descriptors. Each descriptor is passed into iter_func(). * If iter_func() returns 0, list traversal continues. If iter_func() * returns non-zero, list traversal is stopped. */ void scsi_desc_iterate(struct scsi_sense_data_desc *sense, u_int sense_len, int (*iter_func)(struct scsi_sense_data_desc *sense, u_int, struct scsi_sense_desc_header *, void *), void *arg) { int cur_pos; int desc_len; /* * First make sure the extra length field is present. */ if (SSD_DESC_IS_PRESENT(sense, sense_len, extra_len) == 0) return; /* * The length of data actually returned may be different than the * extra_len recorded in the sturcture. */ desc_len = sense_len -offsetof(struct scsi_sense_data_desc, sense_desc); /* * Limit this further by the extra length reported, and the maximum * allowed extra length. */ desc_len = MIN(desc_len, MIN(sense->extra_len, SSD_EXTRA_MAX)); /* * Subtract the size of the header from the descriptor length. * This is to ensure that we have at least the header left, so we * don't have to check that inside the loop. This can wind up * being a negative value. */ desc_len -= sizeof(struct scsi_sense_desc_header); for (cur_pos = 0; cur_pos < desc_len;) { struct scsi_sense_desc_header *header; header = (struct scsi_sense_desc_header *) &sense->sense_desc[cur_pos]; /* * Check to make sure we have the entire descriptor. We * don't call iter_func() unless we do. * * Note that although cur_pos is at the beginning of the * descriptor, desc_len already has the header length * subtracted. So the comparison of the length in the * header (which does not include the header itself) to * desc_len - cur_pos is correct. */ if (header->length > (desc_len - cur_pos)) break; if (iter_func(sense, sense_len, header, arg) != 0) break; cur_pos += sizeof(*header) + header->length; } } struct scsi_find_desc_info { uint8_t desc_type; struct scsi_sense_desc_header *header; }; static int scsi_find_desc_func(struct scsi_sense_data_desc *sense, u_int sense_len, struct scsi_sense_desc_header *header, void *arg) { struct scsi_find_desc_info *desc_info; desc_info = (struct scsi_find_desc_info *)arg; if (header->desc_type == desc_info->desc_type) { desc_info->header = header; /* We found the descriptor, tell the iterator to stop. */ return (1); } else return (0); } /* * Given a descriptor type, return a pointer to it if it is in the sense * data and not truncated. Avoiding truncating sense data will simplify * things significantly for the caller. */ uint8_t * scsi_find_desc(struct scsi_sense_data_desc *sense, u_int sense_len, uint8_t desc_type) { struct scsi_find_desc_info desc_info; desc_info.desc_type = desc_type; desc_info.header = NULL; scsi_desc_iterate(sense, sense_len, scsi_find_desc_func, &desc_info); return ((uint8_t *)desc_info.header); } /* * Fill in SCSI sense data with the specified parameters. This routine can * fill in either fixed or descriptor type sense data. */ void scsi_set_sense_data_va(struct scsi_sense_data *sense_data, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, va_list ap) { int descriptor_sense; scsi_sense_elem_type elem_type; /* * Determine whether to return fixed or descriptor format sense * data. If the user specifies SSD_TYPE_NONE for some reason, * they'll just get fixed sense data. */ if (sense_format == SSD_TYPE_DESC) descriptor_sense = 1; else descriptor_sense = 0; /* * Zero the sense data, so that we don't pass back any garbage data * to the user. */ memset(sense_data, 0, sizeof(*sense_data)); if (descriptor_sense != 0) { struct scsi_sense_data_desc *sense; sense = (struct scsi_sense_data_desc *)sense_data; /* * The descriptor sense format eliminates the use of the * valid bit. */ if (current_error != 0) sense->error_code = SSD_DESC_CURRENT_ERROR; else sense->error_code = SSD_DESC_DEFERRED_ERROR; sense->sense_key = sense_key; sense->add_sense_code = asc; sense->add_sense_code_qual = ascq; /* * Start off with no extra length, since the above data * fits in the standard descriptor sense information. */ sense->extra_len = 0; while ((elem_type = (scsi_sense_elem_type)va_arg(ap, scsi_sense_elem_type)) != SSD_ELEM_NONE) { int sense_len, len_to_copy; uint8_t *data; if (elem_type >= SSD_ELEM_MAX) { printf("%s: invalid sense type %d\n", __func__, elem_type); break; } sense_len = (int)va_arg(ap, int); len_to_copy = MIN(sense_len, SSD_EXTRA_MAX - sense->extra_len); data = (uint8_t *)va_arg(ap, uint8_t *); /* * We've already consumed the arguments for this one. */ if (elem_type == SSD_ELEM_SKIP) continue; switch (elem_type) { case SSD_ELEM_DESC: { /* * This is a straight descriptor. All we * need to do is copy the data in. */ bcopy(data, &sense->sense_desc[ sense->extra_len], len_to_copy); sense->extra_len += len_to_copy; break; } case SSD_ELEM_SKS: { struct scsi_sense_sks sks; bzero(&sks, sizeof(sks)); /* * This is already-formatted sense key * specific data. We just need to fill out * the header and copy everything in. */ bcopy(data, &sks.sense_key_spec, MIN(len_to_copy, sizeof(sks.sense_key_spec))); sks.desc_type = SSD_DESC_SKS; sks.length = sizeof(sks) - offsetof(struct scsi_sense_sks, reserved1); bcopy(&sks,&sense->sense_desc[sense->extra_len], sizeof(sks)); sense->extra_len += sizeof(sks); break; } case SSD_ELEM_INFO: case SSD_ELEM_COMMAND: { struct scsi_sense_command cmd; struct scsi_sense_info info; uint8_t *data_dest; uint8_t *descriptor; int descriptor_size, i, copy_len; bzero(&cmd, sizeof(cmd)); bzero(&info, sizeof(info)); /* * Command or information data. The * operate in pretty much the same way. */ if (elem_type == SSD_ELEM_COMMAND) { len_to_copy = MIN(len_to_copy, sizeof(cmd.command_info)); descriptor = (uint8_t *)&cmd; descriptor_size = sizeof(cmd); data_dest =(uint8_t *)&cmd.command_info; cmd.desc_type = SSD_DESC_COMMAND; cmd.length = sizeof(cmd) - offsetof(struct scsi_sense_command, reserved); } else { len_to_copy = MIN(len_to_copy, sizeof(info.info)); descriptor = (uint8_t *)&info; descriptor_size = sizeof(cmd); data_dest = (uint8_t *)&info.info; info.desc_type = SSD_DESC_INFO; info.byte2 = SSD_INFO_VALID; info.length = sizeof(info) - offsetof(struct scsi_sense_info, byte2); } /* * Copy this in reverse because the spec * (SPC-4) says that when 4 byte quantities * are stored in this 8 byte field, the * first four bytes shall be 0. * * So we fill the bytes in from the end, and * if we have less than 8 bytes to copy, * the initial, most significant bytes will * be 0. */ for (i = sense_len - 1; i >= 0 && len_to_copy > 0; i--, len_to_copy--) data_dest[len_to_copy - 1] = data[i]; /* * This calculation looks much like the * initial len_to_copy calculation, but * we have to do it again here, because * we're looking at a larger amount that * may or may not fit. It's not only the * data the user passed in, but also the * rest of the descriptor. */ copy_len = MIN(descriptor_size, SSD_EXTRA_MAX - sense->extra_len); bcopy(descriptor, &sense->sense_desc[ sense->extra_len], copy_len); sense->extra_len += copy_len; break; } case SSD_ELEM_FRU: { struct scsi_sense_fru fru; int copy_len; bzero(&fru, sizeof(fru)); fru.desc_type = SSD_DESC_FRU; fru.length = sizeof(fru) - offsetof(struct scsi_sense_fru, reserved); fru.fru = *data; copy_len = MIN(sizeof(fru), SSD_EXTRA_MAX - sense->extra_len); bcopy(&fru, &sense->sense_desc[ sense->extra_len], copy_len); sense->extra_len += copy_len; break; } case SSD_ELEM_STREAM: { struct scsi_sense_stream stream_sense; int copy_len; bzero(&stream_sense, sizeof(stream_sense)); stream_sense.desc_type = SSD_DESC_STREAM; stream_sense.length = sizeof(stream_sense) - offsetof(struct scsi_sense_stream, reserved); stream_sense.byte3 = *data; copy_len = MIN(sizeof(stream_sense), SSD_EXTRA_MAX - sense->extra_len); bcopy(&stream_sense, &sense->sense_desc[ sense->extra_len], copy_len); sense->extra_len += copy_len; break; } default: /* * We shouldn't get here, but if we do, do * nothing. We've already consumed the * arguments above. */ break; } } } else { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if (current_error != 0) sense->error_code = SSD_CURRENT_ERROR; else sense->error_code = SSD_DEFERRED_ERROR; sense->flags = sense_key; sense->add_sense_code = asc; sense->add_sense_code_qual = ascq; /* * We've set the ASC and ASCQ, so we have 6 more bytes of * valid data. If we wind up setting any of the other * fields, we'll bump this to 10 extra bytes. */ sense->extra_len = 6; while ((elem_type = (scsi_sense_elem_type)va_arg(ap, scsi_sense_elem_type)) != SSD_ELEM_NONE) { int sense_len, len_to_copy; uint8_t *data; if (elem_type >= SSD_ELEM_MAX) { printf("%s: invalid sense type %d\n", __func__, elem_type); break; } /* * If we get in here, just bump the extra length to * 10 bytes. That will encompass anything we're * going to set here. */ sense->extra_len = 10; sense_len = (int)va_arg(ap, int); len_to_copy = MIN(sense_len, SSD_EXTRA_MAX - sense->extra_len); data = (uint8_t *)va_arg(ap, uint8_t *); switch (elem_type) { case SSD_ELEM_SKS: /* * The user passed in pre-formatted sense * key specific data. */ bcopy(data, &sense->sense_key_spec[0], MIN(sizeof(sense->sense_key_spec), sense_len)); break; case SSD_ELEM_INFO: case SSD_ELEM_COMMAND: { uint8_t *data_dest; int i; if (elem_type == SSD_ELEM_COMMAND) data_dest = &sense->cmd_spec_info[0]; else { data_dest = &sense->info[0]; /* * We're setting the info field, so * set the valid bit. */ sense->error_code |= SSD_ERRCODE_VALID; } /* * Copy this in reverse so that if we have * less than 4 bytes to fill, the least * significant bytes will be at the end. * If we have more than 4 bytes, only the * least significant bytes will be included. */ for (i = sense_len - 1; i >= 0 && len_to_copy > 0; i--, len_to_copy--) data_dest[len_to_copy - 1] = data[i]; break; } case SSD_ELEM_FRU: sense->fru = *data; break; case SSD_ELEM_STREAM: sense->flags |= *data; break; case SSD_ELEM_DESC: default: /* * If the user passes in descriptor sense, * we can't handle that in fixed format. * So just skip it, and any unknown argument * types. */ break; } } } } void scsi_set_sense_data(struct scsi_sense_data *sense_data, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, ...) { va_list ap; va_start(ap, ascq); scsi_set_sense_data_va(sense_data, sense_format, current_error, sense_key, asc, ascq, ap); va_end(ap); } /* * Get sense information for three similar sense data types. */ int scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t info_type, uint64_t *info, int64_t *signed_info) { scsi_sense_data_type sense_type; if (sense_len == 0) goto bailout; sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; uint8_t *desc; sense = (struct scsi_sense_data_desc *)sense_data; desc = scsi_find_desc(sense, sense_len, info_type); if (desc == NULL) goto bailout; switch (info_type) { case SSD_DESC_INFO: { struct scsi_sense_info *info_desc; info_desc = (struct scsi_sense_info *)desc; *info = scsi_8btou64(info_desc->info); if (signed_info != NULL) *signed_info = *info; break; } case SSD_DESC_COMMAND: { struct scsi_sense_command *cmd_desc; cmd_desc = (struct scsi_sense_command *)desc; *info = scsi_8btou64(cmd_desc->command_info); if (signed_info != NULL) *signed_info = *info; break; } case SSD_DESC_FRU: { struct scsi_sense_fru *fru_desc; fru_desc = (struct scsi_sense_fru *)desc; *info = fru_desc->fru; if (signed_info != NULL) *signed_info = (int8_t)fru_desc->fru; break; } default: goto bailout; break; } break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; switch (info_type) { case SSD_DESC_INFO: { uint32_t info_val; if ((sense->error_code & SSD_ERRCODE_VALID) == 0) goto bailout; if (SSD_FIXED_IS_PRESENT(sense, sense_len, info) == 0) goto bailout; info_val = scsi_4btoul(sense->info); *info = info_val; if (signed_info != NULL) *signed_info = (int32_t)info_val; break; } case SSD_DESC_COMMAND: { uint32_t cmd_val; if ((SSD_FIXED_IS_PRESENT(sense, sense_len, cmd_spec_info) == 0) || (SSD_FIXED_IS_FILLED(sense, cmd_spec_info) == 0)) goto bailout; cmd_val = scsi_4btoul(sense->cmd_spec_info); if (cmd_val == 0) goto bailout; *info = cmd_val; if (signed_info != NULL) *signed_info = (int32_t)cmd_val; break; } case SSD_DESC_FRU: if ((SSD_FIXED_IS_PRESENT(sense, sense_len, fru) == 0) || (SSD_FIXED_IS_FILLED(sense, fru) == 0)) goto bailout; if (sense->fru == 0) goto bailout; *info = sense->fru; if (signed_info != NULL) *signed_info = (int8_t)sense->fru; break; default: goto bailout; break; } break; } default: goto bailout; break; } return (0); bailout: return (1); } int scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t *sks) { scsi_sense_data_type sense_type; if (sense_len == 0) goto bailout; sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; struct scsi_sense_sks *desc; sense = (struct scsi_sense_data_desc *)sense_data; desc = (struct scsi_sense_sks *)scsi_find_desc(sense, sense_len, SSD_DESC_SKS); if (desc == NULL) goto bailout; /* * No need to check the SKS valid bit for descriptor sense. * If the descriptor is present, it is valid. */ bcopy(desc->sense_key_spec, sks, sizeof(desc->sense_key_spec)); break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if ((SSD_FIXED_IS_PRESENT(sense, sense_len, sense_key_spec)== 0) || (SSD_FIXED_IS_FILLED(sense, sense_key_spec) == 0)) goto bailout; if ((sense->sense_key_spec[0] & SSD_SCS_VALID) == 0) goto bailout; bcopy(sense->sense_key_spec, sks,sizeof(sense->sense_key_spec)); break; } default: goto bailout; break; } return (0); bailout: return (1); } /* * Provide a common interface for fixed and descriptor sense to detect * whether we have block-specific sense information. It is clear by the * presence of the block descriptor in descriptor mode, but we have to * infer from the inquiry data and ILI bit in fixed mode. */ int scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *block_bits) { scsi_sense_data_type sense_type; if (inq_data != NULL) { switch (SID_TYPE(inq_data)) { case T_DIRECT: case T_RBC: break; default: goto bailout; break; } } sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; struct scsi_sense_block *block; sense = (struct scsi_sense_data_desc *)sense_data; block = (struct scsi_sense_block *)scsi_find_desc(sense, sense_len, SSD_DESC_BLOCK); if (block == NULL) goto bailout; *block_bits = block->byte3; break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0) goto bailout; if ((sense->flags & SSD_ILI) == 0) goto bailout; *block_bits = sense->flags & SSD_ILI; break; } default: goto bailout; break; } return (0); bailout: return (1); } int scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *stream_bits) { scsi_sense_data_type sense_type; if (inq_data != NULL) { switch (SID_TYPE(inq_data)) { case T_SEQUENTIAL: break; default: goto bailout; break; } } sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; struct scsi_sense_stream *stream; sense = (struct scsi_sense_data_desc *)sense_data; stream = (struct scsi_sense_stream *)scsi_find_desc(sense, sense_len, SSD_DESC_STREAM); if (stream == NULL) goto bailout; *stream_bits = stream->byte3; break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0) goto bailout; if ((sense->flags & (SSD_ILI|SSD_EOM|SSD_FILEMARK)) == 0) goto bailout; *stream_bits = sense->flags & (SSD_ILI|SSD_EOM|SSD_FILEMARK); break; } default: goto bailout; break; } return (0); bailout: return (1); } void scsi_info_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t info) { sbuf_printf(sb, "Info: %#jx", info); } void scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t csi) { sbuf_printf(sb, "Command Specific Info: %#jx", csi); } void scsi_progress_sbuf(struct sbuf *sb, uint16_t progress) { sbuf_printf(sb, "Progress: %d%% (%d/%d) complete", (progress * 100) / SSD_SKS_PROGRESS_DENOM, progress, SSD_SKS_PROGRESS_DENOM); } /* * Returns 1 for failure (i.e. SKS isn't valid) and 0 for success. */ int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks) { if ((sks[0] & SSD_SKS_VALID) == 0) return (1); switch (sense_key) { case SSD_KEY_ILLEGAL_REQUEST: { struct scsi_sense_sks_field *field; int bad_command; char tmpstr[40]; /*Field Pointer*/ field = (struct scsi_sense_sks_field *)sks; if (field->byte0 & SSD_SKS_FIELD_CMD) bad_command = 1; else bad_command = 0; tmpstr[0] = '\0'; /* Bit pointer is valid */ if (field->byte0 & SSD_SKS_BPV) snprintf(tmpstr, sizeof(tmpstr), "bit %d ", field->byte0 & SSD_SKS_BIT_VALUE); sbuf_printf(sb, "%s byte %d %sis invalid", bad_command ? "Command" : "Data", scsi_2btoul(field->field), tmpstr); break; } case SSD_KEY_UNIT_ATTENTION: { struct scsi_sense_sks_overflow *overflow; overflow = (struct scsi_sense_sks_overflow *)sks; /*UA Condition Queue Overflow*/ sbuf_printf(sb, "Unit Attention Condition Queue %s", (overflow->byte0 & SSD_SKS_OVERFLOW_SET) ? "Overflowed" : "Did Not Overflow??"); break; } case SSD_KEY_RECOVERED_ERROR: case SSD_KEY_HARDWARE_ERROR: case SSD_KEY_MEDIUM_ERROR: { struct scsi_sense_sks_retry *retry; /*Actual Retry Count*/ retry = (struct scsi_sense_sks_retry *)sks; sbuf_printf(sb, "Actual Retry Count: %d", scsi_2btoul(retry->actual_retry_count)); break; } case SSD_KEY_NO_SENSE: case SSD_KEY_NOT_READY: { struct scsi_sense_sks_progress *progress; int progress_val; /*Progress Indication*/ progress = (struct scsi_sense_sks_progress *)sks; progress_val = scsi_2btoul(progress->progress); scsi_progress_sbuf(sb, progress_val); break; } case SSD_KEY_COPY_ABORTED: { struct scsi_sense_sks_segment *segment; char tmpstr[40]; /*Segment Pointer*/ segment = (struct scsi_sense_sks_segment *)sks; tmpstr[0] = '\0'; if (segment->byte0 & SSD_SKS_SEGMENT_BPV) snprintf(tmpstr, sizeof(tmpstr), "bit %d ", segment->byte0 & SSD_SKS_SEGMENT_BITPTR); sbuf_printf(sb, "%s byte %d %sis invalid", (segment->byte0 & SSD_SKS_SEGMENT_SD) ? "Segment" : "Data", scsi_2btoul(segment->field), tmpstr); break; } default: sbuf_printf(sb, "Sense Key Specific: %#x,%#x", sks[0], scsi_2btoul(&sks[1])); break; } return (0); } void scsi_fru_sbuf(struct sbuf *sb, uint64_t fru) { sbuf_printf(sb, "Field Replaceable Unit: %d", (int)fru); } void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits, uint64_t info) { int need_comma; need_comma = 0; /* * XXX KDM this needs more descriptive decoding. */ if (stream_bits & SSD_DESC_STREAM_FM) { sbuf_printf(sb, "Filemark"); need_comma = 1; } if (stream_bits & SSD_DESC_STREAM_EOM) { sbuf_printf(sb, "%sEOM", (need_comma) ? "," : ""); need_comma = 1; } if (stream_bits & SSD_DESC_STREAM_ILI) sbuf_printf(sb, "%sILI", (need_comma) ? "," : ""); sbuf_printf(sb, ": Info: %#jx", (uintmax_t) info); } void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits, uint64_t info) { if (block_bits & SSD_DESC_BLOCK_ILI) sbuf_printf(sb, "ILI: residue %#jx", (uintmax_t) info); } void scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_info *info; info = (struct scsi_sense_info *)header; scsi_info_sbuf(sb, cdb, cdb_len, inq_data, scsi_8btou64(info->info)); } void scsi_sense_command_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_command *command; command = (struct scsi_sense_command *)header; scsi_command_sbuf(sb, cdb, cdb_len, inq_data, scsi_8btou64(command->command_info)); } void scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_sks *sks; int error_code, sense_key, asc, ascq; sks = (struct scsi_sense_sks *)header; scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); scsi_sks_sbuf(sb, sense_key, sks->sense_key_spec); } void scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_fru *fru; fru = (struct scsi_sense_fru *)header; scsi_fru_sbuf(sb, (uint64_t)fru->fru); } void scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_stream *stream; uint64_t info; stream = (struct scsi_sense_stream *)header; info = 0; scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, &info, NULL); scsi_stream_sbuf(sb, stream->byte3, info); } void scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_block *block; uint64_t info; block = (struct scsi_sense_block *)header; info = 0; scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, &info, NULL); scsi_block_sbuf(sb, block->byte3, info); } void scsi_sense_progress_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_progress *progress; const char *sense_key_desc; const char *asc_desc; int progress_val; progress = (struct scsi_sense_progress *)header; /* * Get descriptions for the sense key, ASC, and ASCQ in the * progress descriptor. These could be different than the values * in the overall sense data. */ scsi_sense_desc(progress->sense_key, progress->add_sense_code, progress->add_sense_code_qual, inq_data, &sense_key_desc, &asc_desc); progress_val = scsi_2btoul(progress->progress); /* * The progress indicator is for the operation described by the * sense key, ASC, and ASCQ in the descriptor. */ sbuf_cat(sb, sense_key_desc); sbuf_printf(sb, " asc:%x,%x (%s): ", progress->add_sense_code, progress->add_sense_code_qual, asc_desc); scsi_progress_sbuf(sb, progress_val); } /* * Generic sense descriptor printing routine. This is used when we have * not yet implemented a specific printing routine for this descriptor. */ void scsi_sense_generic_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { int i; uint8_t *buf_ptr; sbuf_printf(sb, "Descriptor %#x:", header->desc_type); buf_ptr = (uint8_t *)&header[1]; for (i = 0; i < header->length; i++, buf_ptr++) sbuf_printf(sb, " %02x", *buf_ptr); } /* * Keep this list in numeric order. This speeds the array traversal. */ struct scsi_sense_desc_printer { uint8_t desc_type; /* * The function arguments here are the superset of what is needed * to print out various different descriptors. Command and * information descriptors need inquiry data and command type. * Sense key specific descriptors need the sense key. * * The sense, cdb, and inquiry data arguments may be NULL, but the * information printed may not be fully decoded as a result. */ void (*print_func)(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); } scsi_sense_printers[] = { {SSD_DESC_INFO, scsi_sense_info_sbuf}, {SSD_DESC_COMMAND, scsi_sense_command_sbuf}, {SSD_DESC_SKS, scsi_sense_sks_sbuf}, {SSD_DESC_FRU, scsi_sense_fru_sbuf}, {SSD_DESC_STREAM, scsi_sense_stream_sbuf}, {SSD_DESC_BLOCK, scsi_sense_block_sbuf}, {SSD_DESC_PROGRESS, scsi_sense_progress_sbuf} }; void scsi_sense_desc_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { int i; for (i = 0; i < (sizeof(scsi_sense_printers) / sizeof(scsi_sense_printers[0])); i++) { struct scsi_sense_desc_printer *printer; printer = &scsi_sense_printers[i]; /* * The list is sorted, so quit if we've passed our * descriptor number. */ if (printer->desc_type > header->desc_type) break; if (printer->desc_type != header->desc_type) continue; printer->print_func(sb, sense, sense_len, cdb, cdb_len, inq_data, header); return; } /* * No specific printing routine, so use the generic routine. */ scsi_sense_generic_sbuf(sb, sense, sense_len, cdb, cdb_len, inq_data, header); } scsi_sense_data_type scsi_sense_type(struct scsi_sense_data *sense_data) { switch (sense_data->error_code & SSD_ERRCODE) { case SSD_DESC_CURRENT_ERROR: case SSD_DESC_DEFERRED_ERROR: return (SSD_TYPE_DESC); break; case SSD_CURRENT_ERROR: case SSD_DEFERRED_ERROR: return (SSD_TYPE_FIXED); break; default: break; } return (SSD_TYPE_NONE); } struct scsi_print_sense_info { struct sbuf *sb; char *path_str; uint8_t *cdb; int cdb_len; struct scsi_inquiry_data *inq_data; }; static int scsi_print_desc_func(struct scsi_sense_data_desc *sense, u_int sense_len, struct scsi_sense_desc_header *header, void *arg) { struct scsi_print_sense_info *print_info; print_info = (struct scsi_print_sense_info *)arg; switch (header->desc_type) { case SSD_DESC_INFO: case SSD_DESC_FRU: case SSD_DESC_COMMAND: case SSD_DESC_SKS: case SSD_DESC_BLOCK: case SSD_DESC_STREAM: /* * We have already printed these descriptors, if they are * present. */ break; default: { sbuf_printf(print_info->sb, "%s", print_info->path_str); scsi_sense_desc_sbuf(print_info->sb, (struct scsi_sense_data *)sense, sense_len, print_info->cdb, print_info->cdb_len, print_info->inq_data, header); sbuf_printf(print_info->sb, "\n"); break; } } /* * Tell the iterator that we want to see more descriptors if they * are present. */ return (0); } void scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len, struct sbuf *sb, char *path_str, struct scsi_inquiry_data *inq_data, uint8_t *cdb, int cdb_len) { int error_code, sense_key, asc, ascq; sbuf_cat(sb, path_str); scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); sbuf_printf(sb, "SCSI sense: "); switch (error_code) { case SSD_DEFERRED_ERROR: case SSD_DESC_DEFERRED_ERROR: sbuf_printf(sb, "Deferred error: "); /* FALLTHROUGH */ case SSD_CURRENT_ERROR: case SSD_DESC_CURRENT_ERROR: { struct scsi_sense_data_desc *desc_sense; struct scsi_print_sense_info print_info; const char *sense_key_desc; const char *asc_desc; uint8_t sks[3]; uint64_t val; int info_valid; /* * Get descriptions for the sense key, ASC, and ASCQ. If * these aren't present in the sense data (i.e. the sense * data isn't long enough), the -1 values that * scsi_extract_sense_len() returns will yield default * or error descriptions. */ scsi_sense_desc(sense_key, asc, ascq, inq_data, &sense_key_desc, &asc_desc); /* * We first print the sense key and ASC/ASCQ. */ sbuf_cat(sb, sense_key_desc); sbuf_printf(sb, " asc:%x,%x (%s)\n", asc, ascq, asc_desc); /* * Get the info field if it is valid. */ if (scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, &val, NULL) == 0) info_valid = 1; else info_valid = 0; if (info_valid != 0) { uint8_t bits; /* * Determine whether we have any block or stream * device-specific information. */ if (scsi_get_block_info(sense, sense_len, inq_data, &bits) == 0) { sbuf_cat(sb, path_str); scsi_block_sbuf(sb, bits, val); sbuf_printf(sb, "\n"); } else if (scsi_get_stream_info(sense, sense_len, inq_data, &bits) == 0) { sbuf_cat(sb, path_str); scsi_stream_sbuf(sb, bits, val); sbuf_printf(sb, "\n"); } else if (val != 0) { /* * The information field can be valid but 0. * If the block or stream bits aren't set, * and this is 0, it isn't terribly useful * to print it out. */ sbuf_cat(sb, path_str); scsi_info_sbuf(sb, cdb, cdb_len, inq_data, val); sbuf_printf(sb, "\n"); } } /* * Print the FRU. */ if (scsi_get_sense_info(sense, sense_len, SSD_DESC_FRU, &val, NULL) == 0) { sbuf_cat(sb, path_str); scsi_fru_sbuf(sb, val); sbuf_printf(sb, "\n"); } /* * Print any command-specific information. */ if (scsi_get_sense_info(sense, sense_len, SSD_DESC_COMMAND, &val, NULL) == 0) { sbuf_cat(sb, path_str); scsi_command_sbuf(sb, cdb, cdb_len, inq_data, val); sbuf_printf(sb, "\n"); } /* * Print out any sense-key-specific information. */ if (scsi_get_sks(sense, sense_len, sks) == 0) { sbuf_cat(sb, path_str); scsi_sks_sbuf(sb, sense_key, sks); sbuf_printf(sb, "\n"); } /* * If this is fixed sense, we're done. If we have * descriptor sense, we might have more information * available. */ if (scsi_sense_type(sense) != SSD_TYPE_DESC) break; desc_sense = (struct scsi_sense_data_desc *)sense; print_info.sb = sb; print_info.path_str = path_str; print_info.cdb = cdb; print_info.cdb_len = cdb_len; print_info.inq_data = inq_data; /* * Print any sense descriptors that we have not already printed. */ scsi_desc_iterate(desc_sense, sense_len, scsi_print_desc_func, &print_info); break; } case -1: /* * scsi_extract_sense_len() sets values to -1 if the * show_errors flag is set and they aren't present in the * sense data. This means that sense_len is 0. */ sbuf_printf(sb, "No sense data present\n"); break; default: { sbuf_printf(sb, "Error code 0x%x", error_code); if (sense->error_code & SSD_ERRCODE_VALID) { struct scsi_sense_data_fixed *fixed_sense; fixed_sense = (struct scsi_sense_data_fixed *)sense; if (SSD_FIXED_IS_PRESENT(fixed_sense, sense_len, info)){ uint32_t info; info = scsi_4btoul(fixed_sense->info); sbuf_printf(sb, " at block no. %d (decimal)", info); } } sbuf_printf(sb, "\n"); break; } } } /* * scsi_sense_sbuf() returns 0 for success and -1 for failure. */ #ifdef _KERNEL int scsi_sense_sbuf(struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags) #else /* !_KERNEL */ int scsi_sense_sbuf(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags) #endif /* _KERNEL/!_KERNEL */ { struct scsi_sense_data *sense; struct scsi_inquiry_data *inq_data; #ifdef _KERNEL struct ccb_getdev *cgd; #endif /* _KERNEL */ char path_str[64]; uint8_t *cdb; #ifndef _KERNEL if (device == NULL) return(-1); #endif /* !_KERNEL */ if ((csio == NULL) || (sb == NULL)) return(-1); /* * If the CDB is a physical address, we can't deal with it.. */ if ((csio->ccb_h.flags & CAM_CDB_PHYS) != 0) flags &= ~SSS_FLAG_PRINT_COMMAND; #ifdef _KERNEL xpt_path_string(csio->ccb_h.path, path_str, sizeof(path_str)); #else /* !_KERNEL */ cam_path_string(device, path_str, sizeof(path_str)); #endif /* _KERNEL/!_KERNEL */ #ifdef _KERNEL if ((cgd = (struct ccb_getdev*)xpt_alloc_ccb_nowait()) == NULL) return(-1); /* * Get the device information. */ xpt_setup_ccb(&cgd->ccb_h, csio->ccb_h.path, CAM_PRIORITY_NORMAL); cgd->ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)cgd); /* * If the device is unconfigured, just pretend that it is a hard * drive. scsi_op_desc() needs this. */ if (cgd->ccb_h.status == CAM_DEV_NOT_THERE) cgd->inq_data.device = T_DIRECT; inq_data = &cgd->inq_data; #else /* !_KERNEL */ inq_data = &device->inq_data; #endif /* _KERNEL/!_KERNEL */ sense = NULL; if (flags & SSS_FLAG_PRINT_COMMAND) { sbuf_cat(sb, path_str); #ifdef _KERNEL scsi_command_string(csio, sb); #else /* !_KERNEL */ scsi_command_string(device, csio, sb); #endif /* _KERNEL/!_KERNEL */ sbuf_printf(sb, "\n"); } /* * If the sense data is a physical pointer, forget it. */ if (csio->ccb_h.flags & CAM_SENSE_PTR) { if (csio->ccb_h.flags & CAM_SENSE_PHYS) { #ifdef _KERNEL xpt_free_ccb((union ccb*)cgd); #endif /* _KERNEL/!_KERNEL */ return(-1); } else { /* * bcopy the pointer to avoid unaligned access * errors on finicky architectures. We don't * ensure that the sense data is pointer aligned. */ bcopy(&csio->sense_data, &sense, sizeof(struct scsi_sense_data *)); } } else { /* * If the physical sense flag is set, but the sense pointer * is not also set, we assume that the user is an idiot and * return. (Well, okay, it could be that somehow, the * entire csio is physical, but we would have probably core * dumped on one of the bogus pointer deferences above * already.) */ if (csio->ccb_h.flags & CAM_SENSE_PHYS) { #ifdef _KERNEL xpt_free_ccb((union ccb*)cgd); #endif /* _KERNEL/!_KERNEL */ return(-1); } else sense = &csio->sense_data; } if (csio->ccb_h.flags & CAM_CDB_POINTER) cdb = csio->cdb_io.cdb_ptr; else cdb = csio->cdb_io.cdb_bytes; scsi_sense_only_sbuf(sense, csio->sense_len - csio->sense_resid, sb, path_str, inq_data, cdb, csio->cdb_len); #ifdef _KERNEL xpt_free_ccb((union ccb*)cgd); #endif /* _KERNEL/!_KERNEL */ return(0); } #ifdef _KERNEL char * scsi_sense_string(struct ccb_scsiio *csio, char *str, int str_len) #else /* !_KERNEL */ char * scsi_sense_string(struct cam_device *device, struct ccb_scsiio *csio, char *str, int str_len) #endif /* _KERNEL/!_KERNEL */ { struct sbuf sb; sbuf_new(&sb, str, str_len, 0); #ifdef _KERNEL scsi_sense_sbuf(csio, &sb, SSS_FLAG_PRINT_COMMAND); #else /* !_KERNEL */ scsi_sense_sbuf(device, csio, &sb, SSS_FLAG_PRINT_COMMAND); #endif /* _KERNEL/!_KERNEL */ sbuf_finish(&sb); return(sbuf_data(&sb)); } #ifdef _KERNEL void scsi_sense_print(struct ccb_scsiio *csio) { struct sbuf sb; char str[512]; sbuf_new(&sb, str, sizeof(str), 0); scsi_sense_sbuf(csio, &sb, SSS_FLAG_PRINT_COMMAND); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); } #else /* !_KERNEL */ void scsi_sense_print(struct cam_device *device, struct ccb_scsiio *csio, FILE *ofile) { struct sbuf sb; char str[512]; if ((device == NULL) || (csio == NULL) || (ofile == NULL)) return; sbuf_new(&sb, str, sizeof(str), 0); scsi_sense_sbuf(device, csio, &sb, SSS_FLAG_PRINT_COMMAND); sbuf_finish(&sb); fprintf(ofile, "%s", sbuf_data(&sb)); } #endif /* _KERNEL/!_KERNEL */ /* * Extract basic sense information. This is backward-compatible with the * previous implementation. For new implementations, * scsi_extract_sense_len() is recommended. */ void scsi_extract_sense(struct scsi_sense_data *sense_data, int *error_code, int *sense_key, int *asc, int *ascq) { scsi_extract_sense_len(sense_data, sizeof(*sense_data), error_code, sense_key, asc, ascq, /*show_errors*/ 0); } /* * Extract basic sense information from SCSI I/O CCB structure. */ int scsi_extract_sense_ccb(union ccb *ccb, int *error_code, int *sense_key, int *asc, int *ascq) { struct scsi_sense_data *sense_data; /* Make sure there are some sense data we can access. */ if (ccb->ccb_h.func_code != XPT_SCSI_IO || (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_SCSI_STATUS_ERROR || (ccb->csio.scsi_status != SCSI_STATUS_CHECK_COND) || (ccb->ccb_h.status & CAM_AUTOSNS_VALID) == 0 || (ccb->ccb_h.flags & CAM_SENSE_PHYS)) return (0); if (ccb->ccb_h.flags & CAM_SENSE_PTR) bcopy(&ccb->csio.sense_data, &sense_data, sizeof(struct scsi_sense_data *)); else sense_data = &ccb->csio.sense_data; scsi_extract_sense_len(sense_data, ccb->csio.sense_len - ccb->csio.sense_resid, error_code, sense_key, asc, ascq, 1); if (*error_code == -1) return (0); return (1); } /* * Extract basic sense information. If show_errors is set, sense values * will be set to -1 if they are not present. */ void scsi_extract_sense_len(struct scsi_sense_data *sense_data, u_int sense_len, int *error_code, int *sense_key, int *asc, int *ascq, int show_errors) { /* * If we have no length, we have no sense. */ if (sense_len == 0) { if (show_errors == 0) { *error_code = 0; *sense_key = 0; *asc = 0; *ascq = 0; } else { *error_code = -1; *sense_key = -1; *asc = -1; *ascq = -1; } return; } *error_code = sense_data->error_code & SSD_ERRCODE; switch (*error_code) { case SSD_DESC_CURRENT_ERROR: case SSD_DESC_DEFERRED_ERROR: { struct scsi_sense_data_desc *sense; sense = (struct scsi_sense_data_desc *)sense_data; if (SSD_DESC_IS_PRESENT(sense, sense_len, sense_key)) *sense_key = sense->sense_key & SSD_KEY; else *sense_key = (show_errors) ? -1 : 0; if (SSD_DESC_IS_PRESENT(sense, sense_len, add_sense_code)) *asc = sense->add_sense_code; else *asc = (show_errors) ? -1 : 0; if (SSD_DESC_IS_PRESENT(sense, sense_len, add_sense_code_qual)) *ascq = sense->add_sense_code_qual; else *ascq = (show_errors) ? -1 : 0; break; } case SSD_CURRENT_ERROR: case SSD_DEFERRED_ERROR: default: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags)) *sense_key = sense->flags & SSD_KEY; else *sense_key = (show_errors) ? -1 : 0; if ((SSD_FIXED_IS_PRESENT(sense, sense_len, add_sense_code)) && (SSD_FIXED_IS_FILLED(sense, add_sense_code))) *asc = sense->add_sense_code; else *asc = (show_errors) ? -1 : 0; if ((SSD_FIXED_IS_PRESENT(sense, sense_len,add_sense_code_qual)) && (SSD_FIXED_IS_FILLED(sense, add_sense_code_qual))) *ascq = sense->add_sense_code_qual; else *ascq = (show_errors) ? -1 : 0; break; } } } int scsi_get_sense_key(struct scsi_sense_data *sense_data, u_int sense_len, int show_errors) { int error_code, sense_key, asc, ascq; scsi_extract_sense_len(sense_data, sense_len, &error_code, &sense_key, &asc, &ascq, show_errors); return (sense_key); } int scsi_get_asc(struct scsi_sense_data *sense_data, u_int sense_len, int show_errors) { int error_code, sense_key, asc, ascq; scsi_extract_sense_len(sense_data, sense_len, &error_code, &sense_key, &asc, &ascq, show_errors); return (asc); } int scsi_get_ascq(struct scsi_sense_data *sense_data, u_int sense_len, int show_errors) { int error_code, sense_key, asc, ascq; scsi_extract_sense_len(sense_data, sense_len, &error_code, &sense_key, &asc, &ascq, show_errors); return (ascq); } /* * This function currently requires at least 36 bytes, or * SHORT_INQUIRY_LENGTH, worth of data to function properly. If this * function needs more or less data in the future, another length should be * defined in scsi_all.h to indicate the minimum amount of data necessary * for this routine to function properly. */ void scsi_print_inquiry(struct scsi_inquiry_data *inq_data) { u_int8_t type; char *dtype, *qtype; char vendor[16], product[48], revision[16], rstr[12]; type = SID_TYPE(inq_data); /* * Figure out basic device type and qualifier. */ if (SID_QUAL_IS_VENDOR_UNIQUE(inq_data)) { qtype = " (vendor-unique qualifier)"; } else { switch (SID_QUAL(inq_data)) { case SID_QUAL_LU_CONNECTED: qtype = ""; break; case SID_QUAL_LU_OFFLINE: qtype = " (offline)"; break; case SID_QUAL_RSVD: qtype = " (reserved qualifier)"; break; default: case SID_QUAL_BAD_LU: qtype = " (LUN not supported)"; break; } } switch (type) { case T_DIRECT: dtype = "Direct Access"; break; case T_SEQUENTIAL: dtype = "Sequential Access"; break; case T_PRINTER: dtype = "Printer"; break; case T_PROCESSOR: dtype = "Processor"; break; case T_WORM: dtype = "WORM"; break; case T_CDROM: dtype = "CD-ROM"; break; case T_SCANNER: dtype = "Scanner"; break; case T_OPTICAL: dtype = "Optical"; break; case T_CHANGER: dtype = "Changer"; break; case T_COMM: dtype = "Communication"; break; case T_STORARRAY: dtype = "Storage Array"; break; case T_ENCLOSURE: dtype = "Enclosure Services"; break; case T_RBC: dtype = "Simplified Direct Access"; break; case T_OCRW: dtype = "Optical Card Read/Write"; break; case T_OSD: dtype = "Object-Based Storage"; break; case T_ADC: dtype = "Automation/Drive Interface"; break; case T_NODEVICE: dtype = "Uninstalled"; break; default: dtype = "unknown"; break; } cam_strvis(vendor, inq_data->vendor, sizeof(inq_data->vendor), sizeof(vendor)); cam_strvis(product, inq_data->product, sizeof(inq_data->product), sizeof(product)); cam_strvis(revision, inq_data->revision, sizeof(inq_data->revision), sizeof(revision)); if (SID_ANSI_REV(inq_data) == SCSI_REV_0) snprintf(rstr, sizeof(rstr), "SCSI"); else if (SID_ANSI_REV(inq_data) <= SCSI_REV_SPC) { snprintf(rstr, sizeof(rstr), "SCSI-%d", SID_ANSI_REV(inq_data)); } else { snprintf(rstr, sizeof(rstr), "SPC-%d SCSI", SID_ANSI_REV(inq_data) - 2); } printf("<%s %s %s> %s %s %s device%s\n", vendor, product, revision, SID_IS_REMOVABLE(inq_data) ? "Removable" : "Fixed", dtype, rstr, qtype); } void scsi_print_inquiry_short(struct scsi_inquiry_data *inq_data) { char vendor[16], product[48], revision[16]; cam_strvis(vendor, inq_data->vendor, sizeof(inq_data->vendor), sizeof(vendor)); cam_strvis(product, inq_data->product, sizeof(inq_data->product), sizeof(product)); cam_strvis(revision, inq_data->revision, sizeof(inq_data->revision), sizeof(revision)); printf("<%s %s %s>", vendor, product, revision); } /* * Table of syncrates that don't follow the "divisible by 4" * rule. This table will be expanded in future SCSI specs. */ static struct { u_int period_factor; u_int period; /* in 100ths of ns */ } scsi_syncrates[] = { { 0x08, 625 }, /* FAST-160 */ { 0x09, 1250 }, /* FAST-80 */ { 0x0a, 2500 }, /* FAST-40 40MHz */ { 0x0b, 3030 }, /* FAST-40 33MHz */ { 0x0c, 5000 } /* FAST-20 */ }; /* * Return the frequency in kHz corresponding to the given * sync period factor. */ u_int scsi_calc_syncsrate(u_int period_factor) { int i; int num_syncrates; /* * It's a bug if period is zero, but if it is anyway, don't * die with a divide fault- instead return something which * 'approximates' async */ if (period_factor == 0) { return (3300); } num_syncrates = sizeof(scsi_syncrates) / sizeof(scsi_syncrates[0]); /* See if the period is in the "exception" table */ for (i = 0; i < num_syncrates; i++) { if (period_factor == scsi_syncrates[i].period_factor) { /* Period in kHz */ return (100000000 / scsi_syncrates[i].period); } } /* * Wasn't in the table, so use the standard * 4 times conversion. */ return (10000000 / (period_factor * 4 * 10)); } /* * Return the SCSI sync parameter that corresponsd to * the passed in period in 10ths of ns. */ u_int scsi_calc_syncparam(u_int period) { int i; int num_syncrates; if (period == 0) return (~0); /* Async */ /* Adjust for exception table being in 100ths. */ period *= 10; num_syncrates = sizeof(scsi_syncrates) / sizeof(scsi_syncrates[0]); /* See if the period is in the "exception" table */ for (i = 0; i < num_syncrates; i++) { if (period <= scsi_syncrates[i].period) { /* Period in 100ths of ns */ return (scsi_syncrates[i].period_factor); } } /* * Wasn't in the table, so use the standard * 1/4 period in ns conversion. */ return (period/400); } int scsi_devid_is_naa_ieee_reg(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; struct scsi_vpd_id_naa_basic *naa; descr = (struct scsi_vpd_id_descriptor *)bufp; naa = (struct scsi_vpd_id_naa_basic *)descr->identifier; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA) return 0; if (descr->length < sizeof(struct scsi_vpd_id_naa_ieee_reg)) return 0; if ((naa->naa >> SVPD_ID_NAA_NAA_SHIFT) != SVPD_ID_NAA_IEEE_REG) return 0; return 1; } int scsi_devid_is_sas_target(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if (!scsi_devid_is_naa_ieee_reg(bufp)) return 0; if ((descr->id_type & SVPD_ID_PIV) == 0) /* proto field reserved */ return 0; if ((descr->proto_codeset >> SVPD_ID_PROTO_SHIFT) != SCSI_PROTO_SAS) return 0; return 1; } int scsi_devid_is_lun_eui64(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_EUI64) return 0; return 1; } int scsi_devid_is_lun_naa(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA) return 0; return 1; } int scsi_devid_is_lun_t10(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_T10) return 0; return 1; } int scsi_devid_is_lun_name(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_SCSI_NAME) return 0; return 1; } struct scsi_vpd_id_descriptor * scsi_get_devid_desc(struct scsi_vpd_id_descriptor *desc, uint32_t len, scsi_devid_checkfn_t ck_fn) { uint8_t *desc_buf_end; desc_buf_end = (uint8_t *)desc + len; for (; desc->identifier <= desc_buf_end && desc->identifier + desc->length <= desc_buf_end; desc = (struct scsi_vpd_id_descriptor *)(desc->identifier + desc->length)) { if (ck_fn == NULL || ck_fn((uint8_t *)desc) != 0) return (desc); } return (NULL); } struct scsi_vpd_id_descriptor * scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t page_len, scsi_devid_checkfn_t ck_fn) { uint32_t len; if (page_len < sizeof(*id)) return (NULL); len = MIN(scsi_2btoul(id->length), page_len - sizeof(*id)); return (scsi_get_devid_desc((struct scsi_vpd_id_descriptor *) id->desc_list, len, ck_fn)); } int scsi_transportid_sbuf(struct sbuf *sb, struct scsi_transportid_header *hdr, uint32_t valid_len) { switch (hdr->format_protocol & SCSI_TRN_PROTO_MASK) { case SCSI_PROTO_FC: { struct scsi_transportid_fcp *fcp; uint64_t n_port_name; fcp = (struct scsi_transportid_fcp *)hdr; n_port_name = scsi_8btou64(fcp->n_port_name); sbuf_printf(sb, "FCP address: 0x%.16jx",(uintmax_t)n_port_name); break; } case SCSI_PROTO_SPI: { struct scsi_transportid_spi *spi; spi = (struct scsi_transportid_spi *)hdr; sbuf_printf(sb, "SPI address: %u,%u", scsi_2btoul(spi->scsi_addr), scsi_2btoul(spi->rel_trgt_port_id)); break; } case SCSI_PROTO_SSA: /* * XXX KDM there is no transport ID defined in SPC-4 for * SSA. */ break; case SCSI_PROTO_1394: { struct scsi_transportid_1394 *sbp; uint64_t eui64; sbp = (struct scsi_transportid_1394 *)hdr; eui64 = scsi_8btou64(sbp->eui64); sbuf_printf(sb, "SBP address: 0x%.16jx", (uintmax_t)eui64); break; } case SCSI_PROTO_RDMA: { struct scsi_transportid_rdma *rdma; unsigned int i; rdma = (struct scsi_transportid_rdma *)hdr; sbuf_printf(sb, "RDMA address: 0x"); for (i = 0; i < sizeof(rdma->initiator_port_id); i++) sbuf_printf(sb, "%02x", rdma->initiator_port_id[i]); break; } case SCSI_PROTO_ISCSI: { uint32_t add_len, i; uint8_t *iscsi_name = NULL; int nul_found = 0; sbuf_printf(sb, "iSCSI address: "); if ((hdr->format_protocol & SCSI_TRN_FORMAT_MASK) == SCSI_TRN_ISCSI_FORMAT_DEVICE) { struct scsi_transportid_iscsi_device *dev; dev = (struct scsi_transportid_iscsi_device *)hdr; /* * Verify how much additional data we really have. */ add_len = scsi_2btoul(dev->additional_length); add_len = MIN(add_len, valid_len - __offsetof(struct scsi_transportid_iscsi_device, iscsi_name)); iscsi_name = &dev->iscsi_name[0]; } else if ((hdr->format_protocol & SCSI_TRN_FORMAT_MASK) == SCSI_TRN_ISCSI_FORMAT_PORT) { struct scsi_transportid_iscsi_port *port; port = (struct scsi_transportid_iscsi_port *)hdr; add_len = scsi_2btoul(port->additional_length); add_len = MIN(add_len, valid_len - __offsetof(struct scsi_transportid_iscsi_port, iscsi_name)); iscsi_name = &port->iscsi_name[0]; } else { sbuf_printf(sb, "unknown format %x", (hdr->format_protocol & SCSI_TRN_FORMAT_MASK) >> SCSI_TRN_FORMAT_SHIFT); break; } if (add_len == 0) { sbuf_printf(sb, "not enough data"); break; } /* * This is supposed to be a NUL-terminated ASCII * string, but you never know. So we're going to * check. We need to do this because there is no * sbuf equivalent of strncat(). */ for (i = 0; i < add_len; i++) { if (iscsi_name[i] == '\0') { nul_found = 1; break; } } /* * If there is a NUL in the name, we can just use * sbuf_cat(). Otherwise we need to use sbuf_bcat(). */ if (nul_found != 0) sbuf_cat(sb, iscsi_name); else sbuf_bcat(sb, iscsi_name, add_len); break; } case SCSI_PROTO_SAS: { struct scsi_transportid_sas *sas; uint64_t sas_addr; sas = (struct scsi_transportid_sas *)hdr; sas_addr = scsi_8btou64(sas->sas_address); sbuf_printf(sb, "SAS address: 0x%.16jx", (uintmax_t)sas_addr); break; } case SCSI_PROTO_ADITP: case SCSI_PROTO_ATA: case SCSI_PROTO_UAS: /* * No Transport ID format for ADI, ATA or USB is defined in * SPC-4. */ sbuf_printf(sb, "No known Transport ID format for protocol " "%#x", hdr->format_protocol & SCSI_TRN_PROTO_MASK); break; case SCSI_PROTO_SOP: { struct scsi_transportid_sop *sop; struct scsi_sop_routing_id_norm *rid; sop = (struct scsi_transportid_sop *)hdr; rid = (struct scsi_sop_routing_id_norm *)sop->routing_id; /* * Note that there is no alternate format specified in SPC-4 * for the PCIe routing ID, so we don't really have a way * to know whether the second byte of the routing ID is * a device and function or just a function. So we just * assume bus,device,function. */ sbuf_printf(sb, "SOP Routing ID: %u,%u,%u", rid->bus, rid->devfunc >> SCSI_TRN_SOP_DEV_SHIFT, rid->devfunc & SCSI_TRN_SOP_FUNC_NORM_MAX); break; } case SCSI_PROTO_NONE: default: sbuf_printf(sb, "Unknown protocol %#x", hdr->format_protocol & SCSI_TRN_PROTO_MASK); break; } return (0); } struct scsi_nv scsi_proto_map[] = { { "fcp", SCSI_PROTO_FC }, { "spi", SCSI_PROTO_SPI }, { "ssa", SCSI_PROTO_SSA }, { "sbp", SCSI_PROTO_1394 }, { "1394", SCSI_PROTO_1394 }, { "srp", SCSI_PROTO_RDMA }, { "rdma", SCSI_PROTO_RDMA }, { "iscsi", SCSI_PROTO_ISCSI }, { "iqn", SCSI_PROTO_ISCSI }, { "sas", SCSI_PROTO_SAS }, { "aditp", SCSI_PROTO_ADITP }, { "ata", SCSI_PROTO_ATA }, { "uas", SCSI_PROTO_UAS }, { "usb", SCSI_PROTO_UAS }, { "sop", SCSI_PROTO_SOP } }; const char * scsi_nv_to_str(struct scsi_nv *table, int num_table_entries, uint64_t value) { int i; for (i = 0; i < num_table_entries; i++) { if (table[i].value == value) return (table[i].name); } return (NULL); } /* * Given a name/value table, find a value matching the given name. * Return values: * SCSI_NV_FOUND - match found * SCSI_NV_AMBIGUOUS - more than one match, none of them exact * SCSI_NV_NOT_FOUND - no match found */ scsi_nv_status scsi_get_nv(struct scsi_nv *table, int num_table_entries, char *name, int *table_entry, scsi_nv_flags flags) { int i, num_matches = 0; for (i = 0; i < num_table_entries; i++) { size_t table_len, name_len; table_len = strlen(table[i].name); name_len = strlen(name); if ((((flags & SCSI_NV_FLAG_IG_CASE) != 0) && (strncasecmp(table[i].name, name, name_len) == 0)) || (((flags & SCSI_NV_FLAG_IG_CASE) == 0) && (strncmp(table[i].name, name, name_len) == 0))) { *table_entry = i; /* * Check for an exact match. If we have the same * number of characters in the table as the argument, * and we already know they're the same, we have * an exact match. */ if (table_len == name_len) return (SCSI_NV_FOUND); /* * Otherwise, bump up the number of matches. We'll * see later how many we have. */ num_matches++; } } if (num_matches > 1) return (SCSI_NV_AMBIGUOUS); else if (num_matches == 1) return (SCSI_NV_FOUND); else return (SCSI_NV_NOT_FOUND); } /* * Parse transport IDs for Fibre Channel, 1394 and SAS. Since these are * all 64-bit numbers, the code is similar. */ int scsi_parse_transportid_64bit(int proto_id, char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { uint64_t value; char *endptr; int retval; size_t alloc_size; retval = 0; value = strtouq(id_str, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing ID %s, 64-bit number required", __func__, id_str); } retval = 1; goto bailout; } switch (proto_id) { case SCSI_PROTO_FC: alloc_size = sizeof(struct scsi_transportid_fcp); break; case SCSI_PROTO_1394: alloc_size = sizeof(struct scsi_transportid_1394); break; case SCSI_PROTO_SAS: alloc_size = sizeof(struct scsi_transportid_sas); break; default: if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unsupoprted " "protocol %d", __func__, proto_id); } retval = 1; goto bailout; break; /* NOTREACHED */ } #ifdef _KERNEL *hdr = malloc(alloc_size, type, flags); #else /* _KERNEL */ *hdr = malloc(alloc_size); #endif /*_KERNEL */ if (*hdr == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, alloc_size); } retval = 1; goto bailout; } *alloc_len = alloc_size; bzero(*hdr, alloc_size); switch (proto_id) { case SCSI_PROTO_FC: { struct scsi_transportid_fcp *fcp; fcp = (struct scsi_transportid_fcp *)(*hdr); fcp->format_protocol = SCSI_PROTO_FC | SCSI_TRN_FCP_FORMAT_DEFAULT; scsi_u64to8b(value, fcp->n_port_name); break; } case SCSI_PROTO_1394: { struct scsi_transportid_1394 *sbp; sbp = (struct scsi_transportid_1394 *)(*hdr); sbp->format_protocol = SCSI_PROTO_1394 | SCSI_TRN_1394_FORMAT_DEFAULT; scsi_u64to8b(value, sbp->eui64); break; } case SCSI_PROTO_SAS: { struct scsi_transportid_sas *sas; sas = (struct scsi_transportid_sas *)(*hdr); sas->format_protocol = SCSI_PROTO_SAS | SCSI_TRN_SAS_FORMAT_DEFAULT; scsi_u64to8b(value, sas->sas_address); break; } default: break; } bailout: return (retval); } /* * Parse a SPI (Parallel SCSI) address of the form: id,rel_tgt_port */ int scsi_parse_transportid_spi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { unsigned long scsi_addr, target_port; struct scsi_transportid_spi *spi; char *tmpstr, *endptr; int retval; retval = 0; tmpstr = strsep(&id_str, ","); if (tmpstr == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no ID found", __func__); } retval = 1; goto bailout; } scsi_addr = strtoul(tmpstr, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing SCSI ID %s, number required", __func__, tmpstr); } retval = 1; goto bailout; } if (id_str == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no relative " "target port found", __func__); } retval = 1; goto bailout; } target_port = strtoul(id_str, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing relative target port %s, number " "required", __func__, id_str); } retval = 1; goto bailout; } #ifdef _KERNEL spi = malloc(sizeof(*spi), type, flags); #else spi = malloc(sizeof(*spi)); #endif if (spi == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, sizeof(*spi)); } retval = 1; goto bailout; } *alloc_len = sizeof(*spi); bzero(spi, sizeof(*spi)); spi->format_protocol = SCSI_PROTO_SPI | SCSI_TRN_SPI_FORMAT_DEFAULT; scsi_ulto2b(scsi_addr, spi->scsi_addr); scsi_ulto2b(target_port, spi->rel_trgt_port_id); *hdr = (struct scsi_transportid_header *)spi; bailout: return (retval); } /* * Parse an RDMA/SRP Initiator Port ID string. This is 32 hexadecimal digits, * optionally prefixed by "0x" or "0X". */ int scsi_parse_transportid_rdma(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { struct scsi_transportid_rdma *rdma; int retval; size_t id_len, rdma_id_size; uint8_t rdma_id[SCSI_TRN_RDMA_PORT_LEN]; char *tmpstr; unsigned int i, j; retval = 0; id_len = strlen(id_str); rdma_id_size = SCSI_TRN_RDMA_PORT_LEN; /* * Check the size. It needs to be either 32 or 34 characters long. */ if ((id_len != (rdma_id_size * 2)) && (id_len != ((rdma_id_size * 2) + 2))) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: RDMA ID " "must be 32 hex digits (0x prefix " "optional), only %zu seen", __func__, id_len); } retval = 1; goto bailout; } tmpstr = id_str; /* * If the user gave us 34 characters, the string needs to start * with '0x'. */ if (id_len == ((rdma_id_size * 2) + 2)) { if ((tmpstr[0] == '0') && ((tmpstr[1] == 'x') || (tmpstr[1] == 'X'))) { tmpstr += 2; } else { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: RDMA " "ID prefix, if used, must be \"0x\", " "got %s", __func__, tmpstr); } retval = 1; goto bailout; } } bzero(rdma_id, sizeof(rdma_id)); /* * Convert ASCII hex into binary bytes. There is no standard * 128-bit integer type, and so no strtou128t() routine to convert * from hex into a large integer. In the end, we're not going to * an integer, but rather to a byte array, so that and the fact * that we require the user to give us 32 hex digits simplifies the * logic. */ for (i = 0; i < (rdma_id_size * 2); i++) { int cur_shift; unsigned char c; /* Increment the byte array one for every 2 hex digits */ j = i >> 1; /* * The first digit in every pair is the most significant * 4 bits. The second is the least significant 4 bits. */ if ((i % 2) == 0) cur_shift = 4; else cur_shift = 0; c = tmpstr[i]; /* Convert the ASCII hex character into a number */ if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "RDMA ID must be hex digits, got " "invalid character %c", __func__, tmpstr[i]); } retval = 1; goto bailout; } /* * The converted number can't be less than 0; the type is * unsigned, and the subtraction logic will not give us * a negative number. So we only need to make sure that * the value is not greater than 0xf. (i.e. make sure the * user didn't give us a value like "0x12jklmno"). */ if (c > 0xf) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "RDMA ID must be hex digits, got " "invalid character %c", __func__, tmpstr[i]); } retval = 1; goto bailout; } rdma_id[j] |= c << cur_shift; } #ifdef _KERNEL rdma = malloc(sizeof(*rdma), type, flags); #else rdma = malloc(sizeof(*rdma)); #endif if (rdma == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, sizeof(*rdma)); } retval = 1; goto bailout; } *alloc_len = sizeof(*rdma); bzero(rdma, *alloc_len); rdma->format_protocol = SCSI_PROTO_RDMA | SCSI_TRN_RDMA_FORMAT_DEFAULT; bcopy(rdma_id, rdma->initiator_port_id, SCSI_TRN_RDMA_PORT_LEN); *hdr = (struct scsi_transportid_header *)rdma; bailout: return (retval); } /* * Parse an iSCSI name. The format is either just the name: * * iqn.2012-06.com.example:target0 * or the name, separator and initiator session ID: * * iqn.2012-06.com.example:target0,i,0x123 * * The separator format is exact. */ int scsi_parse_transportid_iscsi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { size_t id_len, sep_len, id_size, name_len; int retval; unsigned int i, sep_pos, sep_found; const char *sep_template = ",i,0x"; const char *iqn_prefix = "iqn."; struct scsi_transportid_iscsi_device *iscsi; retval = 0; sep_found = 0; id_len = strlen(id_str); sep_len = strlen(sep_template); /* * The separator is defined as exactly ',i,0x'. Any other commas, * or any other form, is an error. So look for a comma, and once * we find that, the next few characters must match the separator * exactly. Once we get through the separator, there should be at * least one character. */ for (i = 0, sep_pos = 0; i < id_len; i++) { if (sep_pos == 0) { if (id_str[i] == sep_template[sep_pos]) sep_pos++; continue; } if (sep_pos < sep_len) { if (id_str[i] == sep_template[sep_pos]) { sep_pos++; continue; } if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "invalid separator in iSCSI name " "\"%s\"", __func__, id_str); } retval = 1; goto bailout; } else { sep_found = 1; break; } } /* * Check to see whether we have a separator but no digits after it. */ if ((sep_pos != 0) && (sep_found == 0)) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no digits " "found after separator in iSCSI name \"%s\"", __func__, id_str); } retval = 1; goto bailout; } /* * The incoming ID string has the "iqn." prefix stripped off. We * need enough space for the base structure (the structures are the * same for the two iSCSI forms), the prefix, the ID string and a * terminating NUL. */ id_size = sizeof(*iscsi) + strlen(iqn_prefix) + id_len + 1; #ifdef _KERNEL iscsi = malloc(id_size, type, flags); #else iscsi = malloc(id_size); #endif if (iscsi == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, id_size); } retval = 1; goto bailout; } *alloc_len = id_size; bzero(iscsi, id_size); iscsi->format_protocol = SCSI_PROTO_ISCSI; if (sep_found == 0) iscsi->format_protocol |= SCSI_TRN_ISCSI_FORMAT_DEVICE; else iscsi->format_protocol |= SCSI_TRN_ISCSI_FORMAT_PORT; name_len = id_size - sizeof(*iscsi); scsi_ulto2b(name_len, iscsi->additional_length); snprintf(iscsi->iscsi_name, name_len, "%s%s", iqn_prefix, id_str); *hdr = (struct scsi_transportid_header *)iscsi; bailout: return (retval); } /* * Parse a SCSI over PCIe (SOP) identifier. The Routing ID can either be * of the form 'bus,device,function' or 'bus,function'. */ int scsi_parse_transportid_sop(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { struct scsi_transportid_sop *sop; unsigned long bus, device, function; char *tmpstr, *endptr; int retval, device_spec; retval = 0; device_spec = 0; device = 0; tmpstr = strsep(&id_str, ","); if ((tmpstr == NULL) || (*tmpstr == '\0')) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no ID found", __func__); } retval = 1; goto bailout; } bus = strtoul(tmpstr, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing PCIe bus %s, number required", __func__, tmpstr); } retval = 1; goto bailout; } if ((id_str == NULL) || (*id_str == '\0')) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no PCIe " "device or function found", __func__); } retval = 1; goto bailout; } tmpstr = strsep(&id_str, ","); function = strtoul(tmpstr, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing PCIe device/function %s, number " "required", __func__, tmpstr); } retval = 1; goto bailout; } /* * Check to see whether the user specified a third value. If so, * the second is the device. */ if (id_str != NULL) { if (*id_str == '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "no PCIe function found", __func__); } retval = 1; goto bailout; } device = function; device_spec = 1; function = strtoul(id_str, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "error parsing PCIe function %s, " "number required", __func__, id_str); } retval = 1; goto bailout; } } if (bus > SCSI_TRN_SOP_BUS_MAX) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: bus value " "%lu greater than maximum %u", __func__, bus, SCSI_TRN_SOP_BUS_MAX); } retval = 1; goto bailout; } if ((device_spec != 0) && (device > SCSI_TRN_SOP_DEV_MASK)) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: device value " "%lu greater than maximum %u", __func__, device, SCSI_TRN_SOP_DEV_MAX); } retval = 1; goto bailout; } if (((device_spec != 0) && (function > SCSI_TRN_SOP_FUNC_NORM_MAX)) || ((device_spec == 0) && (function > SCSI_TRN_SOP_FUNC_ALT_MAX))) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: function value " "%lu greater than maximum %u", __func__, function, (device_spec == 0) ? SCSI_TRN_SOP_FUNC_ALT_MAX : SCSI_TRN_SOP_FUNC_NORM_MAX); } retval = 1; goto bailout; } #ifdef _KERNEL sop = malloc(sizeof(*sop), type, flags); #else sop = malloc(sizeof(*sop)); #endif if (sop == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, sizeof(*sop)); } retval = 1; goto bailout; } *alloc_len = sizeof(*sop); bzero(sop, sizeof(*sop)); sop->format_protocol = SCSI_PROTO_SOP | SCSI_TRN_SOP_FORMAT_DEFAULT; if (device_spec != 0) { struct scsi_sop_routing_id_norm rid; rid.bus = bus; rid.devfunc = (device << SCSI_TRN_SOP_DEV_SHIFT) | function; bcopy(&rid, sop->routing_id, MIN(sizeof(rid), sizeof(sop->routing_id))); } else { struct scsi_sop_routing_id_alt rid; rid.bus = bus; rid.function = function; bcopy(&rid, sop->routing_id, MIN(sizeof(rid), sizeof(sop->routing_id))); } *hdr = (struct scsi_transportid_header *)sop; bailout: return (retval); } /* * transportid_str: NUL-terminated string with format: protcol,id * The ID is protocol specific. * hdr: Storage will be allocated for the transport ID. * alloc_len: The amount of memory allocated is returned here. * type: Malloc bucket (kernel only). * flags: Malloc flags (kernel only). * error_str: If non-NULL, it will contain error information (without * a terminating newline) if an error is returned. * error_str_len: Allocated length of the error string. * * Returns 0 for success, non-zero for failure. */ int scsi_parse_transportid(char *transportid_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { char *tmpstr; scsi_nv_status status; int retval, num_proto_entries, table_entry; retval = 0; table_entry = 0; /* * We do allow a period as well as a comma to separate the protocol * from the ID string. This is to accommodate iSCSI names, which * start with "iqn.". */ tmpstr = strsep(&transportid_str, ",."); if (tmpstr == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: transportid_str is NULL", __func__); } retval = 1; goto bailout; } num_proto_entries = sizeof(scsi_proto_map) / sizeof(scsi_proto_map[0]); status = scsi_get_nv(scsi_proto_map, num_proto_entries, tmpstr, &table_entry, SCSI_NV_FLAG_IG_CASE); if (status != SCSI_NV_FOUND) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: %s protocol " "name %s", __func__, (status == SCSI_NV_AMBIGUOUS) ? "ambiguous" : "invalid", tmpstr); } retval = 1; goto bailout; } switch (scsi_proto_map[table_entry].value) { case SCSI_PROTO_FC: case SCSI_PROTO_1394: case SCSI_PROTO_SAS: retval = scsi_parse_transportid_64bit( scsi_proto_map[table_entry].value, transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_SPI: retval = scsi_parse_transportid_spi(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_RDMA: retval = scsi_parse_transportid_rdma(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_ISCSI: retval = scsi_parse_transportid_iscsi(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_SOP: retval = scsi_parse_transportid_sop(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_SSA: case SCSI_PROTO_ADITP: case SCSI_PROTO_ATA: case SCSI_PROTO_UAS: case SCSI_PROTO_NONE: default: /* * There is no format defined for a Transport ID for these * protocols. So even if the user gives us something, we * have no way to turn it into a standard SCSI Transport ID. */ retval = 1; if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no Transport " "ID format exists for protocol %s", __func__, tmpstr); } goto bailout; break; /* NOTREACHED */ } bailout: return (retval); } struct scsi_attrib_table_entry scsi_mam_attr_table[] = { { SMA_ATTR_REM_CAP_PARTITION, SCSI_ATTR_FLAG_NONE, "Remaining Capacity in Partition", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf,/*parse_str*/ NULL }, { SMA_ATTR_MAX_CAP_PARTITION, SCSI_ATTR_FLAG_NONE, "Maximum Capacity in Partition", /*suffix*/"MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TAPEALERT_FLAGS, SCSI_ATTR_FLAG_HEX, "TapeAlert Flags", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_LOAD_COUNT, SCSI_ATTR_FLAG_NONE, "Load Count", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MAM_SPACE_REMAINING, SCSI_ATTR_FLAG_NONE, "MAM Space Remaining", /*suffix*/"bytes", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_ASSIGNING_ORG, SCSI_ATTR_FLAG_NONE, "Assigning Organization", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_FORMAT_DENSITY_CODE, SCSI_ATTR_FLAG_HEX, "Format Density Code", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_INITIALIZATION_COUNT, SCSI_ATTR_FLAG_NONE, "Initialization Count", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_VOLUME_ID, SCSI_ATTR_FLAG_NONE, "Volume Identifier", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_VOLUME_CHANGE_REF, SCSI_ATTR_FLAG_HEX, "Volume Change Reference", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD_1, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load - 1", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD_2, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load - 2", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD_3, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load - 3", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_WRITTEN_LT, SCSI_ATTR_FLAG_NONE, "Total MB Written in Medium Life", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_READ_LT, SCSI_ATTR_FLAG_NONE, "Total MB Read in Medium Life", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_WRITTEN_CUR, SCSI_ATTR_FLAG_NONE, "Total MB Written in Current/Last Load", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_READ_CUR, SCSI_ATTR_FLAG_NONE, "Total MB Read in Current/Last Load", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_FIRST_ENC_BLOCK, SCSI_ATTR_FLAG_NONE, "Logical Position of First Encrypted Block", /*suffix*/ NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_NEXT_UNENC_BLOCK, SCSI_ATTR_FLAG_NONE, "Logical Position of First Unencrypted Block after First " "Encrypted Block", /*suffix*/ NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MEDIUM_USAGE_HIST, SCSI_ATTR_FLAG_NONE, "Medium Usage History", /*suffix*/ NULL, /*to_str*/ NULL, /*parse_str*/ NULL }, { SMA_ATTR_PART_USAGE_HIST, SCSI_ATTR_FLAG_NONE, "Partition Usage History", /*suffix*/ NULL, /*to_str*/ NULL, /*parse_str*/ NULL }, { SMA_ATTR_MED_MANUF, SCSI_ATTR_FLAG_NONE, "Medium Manufacturer", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_SERIAL, SCSI_ATTR_FLAG_NONE, "Medium Serial Number", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_LENGTH, SCSI_ATTR_FLAG_NONE, "Medium Length", /*suffix*/"m", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_WIDTH, SCSI_ATTR_FLAG_FP | SCSI_ATTR_FLAG_DIV_10 | SCSI_ATTR_FLAG_FP_1DIGIT, "Medium Width", /*suffix*/"mm", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_ASSIGNING_ORG, SCSI_ATTR_FLAG_NONE, "Assigning Organization", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_DENSITY_CODE, SCSI_ATTR_FLAG_HEX, "Medium Density Code", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_MANUF_DATE, SCSI_ATTR_FLAG_NONE, "Medium Manufacture Date", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MAM_CAPACITY, SCSI_ATTR_FLAG_NONE, "MAM Capacity", /*suffix*/"bytes", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_TYPE, SCSI_ATTR_FLAG_HEX, "Medium Type", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_TYPE_INFO, SCSI_ATTR_FLAG_HEX, "Medium Type Information", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_SERIAL_NUM, SCSI_ATTR_FLAG_NONE, "Medium Serial Number", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_VENDOR, SCSI_ATTR_FLAG_NONE, "Application Vendor", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_NAME, SCSI_ATTR_FLAG_NONE, "Application Name", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_VERSION, SCSI_ATTR_FLAG_NONE, "Application Version", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_USER_MED_TEXT_LABEL, SCSI_ATTR_FLAG_NONE, "User Medium Text Label", /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_LAST_WRITTEN_TIME, SCSI_ATTR_FLAG_NONE, "Date and Time Last Written", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TEXT_LOCAL_ID, SCSI_ATTR_FLAG_HEX, "Text Localization Identifier", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_BARCODE, SCSI_ATTR_FLAG_NONE, "Barcode", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_HOST_OWNER_NAME, SCSI_ATTR_FLAG_NONE, "Owning Host Textual Name", /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MEDIA_POOL, SCSI_ATTR_FLAG_NONE, "Media Pool", /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_PART_USER_LABEL, SCSI_ATTR_FLAG_NONE, "Partition User Text Label", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_LOAD_UNLOAD_AT_PART, SCSI_ATTR_FLAG_NONE, "Load/Unload at Partition", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_FORMAT_VERSION, SCSI_ATTR_FLAG_NONE, "Application Format Version", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_VOL_COHERENCY_INFO, SCSI_ATTR_FLAG_NONE, "Volume Coherency Information", /*suffix*/NULL, /*to_str*/ scsi_attrib_volcoh_sbuf, /*parse_str*/ NULL }, { 0x0ff1, SCSI_ATTR_FLAG_NONE, "Spectra MLM Creation", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff2, SCSI_ATTR_FLAG_NONE, "Spectra MLM C3", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff3, SCSI_ATTR_FLAG_NONE, "Spectra MLM RW", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff4, SCSI_ATTR_FLAG_NONE, "Spectra MLM SDC List", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff7, SCSI_ATTR_FLAG_NONE, "Spectra MLM Post Scan", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ffe, SCSI_ATTR_FLAG_NONE, "Spectra MLM Checksum", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f1, SCSI_ATTR_FLAG_NONE, "Spectra MLM Creation", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f2, SCSI_ATTR_FLAG_NONE, "Spectra MLM C3", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f3, SCSI_ATTR_FLAG_NONE, "Spectra MLM RW", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f4, SCSI_ATTR_FLAG_NONE, "Spectra MLM SDC List", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f7, SCSI_ATTR_FLAG_NONE, "Spectra MLM Post Scan", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17ff, SCSI_ATTR_FLAG_NONE, "Spectra MLM Checksum", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, }; /* * Print out Volume Coherency Information (Attribute 0x080c). * This field has two variable length members, including one at the * beginning, so it isn't practical to have a fixed structure definition. * This is current as of SSC4r03 (see section 4.2.21.3), dated March 25, * 2013. */ int scsi_attrib_volcoh_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size; uint64_t tmp_val; uint8_t *cur_ptr; int retval; int vcr_len, as_len; retval = 0; tmp_val = 0; field_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); if (field_size > avail_len) { if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; goto bailout; } else if (field_size == 0) { /* * It isn't clear from the spec whether a field length of * 0 is invalid here. It probably is, but be lenient here * to avoid inconveniencing the user. */ goto bailout; } cur_ptr = hdr->attribute; vcr_len = *cur_ptr; cur_ptr++; sbuf_printf(sb, "\n\tVolume Change Reference Value:"); switch (vcr_len) { case 0: if (error_str != NULL) { snprintf(error_str, error_str_len, "Volume Change " "Reference value has length of 0"); } retval = 1; goto bailout; break; /*NOTREACHED*/ case 1: tmp_val = *cur_ptr; break; case 2: tmp_val = scsi_2btoul(cur_ptr); break; case 3: tmp_val = scsi_3btoul(cur_ptr); break; case 4: tmp_val = scsi_4btoul(cur_ptr); break; case 8: tmp_val = scsi_8btou64(cur_ptr); break; default: sbuf_printf(sb, "\n"); sbuf_hexdump(sb, cur_ptr, vcr_len, NULL, 0); break; } if (vcr_len <= 8) sbuf_printf(sb, " 0x%jx\n", (uintmax_t)tmp_val); cur_ptr += vcr_len; tmp_val = scsi_8btou64(cur_ptr); sbuf_printf(sb, "\tVolume Coherency Count: %ju\n", (uintmax_t)tmp_val); cur_ptr += sizeof(tmp_val); tmp_val = scsi_8btou64(cur_ptr); sbuf_printf(sb, "\tVolume Coherency Set Identifier: 0x%jx\n", (uintmax_t)tmp_val); /* * Figure out how long the Application Client Specific Information * is and produce a hexdump. */ cur_ptr += sizeof(tmp_val); as_len = scsi_2btoul(cur_ptr); cur_ptr += sizeof(uint16_t); sbuf_printf(sb, "\tApplication Client Specific Information: "); if (((as_len == SCSI_LTFS_VER0_LEN) || (as_len == SCSI_LTFS_VER1_LEN)) && (strncmp(cur_ptr, SCSI_LTFS_STR_NAME, SCSI_LTFS_STR_LEN) == 0)) { sbuf_printf(sb, "LTFS\n"); cur_ptr += SCSI_LTFS_STR_LEN + 1; if (cur_ptr[SCSI_LTFS_UUID_LEN] != '\0') cur_ptr[SCSI_LTFS_UUID_LEN] = '\0'; sbuf_printf(sb, "\tLTFS UUID: %s\n", cur_ptr); cur_ptr += SCSI_LTFS_UUID_LEN + 1; /* XXX KDM check the length */ sbuf_printf(sb, "\tLTFS Version: %d\n", *cur_ptr); } else { sbuf_printf(sb, "Unknown\n"); sbuf_hexdump(sb, cur_ptr, as_len, NULL, 0); } bailout: return (retval); } int scsi_attrib_vendser_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size; struct scsi_attrib_vendser *vendser; cam_strvis_flags strvis_flags; int retval = 0; field_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); if (field_size > avail_len) { if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; goto bailout; } else if (field_size == 0) { /* * A field size of 0 doesn't make sense here. The device * can at least give you the vendor ID, even if it can't * give you the serial number. */ if (error_str != NULL) { snprintf(error_str, error_str_len, "The length of " "attribute ID 0x%.4x is 0", scsi_2btoul(hdr->id)); } retval = 1; goto bailout; } vendser = (struct scsi_attrib_vendser *)hdr->attribute; switch (output_flags & SCSI_ATTR_OUTPUT_NONASCII_MASK) { case SCSI_ATTR_OUTPUT_NONASCII_TRIM: strvis_flags = CAM_STRVIS_FLAG_NONASCII_TRIM; break; case SCSI_ATTR_OUTPUT_NONASCII_RAW: strvis_flags = CAM_STRVIS_FLAG_NONASCII_RAW; break; case SCSI_ATTR_OUTPUT_NONASCII_ESC: default: strvis_flags = CAM_STRVIS_FLAG_NONASCII_ESC; break;; } cam_strvis_sbuf(sb, vendser->vendor, sizeof(vendser->vendor), strvis_flags); sbuf_putc(sb, ' '); cam_strvis_sbuf(sb, vendser->serial_num, sizeof(vendser->serial_num), strvis_flags); bailout: return (retval); } int scsi_attrib_hexdump_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { uint32_t field_size; ssize_t avail_len; uint32_t print_len; uint8_t *num_ptr; int retval = 0; field_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); print_len = MIN(avail_len, field_size); num_ptr = hdr->attribute; if (print_len > 0) { sbuf_printf(sb, "\n"); sbuf_hexdump(sb, num_ptr, print_len, NULL, 0); } return (retval); } int scsi_attrib_int_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { uint64_t print_number; size_t avail_len; uint32_t number_size; int retval = 0; number_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); if (avail_len < number_size) { if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, number_size); } retval = 1; goto bailout; } switch (number_size) { case 0: /* * We don't treat this as an error, since there may be * scenarios where a device reports a field but then gives * a length of 0. See the note in scsi_attrib_ascii_sbuf(). */ goto bailout; break; /*NOTREACHED*/ case 1: print_number = hdr->attribute[0]; break; case 2: print_number = scsi_2btoul(hdr->attribute); break; case 3: print_number = scsi_3btoul(hdr->attribute); break; case 4: print_number = scsi_4btoul(hdr->attribute); break; case 8: print_number = scsi_8btou64(hdr->attribute); break; default: /* * If we wind up here, the number is too big to print * normally, so just do a hexdump. */ retval = scsi_attrib_hexdump_sbuf(sb, hdr, valid_len, flags, output_flags, error_str, error_str_len); goto bailout; break; } if (flags & SCSI_ATTR_FLAG_FP) { #ifndef _KERNEL long double num_float; num_float = (long double)print_number; if (flags & SCSI_ATTR_FLAG_DIV_10) num_float /= 10; sbuf_printf(sb, "%.*Lf", (flags & SCSI_ATTR_FLAG_FP_1DIGIT) ? 1 : 0, num_float); #else /* _KERNEL */ sbuf_printf(sb, "%ju", (flags & SCSI_ATTR_FLAG_DIV_10) ? (print_number / 10) : print_number); #endif /* _KERNEL */ } else if (flags & SCSI_ATTR_FLAG_HEX) { sbuf_printf(sb, "0x%jx", (uintmax_t)print_number); } else sbuf_printf(sb, "%ju", (uintmax_t)print_number); bailout: return (retval); } int scsi_attrib_ascii_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size, print_size; int retval = 0; avail_len = valid_len - sizeof(*hdr); field_size = scsi_2btoul(hdr->length); print_size = MIN(avail_len, field_size); if (print_size > 0) { cam_strvis_flags strvis_flags; switch (output_flags & SCSI_ATTR_OUTPUT_NONASCII_MASK) { case SCSI_ATTR_OUTPUT_NONASCII_TRIM: strvis_flags = CAM_STRVIS_FLAG_NONASCII_TRIM; break; case SCSI_ATTR_OUTPUT_NONASCII_RAW: strvis_flags = CAM_STRVIS_FLAG_NONASCII_RAW; break; case SCSI_ATTR_OUTPUT_NONASCII_ESC: default: strvis_flags = CAM_STRVIS_FLAG_NONASCII_ESC; break; } cam_strvis_sbuf(sb, hdr->attribute, print_size, strvis_flags); } else if (avail_len < field_size) { /* * We only report an error if the user didn't allocate * enough space to hold the full value of this field. If * the field length is 0, that is allowed by the spec. * e.g. in SPC-4r37, section 7.4.2.2.5, VOLUME IDENTIFIER * "This attribute indicates the current volume identifier * (see SMC-3) of the medium. If the device server supports * this attribute but does not have access to the volume * identifier, the device server shall report this attribute * with an attribute length value of zero." */ if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; } return (retval); } int scsi_attrib_text_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size, print_size; int retval = 0; int esc_text = 1; avail_len = valid_len - sizeof(*hdr); field_size = scsi_2btoul(hdr->length); print_size = MIN(avail_len, field_size); if ((output_flags & SCSI_ATTR_OUTPUT_TEXT_MASK) == SCSI_ATTR_OUTPUT_TEXT_RAW) esc_text = 0; if (print_size > 0) { uint32_t i; for (i = 0; i < print_size; i++) { if (hdr->attribute[i] == '\0') continue; else if (((unsigned char)hdr->attribute[i] < 0x80) || (esc_text == 0)) sbuf_putc(sb, hdr->attribute[i]); else sbuf_printf(sb, "%%%02x", (unsigned char)hdr->attribute[i]); } } else if (avail_len < field_size) { /* * We only report an error if the user didn't allocate * enough space to hold the full value of this field. */ if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; } return (retval); } struct scsi_attrib_table_entry * scsi_find_attrib_entry(struct scsi_attrib_table_entry *table, size_t num_table_entries, uint32_t id) { uint32_t i; for (i = 0; i < num_table_entries; i++) { if (table[i].id == id) return (&table[i]); } return (NULL); } struct scsi_attrib_table_entry * scsi_get_attrib_entry(uint32_t id) { return (scsi_find_attrib_entry(scsi_mam_attr_table, sizeof(scsi_mam_attr_table) / sizeof(scsi_mam_attr_table[0]), id)); } int scsi_attrib_value_sbuf(struct sbuf *sb, uint32_t valid_len, struct scsi_mam_attribute_header *hdr, uint32_t output_flags, char *error_str, size_t error_str_len) { int retval; switch (hdr->byte2 & SMA_FORMAT_MASK) { case SMA_FORMAT_ASCII: retval = scsi_attrib_ascii_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str,error_str_len); break; case SMA_FORMAT_BINARY: if (scsi_2btoul(hdr->length) <= 8) retval = scsi_attrib_int_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str, error_str_len); else retval = scsi_attrib_hexdump_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str, error_str_len); break; case SMA_FORMAT_TEXT: retval = scsi_attrib_text_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str, error_str_len); break; default: if (error_str != NULL) { snprintf(error_str, error_str_len, "Unknown attribute " "format 0x%x", hdr->byte2 & SMA_FORMAT_MASK); } retval = 1; goto bailout; break; /*NOTREACHED*/ } sbuf_trim(sb); bailout: return (retval); } void scsi_attrib_prefix_sbuf(struct sbuf *sb, uint32_t output_flags, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, const char *desc) { int need_space = 0; uint32_t len; uint32_t id; /* * We can't do anything if we don't have enough valid data for the * header. */ if (valid_len < sizeof(*hdr)) return; id = scsi_2btoul(hdr->id); /* * Note that we print out the value of the attribute listed in the * header, regardless of whether we actually got that many bytes * back from the device through the controller. A truncated result * could be the result of a failure to ask for enough data; the * header indicates how many bytes are allocated for this attribute * in the MAM. */ len = scsi_2btoul(hdr->length); if ((output_flags & SCSI_ATTR_OUTPUT_FIELD_MASK) == SCSI_ATTR_OUTPUT_FIELD_NONE) return; if ((output_flags & SCSI_ATTR_OUTPUT_FIELD_DESC) && (desc != NULL)) { sbuf_printf(sb, "%s", desc); need_space = 1; } if (output_flags & SCSI_ATTR_OUTPUT_FIELD_NUM) { sbuf_printf(sb, "%s(0x%.4x)", (need_space) ? " " : "", id); need_space = 0; } if (output_flags & SCSI_ATTR_OUTPUT_FIELD_SIZE) { sbuf_printf(sb, "%s[%d]", (need_space) ? " " : "", len); need_space = 0; } if (output_flags & SCSI_ATTR_OUTPUT_FIELD_RW) { sbuf_printf(sb, "%s(%s)", (need_space) ? " " : "", (hdr->byte2 & SMA_READ_ONLY) ? "RO" : "RW"); } sbuf_printf(sb, ": "); } int scsi_attrib_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, struct scsi_attrib_table_entry *user_table, size_t num_user_entries, int prefer_user_table, uint32_t output_flags, char *error_str, int error_str_len) { int retval; struct scsi_attrib_table_entry *table1 = NULL, *table2 = NULL; struct scsi_attrib_table_entry *entry = NULL; size_t table1_size = 0, table2_size = 0; uint32_t id; retval = 0; if (valid_len < sizeof(*hdr)) { retval = 1; goto bailout; } id = scsi_2btoul(hdr->id); if (user_table != NULL) { if (prefer_user_table != 0) { table1 = user_table; table1_size = num_user_entries; table2 = scsi_mam_attr_table; table2_size = sizeof(scsi_mam_attr_table) / sizeof(scsi_mam_attr_table[0]); } else { table1 = scsi_mam_attr_table; table1_size = sizeof(scsi_mam_attr_table) / sizeof(scsi_mam_attr_table[0]); table2 = user_table; table2_size = num_user_entries; } } else { table1 = scsi_mam_attr_table; table1_size = sizeof(scsi_mam_attr_table) / sizeof(scsi_mam_attr_table[0]); } entry = scsi_find_attrib_entry(table1, table1_size, id); if (entry != NULL) { scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len, entry->desc); if (entry->to_str == NULL) goto print_default; retval = entry->to_str(sb, hdr, valid_len, entry->flags, output_flags, error_str, error_str_len); goto bailout; } if (table2 != NULL) { entry = scsi_find_attrib_entry(table2, table2_size, id); if (entry != NULL) { if (entry->to_str == NULL) goto print_default; scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len, entry->desc); retval = entry->to_str(sb, hdr, valid_len, entry->flags, output_flags, error_str, error_str_len); goto bailout; } } scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len, NULL); print_default: retval = scsi_attrib_value_sbuf(sb, valid_len, hdr, output_flags, error_str, error_str_len); bailout: if (retval == 0) { if ((entry != NULL) && (entry->suffix != NULL)) sbuf_printf(sb, " %s", entry->suffix); sbuf_trim(sb); sbuf_printf(sb, "\n"); } return (retval); } void scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout) { struct scsi_test_unit_ready *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_test_unit_ready *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = TEST_UNIT_READY; } void scsi_request_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), void *data_ptr, u_int8_t dxfer_len, u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout) { struct scsi_request_sense *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_request_sense *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = REQUEST_SENSE; scsi_cmd->length = dxfer_len; } void scsi_inquiry(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *inq_buf, u_int32_t inq_len, int evpd, u_int8_t page_code, u_int8_t sense_len, u_int32_t timeout) { struct scsi_inquiry *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/inq_buf, /*dxfer_len*/inq_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_inquiry *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = INQUIRY; if (evpd) { scsi_cmd->byte2 |= SI_EVPD; scsi_cmd->page_code = page_code; } scsi_ulto2b(inq_len, scsi_cmd->length); } void scsi_mode_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int dbd, u_int8_t page_code, u_int8_t page, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout) { scsi_mode_sense_len(csio, retries, cbfcnp, tag_action, dbd, page_code, page, param_buf, param_len, 0, sense_len, timeout); } void scsi_mode_sense_len(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int dbd, u_int8_t page_code, u_int8_t page, u_int8_t *param_buf, u_int32_t param_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout) { u_int8_t cdb_len; /* * Use the smallest possible command to perform the operation. */ if ((param_len < 256) && (minimum_cmd_size < 10)) { /* * We can fit in a 6 byte cdb. */ struct scsi_mode_sense_6 *scsi_cmd; scsi_cmd = (struct scsi_mode_sense_6 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SENSE_6; if (dbd != 0) scsi_cmd->byte2 |= SMS_DBD; scsi_cmd->page = page_code | page; scsi_cmd->length = param_len; cdb_len = sizeof(*scsi_cmd); } else { /* * Need a 10 byte cdb. */ struct scsi_mode_sense_10 *scsi_cmd; scsi_cmd = (struct scsi_mode_sense_10 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SENSE_10; if (dbd != 0) scsi_cmd->byte2 |= SMS_DBD; scsi_cmd->page = page_code | page; scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); } cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_IN, tag_action, param_buf, param_len, sense_len, cdb_len, timeout); } void scsi_mode_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout) { scsi_mode_select_len(csio, retries, cbfcnp, tag_action, scsi_page_fmt, save_pages, param_buf, param_len, 0, sense_len, timeout); } void scsi_mode_select_len(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout) { u_int8_t cdb_len; /* * Use the smallest possible command to perform the operation. */ if ((param_len < 256) && (minimum_cmd_size < 10)) { /* * We can fit in a 6 byte cdb. */ struct scsi_mode_select_6 *scsi_cmd; scsi_cmd = (struct scsi_mode_select_6 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SELECT_6; if (scsi_page_fmt != 0) scsi_cmd->byte2 |= SMS_PF; if (save_pages != 0) scsi_cmd->byte2 |= SMS_SP; scsi_cmd->length = param_len; cdb_len = sizeof(*scsi_cmd); } else { /* * Need a 10 byte cdb. */ struct scsi_mode_select_10 *scsi_cmd; scsi_cmd = (struct scsi_mode_select_10 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SELECT_10; if (scsi_page_fmt != 0) scsi_cmd->byte2 |= SMS_PF; if (save_pages != 0) scsi_cmd->byte2 |= SMS_SP; scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); } cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_OUT, tag_action, param_buf, param_len, sense_len, cdb_len, timeout); } void scsi_log_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, u_int8_t page, int save_pages, int ppc, u_int32_t paramptr, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_log_sense *scsi_cmd; u_int8_t cdb_len; scsi_cmd = (struct scsi_log_sense *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = LOG_SENSE; scsi_cmd->page = page_code | page; if (save_pages != 0) scsi_cmd->byte2 |= SLS_SP; if (ppc != 0) scsi_cmd->byte2 |= SLS_PPC; scsi_ulto2b(paramptr, scsi_cmd->paramptr); scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/param_buf, /*dxfer_len*/param_len, sense_len, cdb_len, timeout); } void scsi_log_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, int save_pages, int pc_reset, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_log_select *scsi_cmd; u_int8_t cdb_len; scsi_cmd = (struct scsi_log_select *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = LOG_SELECT; scsi_cmd->page = page_code & SLS_PAGE_CODE; if (save_pages != 0) scsi_cmd->byte2 |= SLS_SP; if (pc_reset != 0) scsi_cmd->byte2 |= SLS_PCR; scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/param_buf, /*dxfer_len*/param_len, sense_len, cdb_len, timeout); } /* * Prevent or allow the user to remove the media */ void scsi_prevent(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t action, u_int8_t sense_len, u_int32_t timeout) { struct scsi_prevent *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_NONE, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_prevent *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = PREVENT_ALLOW; scsi_cmd->how = action; } /* XXX allow specification of address and PMI bit and LBA */ void scsi_read_capacity(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, struct scsi_read_capacity_data *rcap_buf, u_int8_t sense_len, u_int32_t timeout) { struct scsi_read_capacity *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)rcap_buf, /*dxfer_len*/sizeof(*rcap_buf), sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_read_capacity *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = READ_CAPACITY; } void scsi_read_capacity_16(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint64_t lba, int reladr, int pmi, uint8_t *rcap_buf, int rcap_buf_len, uint8_t sense_len, uint32_t timeout) { struct scsi_read_capacity_16 *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)rcap_buf, /*dxfer_len*/rcap_buf_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_read_capacity_16 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SERVICE_ACTION_IN; scsi_cmd->service_action = SRC16_SERVICE_ACTION; scsi_u64to8b(lba, scsi_cmd->addr); scsi_ulto4b(rcap_buf_len, scsi_cmd->alloc_len); if (pmi) reladr |= SRC16_PMI; if (reladr) reladr |= SRC16_RELADR; } void scsi_report_luns(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t select_report, struct scsi_report_luns_data *rpl_buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_report_luns *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)rpl_buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_report_luns *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = REPORT_LUNS; scsi_cmd->select_report = select_report; scsi_ulto4b(alloc_len, scsi_cmd->length); } void scsi_report_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t pdf, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_target_group *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_target_group *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_IN; scsi_cmd->service_action = REPORT_TARGET_PORT_GROUPS | pdf; scsi_ulto4b(alloc_len, scsi_cmd->length); } void scsi_set_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_target_group *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/(u_int8_t *)buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_target_group *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_OUT; scsi_cmd->service_action = SET_TARGET_PORT_GROUPS; scsi_ulto4b(alloc_len, scsi_cmd->length); } /* * Syncronize the media to the contents of the cache for * the given lba/count pair. Specifying 0/0 means sync * the whole cache. */ void scsi_synchronize_cache(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int32_t begin_lba, u_int16_t lb_count, u_int8_t sense_len, u_int32_t timeout) { struct scsi_sync_cache *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_NONE, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_sync_cache *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SYNCHRONIZE_CACHE; scsi_ulto4b(begin_lba, scsi_cmd->begin_lba); scsi_ulto2b(lb_count, scsi_cmd->lb_count); } void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int readop, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { int read; u_int8_t cdb_len; read = (readop & SCSI_RW_DIRMASK) == SCSI_RW_READ; /* * Use the smallest possible command to perform the operation * as some legacy hardware does not support the 10 byte commands. * If any of the bits in byte2 is set, we have to go with a larger * command. */ if ((minimum_cmd_size < 10) && ((lba & 0x1fffff) == lba) && ((block_count & 0xff) == block_count) && (byte2 == 0)) { /* * We can fit in a 6 byte cdb. */ struct scsi_rw_6 *scsi_cmd; scsi_cmd = (struct scsi_rw_6 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_6 : WRITE_6; scsi_ulto3b(lba, scsi_cmd->addr); scsi_cmd->length = block_count & 0xff; scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("6byte: %x%x%x:%d:%d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->length, dxfer_len)); } else if ((minimum_cmd_size < 12) && ((block_count & 0xffff) == block_count) && ((lba & 0xffffffff) == lba)) { /* * Need a 10 byte cdb. */ struct scsi_rw_10 *scsi_cmd; scsi_cmd = (struct scsi_rw_10 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_10 : WRITE_10; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; scsi_ulto2b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("10byte: %x%x%x%x:%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->length[0], scsi_cmd->length[1], dxfer_len)); } else if ((minimum_cmd_size < 16) && ((block_count & 0xffffffff) == block_count) && ((lba & 0xffffffff) == lba)) { /* * The block count is too big for a 10 byte CDB, use a 12 * byte CDB. */ struct scsi_rw_12 *scsi_cmd; scsi_cmd = (struct scsi_rw_12 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_12 : WRITE_12; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; scsi_ulto4b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("12byte: %x%x%x%x:%x%x%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->length[0], scsi_cmd->length[1], scsi_cmd->length[2], scsi_cmd->length[3], dxfer_len)); } else { /* * 16 byte CDB. We'll only get here if the LBA is larger * than 2^32, or if the user asks for a 16 byte command. */ struct scsi_rw_16 *scsi_cmd; scsi_cmd = (struct scsi_rw_16 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_16 : WRITE_16; scsi_cmd->byte2 = byte2; scsi_u64to8b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; scsi_ulto4b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); } cam_fill_csio(csio, retries, cbfcnp, (read ? CAM_DIR_IN : CAM_DIR_OUT) | ((readop & SCSI_RW_BIO) != 0 ? CAM_DATA_BIO : 0), tag_action, data_ptr, dxfer_len, sense_len, cdb_len, timeout); } void scsi_write_same(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { u_int8_t cdb_len; if ((minimum_cmd_size < 16) && ((block_count & 0xffff) == block_count) && ((lba & 0xffffffff) == lba)) { /* * Need a 10 byte cdb. */ struct scsi_write_same_10 *scsi_cmd; scsi_cmd = (struct scsi_write_same_10 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = WRITE_SAME_10; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->group = 0; scsi_ulto2b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("10byte: %x%x%x%x:%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->length[0], scsi_cmd->length[1], dxfer_len)); } else { /* * 16 byte CDB. We'll only get here if the LBA is larger * than 2^32, or if the user asks for a 16 byte command. */ struct scsi_write_same_16 *scsi_cmd; scsi_cmd = (struct scsi_write_same_16 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = WRITE_SAME_16; scsi_cmd->byte2 = byte2; scsi_u64to8b(lba, scsi_cmd->addr); scsi_ulto4b(block_count, scsi_cmd->length); scsi_cmd->group = 0; scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("16byte: %x%x%x%x%x%x%x%x:%x%x%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->addr[4], scsi_cmd->addr[5], scsi_cmd->addr[6], scsi_cmd->addr[7], scsi_cmd->length[0], scsi_cmd->length[1], scsi_cmd->length[2], scsi_cmd->length[3], dxfer_len)); } cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, data_ptr, dxfer_len, sense_len, cdb_len, timeout); } void scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { scsi_ata_pass_16(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*protocol*/AP_PROTO_PIO_IN, /*ata_flags*/AP_FLAG_TDIR_FROM_DEV| AP_FLAG_BYT_BLOK_BYTES|AP_FLAG_TLEN_SECT_CNT, /*features*/0, /*sector_count*/dxfer_len, /*lba*/0, /*command*/ATA_ATA_IDENTIFY, /*control*/0, data_ptr, dxfer_len, sense_len, timeout); } void scsi_ata_trim(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int16_t block_count, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { scsi_ata_pass_16(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*protocol*/AP_EXTEND|AP_PROTO_DMA, /*ata_flags*/AP_FLAG_TLEN_SECT_CNT|AP_FLAG_BYT_BLOK_BLOCKS, /*features*/ATA_DSM_TRIM, /*sector_count*/block_count, /*lba*/0, /*command*/ATA_DATA_SET_MANAGEMENT, /*control*/0, data_ptr, dxfer_len, sense_len, timeout); } void scsi_ata_pass_16(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t tag_action, u_int8_t protocol, u_int8_t ata_flags, u_int16_t features, u_int16_t sector_count, uint64_t lba, u_int8_t command, u_int8_t control, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct ata_pass_16 *ata_cmd; ata_cmd = (struct ata_pass_16 *)&csio->cdb_io.cdb_bytes; ata_cmd->opcode = ATA_PASS_16; ata_cmd->protocol = protocol; ata_cmd->flags = ata_flags; ata_cmd->features_ext = features >> 8; ata_cmd->features = features; ata_cmd->sector_count_ext = sector_count >> 8; ata_cmd->sector_count = sector_count; ata_cmd->lba_low = lba; ata_cmd->lba_mid = lba >> 8; ata_cmd->lba_high = lba >> 16; ata_cmd->device = ATA_DEV_LBA; if (protocol & AP_EXTEND) { ata_cmd->lba_low_ext = lba >> 24; ata_cmd->lba_mid_ext = lba >> 32; ata_cmd->lba_high_ext = lba >> 40; } else ata_cmd->device |= (lba >> 24) & 0x0f; ata_cmd->command = command; ata_cmd->control = control; cam_fill_csio(csio, retries, cbfcnp, flags, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*ata_cmd), timeout); } void scsi_unmap(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_unmap *scsi_cmd; scsi_cmd = (struct scsi_unmap *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = UNMAP; scsi_cmd->byte2 = byte2; scsi_ulto4b(0, scsi_cmd->reserved); scsi_cmd->group = 0; scsi_ulto2b(dxfer_len, scsi_cmd->length); scsi_cmd->control = 0; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int pcv, uint8_t page_code, uint8_t *data_ptr, uint16_t allocation_length, uint8_t sense_len, uint32_t timeout) { struct scsi_receive_diag *scsi_cmd; scsi_cmd = (struct scsi_receive_diag *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = RECEIVE_DIAGNOSTIC; if (pcv) { scsi_cmd->byte2 |= SRD_PCV; scsi_cmd->page_code = page_code; } scsi_ulto2b(allocation_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, allocation_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int unit_offline, int device_offline, int self_test, int page_format, int self_test_code, uint8_t *data_ptr, uint16_t param_list_length, uint8_t sense_len, uint32_t timeout) { struct scsi_send_diag *scsi_cmd; scsi_cmd = (struct scsi_send_diag *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = SEND_DIAGNOSTIC; /* * The default self-test mode control and specific test * control are mutually exclusive. */ if (self_test) self_test_code = SSD_SELF_TEST_CODE_NONE; scsi_cmd->byte2 = ((self_test_code << SSD_SELF_TEST_CODE_SHIFT) & SSD_SELF_TEST_CODE_MASK) | (unit_offline ? SSD_UNITOFFL : 0) | (device_offline ? SSD_DEVOFFL : 0) | (self_test ? SSD_SELFTEST : 0) | (page_format ? SSD_PF : 0); scsi_ulto2b(param_list_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/param_list_length ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, param_list_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t allocation_length, uint8_t sense_len, uint32_t timeout) { struct scsi_read_buffer *scsi_cmd; scsi_cmd = (struct scsi_read_buffer *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = READ_BUFFER; scsi_cmd->byte2 = mode; scsi_cmd->buffer_id = buffer_id; scsi_ulto3b(offset, scsi_cmd->offset); scsi_ulto3b(allocation_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, allocation_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t param_list_length, uint8_t sense_len, uint32_t timeout) { struct scsi_write_buffer *scsi_cmd; scsi_cmd = (struct scsi_write_buffer *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = WRITE_BUFFER; scsi_cmd->byte2 = mode; scsi_cmd->buffer_id = buffer_id; scsi_ulto3b(offset, scsi_cmd->offset); scsi_ulto3b(param_list_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/param_list_length ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, param_list_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int start, int load_eject, int immediate, u_int8_t sense_len, u_int32_t timeout) { struct scsi_start_stop_unit *scsi_cmd; int extra_flags = 0; scsi_cmd = (struct scsi_start_stop_unit *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = START_STOP_UNIT; if (start != 0) { scsi_cmd->how |= SSS_START; /* it takes a lot of power to start a drive */ extra_flags |= CAM_HIGH_POWER; } if (load_eject != 0) scsi_cmd->how |= SSS_LOEJ; if (immediate != 0) scsi_cmd->byte2 |= SSS_IMMED; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_NONE | extra_flags, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_attribute(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t service_action, uint32_t element, u_int8_t elem_type, int logical_volume, int partition, u_int32_t first_attribute, int cache, u_int8_t *data_ptr, u_int32_t length, int sense_len, u_int32_t timeout) { struct scsi_read_attribute *scsi_cmd; scsi_cmd = (struct scsi_read_attribute *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = READ_ATTRIBUTE; scsi_cmd->service_action = service_action, scsi_ulto2b(element, scsi_cmd->element); scsi_cmd->elem_type = elem_type; scsi_cmd->logical_volume = logical_volume; scsi_cmd->partition = partition; scsi_ulto2b(first_attribute, scsi_cmd->first_attribute); scsi_ulto4b(length, scsi_cmd->length); if (cache != 0) scsi_cmd->cache |= SRA_CACHE; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/data_ptr, /*dxfer_len*/length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_write_attribute(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, uint32_t element, int logical_volume, int partition, int wtc, u_int8_t *data_ptr, u_int32_t length, int sense_len, u_int32_t timeout) { struct scsi_write_attribute *scsi_cmd; scsi_cmd = (struct scsi_write_attribute *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = WRITE_ATTRIBUTE; if (wtc != 0) scsi_cmd->byte2 = SWA_WTC; scsi_ulto3b(element, scsi_cmd->element); scsi_cmd->logical_volume = logical_volume; scsi_cmd->partition = partition; scsi_ulto4b(length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/data_ptr, /*dxfer_len*/length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_persistent_reserve_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int service_action, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_per_res_in *scsi_cmd; scsi_cmd = (struct scsi_per_res_in *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = PERSISTENT_RES_IN; scsi_cmd->action = service_action; scsi_ulto2b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_persistent_reserve_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int service_action, int scope, int res_type, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_per_res_out *scsi_cmd; scsi_cmd = (struct scsi_per_res_out *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = PERSISTENT_RES_OUT; scsi_cmd->action = service_action; scsi_cmd->scope_type = scope | res_type; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/data_ptr, /*dxfer_len*/dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_security_protocol_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint32_t security_protocol, uint32_t security_protocol_specific, int byte4, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_security_protocol_in *scsi_cmd; scsi_cmd = (struct scsi_security_protocol_in *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SECURITY_PROTOCOL_IN; scsi_cmd->security_protocol = security_protocol; scsi_ulto2b(security_protocol_specific, scsi_cmd->security_protocol_specific); scsi_cmd->byte4 = byte4; scsi_ulto4b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_security_protocol_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint32_t security_protocol, uint32_t security_protocol_specific, int byte4, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_security_protocol_out *scsi_cmd; scsi_cmd = (struct scsi_security_protocol_out *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SECURITY_PROTOCOL_OUT; scsi_cmd->security_protocol = security_protocol; scsi_ulto2b(security_protocol_specific, scsi_cmd->security_protocol_specific); scsi_cmd->byte4 = byte4; scsi_ulto4b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_report_supported_opcodes(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int options, int req_opcode, int req_service_action, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_report_supported_opcodes *scsi_cmd; scsi_cmd = (struct scsi_report_supported_opcodes *) &csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_IN; scsi_cmd->service_action = REPORT_SUPPORTED_OPERATION_CODES; scsi_cmd->options = options; scsi_cmd->requested_opcode = req_opcode; scsi_ulto2b(req_service_action, scsi_cmd->requested_service_action); scsi_ulto4b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } /* * Try make as good a match as possible with * available sub drivers */ int scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry) { struct scsi_inquiry_pattern *entry; struct scsi_inquiry_data *inq; entry = (struct scsi_inquiry_pattern *)table_entry; inq = (struct scsi_inquiry_data *)inqbuffer; if (((SID_TYPE(inq) == entry->type) || (entry->type == T_ANY)) && (SID_IS_REMOVABLE(inq) ? entry->media_type & SIP_MEDIA_REMOVABLE : entry->media_type & SIP_MEDIA_FIXED) && (cam_strmatch(inq->vendor, entry->vendor, sizeof(inq->vendor)) == 0) && (cam_strmatch(inq->product, entry->product, sizeof(inq->product)) == 0) && (cam_strmatch(inq->revision, entry->revision, sizeof(inq->revision)) == 0)) { return (0); } return (-1); } /* * Try make as good a match as possible with * available sub drivers */ int scsi_static_inquiry_match(caddr_t inqbuffer, caddr_t table_entry) { struct scsi_static_inquiry_pattern *entry; struct scsi_inquiry_data *inq; entry = (struct scsi_static_inquiry_pattern *)table_entry; inq = (struct scsi_inquiry_data *)inqbuffer; if (((SID_TYPE(inq) == entry->type) || (entry->type == T_ANY)) && (SID_IS_REMOVABLE(inq) ? entry->media_type & SIP_MEDIA_REMOVABLE : entry->media_type & SIP_MEDIA_FIXED) && (cam_strmatch(inq->vendor, entry->vendor, sizeof(inq->vendor)) == 0) && (cam_strmatch(inq->product, entry->product, sizeof(inq->product)) == 0) && (cam_strmatch(inq->revision, entry->revision, sizeof(inq->revision)) == 0)) { return (0); } return (-1); } /** * Compare two buffers of vpd device descriptors for a match. * * \param lhs Pointer to first buffer of descriptors to compare. * \param lhs_len The length of the first buffer. * \param rhs Pointer to second buffer of descriptors to compare. * \param rhs_len The length of the second buffer. * * \return 0 on a match, -1 otherwise. * * Treat rhs and lhs as arrays of vpd device id descriptors. Walk lhs matching * agains each element in rhs until all data are exhausted or we have found * a match. */ int scsi_devid_match(uint8_t *lhs, size_t lhs_len, uint8_t *rhs, size_t rhs_len) { struct scsi_vpd_id_descriptor *lhs_id; struct scsi_vpd_id_descriptor *lhs_last; struct scsi_vpd_id_descriptor *rhs_last; uint8_t *lhs_end; uint8_t *rhs_end; lhs_end = lhs + lhs_len; rhs_end = rhs + rhs_len; /* * rhs_last and lhs_last are the last posible position of a valid * descriptor assuming it had a zero length identifier. We use * these variables to insure we can safely dereference the length * field in our loop termination tests. */ lhs_last = (struct scsi_vpd_id_descriptor *) (lhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier)); rhs_last = (struct scsi_vpd_id_descriptor *) (rhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier)); lhs_id = (struct scsi_vpd_id_descriptor *)lhs; while (lhs_id <= lhs_last && (lhs_id->identifier + lhs_id->length) <= lhs_end) { struct scsi_vpd_id_descriptor *rhs_id; rhs_id = (struct scsi_vpd_id_descriptor *)rhs; while (rhs_id <= rhs_last && (rhs_id->identifier + rhs_id->length) <= rhs_end) { if ((rhs_id->id_type & (SVPD_ID_ASSOC_MASK | SVPD_ID_TYPE_MASK)) == (lhs_id->id_type & (SVPD_ID_ASSOC_MASK | SVPD_ID_TYPE_MASK)) && rhs_id->length == lhs_id->length && memcmp(rhs_id->identifier, lhs_id->identifier, rhs_id->length) == 0) return (0); rhs_id = (struct scsi_vpd_id_descriptor *) (rhs_id->identifier + rhs_id->length); } lhs_id = (struct scsi_vpd_id_descriptor *) (lhs_id->identifier + lhs_id->length); } return (-1); } #ifdef _KERNEL int scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id) { struct cam_ed *device; struct scsi_vpd_supported_pages *vpds; int i, num_pages; device = periph->path->device; vpds = (struct scsi_vpd_supported_pages *)device->supported_vpds; if (vpds != NULL) { num_pages = device->supported_vpds_len - SVPD_SUPPORTED_PAGES_HDR_LEN; for (i = 0; i < num_pages; i++) { if (vpds->page_list[i] == page_id) return (1); } } return (0); } static void init_scsi_delay(void) { int delay; delay = SCSI_DELAY; TUNABLE_INT_FETCH("kern.cam.scsi_delay", &delay); if (set_scsi_delay(delay) != 0) { printf("cam: invalid value for tunable kern.cam.scsi_delay\n"); set_scsi_delay(SCSI_DELAY); } } SYSINIT(scsi_delay, SI_SUB_TUNABLES, SI_ORDER_ANY, init_scsi_delay, NULL); static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS) { int error, delay; delay = scsi_delay; error = sysctl_handle_int(oidp, &delay, 0, req); if (error != 0 || req->newptr == NULL) return (error); return (set_scsi_delay(delay)); } SYSCTL_PROC(_kern_cam, OID_AUTO, scsi_delay, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_scsi_delay, "I", "Delay to allow devices to settle after a SCSI bus reset (ms)"); static int set_scsi_delay(int delay) { /* * If someone sets this to 0, we assume that they want the * minimum allowable bus settle delay. */ if (delay == 0) { printf("cam: using minimum scsi_delay (%dms)\n", SCSI_MIN_DELAY); delay = SCSI_MIN_DELAY; } if (delay < SCSI_MIN_DELAY) return (EINVAL); scsi_delay = delay; return (0); } #endif /* _KERNEL */ Index: projects/iosched/sys/cddl/contrib/opensolaris/common/avl/avl.c =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/common/avl/avl.c (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/common/avl/avl.c (revision 287722) @@ -1,1059 +1,1063 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* * AVL - generic AVL tree implementation for kernel use * * A complete description of AVL trees can be found in many CS textbooks. * * Here is a very brief overview. An AVL tree is a binary search tree that is * almost perfectly balanced. By "almost" perfectly balanced, we mean that at * any given node, the left and right subtrees are allowed to differ in height * by at most 1 level. * * This relaxation from a perfectly balanced binary tree allows doing * insertion and deletion relatively efficiently. Searching the tree is * still a fast operation, roughly O(log(N)). * * The key to insertion and deletion is a set of tree manipulations called * rotations, which bring unbalanced subtrees back into the semi-balanced state. * * This implementation of AVL trees has the following peculiarities: * * - The AVL specific data structures are physically embedded as fields * in the "using" data structures. To maintain generality the code * must constantly translate between "avl_node_t *" and containing * data structure "void *"s by adding/subtracting the avl_offset. * * - Since the AVL data is always embedded in other structures, there is * no locking or memory allocation in the AVL routines. This must be * provided for by the enclosing data structure's semantics. Typically, * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of * exclusive write lock. Other operations require a read lock. * * - The implementation uses iteration instead of explicit recursion, * since it is intended to run on limited size kernel stacks. Since * there is no recursion stack present to move "up" in the tree, * there is an explicit "parent" link in the avl_node_t. * * - The left/right children pointers of a node are in an array. * In the code, variables (instead of constants) are used to represent * left and right indices. The implementation is written as if it only * dealt with left handed manipulations. By changing the value assigned * to "left", the code also works for right handed trees. The * following variables/terms are frequently used: * * int left; // 0 when dealing with left children, * // 1 for dealing with right children * * int left_heavy; // -1 when left subtree is taller at some node, * // +1 when right subtree is taller * * int right; // will be the opposite of left (0 or 1) * int right_heavy;// will be the opposite of left_heavy (-1 or 1) * * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) * * Though it is a little more confusing to read the code, the approach * allows using half as much code (and hence cache footprint) for tree * manipulations and eliminates many conditional branches. * * - The avl_index_t is an opaque "cookie" used to find nodes at or * adjacent to where a new value would be inserted in the tree. The value * is a modified "avl_node_t *". The bottom bit (normally 0 for a * pointer) is set to indicate if that the new node has a value greater * than the value of the indicated "avl_node_t *". * * Note - in addition to userland (e.g. libavl and libutil) and the kernel * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, * which each have their own compilation environments and subsequent * requirements. Each of these environments must be considered when adding * dependencies from avl.c. */ #include #include #include #include #include /* * Small arrays to translate between balance (or diff) values and child indices. * * Code that deals with binary tree data structures will randomly use * left and right children when examining a tree. C "if()" statements * which evaluate randomly suffer from very poor hardware branch prediction. * In this code we avoid some of the branch mispredictions by using the * following translation arrays. They replace random branches with an * additional memory reference. Since the translation arrays are both very * small the data should remain efficiently in cache. */ static const int avl_child2balance[2] = {-1, 1}; static const int avl_balance2child[] = {0, 0, 1}; /* * Walk from one node to the previous valued node (ie. an infix walk * towards the left). At any given node we do one of 2 things: * * - If there is a left child, go to it, then to it's rightmost descendant. * * - otherwise we return through parent nodes until we've come from a right * child. * * Return Value: * NULL - if at the end of the nodes * otherwise next node */ void * avl_walk(avl_tree_t *tree, void *oldnode, int left) { size_t off = tree->avl_offset; avl_node_t *node = AVL_DATA2NODE(oldnode, off); int right = 1 - left; int was_child; /* * nowhere to walk to if tree is empty */ if (node == NULL) return (NULL); /* * Visit the previous valued node. There are two possibilities: * * If this node has a left child, go down one left, then all * the way right. */ if (node->avl_child[left] != NULL) { for (node = node->avl_child[left]; node->avl_child[right] != NULL; node = node->avl_child[right]) ; /* * Otherwise, return thru left children as far as we can. */ } else { for (;;) { was_child = AVL_XCHILD(node); node = AVL_XPARENT(node); if (node == NULL) return (NULL); if (was_child == right) break; } } return (AVL_NODE2DATA(node, off)); } /* * Return the lowest valued node in a tree or NULL. * (leftmost child from root of tree) */ void * avl_first(avl_tree_t *tree) { avl_node_t *node; avl_node_t *prev = NULL; size_t off = tree->avl_offset; for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) prev = node; if (prev != NULL) return (AVL_NODE2DATA(prev, off)); return (NULL); } /* * Return the highest valued node in a tree or NULL. * (rightmost child from root of tree) */ void * avl_last(avl_tree_t *tree) { avl_node_t *node; avl_node_t *prev = NULL; size_t off = tree->avl_offset; for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) prev = node; if (prev != NULL) return (AVL_NODE2DATA(prev, off)); return (NULL); } /* * Access the node immediately before or after an insertion point. * * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child * * Return value: * NULL: no node in the given direction * "void *" of the found tree node */ void * avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) { int child = AVL_INDEX2CHILD(where); avl_node_t *node = AVL_INDEX2NODE(where); void *data; size_t off = tree->avl_offset; if (node == NULL) { ASSERT(tree->avl_root == NULL); return (NULL); } data = AVL_NODE2DATA(node, off); if (child != direction) return (data); return (avl_walk(tree, data, direction)); } /* * Search for the node which contains "value". The algorithm is a * simple binary tree search. * * return value: * NULL: the value is not in the AVL tree * *where (if not NULL) is set to indicate the insertion point * "void *" of the found tree node */ void * avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) { avl_node_t *node; avl_node_t *prev = NULL; int child = 0; int diff; size_t off = tree->avl_offset; for (node = tree->avl_root; node != NULL; node = node->avl_child[child]) { prev = node; diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); ASSERT(-1 <= diff && diff <= 1); if (diff == 0) { #ifdef DEBUG if (where != NULL) *where = 0; #endif return (AVL_NODE2DATA(node, off)); } child = avl_balance2child[1 + diff]; } if (where != NULL) *where = AVL_MKINDEX(prev, child); return (NULL); } /* * Perform a rotation to restore balance at the subtree given by depth. * * This routine is used by both insertion and deletion. The return value * indicates: * 0 : subtree did not change height * !0 : subtree was reduced in height * * The code is written as if handling left rotations, right rotations are * symmetric and handled by swapping values of variables right/left[_heavy] * * On input balance is the "new" balance at "node". This value is either * -2 or +2. */ static int avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) { int left = !(balance < 0); /* when balance = -2, left will be 0 */ int right = 1 - left; int left_heavy = balance >> 1; int right_heavy = -left_heavy; avl_node_t *parent = AVL_XPARENT(node); avl_node_t *child = node->avl_child[left]; avl_node_t *cright; avl_node_t *gchild; avl_node_t *gright; avl_node_t *gleft; int which_child = AVL_XCHILD(node); int child_bal = AVL_XBALANCE(child); /* BEGIN CSTYLED */ /* * case 1 : node is overly left heavy, the left child is balanced or * also left heavy. This requires the following rotation. * * (node bal:-2) * / \ * / \ * (child bal:0 or -1) * / \ * / \ * cright * * becomes: * * (child bal:1 or 0) * / \ * / \ * (node bal:-1 or 0) * / \ * / \ * cright * * we detect this situation by noting that child's balance is not * right_heavy. */ /* END CSTYLED */ if (child_bal != right_heavy) { /* * compute new balance of nodes * * If child used to be left heavy (now balanced) we reduced * the height of this sub-tree -- used in "return...;" below */ child_bal += right_heavy; /* adjust towards right */ /* * move "cright" to be node's left child */ cright = child->avl_child[right]; node->avl_child[left] = cright; if (cright != NULL) { AVL_SETPARENT(cright, node); AVL_SETCHILD(cright, left); } /* * move node to be child's right child */ child->avl_child[right] = node; AVL_SETBALANCE(node, -child_bal); AVL_SETCHILD(node, right); AVL_SETPARENT(node, child); /* * update the pointer into this subtree */ AVL_SETBALANCE(child, child_bal); AVL_SETCHILD(child, which_child); AVL_SETPARENT(child, parent); if (parent != NULL) parent->avl_child[which_child] = child; else tree->avl_root = child; return (child_bal == 0); } /* BEGIN CSTYLED */ /* * case 2 : When node is left heavy, but child is right heavy we use * a different rotation. * * (node b:-2) * / \ * / \ * / \ * (child b:+1) * / \ * / \ * (gchild b: != 0) * / \ * / \ * gleft gright * * becomes: * * (gchild b:0) * / \ * / \ * / \ * (child b:?) (node b:?) * / \ / \ * / \ / \ * gleft gright * * computing the new balances is more complicated. As an example: * if gchild was right_heavy, then child is now left heavy * else it is balanced */ /* END CSTYLED */ gchild = child->avl_child[right]; gleft = gchild->avl_child[left]; gright = gchild->avl_child[right]; /* * move gright to left child of node and * * move gleft to right child of node */ node->avl_child[left] = gright; if (gright != NULL) { AVL_SETPARENT(gright, node); AVL_SETCHILD(gright, left); } child->avl_child[right] = gleft; if (gleft != NULL) { AVL_SETPARENT(gleft, child); AVL_SETCHILD(gleft, right); } /* * move child to left child of gchild and * * move node to right child of gchild and * * fixup parent of all this to point to gchild */ balance = AVL_XBALANCE(gchild); gchild->avl_child[left] = child; AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); AVL_SETPARENT(child, gchild); AVL_SETCHILD(child, left); gchild->avl_child[right] = node; AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); AVL_SETPARENT(node, gchild); AVL_SETCHILD(node, right); AVL_SETBALANCE(gchild, 0); AVL_SETPARENT(gchild, parent); AVL_SETCHILD(gchild, which_child); if (parent != NULL) parent->avl_child[which_child] = gchild; else tree->avl_root = gchild; return (1); /* the new tree is always shorter */ } /* * Insert a new node into an AVL tree at the specified (from avl_find()) place. * * Newly inserted nodes are always leaf nodes in the tree, since avl_find() * searches out to the leaf positions. The avl_index_t indicates the node * which will be the parent of the new node. * * After the node is inserted, a single rotation further up the tree may * be necessary to maintain an acceptable AVL balance. */ void avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) { avl_node_t *node; avl_node_t *parent = AVL_INDEX2NODE(where); int old_balance; int new_balance; int which_child = AVL_INDEX2CHILD(where); size_t off = tree->avl_offset; ASSERT(tree); #ifdef _LP64 ASSERT(((uintptr_t)new_data & 0x7) == 0); #endif node = AVL_DATA2NODE(new_data, off); /* * First, add the node to the tree at the indicated position. */ ++tree->avl_numnodes; node->avl_child[0] = NULL; node->avl_child[1] = NULL; AVL_SETCHILD(node, which_child); AVL_SETBALANCE(node, 0); AVL_SETPARENT(node, parent); if (parent != NULL) { ASSERT(parent->avl_child[which_child] == NULL); parent->avl_child[which_child] = node; } else { ASSERT(tree->avl_root == NULL); tree->avl_root = node; } /* * Now, back up the tree modifying the balance of all nodes above the * insertion point. If we get to a highly unbalanced ancestor, we * need to do a rotation. If we back out of the tree we are done. * If we brought any subtree into perfect balance (0), we are also done. */ for (;;) { node = parent; if (node == NULL) return; /* * Compute the new balance */ old_balance = AVL_XBALANCE(node); new_balance = old_balance + avl_child2balance[which_child]; /* * If we introduced equal balance, then we are done immediately */ if (new_balance == 0) { AVL_SETBALANCE(node, 0); return; } /* * If both old and new are not zero we went * from -1 to -2 balance, do a rotation. */ if (old_balance != 0) break; AVL_SETBALANCE(node, new_balance); parent = AVL_XPARENT(node); which_child = AVL_XCHILD(node); } /* * perform a rotation to fix the tree and return */ (void) avl_rotation(tree, node, new_balance); } /* * Insert "new_data" in "tree" in the given "direction" either after or * before (AVL_AFTER, AVL_BEFORE) the data "here". * * Insertions can only be done at empty leaf points in the tree, therefore * if the given child of the node is already present we move to either * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since * every other node in the tree is a leaf, this always works. * * To help developers using this interface, we assert that the new node * is correctly ordered at every step of the way in DEBUG kernels. */ void avl_insert_here( avl_tree_t *tree, void *new_data, void *here, int direction) { avl_node_t *node; int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ #ifdef DEBUG int diff; #endif ASSERT(tree != NULL); ASSERT(new_data != NULL); ASSERT(here != NULL); ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); /* * If corresponding child of node is not NULL, go to the neighboring * node and reverse the insertion direction. */ node = AVL_DATA2NODE(here, tree->avl_offset); #ifdef DEBUG diff = tree->avl_compar(new_data, here); ASSERT(-1 <= diff && diff <= 1); ASSERT(diff != 0); ASSERT(diff > 0 ? child == 1 : child == 0); #endif if (node->avl_child[child] != NULL) { node = node->avl_child[child]; child = 1 - child; while (node->avl_child[child] != NULL) { #ifdef DEBUG diff = tree->avl_compar(new_data, AVL_NODE2DATA(node, tree->avl_offset)); ASSERT(-1 <= diff && diff <= 1); ASSERT(diff != 0); ASSERT(diff > 0 ? child == 1 : child == 0); #endif node = node->avl_child[child]; } #ifdef DEBUG diff = tree->avl_compar(new_data, AVL_NODE2DATA(node, tree->avl_offset)); ASSERT(-1 <= diff && diff <= 1); ASSERT(diff != 0); ASSERT(diff > 0 ? child == 1 : child == 0); #endif } ASSERT(node->avl_child[child] == NULL); avl_insert(tree, new_data, AVL_MKINDEX(node, child)); } /* * Add a new node to an AVL tree. */ void avl_add(avl_tree_t *tree, void *new_node) { avl_index_t where; /* * This is unfortunate. We want to call panic() here, even for * non-DEBUG kernels. In userland, however, we can't depend on anything - * in libc or else the rtld build process gets confused. So, all we can - * do in userland is resort to a normal ASSERT(). + * in libc or else the rtld build process gets confused. + * Thankfully, rtld provides us with its own assfail() so we can use + * that here. We use assfail() directly to get a nice error message + * in the core - much like what panic() does for crashdumps. */ if (avl_find(tree, new_node, &where) != NULL) #ifdef _KERNEL panic("avl_find() succeeded inside avl_add()"); #else - ASSERT(0); + (void) assfail("avl_find() succeeded inside avl_add()", + __FILE__, __LINE__); #endif avl_insert(tree, new_node, where); } /* * Delete a node from the AVL tree. Deletion is similar to insertion, but * with 2 complications. * * First, we may be deleting an interior node. Consider the following subtree: * * d c c * / \ / \ / \ * b e b e b e * / \ / \ / * a c a a * * When we are deleting node (d), we find and bring up an adjacent valued leaf * node, say (c), to take the interior node's place. In the code this is * handled by temporarily swapping (d) and (c) in the tree and then using * common code to delete (d) from the leaf position. * * Secondly, an interior deletion from a deep tree may require more than one * rotation to fix the balance. This is handled by moving up the tree through * parents and applying rotations as needed. The return value from * avl_rotation() is used to detect when a subtree did not change overall * height due to a rotation. */ void avl_remove(avl_tree_t *tree, void *data) { avl_node_t *delete; avl_node_t *parent; avl_node_t *node; avl_node_t tmp; int old_balance; int new_balance; int left; int right; int which_child; size_t off = tree->avl_offset; ASSERT(tree); delete = AVL_DATA2NODE(data, off); /* * Deletion is easiest with a node that has at most 1 child. * We swap a node with 2 children with a sequentially valued * neighbor node. That node will have at most 1 child. Note this * has no effect on the ordering of the remaining nodes. * * As an optimization, we choose the greater neighbor if the tree * is right heavy, otherwise the left neighbor. This reduces the * number of rotations needed. */ if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { /* * choose node to swap from whichever side is taller */ old_balance = AVL_XBALANCE(delete); left = avl_balance2child[old_balance + 1]; right = 1 - left; /* * get to the previous value'd node * (down 1 left, as far as possible right) */ for (node = delete->avl_child[left]; node->avl_child[right] != NULL; node = node->avl_child[right]) ; /* * create a temp placeholder for 'node' * move 'node' to delete's spot in the tree */ tmp = *node; *node = *delete; if (node->avl_child[left] == node) node->avl_child[left] = &tmp; parent = AVL_XPARENT(node); if (parent != NULL) parent->avl_child[AVL_XCHILD(node)] = node; else tree->avl_root = node; AVL_SETPARENT(node->avl_child[left], node); AVL_SETPARENT(node->avl_child[right], node); /* * Put tmp where node used to be (just temporary). * It always has a parent and at most 1 child. */ delete = &tmp; parent = AVL_XPARENT(delete); parent->avl_child[AVL_XCHILD(delete)] = delete; which_child = (delete->avl_child[1] != 0); if (delete->avl_child[which_child] != NULL) AVL_SETPARENT(delete->avl_child[which_child], delete); } /* * Here we know "delete" is at least partially a leaf node. It can * be easily removed from the tree. */ ASSERT(tree->avl_numnodes > 0); --tree->avl_numnodes; parent = AVL_XPARENT(delete); which_child = AVL_XCHILD(delete); if (delete->avl_child[0] != NULL) node = delete->avl_child[0]; else node = delete->avl_child[1]; /* * Connect parent directly to node (leaving out delete). */ if (node != NULL) { AVL_SETPARENT(node, parent); AVL_SETCHILD(node, which_child); } if (parent == NULL) { tree->avl_root = node; return; } parent->avl_child[which_child] = node; /* * Since the subtree is now shorter, begin adjusting parent balances * and performing any needed rotations. */ do { /* * Move up the tree and adjust the balance * * Capture the parent and which_child values for the next * iteration before any rotations occur. */ node = parent; old_balance = AVL_XBALANCE(node); new_balance = old_balance - avl_child2balance[which_child]; parent = AVL_XPARENT(node); which_child = AVL_XCHILD(node); /* * If a node was in perfect balance but isn't anymore then * we can stop, since the height didn't change above this point * due to a deletion. */ if (old_balance == 0) { AVL_SETBALANCE(node, new_balance); break; } /* * If the new balance is zero, we don't need to rotate * else * need a rotation to fix the balance. * If the rotation doesn't change the height * of the sub-tree we have finished adjusting. */ if (new_balance == 0) AVL_SETBALANCE(node, new_balance); else if (!avl_rotation(tree, node, new_balance)) break; } while (parent != NULL); } #define AVL_REINSERT(tree, obj) \ avl_remove((tree), (obj)); \ avl_add((tree), (obj)) boolean_t avl_update_lt(avl_tree_t *t, void *obj) { void *neighbor; ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || (t->avl_compar(obj, neighbor) <= 0)); neighbor = AVL_PREV(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } return (B_FALSE); } boolean_t avl_update_gt(avl_tree_t *t, void *obj) { void *neighbor; ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || (t->avl_compar(obj, neighbor) >= 0)); neighbor = AVL_NEXT(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } return (B_FALSE); } boolean_t avl_update(avl_tree_t *t, void *obj) { void *neighbor; neighbor = AVL_PREV(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } neighbor = AVL_NEXT(t, obj); if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { AVL_REINSERT(t, obj); return (B_TRUE); } return (B_FALSE); } void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) { avl_node_t *temp_node; ulong_t temp_numnodes; ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); ASSERT3U(tree1->avl_size, ==, tree2->avl_size); temp_node = tree1->avl_root; temp_numnodes = tree1->avl_numnodes; tree1->avl_root = tree2->avl_root; tree1->avl_numnodes = tree2->avl_numnodes; tree2->avl_root = temp_node; tree2->avl_numnodes = temp_numnodes; } /* * initialize a new AVL tree */ void avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), size_t size, size_t offset) { ASSERT(tree); ASSERT(compar); ASSERT(size > 0); ASSERT(size >= offset + sizeof (avl_node_t)); #ifdef _LP64 ASSERT((offset & 0x7) == 0); #endif tree->avl_compar = compar; tree->avl_root = NULL; tree->avl_numnodes = 0; tree->avl_size = size; tree->avl_offset = offset; } /* * Delete a tree. */ /* ARGSUSED */ void avl_destroy(avl_tree_t *tree) { ASSERT(tree); ASSERT(tree->avl_numnodes == 0); ASSERT(tree->avl_root == NULL); } /* * Return the number of nodes in an AVL tree. */ ulong_t avl_numnodes(avl_tree_t *tree) { ASSERT(tree); return (tree->avl_numnodes); } boolean_t avl_is_empty(avl_tree_t *tree) { ASSERT(tree); return (tree->avl_numnodes == 0); } #define CHILDBIT (1L) /* * Post-order tree walk used to visit all tree nodes and destroy the tree * in post order. This is used for destroying a tree without paying any cost * for rebalancing it. * * example: * * void *cookie = NULL; * my_data_t *node; * * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) * free(node); * avl_destroy(tree); * * The cookie is really an avl_node_t to the current node's parent and * an indication of which child you looked at last. * * On input, a cookie value of CHILDBIT indicates the tree is done. */ void * avl_destroy_nodes(avl_tree_t *tree, void **cookie) { avl_node_t *node; avl_node_t *parent; int child; void *first; size_t off = tree->avl_offset; /* * Initial calls go to the first node or it's right descendant. */ if (*cookie == NULL) { first = avl_first(tree); /* * deal with an empty tree */ if (first == NULL) { *cookie = (void *)CHILDBIT; return (NULL); } node = AVL_DATA2NODE(first, off); parent = AVL_XPARENT(node); goto check_right_side; } /* * If there is no parent to return to we are done. */ parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); if (parent == NULL) { if (tree->avl_root != NULL) { ASSERT(tree->avl_numnodes == 1); tree->avl_root = NULL; tree->avl_numnodes = 0; } return (NULL); } /* * Remove the child pointer we just visited from the parent and tree. */ child = (uintptr_t)(*cookie) & CHILDBIT; parent->avl_child[child] = NULL; ASSERT(tree->avl_numnodes > 1); --tree->avl_numnodes; /* * If we just did a right child or there isn't one, go up to parent. */ if (child == 1 || parent->avl_child[1] == NULL) { node = parent; parent = AVL_XPARENT(parent); goto done; } /* * Do parent's right child, then leftmost descendent. */ node = parent->avl_child[1]; while (node->avl_child[0] != NULL) { parent = node; node = node->avl_child[0]; } /* * If here, we moved to a left child. It may have one * child on the right (when balance == +1). */ check_right_side: if (node->avl_child[1] != NULL) { ASSERT(AVL_XBALANCE(node) == 1); parent = node; node = node->avl_child[1]; ASSERT(node->avl_child[0] == NULL && node->avl_child[1] == NULL); } else { ASSERT(AVL_XBALANCE(node) <= 0); } done: if (parent == NULL) { *cookie = (void *)CHILDBIT; ASSERT(node == tree->avl_root); } else { *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); } return (AVL_NODE2DATA(node, off)); } Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 287722) @@ -1,6933 +1,6988 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each arc state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an arc list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * Arc buffers may have an associated eviction callback function. * This function will be invoked prior to removing the buffer (e.g. * in arc_do_user_evicts()). Note however that the data associated * with the buffer may be evicted prior to the callback. The callback * must be made with *no locks held* (to prevent deadlock). Additionally, * the users of callbacks must ensure that their private data is * protected from simultaneous callbacks from arc_clear_callback() * and arc_do_user_evicts(). * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include #include #include #include #include #include #include #ifdef illumos #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; int arc_procfd; #endif #endif /* illumos */ static kmutex_t arc_reclaim_lock; static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; static kmutex_t arc_user_evicts_lock; static kcondvar_t arc_user_evicts_cv; static boolean_t arc_user_evicts_thread_exit; uint_t arc_reduce_dnlc_percent = 3; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ int zfs_arc_evict_batch_limit = 10; /* * The number of sublists used for each of the arc state lists. If this * is not set to a suitable value by the user, it will be configured to * the number of CPUs on the system in arc_init(). */ int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; /* shift of arc_c for calculating overflow limit in arc_get_data_buf */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ static int arc_shrink_shift = 7; /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ int arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_lifespan; /* * If this percent of memory is free, don't throttle. */ int arc_lotsfree_percent = 10; static int arc_dead; -extern int zfs_prefetch_disable; +extern boolean_t zfs_prefetch_disable; /* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); #ifdef _KERNEL static void arc_free_target_init(void *unused __unused) { zfs_arc_free_target = vm_pageout_wakeup_thresh; } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); SYSCTL_DECL(_vfs_zfs); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, &arc_shrink_shift, 0, "log2(fraction of arc to reclaim)"); /* * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), sysctl_vfs_zfs_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim"); static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) { u_int val; int err; val = zfs_arc_free_target; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < minfree) return (EINVAL); if (val > vm_cnt.v_page_count) return (EINVAL); zfs_arc_free_target = val; return (0); } /* * Must be declared here, before the definition of corresponding kstat * macro which uses the same names will confuse the compiler. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_meta_limit, "QU", "ARC metadata limit"); #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { /* * list of evictable buffers */ multilist_t arcs_list[ARC_BUFC_NUMTYPES]; /* * total amount of evictable data in this state */ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. */ refcount_t arcs_size; } arc_state_t; /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped because they have I/O in progress, are * indrect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach it's target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_p; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by various buffers and structures * not actually backed with ARC buffers. This includes bonus * buffers (allocated directly via zio_buf_* functions), * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t * cache), and dnode_t structures (allocated via dnode_t cache). */ kstat_named_t arcstat_other_size; /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_anon_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mru_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mfu_size; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_compress_successes; kstat_named_t arcstat_l2_compress_zeros; kstat_named_t arcstat_l2_compress_failures; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; kstat_named_t arcstat_l2_write_spa_mismatch; kstat_named_t arcstat_l2_write_in_l2; kstat_named_t arcstat_l2_write_hdr_io_in_progress; kstat_named_t arcstat_l2_write_not_cacheable; kstat_named_t arcstat_l2_write_full; kstat_named_t arcstat_l2_write_buffer_iter; kstat_named_t arcstat_l2_write_pios; kstat_named_t arcstat_l2_write_buffer_bytes_scanned; kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_duplicate_buffers; kstat_named_t arcstat_duplicate_buffers_size; kstat_named_t arcstat_duplicate_reads; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_sync_wait_for_async; + kstat_named_t arcstat_demand_hit_predictive_prefetch; } arc_stats_t; static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_compress_successes", KSTAT_DATA_UINT64 }, { "l2_compress_zeros", KSTAT_DATA_UINT64 }, { "l2_compress_failures", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, { "l2_write_in_l2", KSTAT_DATA_UINT64 }, { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, { "l2_write_full", KSTAT_DATA_UINT64 }, { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, { "l2_write_pios", KSTAT_DATA_UINT64 }, { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "duplicate_buffers", KSTAT_DATA_UINT64 }, { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, { "duplicate_reads", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, - { "arc_meta_min", KSTAT_DATA_UINT64 } + { "arc_meta_min", KSTAT_DATA_UINT64 }, + { "sync_wait_for_async", KSTAT_DATA_UINT64 }, + { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ #define L2ARC_IS_VALID_COMPRESS(_c_) \ ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; arc_done_func_t *awcb_physdone; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; /* * ARC buffers are separated into multiple structs as a memory saving measure: * - Common fields struct, always defined, and embedded within it: * - L2-only fields, always allocated but undefined when not in L2ARC * - L1-only fields, only allocated when in L1ARC * * Buffer in L1 Buffer only in L2 * +------------------------+ +------------------------+ * | arc_buf_hdr_t | | arc_buf_hdr_t | * | | | | * | | | | * | | | | * +------------------------+ +------------------------+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | * | (undefined if L1-only) | | | * +------------------------+ +------------------------+ * | l1arc_buf_hdr_t | * | | * | | * | | * | | * +------------------------+ * * Because it's possible for the L2ARC to become extremely large, we can wind * up eating a lot of memory in L2ARC buffer headers, so the size of a header * is minimized by only allocating the fields necessary for an L1-cached buffer * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple * words in pointers. arc_hdr_realloc() is used to switch a header between * these two allocation states. */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; #ifdef ZFS_DEBUG /* * used for debugging wtih kmem_flags - by allocating and freeing * b_thawed when the buffer is thawed, we get a record of the stack * trace that thawed it. */ void *b_thawed; #endif arc_buf_t *b_buf; uint32_t b_datacnt; /* for waiting on writes to complete */ kcondvar_t b_cv; /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; /* self protecting */ refcount_t b_refcnt; arc_callback_t *b_acb; /* temporary buffer holder for in-flight compressed data */ void *b_tmp_cdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ /* real alloc'd buffer size depending on b_compress applied */ int32_t b_asize; + uint8_t b_compress; list_node_t b_l2node; } l2arc_buf_hdr_t; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; /* * Even though this checksum is only set/verified when a buffer is in * the L1 cache, it needs to be in the set of common fields because it * must be preserved from the time before a buffer is written out to * L2ARC until after it is read back in. */ zio_cksum_t *b_freeze_cksum; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; /* immutable */ int32_t b_size; uint64_t b_spa; /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; }; #ifdef _KERNEL static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = arc_meta_limit; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val <= 0 || val > arc_c_max) return (EINVAL); arc_meta_limit = val; return (0); } #endif static arc_buf_t *arc_eviction_list; static arc_buf_hdr_t arc_eviction_hdr; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) -/* For storing compression mode in b_flags */ -#define HDR_COMPRESS_OFFSET 24 -#define HDR_COMPRESS_NBITS 7 - -#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ - HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) -#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ - HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) - /* * Other sizes */ #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define HT_LOCK_PAD CACHE_LINE_SIZE struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; #endif }; #define BUF_LOCKS 256 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * Used to distinguish headers that are being process by * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk * address. This can happen when the header is added to the l2arc's list * of buffers to write in the first stage of l2arc_write_buffers(), but * has not yet been written out which happens in the second stage of * l2arc_write_buffers(). */ #define L2ARC_ADDR_UNSET ((uint64_t)(-1)) #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, &l2arc_write_max, 0, "max write size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, &l2arc_write_boost, 0, "extra write during warmup"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, &l2arc_headroom, 0, "number of dev writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, &l2arc_feed_secs, 0, "interval seconds"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, &l2arc_feed_min_ms, 0, "min interval milliseconds"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, &l2arc_noprefetch, 0, "don't cache prefetch bufs"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, &l2arc_feed_again, 0, "turbo warmup"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, &l2arc_norw, 0, "no reads during writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); /* * L2ARC Internals */ struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ refcount_t l2ad_alloc; /* allocated bytes */ }; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_t *l2rcb_buf; /* read buffer */ spa_t *l2rcb_spa; /* spa */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; void (*l2df_func)(void *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void arc_get_data_buf(arc_buf_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); static void l2arc_release_cdata_buf(arc_buf_hdr_t *); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; crc ^= (spa>>8) ^ birth; return (crc); } #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ (buf)->b_dva.dva_word[1] == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (BUF_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT(fhdr != NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_FULL_SIZE); cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(BUF_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(BUF_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* * Reclaim callback -- invoked when memory is low. */ /* ARGSUSED */ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ if (!arc_dead) cv_signal(&arc_reclaim_thread_cv); } static void buf_init(void) { uint64_t *ct; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache) { nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); } else { ASSERT(hdr->b_l1hdr.b_buf == NULL); ASSERT0(hdr->b_l1hdr.b_datacnt); /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) refcount_remove_many(&dev->l2ad_alloc, hdr->b_l2hdr.b_asize, hdr); (void) refcount_add_many(&dev->l2ad_alloc, nhdr->b_l2hdr.b_asize, nhdr); buf_discard_identity(hdr); hdr->b_freeze_cksum = NULL; kmem_cache_free(old, hdr); return (nhdr); } #define ARC_MINTIME (hz>>4) /* 62 ms */ static void arc_cksum_verify(arc_buf_t *buf) { zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); } static int arc_cksum_equal(arc_buf_t *buf) { zio_cksum_t zc; int equal; mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return (equal); } static void arc_cksum_compute(arc_buf_t *buf, boolean_t force) { if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif } #ifdef illumos #ifndef _KERNEL typedef struct procctl { long cmd; prwatch_t prwatch; } procctl_t; #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = 0; ctl.prwatch.pr_wflags = 0; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = buf->b_hdr->b_size; ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } #endif /* illumos */ static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { if (HDR_ISTYPE_METADATA(hdr)) { return (ARC_BUFC_METADATA); } else { return (ARC_BUFC_DATA); } } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); if (HDR_IO_IN_PROGRESS(buf->b_hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { if (buf->b_hdr->b_l1hdr.b_thawed != NULL) kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } #endif mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); #endif } void arc_buf_freeze(arc_buf_t *buf) { kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); } static void add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(hash_lock)); arc_state_t *state = hdr->b_l1hdr.b_state; if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { arc_buf_contents_t type = arc_buf_type(hdr); uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; multilist_t *list = &state->arcs_list[type]; uint64_t *size = &state->arcs_lsize[type]; multilist_remove(list, hdr); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_datacnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); delta = hdr->b_size; } ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); } /* remove the prefetch flag if we get a reference */ hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); /* * arc_l2c_only counts as a ghost state so we don't need to explicitly * check to prevent usage of the arc_l2c_only list. */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { arc_buf_contents_t type = arc_buf_type(hdr); multilist_t *list = &state->arcs_list[type]; uint64_t *size = &state->arcs_lsize[type]; multilist_insert(list, hdr); ASSERT(hdr->b_l1hdr.b_datacnt > 0); atomic_add_64(size, hdr->b_size * hdr->b_l1hdr.b_datacnt); } return (cnt); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *old_state; int64_t refcnt; uint32_t datacnt; uint64_t from_delta, to_delta; arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); datacnt = hdr->b_l1hdr.b_datacnt; } else { old_state = arc_l2c_only; refcnt = 0; datacnt = 0; } ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); ASSERT(refcnt == 0 || datacnt > 0); ASSERT(!GHOST_STATE(new_state) || datacnt == 0); ASSERT(old_state != arc_anon || datacnt <= 1); from_delta = to_delta = datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { uint64_t *size = &old_state->arcs_lsize[buftype]; ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ if (GHOST_STATE(old_state) && datacnt == 0) { /* ghost elements have a ghost size */ ASSERT(hdr->b_l1hdr.b_buf == NULL); from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); } if (new_state != arc_anon && new_state != arc_l2c_only) { uint64_t *size = &new_state->arcs_lsize[buftype]; /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { ASSERT0(datacnt); ASSERT(hdr->b_l1hdr.b_buf == NULL); to_delta = hdr->b_size; } atomic_add_64(size, to_delta); } } ASSERT(!BUF_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (to_delta && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(datacnt); /* * We moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * datacnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, hdr->b_size, hdr); } else { ASSERT3U(datacnt, !=, 0); /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { (void) refcount_add_many(&new_state->arcs_size, hdr->b_size, buf); } } } if (from_delta && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { /* * When moving a header off of a ghost state, * there's the possibility for datacnt to be * non-zero. This is because we first add the * arc buffer to the header prior to changing * the header's state. Since we used the header * for the reference when putting the header on * the ghost state, we must balance that and use * the header when removing off the ghost state * (even though datacnt is non zero). */ IMPLY(datacnt == 0, new_state == arc_anon || new_state == arc_l2c_only); (void) refcount_remove_many(&old_state->arcs_size, hdr->b_size, hdr); } else { ASSERT3P(datacnt, !=, 0); /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { (void) refcount_remove_many( &old_state->arcs_size, hdr->b_size, buf); } } } if (HDR_HAS_L1HDR(hdr)) hdr->b_l1hdr.b_state = new_state; /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, space); break; } if (type != ARC_SPACE_DATA) ARCSTAT_INCR(arcstat_meta_used, space); atomic_add_64(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, -space); break; } if (type != ARC_SPACE_DATA) { ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; ARCSTAT_INCR(arcstat_meta_used, -space); } ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } arc_buf_t * arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; ASSERT3U(size, >, 0); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); ASSERT3P(hdr->b_freeze_cksum, ==, NULL); hdr->b_size = size; hdr->b_spa = spa_load_guid(spa); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_flags = arc_bufc_to_flags(type); hdr->b_flags |= ARC_FLAG_HAS_L1HDR; hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_datacnt = 1; hdr->b_l1hdr.b_tmp_cdata = NULL; arc_get_data_buf(buf); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); return (buf); } static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; atomic_add_64(&arc_loaned_bytes, hdr->b_size); } static arc_buf_t * arc_buf_clone(arc_buf_t *from) { arc_buf_t *buf; arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = hdr->b_l1hdr.b_buf; hdr->b_l1hdr.b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); /* * This buffer already exists in the arc so create a duplicate * copy for the caller. If the buffer is associated with user data * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } hdr->b_l1hdr.b_datacnt += 1; return (buf); } void arc_buf_add_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; /* * Check to see if this buffer is evicted. Callers * must verify b_data != NULL to know if the add_ref * was successful. */ mutex_enter(&buf->b_evict_lock); if (buf->b_data == NULL) { mutex_exit(&buf->b_evict_lock); return; } hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } static void arc_buf_free_on_write(void *data, size_t size, void (*free_func)(void *, size_t)) { l2arc_data_free_t *df; df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) { arc_buf_hdr_t *hdr = buf->b_hdr; if (HDR_L2_WRITING(hdr)) { arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { free_func(buf->b_data, hdr->b_size); } } static void arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L2HDR(hdr)); ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); /* * The b_tmp_cdata field is linked off of the b_l1hdr, so if * that doesn't exist, the header is in the arc_l2c_only state, * and there isn't anything to free (it's already been freed). */ if (!HDR_HAS_L1HDR(hdr)) return; /* * The header isn't being written to the l2arc device, thus it * shouldn't have a b_tmp_cdata to free. */ if (!HDR_L2_WRITING(hdr)) { ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); return; } /* * The header does not have compression enabled. This can be due * to the buffer not being compressible, or because we're * freeing the buffer before the second phase of * l2arc_write_buffer() has started (which does the compression * step). In either case, b_tmp_cdata does not point to a * separately compressed buffer, so there's nothing to free (it * points to the same buffer as the arc_buf_t's b_data field). */ - if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) { hdr->b_l1hdr.b_tmp_cdata = NULL; return; } /* * There's nothing to free since the buffer was all zero's and * compressed to a zero length buffer. */ - if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) { + if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); return; } - ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))); + ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress)); arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, zio_data_buf_free); ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); hdr->b_l1hdr.b_tmp_cdata = NULL; } /* * Free up buf->b_data and if 'remove' is set, then pull the * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void arc_buf_destroy(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; /* free up data associated with the buf */ if (buf->b_data != NULL) { arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf, zio_buf_free); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf, zio_data_buf_free); arc_space_return(size, ARC_SPACE_DATA); } /* protected by hash lock, if in the hash table */ if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; ASSERT(refcount_is_zero( &buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); } (void) refcount_remove_many(&state->arcs_size, size, buf); buf->b_data = NULL; /* * If we're destroying a duplicate buffer make sure * that the appropriate statistics are updated. */ if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(buf->b_hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); } ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); buf->b_hdr->b_l1hdr.b_datacnt -= 1; } /* only remove the buf if requested */ if (!remove) return; /* remove the buf from the hdr list */ for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; ASSERT(buf->b_efunc == NULL); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); /* * We don't want to leak the b_tmp_cdata buffer that was * allocated in l2arc_write_buffers() */ arc_buf_l2_cdata_free(hdr); /* * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then * this header is being processed by l2arc_write_buffers() (i.e. * it's in the first stage of l2arc_write_buffers()). * Re-affirming that truth here, just to serve as a reminder. If * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or * may not have its HDR_L2_WRITING flag set. (the write may have * completed, in which case HDR_L2_WRITING will be false and the * b_daddr field will point to the address of the buffer on disk). */ IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); /* * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with * l2arc_write_buffers(). Since we've just removed this header * from the l2arc buffer list, this header will never reach the * second stage of l2arc_write_buffers(), which increments the * accounting stats for this header. Thus, we must be careful * not to decrement them for this header either. */ if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); vdev_space_update(dev->l2ad_vdev, -l2hdr->b_asize, 0, 0); (void) refcount_remove_many(&dev->l2ad_alloc, l2hdr->b_asize, hdr); } hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_datacnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } if (!BUF_EMPTY(hdr)) buf_discard_identity(hdr); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } if (HDR_HAS_L1HDR(hdr)) { while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (buf->b_efunc != NULL) { mutex_enter(&arc_user_evicts_lock); mutex_enter(&buf->b_evict_lock); ASSERT(buf->b_hdr != NULL); arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; mutex_exit(&buf->b_evict_lock); cv_signal(&arc_user_evicts_cv); mutex_exit(&arc_user_evicts_lock); } else { arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); } } #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; int hashed = hdr->b_l1hdr.b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); if (hashed) { kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { int destroy_hdr; /* * We are in the middle of an async write. Don't destroy * this buffer unless the write completes before we finish * decrementing the reference count. */ mutex_enter(&arc_user_evicts_lock); (void) remove_reference(hdr, NULL, tag); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); mutex_exit(&arc_user_evicts_lock); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) arc_buf_destroy(buf, TRUE); else arc_hdr_destroy(hdr); } } boolean_t arc_buf_remove_ref(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); boolean_t no_callback = (buf->b_efunc == NULL); if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT(hdr->b_l1hdr.b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT(hdr->b_l1hdr.b_datacnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT(hdr->b_l1hdr.b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) arc_buf_destroy(buf, TRUE); } else if (no_callback) { ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); mutex_exit(hash_lock); return (no_callback); } int32_t arc_buf_size(arc_buf_t *buf) { return (buf->b_hdr->b_size); } /* * Called from the DMU to determine if the current buffer should be * evicted. In order to ensure proper locking, the eviction must be initiated * from the DMU. Return true if the buffer is associated with user data and * duplicate buffers still exist. */ boolean_t arc_buf_eviction_needed(arc_buf_t *buf) { arc_buf_hdr_t *hdr; boolean_t evict_needed = B_FALSE; if (zfs_disable_dup_eviction) return (B_FALSE); mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(); let that function * perform the eviction. */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); return (B_FALSE); } else if (buf->b_data == NULL) { /* * We have already been added to the arc eviction list; * recommend eviction. */ ASSERT3P(hdr, ==, &arc_eviction_hdr); mutex_exit(&buf->b_evict_lock); return (B_TRUE); } if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) evict_needed = B_TRUE; mutex_exit(&buf->b_evict_lock); return (evict_needed); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on it's state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(hdr->b_l1hdr.b_buf == NULL); /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. it's b_tmp_cdata field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += hdr->b_size; DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < arc_min_prefetch_lifespan)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { ARCSTAT_BUMP(arcstat_mutex_miss); break; } if (buf->b_data != NULL) bytes_evicted += hdr->b_size; if (buf->b_efunc != NULL) { mutex_enter(&arc_user_evicts_lock); arc_buf_destroy(buf, FALSE); hdr->b_l1hdr.b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; arc_eviction_list = buf; cv_signal(&arc_user_evicts_cv); mutex_exit(&arc_user_evicts_lock); mutex_exit(&buf->b_evict_lock); } else { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, TRUE); } } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); else ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); } if (hdr->b_l1hdr.b_datacnt == 0) { arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, int64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; int evict_count = 0; ASSERT3P(marker, !=, NULL); IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; hdr = multilist_sublist_prev(mls, marker)) { if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || (evict_count >= zfs_arc_evict_batch_limit)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t evicted = arc_evict_hdr(hdr, hash_lock); mutex_exit(hash_lock); bytes_evicted += evicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count++; /* * If arc_size isn't overflowing, signal any * threads that might happen to be waiting. * * For each header evicted, we wake up a single * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since * arc_get_data_buf() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the * function should proceed in this case). * * If threads are left sleeping, due to not * using cv_broadcast, they will be woken up * just before arc_reclaim_thread() sleeps. */ mutex_enter(&arc_reclaim_lock); if (!arc_is_overflowing()) cv_signal(&arc_reclaim_waiters_cv); mutex_exit(&arc_reclaim_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); return (bytes_evicted); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); for (int i = 0; i < num_sublists; i++) { markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_adjust_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (bytes == ARC_EVICT_ALL) bytes_remaining = ARC_EVICT_ALL; else if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); kmem_cache_free(hdr_full_cache, markers[i]); } kmem_free(markers, sizeof (*markers) * num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (state->arcs_lsize[type] != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function * prevents us from trying to evict more from a state's list than * is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { int64_t delta; if (bytes > 0 && state->arcs_lsize[type] > 0) { delta = MIN(state->arcs_lsize[type], bytes); return (arc_evict_state(state, spa, delta, type)); } return (0); } /* * Evict metadata buffers from the cache, such that arc_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t arc_adjust_meta(void) { uint64_t total_evicted = 0; int64_t target; /* * If we're over the meta limit, we want to evict enough * metadata to get back under the meta limit. We don't want to * evict so much that we drop the MRU below arc_p, though. If * we're over the meta limit more than we're over arc_p, we * evict some from the MRU here, and some from the MFU below. */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) - arc_p)); total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the * space alloted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } /* * Return the type of the oldest buffer in the given arc state * * This function will select a random sublist of type ARC_BUFC_DATA and * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist * is compared, and the type which contains the "older" buffer will be * returned. */ static arc_buf_contents_t arc_adjust_type(arc_state_t *state) { multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; multilist_sublist_t *meta_mls; arc_buf_contents_t type; arc_buf_hdr_t *data_hdr; arc_buf_hdr_t *meta_hdr; /* * We keep the sublist lock until we're finished, to prevent * the headers from being destroyed via arc_evict_state(). */ data_mls = multilist_sublist_lock(data_ml, data_idx); meta_mls = multilist_sublist_lock(meta_ml, meta_idx); /* * These two loops are to ensure we skip any markers that * might be at the tail of the lists due to arc_evict_state(). */ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { if (data_hdr->b_spa != 0) break; } for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { if (meta_hdr->b_spa != 0) break; } if (data_hdr == NULL && meta_hdr == NULL) { type = ARC_BUFC_DATA; } else if (data_hdr == NULL) { ASSERT3P(meta_hdr, !=, NULL); type = ARC_BUFC_METADATA; } else if (meta_hdr == NULL) { ASSERT3P(data_hdr, !=, NULL); type = ARC_BUFC_DATA; } else { ASSERT3P(data_hdr, !=, NULL); ASSERT3P(meta_hdr, !=, NULL); /* The headers can't be on the sublist without an L1 header */ ASSERT(HDR_HAS_L1HDR(data_hdr)); ASSERT(HDR_HAS_L1HDR(meta_hdr)); if (data_hdr->b_l1hdr.b_arc_access < meta_hdr->b_l1hdr.b_arc_access) { type = ARC_BUFC_DATA; } else { type = ARC_BUFC_METADATA; } } multilist_sublist_unlock(meta_mls); multilist_sublist_unlock(data_mls); return (type); } /* * Evict buffers from the cache, such that arc_size is capped by arc_c. */ static uint64_t arc_adjust(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ total_evicted += arc_adjust_meta(); /* * Adjust MRU size * * If we're over the target cache size, we want to evict enough * from the list to get back to our target size. We don't want * to evict too much from the MRU, such that it drops below * arc_p. So, if we're over our target cache size more than * the MRU is over arc_p, we'll evict enough to get back to * arc_p here, and then evict more from the MFU below. */ target = MIN((int64_t)(arc_size - arc_c), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); /* * If we're below arc_meta_min, always prefer to evict data. * Otherwise, try to satisfy the requested number of bytes to * evict from the type which contains older buffers; in an * effort to keep newer buffers in the cache regardless of their * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && arc_meta_used > arc_meta_min) { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from metadata. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Adjust MFU size * * Now that we've tried to evict enough from the MRU to get its * size back to arc_p, if we're still above the target cache * size, we evict the rest from the MFU. */ target = arc_size - arc_c; if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && arc_meta_used > arc_meta_min) { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists * * In addition to the above, the ARC also defines target values * for the ghost lists. The sum of the mru list and mru ghost * list should never exceed the target size of the cache, and * the sum of the mru list, mfu list, mru ghost list, and mfu * ghost list should never exceed twice the target size of the * cache. The following logic enforces these limits on the ghost * caches, and evicts from them as needed. */ target = refcount_count(&arc_mru->arcs_size) + refcount_count(&arc_mru_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than * or equal to arc_c (we enforced this above), which means we * can use the simpler of the two equations below: * * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c * mru ghost + mfu ghost <= arc_c */ target = refcount_count(&arc_mru_ghost->arcs_size) + refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } static void arc_do_user_evicts(void) { mutex_enter(&arc_user_evicts_lock); while (arc_eviction_list != NULL) { arc_buf_t *buf = arc_eviction_list; arc_eviction_list = buf->b_next; mutex_enter(&buf->b_evict_lock); buf->b_hdr = NULL; mutex_exit(&buf->b_evict_lock); mutex_exit(&arc_user_evicts_lock); if (buf->b_efunc != NULL) VERIFY0(buf->b_efunc(buf->b_private)); buf->b_efunc = NULL; buf->b_private = NULL; kmem_cache_free(buf_cache, buf); mutex_enter(&arc_user_evicts_lock); } mutex_exit(&arc_user_evicts_lock); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); arc_do_user_evicts(); ASSERT(spa || arc_eviction_list == NULL); } void arc_shrink(int64_t to_free) { if (arc_c > arc_c_min) { DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, arc_c_min, uint64_t, arc_p, uint64_t, to_free); if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, arc_p); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } if (arc_size > arc_c) { DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, uint64_t, arc_c); (void) arc_adjust(); } } static long needfree = 0; typedef enum free_memory_reason_t { FMR_UNKNOWN, FMR_NEEDFREE, FMR_LOTSFREE, FMR_SWAPFS_MINFREE, FMR_PAGES_PP_MAXIMUM, FMR_HEAP_ARENA, FMR_ZIO_ARENA, FMR_ZIO_FRAG, } free_memory_reason_t; int64_t last_free_memory; free_memory_reason_t last_free_reason; /* * Additional reserve of pages for pp_reserve. */ int64_t arc_pages_pp_reserve = 64; /* * Additional reserve of pages for swapfs. */ int64_t arc_swapfs_reserve = 64; /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. */ static int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; int64_t n; free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL if (needfree > 0) { n = PAGESIZE * (-needfree); if (n < lowest) { lowest = n; r = FMR_NEEDFREE; } } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } #ifdef illumos /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ n = PAGESIZE * (freemem - lotsfree - needfree - desfree); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } /* * check to make sure that swapfs has enough space so that anon * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - desfree - arc_swapfs_reserve); if (n < lowest) { lowest = n; r = FMR_SWAPFS_MINFREE; } /* * Check that we have enough availrmem that memory locking (e.g., via * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum * stores the number of pages that cannot be locked; when availrmem * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ n = PAGESIZE * (availrmem - pages_pp_maximum - arc_pages_pp_reserve); if (n < lowest) { lowest = n; r = FMR_PAGES_PP_MAXIMUM; } #endif /* illumos */ #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); if (n < lowest) { lowest = n; r = FMR_HEAP_ARENA; } #define zio_arena NULL #else #define zio_arena heap_arena #endif /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this arena remains * above about 1/16th free. * * Note: The 1/16th arena free requirement was put in place * to aggressively evict memory from the arc in order to avoid * memory fragmentation issues. */ if (zio_arena != NULL) { n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - (vmem_size(zio_arena, VMEM_ALLOC) >> 4); if (n < lowest) { lowest = n; r = FMR_ZIO_ARENA; } } /* * Above limits know nothing about real level of KVA fragmentation. * Start aggressive reclamation if too little sequential KVA left. */ if (lowest > 0) { n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : INT64_MAX; if (n < lowest) { lowest = n; r = FMR_ZIO_FRAG; } } #else /* _KERNEL */ /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) lowest = -1024; #endif /* _KERNEL */ last_free_memory = lowest; last_free_reason = r; DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); return (lowest); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; static __noinline void arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* * We are exceeding our meta-data cache limit. * Purge some DNLC entries to release holds on meta-data. */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(range_seg_cache); #ifdef illumos if (zio_arena != NULL) { /* * Ask the vmem arena to reclaim unused memory from its * quantum caches. */ vmem_qcache_reap(zio_arena); } #endif DTRACE_PROBE(arc__kmem_reap_end); } /* * Threads can block in arc_get_data_buf() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in * arc_get_data_buf() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, * fails, and goes to sleep forever. * * This possible deadlock is avoided by always acquiring a hash lock * using mutex_tryenter() from arc_reclaim_thread(). */ static void arc_reclaim_thread(void *dummy __unused) { clock_t growtime = 0; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_lock); while (!arc_reclaim_thread_exit) { int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 60) seconds * before considering growing. */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); arc_kmem_reap_now(); /* * If we are still low on memory, shrink the ARC * so that we have arc_shrink_min free space. */ free_memory = arc_available_memory(); int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { #ifdef _KERNEL to_free = MAX(to_free, ptob(needfree)); #endif arc_shrink(to_free); } } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (ddi_get_lbolt() >= growtime) { arc_no_grow = B_FALSE; } evicted = arc_adjust(); mutex_enter(&arc_reclaim_lock); /* * If evicted is zero, we couldn't evict anything via * arc_adjust(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. */ if (arc_size <= arc_c || evicted == 0) { #ifdef _KERNEL needfree = 0; #endif /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * up any threads before we go to sleep. */ cv_broadcast(&arc_reclaim_waiters_cv); /* * Block until signaled, or after one second (we * might need to perform arc_kmem_reap_now() * even if we aren't being signalled) */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_reclaim_thread_cv, &arc_reclaim_lock, hz); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); } } arc_reclaim_thread_exit = FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ thread_exit(); } static void arc_user_evicts_thread(void *dummy __unused) { callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_user_evicts_lock); while (!arc_user_evicts_thread_exit) { mutex_exit(&arc_user_evicts_lock); arc_do_user_evicts(); /* * This is necessary in order for the mdb ::arc dcmd to * show up to date information. Since the ::arc command * does not call the kstat's update function, without * this call, the command may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this change, the data might be up to 1 second * out of date; but that should suffice. The arc_state_t * structures can be queried directly if more accurate * information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); mutex_enter(&arc_user_evicts_lock); /* * Block until signaled, or after one second (we need to * call the arc's kstat update function regularly). */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_user_evicts_cv, &arc_user_evicts_lock, hz); CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); } arc_user_evicts_thread_exit = FALSE; cv_broadcast(&arc_user_evicts_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ thread_exit(); } /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); if (state == arc_l2c_only) return; ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thread_cv); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { DTRACE_PROBE1(arc__inc_adapt, int, bytes); atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static boolean_t arc_is_overflowing(void) { /* Always allow at least one block of overflow */ uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); return (arc_size >= arc_c + overflow); } /* * The buffer, supplied as the first argument, needs a data block. If we * are hitting the hard limit for the cache size, we must sleep, waiting * for the eviction thread to catch up. If we're past the target size * but below the hard limit, we'll only signal the reclaim thread and * continue on. */ static void arc_get_data_buf(arc_buf_t *buf) { arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_adapt(size, state); /* * If arc_size is currently overflowing, and has grown past our * upper limit, we must be adding data faster than the evict * thread can evict. Thus, to ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we halt and wait for the * eviction thread to catch up. * * It's also possible that the reclaim thread is unable to evict * enough buffers to get arc_size below the overflow limit (e.g. * due to buffers being un-evictable, or hash lock collisions). * In this case, we want to proceed regardless if we're * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { mutex_enter(&arc_reclaim_lock); /* * Now that we've acquired the lock, we may no longer be * over the overflow limit, lets check. * * We're ignoring the case of spurious wake ups. If that * were to happen, it'd let this thread consume an ARC * buffer before it should have (i.e. before we're under * the overflow limit and were signalled by the reclaim * thread). As long as that is a rare occurrence, it * shouldn't cause any harm. */ if (arc_is_overflowing()) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); } if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_state_t *state = hdr->b_l1hdr.b_state; (void) refcount_add_many(&state->arcs_size, size, buf); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && (refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if (HDR_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { hdr->b_flags &= ~ARC_FLAG_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) hdr->b_flags &= ~ARC_FLAG_PREFETCH; DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ if ((HDR_PREFETCH(hdr)) != 0) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); /* link protected by hash_lock */ ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { ASSERT(!"invalid arc state"); } } /* a generic arc_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg)); } /* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { VERIFY(arc_buf_remove_ref(buf, arg)); *bufp = NULL; } else { *bufp = buf; ASSERT(buf->b_data); } } static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_t *abuf; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; int freeable = FALSE; buf = zio->io_private; hdr = buf->b_hdr; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); } hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; if (l2arc_noprefetch && HDR_PREFETCH(hdr)) hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_l1hdr.b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : dmu_ot_byteswap[bswap].ob_func; func(buf->b_data, hdr->b_size); } arc_cksum_compute(buf, B_FALSE); #ifdef illumos arc_buf_watch(buf); #endif if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { if (abuf == NULL) { ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); } acb->acb_buf = abuf; abuf = NULL; } } hdr->b_l1hdr.b_acb = NULL; hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); ASSERT(hdr->b_l1hdr.b_datacnt == 1); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); if (zio->io_error != 0) { hdr->b_flags |= ARC_FLAG_IO_ERROR; if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) acb->acb_done(zio, acb->acb_buf, acb->acb_private); if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); top: if (!BP_IS_EMBEDDED(bp)) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { + if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && + priority == ZIO_PRIORITY_SYNC_READ) { + /* + * This sync read must wait for an + * in-progress async read (e.g. a predictive + * prefetch). Async reads are queued + * separately at the vdev_queue layer, so + * this is a form of priority inversion. + * Ideally, we would "inherit" the demand + * i/o's priority by moving the i/o from + * the async queue to the synchronous queue, + * but there is currently no mechanism to do + * so. Track this so that we can evaluate + * the magnitude of this potential performance + * problem. + * + * Note that if the prefetch i/o is already + * active (has been issued to the device), + * the prefetch improved performance, because + * we issued it sooner than we would have + * without the prefetch. + */ + DTRACE_PROBE1(arc__sync__wait__for__async, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_sync_wait_for_async); + } + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + } + if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { - arc_callback_t *acb = NULL; + arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); return (0); } mutex_exit(hash_lock); return (0); } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); if (done) { + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + /* + * This is a demand read which does not have to + * wait for i/o because we did a predictive + * prefetch i/o for it, which has completed. + */ + DTRACE_PROBE1( + arc__demand__hit__predictive__prefetch, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP( + arcstat_demand_hit_predictive_prefetch); + hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; + } add_reference(hdr, hash_lock, private); /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { ASSERT(buf->b_efunc == NULL); hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; } else { buf = arc_buf_clone(buf); } } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) hdr->b_flags |= ARC_FLAG_L2CACHE; if (*arc_flags & ARC_FLAG_L2COMPRESS) hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, buf, private); } else { uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; enum zio_compress b_compress = ZIO_COMPRESS_OFF; int32_t b_asize = 0; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } - /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_FLAG_PREFETCH) { + /* + * If there is a callback, we pass our reference to + * it; otherwise we remove our reference. + */ + if (done == NULL) { (void) remove_reference(hdr, hash_lock, private); - hdr->b_flags |= ARC_FLAG_PREFETCH; } + if (*arc_flags & ARC_FLAG_PREFETCH) + hdr->b_flags |= ARC_FLAG_PREFETCH; if (*arc_flags & ARC_FLAG_L2CACHE) hdr->b_flags |= ARC_FLAG_L2CACHE; if (*arc_flags & ARC_FLAG_L2COMPRESS) hdr->b_flags |= ARC_FLAG_L2COMPRESS; if (BP_GET_LEVEL(bp) > 0) hdr->b_flags |= ARC_FLAG_INDIRECT; } else { /* * This block is in the ghost cache. If it was L2-only * (and thus didn't have an L1 hdr), we realloc the * header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - /* if this is a prefetch, we don't have a reference */ + /* + * If there is a callback, we pass a reference to it. + */ + if (done != NULL) + add_reference(hdr, hash_lock, private); if (*arc_flags & ARC_FLAG_PREFETCH) hdr->b_flags |= ARC_FLAG_PREFETCH; - else - add_reference(hdr, hash_lock, private); if (*arc_flags & ARC_FLAG_L2CACHE) hdr->b_flags |= ARC_FLAG_L2CACHE; if (*arc_flags & ARC_FLAG_L2COMPRESS) hdr->b_flags |= ARC_FLAG_L2COMPRESS; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; hdr->b_l1hdr.b_buf = buf; ASSERT0(hdr->b_l1hdr.b_datacnt); hdr->b_l1hdr.b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } + if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) + hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; ASSERT(hdr->b_l1hdr.b_acb == NULL); hdr->b_l1hdr.b_acb = acb; hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; - b_compress = HDR_GET_COMPRESS(hdr); + b_compress = hdr->b_l2hdr.b_compress; b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } if (hash_lock != NULL) mutex_exit(hash_lock); /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); #ifdef _KERNEL curthread->td_ru.ru_inblock++; #endif + if (priority == ZIO_PRIORITY_ASYNC_READ) + hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; + else + hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_buf = buf; cb->l2rcb_spa = spa; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; cb->l2rcb_compress = b_compress; ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + size < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ if (b_compress == ZIO_COMPRESS_EMPTY) { rzio = zio_null(pio, spa, vd, l2arc_read_done, cb, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); } else { rzio = zio_read_phys(pio, vd, addr, b_asize, buf->b_data, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); } DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); return (0); } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) return (0); /* l2arc read error; goto zio_read() */ } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); if (l2arc_ndev != 0) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } return (0); } void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); buf->b_efunc = func; buf->b_private = private; } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; add_reference(hdr, hash_lock, FTAG); hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); arc_release(buf, FTAG); (void) arc_buf_remove_ref(buf, FTAG); } else { mutex_exit(hash_lock); } } /* * Clear the user eviction callback set by arc_set_callback(), first calling * it if it exists. Because the presence of a callback keeps an arc_buf cached * clearing the callback may result in the arc_buf being destroyed. However, * it will not result in the *last* arc_buf being destroyed, hence the data * will remain cached in the ARC. We make a copy of the arc buffer here so * that we can process the callback without holding any locks. * * It's possible that the callback is already in the process of being cleared * by another thread. In this case we can not clear the callback. * * Returns B_TRUE if the callback was successfully called and cleared. */ boolean_t arc_clear_callback(arc_buf_t *buf) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; arc_evict_func_t *efunc = buf->b_efunc; void *private = buf->b_private; mutex_enter(&buf->b_evict_lock); hdr = buf->b_hdr; if (hdr == NULL) { /* * We are in arc_do_user_evicts(). */ ASSERT(buf->b_data == NULL); mutex_exit(&buf->b_evict_lock); return (B_FALSE); } else if (buf->b_data == NULL) { /* * We are on the eviction list; process this buffer now * but let arc_do_user_evicts() do the reaping. */ buf->b_efunc = NULL; mutex_exit(&buf->b_evict_lock); VERIFY0(efunc(private)); return (B_TRUE); } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, hdr->b_l1hdr.b_datacnt); ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); buf->b_efunc = NULL; buf->b_private = NULL; if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); arc_buf_destroy(buf, TRUE); } else { ASSERT(buf == hdr->b_l1hdr.b_buf); hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } mutex_exit(hash_lock); VERIFY0(efunc(private)); return (B_TRUE); } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if it's DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT(BUF_EMPTY(hdr)); ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(buf->b_efunc, ==, NULL); ASSERT3P(buf->b_private, ==, NULL); hdr->b_l1hdr.b_arc_access = 0; arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) { if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); arc_hdr_l2hdr_destroy(hdr); } mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; arc_buf_contents_t type = arc_buf_type(hdr); uint32_t flags = hdr->b_flags; ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_l1hdr.b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many( &state->arcs_size, hdr->b_size, buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); uint64_t *size = &state->arcs_lsize[type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } /* * We're releasing a duplicate user data buffer, update * our statistics accordingly. */ if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -hdr->b_size); } hdr->b_l1hdr.b_datacnt -= 1; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif mutex_exit(hash_lock); nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; nhdr->b_flags |= arc_bufc_to_flags(type); nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_datacnt = 1; nhdr->b_l1hdr.b_state = arc_anon; nhdr->b_l1hdr.b_arc_access = 0; nhdr->b_l1hdr.b_tmp_cdata = NULL; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_datacnt > 0); callback->awcb_ready(zio, buf, callback->awcb_private); /* * If the IO is already in progress, then this is a re-write * attempt, so we need to thaw and re-compute the cksum. * It is the responsibility of the callback to handle the * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(hdr->b_l1hdr.b_acb == NULL); if (zio->io_error == 0) { if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(BUF_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT(zio->io_error == 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_datacnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(hdr->b_l1hdr.b_acb == NULL); ASSERT(hdr->b_l1hdr.b_datacnt > 0); if (l2arc) hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) hdr->b_flags |= ARC_FLAG_L2COMPRESS; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, arc_write_ready, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } static int arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); #endif if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { last_txg = txg; page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { if (page_load > MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (SET_ERROR(EAGAIN)); } page_load = 0; #endif return (0); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (reserve > arc_c/4 && !arc_no_grow) { arc_c = MIN(arc_c_max, reserve * 4); DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); } if (reserve > arc_c) return (SET_ERROR(ENOMEM)); /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, reserve>>10, arc_c>>10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) { return (EACCES); } else { arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); } return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!BUF_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout it's lifetime * (i.e. it's b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } #ifdef _KERNEL static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { mutex_enter(&arc_reclaim_lock); /* XXX: Memory deficit should be passed as argument. */ needfree = btoc(arc_c >> arc_shrink_shift); DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thread_cv); /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); mutex_exit(&arc_reclaim_lock); } #endif void arc_init(void) { int i, prefetch_tunable_set = 0; mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ arc_c = kmem_size() / 8; #ifdef illumos #ifdef _KERNEL /* * On architectures where the physical memory can be larger * than the addressable space (intel in 32-bit mode), we may * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif #endif /* illumos */ /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ arc_c_min = MAX(arc_c / 4, 16 << 20); /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ if (arc_c * 8 >= 1 << 30) arc_c_max = (arc_c * 8) - (1 << 30); else arc_c_max = arc_c_min; arc_c_max = MAX(arc_c * 5, arc_c_max); #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are * reasonable (ie. over 16MB) */ if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) arc_c_max = zfs_arc_max; if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif arc_c = arc_c_max; arc_p = (arc_c >> 1); /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; if (zfs_arc_meta_min > 0) { arc_meta_min = zfs_arc_meta_min; } else { arc_meta_min = arc_c_min / 2; } if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; /* * Ensure that arc_no_grow_shift is less than arc_shrink_shift. */ if (arc_no_grow_shift >= arc_shrink_shift) arc_no_grow_shift = arc_shrink_shift - 1; if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; if (zfs_arc_num_sublists_per_state < 1) zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; zfs_arc_min = arc_c_min; zfs_arc_max = arc_c_max; arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; arc_size = 0; multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); refcount_create(&arc_anon->arcs_size); refcount_create(&arc_mru->arcs_size); refcount_create(&arc_mru_ghost->arcs_size); refcount_create(&arc_mfu->arcs_size); refcount_create(&arc_mfu_ghost->arcs_size); refcount_create(&arc_l2c_only->arcs_size); buf_init(); arc_reclaim_thread_exit = FALSE; arc_user_evicts_thread_exit = FALSE; arc_eviction_list = NULL; bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); #ifdef _KERNEL arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); #endif (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, TS_RUN, minclsyspri); arc_dead = FALSE; arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by /etc/system, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4GB). */ if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = ptob(physmem) * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } #ifdef _KERNEL if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) prefetch_tunable_set = 1; #ifdef __i386__ if (prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default on i386 " "-- to enable,\n"); printf(" add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #else if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default if less " "than 4GB of RAM is present;\n" " to enable, add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #endif /* Warn about ZFS memory and address space requirements. */ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " "expect unstable behavior.\n"); } if (kmem_size() < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); printf(" in /boot/loader.conf.\n"); } #endif } void arc_fini(void) { mutex_enter(&arc_reclaim_lock); arc_reclaim_thread_exit = TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to * FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); mutex_enter(&arc_user_evicts_lock); arc_user_evicts_thread_exit = TRUE; /* * The user evicts thread will set arc_user_evicts_thread_exit * to FALSE when it is finished exiting; we're waiting for that. */ while (arc_user_evicts_thread_exit) { cv_signal(&arc_user_evicts_cv); cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); } mutex_exit(&arc_user_evicts_lock); /* Use TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, TRUE); arc_dead = TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } mutex_destroy(&arc_reclaim_lock); cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); mutex_destroy(&arc_user_evicts_lock); cv_destroy(&arc_user_evicts_cv); refcount_destroy(&arc_anon->arcs_size); refcount_destroy(&arc_mru->arcs_size); refcount_destroy(&arc_mru_ghost->arcs_size); refcount_destroy(&arc_mfu->arcs_size); refcount_destroy(&arc_mfu_ghost->arcs_size); refcount_destroy(&arc_l2c_only->arcs_size); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); buf_fini(); ASSERT0(arc_loaned_bytes); #ifdef _KERNEL if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid) { ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); return (B_FALSE); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_in_l2); return (B_FALSE); } if (HDR_IO_IN_PROGRESS(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); return (B_FALSE); } if (!HDR_L2CACHE(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); return (B_FALSE); } return (B_TRUE); } static uint64_t l2arc_write_size(void) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write() { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT(df->l2df_data != NULL); ASSERT(df->l2df_func != NULL); df->l2df_func(df->l2df_data, df->l2df_size); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT(cb != NULL); dev = cb->l2wcb_dev; ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); buflist = &dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * We may have allocated a buffer for L2ARC compression, * we must release it to avoid leaking this data. */ l2arc_release_cdata_buf(hdr); if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); bytes_dropped += hdr->b_l2hdr.b_asize; (void) refcount_remove_many(&dev->l2ad_alloc, hdr->b_l2hdr.b_asize, hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ hdr->b_flags &= ~ARC_FLAG_L2_WRITING; mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; int equal; ASSERT(zio->io_vd != NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; ASSERT(cb != NULL); buf = cb->l2rcb_buf; ASSERT(buf != NULL); hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the buffer was compressed, decompress it first. */ if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); ASSERT(zio->io_data != NULL); + ASSERT3U(zio->io_size, ==, hdr->b_size); + ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); /* * Check this survived the L2ARC journey. */ equal = arc_cksum_equal(buf); if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = buf; zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ arc_read_done(zio); } else { mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!equal) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, zio->io_size, arc_read_done, buf, + buf->b_data, hdr->b_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. */ return; } if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */ taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } if (!all && HDR_HAS_L2HDR(hdr) && (hdr->b_l2hdr.b_daddr > taddr || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } ASSERT(HDR_HAS_L2HDR(hdr)); if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); hdr->b_flags |= ARC_FLAG_L2_EVICTED; } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_sz, headroom, buf_compress_minsz; void *buf_data; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); const boolean_t do_headroom_boost = *headroom_boost; int try; ASSERT(dev->l2ad_vdev != NULL); /* Lower the flag now, we might want to raise it again later. */ *headroom_boost = B_FALSE; pio = NULL; write_sz = write_asize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; head->b_flags |= ARC_FLAG_HAS_L2HDR; ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * We will want to try to compress buffers that are at least 2x the * device sector size. */ buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; /* * Copy buffers for L2ARC writing. */ for (try = 0; try <= 3; try++) { multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); if (hdr == NULL) ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); headroom = target_sz * l2arc_headroom; if (do_headroom_boost) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t buf_sz; uint64_t buf_a_sz; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } passed_sz += hdr->b_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } /* * Assume that the buffer is not going to be compressed * and could take more space on disk because of a larger * disk block size. */ buf_sz = hdr->b_size; buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); if ((write_asize + buf_a_sz) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_full); break; } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); ARCSTAT_BUMP(arcstat_l2_write_pios); } /* * Create and add a new L2ARC header. */ hdr->b_l2hdr.b_dev = dev; hdr->b_flags |= ARC_FLAG_L2_WRITING; /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from * there. This is because can't access b_l1hdr.b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing). */ - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; hdr->b_l2hdr.b_asize = hdr->b_size; hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; /* * Explicitly set the b_daddr field to a known * value which means "invalid address". This * enables us to differentiate which stage of * l2arc_write_buffers() the particular header * is in (e.g. this loop, or the one below). * ARC_FLAG_L2_WRITING is not enough to make * this distinction, and we need to know in * order to do proper l2arc vdev accounting in * arc_release() and arc_hdr_destroy(). * * Note, we can't use a new flag to distinguish * the two stages because we don't hold the * header's hash_lock below, in the second stage * of this function. Thus, we can't simply * change the b_flags field to denote that the * IO has been sent. We can change the b_daddr * field of the L2 portion, though, since we'll * be holding the l2ad_mtx; which is why we're * using it to denote the header's state change. */ hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; hdr->b_flags |= ARC_FLAG_HAS_L2HDR; mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ arc_cksum_verify(hdr->b_l1hdr.b_buf); arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); mutex_exit(hash_lock); write_sz += buf_sz; write_asize += buf_a_sz; } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); return (0); } mutex_enter(&dev->l2ad_mtx); /* * Note that elsewhere in this file arcstat_l2_asize * and the used space on l2ad_vdev are updated using b_asize, * which is not necessarily rounded up to the device block size. * Too keep accounting consistent we do the same here as well: * stats_size accumulates the sum of b_asize of the written buffers, * while write_asize accumulates the sum of b_asize rounded up * to the device block size. * The latter sum is used only to validate the corectness of the code. */ uint64_t stats_size = 0; write_asize = 0; /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector * loop above. */ for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * We shouldn't need to lock the buffer here, since we flagged * it as ARC_FLAG_L2_WRITING in the previous step, but we must * take care to only access its L2 cache parameters. In * particular, hdr->l1hdr.b_buf may be invalid by now due to * ARC eviction. */ hdr->b_l2hdr.b_daddr = dev->l2ad_hand; if ((HDR_L2COMPRESS(hdr)) && hdr->b_l2hdr.b_asize >= buf_compress_minsz) { if (l2arc_compress_buf(hdr)) { /* * If compression succeeded, enable headroom * boost on the next scan cycle. */ *headroom_boost = B_TRUE; } } /* * Pick up the buffer data we had previously stashed away * (and now potentially also compressed). */ buf_data = hdr->b_l1hdr.b_tmp_cdata; buf_sz = hdr->b_l2hdr.b_asize; /* * We need to do this regardless if buf_sz is zero or * not, otherwise, when this l2hdr is evicted we'll * remove a reference that was never added. */ (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { uint64_t buf_a_sz; wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); stats_size += buf_sz; /* * Keep the clock hand suitably device-aligned. */ buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); write_asize += buf_a_sz; dev->l2ad_hand += buf_a_sz; } } mutex_exit(&dev->l2ad_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, stats_size); vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { dev->l2ad_hand = dev->l2ad_start; dev->l2ad_first = B_FALSE; } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; return (write_asize); } /* * Compresses an L2ARC buffer. * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its * size in l2hdr->b_asize. This routine tries to compress the data and * depending on the compression result there are three possible outcomes: * *) The buffer was incompressible. The original l2hdr contents were left * untouched and are ready for writing to an L2 device. * *) The buffer was all-zeros, so there is no need to write it to an L2 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. * *) Compression succeeded and b_tmp_cdata was replaced with a temporary * data buffer which holds the compressed data to be written, and b_asize * tells us how much data there is. b_compress is set to the appropriate * compression algorithm. Once writing is done, invoke * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. * * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the * buffer was incompressible). */ static boolean_t l2arc_compress_buf(arc_buf_hdr_t *hdr) { void *cdata; size_t csize, len, rounded; ASSERT(HDR_HAS_L2HDR(hdr)); l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); + ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); len = l2hdr->b_asize; cdata = zio_data_buf_alloc(len); ASSERT3P(cdata, !=, NULL); csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, cdata, l2hdr->b_asize); if (csize == 0) { /* zero block, indicate that there's nothing to write */ zio_data_buf_free(cdata, len); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); + l2hdr->b_compress = ZIO_COMPRESS_EMPTY; l2hdr->b_asize = 0; hdr->b_l1hdr.b_tmp_cdata = NULL; ARCSTAT_BUMP(arcstat_l2_compress_zeros); return (B_TRUE); } rounded = P2ROUNDUP(csize, (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); if (rounded < len) { /* * Compression succeeded, we'll keep the cdata around for * writing and release it afterwards. */ if (rounded > csize) { bzero((char *)cdata + csize, rounded - csize); csize = rounded; } - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); + l2hdr->b_compress = ZIO_COMPRESS_LZ4; l2hdr->b_asize = csize; hdr->b_l1hdr.b_tmp_cdata = cdata; ARCSTAT_BUMP(arcstat_l2_compress_successes); return (B_TRUE); } else { /* * Compression failed, release the compressed buffer. * l2hdr will be left unmodified. */ zio_data_buf_free(cdata, len); ARCSTAT_BUMP(arcstat_l2_compress_failures); return (B_FALSE); } } /* * Decompresses a zio read back from an l2arc device. On success, the * underlying zio's io_data buffer is overwritten by the uncompressed * version. On decompression error (corrupt compressed stream), the * zio->io_error value is set to signal an I/O error. * * Please note that the compressed data stream is not checksummed, so * if the underlying device is experiencing data corruption, we may feed * corrupt data to the decompressor, so the decompressor needs to be * able to handle this situation (LZ4 does). */ static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) { ASSERT(L2ARC_IS_VALID_COMPRESS(c)); if (zio->io_error != 0) { /* * An io error has occured, just restore the original io * size in preparation for a main pool read. */ zio->io_orig_size = zio->io_size = hdr->b_size; return; } if (c == ZIO_COMPRESS_EMPTY) { /* * An empty buffer results in a null zio, which means we * need to fill its io_data after we're done restoring the * buffer's contents. */ ASSERT(hdr->b_l1hdr.b_buf != NULL); bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; } else { ASSERT(zio->io_data != NULL); /* * We copy the compressed data from the start of the arc buffer * (the zio_read will have pulled in only what we need, the * rest is garbage which we will overwrite at decompression) * and then decompress back to the ARC data buffer. This way we * can minimize copying by simply decompressing back over the * original compressed data (rather than decompressing to an * aux buffer and then copying back the uncompressed buffer, * which is likely to be much larger). */ uint64_t csize; void *cdata; csize = zio->io_size; cdata = zio_data_buf_alloc(csize); bcopy(zio->io_data, cdata, csize); if (zio_decompress_data(c, cdata, zio->io_data, csize, hdr->b_size) != 0) zio->io_error = EIO; zio_data_buf_free(cdata, csize); } /* Restore the expected uncompressed IO size. */ zio->io_orig_size = zio->io_size = hdr->b_size; } /* * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. * This buffer serves as a temporary holder of compressed data while * the buffer entry is being written to an l2arc device. Once that is * done, we can dispose of it. */ static void l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - enum zio_compress comp = HDR_GET_COMPRESS(hdr); + ASSERT(HDR_HAS_L2HDR(hdr)); + enum zio_compress comp = hdr->b_l2hdr.b_compress; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); if (comp == ZIO_COMPRESS_OFF) { /* * In this case, b_tmp_cdata points to the same buffer * as the arc_buf_t's b_data field. We don't want to * free it, since the arc_buf_t will handle that. */ hdr->b_l1hdr.b_tmp_cdata = NULL; } else if (comp == ZIO_COMPRESS_EMPTY) { /* * In this case, b_tmp_cdata was compressed to an empty * buffer, thus there's nothing to free and b_tmp_cdata * should have been set to NULL in l2arc_write_buffers(). */ ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); } else { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size); hdr->b_l1hdr.b_tmp_cdata = NULL; } } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static void l2arc_feed_thread(void *dummy __unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); boolean_t headroom_boost = B_FALSE; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next - ddi_get_lbolt()); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT(spa != NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); vdev_ashift_optimize(vd); /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); refcount_create(&adddev->l2ad_alloc); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *dev, *nextdev, *remdev = NULL; /* * Find the device by vdev */ mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { nextdev = list_next(l2arc_dev_list, dev); if (vd == dev->l2ad_vdev) { remdev = dev; break; } } ASSERT(remdev != NULL); /* * Remove device from global list */ list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); mutex_destroy(&remdev->l2ad_mtx); refcount_destroy(&remdev->l2ad_alloc); kmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { /* * This is called from dmu_fini(), which is called from spa_fini(); * Because of this, we can assume that all l2arc devices have * already been removed when the pools themselves were removed. */ l2arc_do_free_on_write(); mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 287722) @@ -1,3200 +1,3197 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Number of times that zfs_free_range() took the slow path while doing * a zfs receive. A nonzero value indicates a potential performance problem. */ uint64_t zfs_free_range_recv_miss; static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); #ifndef __lint extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); #endif /* ! __lint */ /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_cache; static taskq_t *dbu_evict_taskq; /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); return (crc); } #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_inc_64(&dbuf_hash_count); return (NULL); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; /* * We musn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_hash_count); } static arc_evict_func_t dbuf_do_evict; typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING } dbvu_verify_type_t; static void dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) { #ifdef ZFS_DEBUG int64_t holds; if (db->db_user == NULL) return; /* Only data blocks support the attachment of user data. */ ASSERT(db->db_level == 0); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); ASSERT3U(db->db_state, ==, DB_CACHED); holds = refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { /* * Immediate eviction occurs when holds == dirtycnt. * For normal eviction buffers, holds is zero on * eviction, except when dbuf_fix_old_data() calls * dbuf_clear_data(). However, the hold count can grow * during eviction even though db_mtx is held (see * dmu_bonus_hold() for an example), so we can only * test the generic invariant that holds >= dirtycnt. */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { if (db->db_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); } #endif } static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; ASSERT(MUTEX_HELD(&db->db_mtx)); if (dbu == NULL) return; dbuf_verify_user(db, DBVU_EVICTING); db->db_user = NULL; #ifdef ZFS_DEBUG if (dbu->dbu_clear_on_evict_dbufp != NULL) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif /* * Invoke the callback from a taskq to avoid lock order reversals * and limit stack depth. */ taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, &dbu->dbu_tqent); } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { if (db->db_level > 0) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } void dbuf_evict(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL); ASSERT(db->db_data_pending == NULL); dbuf_clear(db); dbuf_destroy(db); } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average 4K block size. The table will take up * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). */ while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); kmem_cache_destroy(dbuf_cache); taskq_destroy(dbu_evict_taskq); } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. */ if (db->db_dirtycnt == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } } DB_DNODE_EXIT(db); } #endif static void dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); db->db_buf = NULL; db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; if (!arc_released(buf)) arc_set_callback(buf, dbuf_do_evict, db); } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } /* * Calculate which level n block references the data at the level 0 offset * provided. */ uint64_t dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { /* * The level n blkid is equal to the level 0 blkid divided by * the number of level 0s in a level n block. * * The level 0 blkid is offset >> datablkshift = * offset / 2^datablkshift. * * The number of level 0s in a level n is the number of block * pointers in an indirect block, raised to the power of level. * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). * * Thus, the level n blkid is: offset / * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) * = offset / 2^(datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) * = offset >> (datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) */ return (offset >> (dn->dn_datablkshift + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else if (zio == NULL || zio->io_error == 0) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db)); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); } static void -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { dnode_t *dn; zbookmark_phys_t zb; arc_flags_t aflags = ARC_FLAG_NOWAIT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } /* * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); DB_DNODE_EXIT(db); dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; - *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); return; } DB_DNODE_EXIT(db); db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; if (DBUF_IS_L2COMPRESSIBLE(db)) aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - if (aflags & ARC_FLAG_CACHED) - *flags |= DB_RF_CACHED; } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t havepzio = (zio != NULL); boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, - db->db.db_size, TRUE); + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; if (zio == NULL) zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - dbuf_read_impl(db, zio, &flags); + dbuf_read_impl(db, zio, flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, - db->db.db_size, flags & DB_RF_CACHED); + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, - db->db.db_size, TRUE); + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); } mutex_exit(&db->db_mtx); } ASSERT(err || havepzio || db->db_state == DB_CACHED); return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } /* * This is our just-in-time copy function. It makes a copy of * buffers, that have been modified in a previous transaction * group, before we modify them in the current active group. * * This function is used in two places: when we are dirtying a * buffer for the first time in a txg, and when we are freeing * a range in a dnode that includes this buffer. * * Note that when we are called from dbuf_free_range() we do * not put a hold on the buffer, we just traverse the active * dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { dbuf_clear_data(db); } } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. * * This is a no-op if the dataset is in the middle of an incremental * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); db_search.db_level = 0; db_search.db_blkid = start_blkid; db_search.db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); if (start_blkid >= dn->dn_unlisted_l0_blkid) { /* There can't be any dbufs in this range; no need to search. */ #ifdef DEBUG db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); ASSERT(db == NULL || db->db_level > 0); #endif mutex_exit(&dn->dn_dbufs_mtx); return; } else if (dmu_objset_is_receiving(dn->dn_objset)) { /* * If we are receiving, we expect there to be no dbufs in * the range to be freed, because receive modifies each * block at most once, and in offset order. If this is * not the case, it can lead to performance problems, * so note that we unexpectedly took the slow path. */ atomic_inc_64(&zfs_free_range_recv_miss); } db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_clear(db); continue; } /* The dbuf is referenced */ if (db->db_last_dirty != NULL) { dbuf_dirty_record_t *dr = db->db_last_dirty; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); } static int dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. * * This logic ensures that only block births for * filled blocks are considered. */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_last_dirty && (db->db_blkptr == NULL || !BP_IS_HOLE(db->db_blkptr))) { birth_txg = db->db_last_dirty->dr_txg; } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { birth_txg = db->db_blkptr->blk_birth; } /* * If this block don't exist or is in a snapshot, it can't be freed. * Don't pass the bp to dsl_dataset_block_freeable() since we * are holding the db_mtx lock and might deadlock if we are * prefetching a dedup-ed block. */ if (birth_txg != 0) return (ds == NULL || dsl_dataset_block_freeable(ds, NULL, birth_txg)); else return (B_FALSE); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); VERIFY(arc_buf_remove_ref(obuf, db)); db->db.db_size = size; if (db->db_level == 0) { ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); dnode_willuse_space(dn, size-osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED && !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); } mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); return (dr); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop * the db_mtx. This keeps us from grabbing other locks * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); do_free_accounting = dbuf_block_freeable(db); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr; /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? bp_get_dsize(os->os_spa, bp) : db->db.db_size; /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much * space it will use on disk yet. We should * really have the struct_rwlock to access * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); return (B_TRUE); } return (B_FALSE); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), SPA_FEATURE_EMBEDDED_DATA)); } DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); dl = &db->db_last_dirty->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ASSERT(buf != NULL); ASSERT(arc_buf_size(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; } xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; VERIFY(arc_buf_remove_ref(db->db_buf, db)); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); VERIFY(arc_buf_remove_ref(db->db_buf, db)); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } /* * "Clear" the contents of this dbuf. This will mark the dbuf * EVICTING and clear *most* of its references. Unfortunately, * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: * DMU: dbuf_clear()->arc_clear_callback() * ARC: dbuf_do_evict()->dbuf_destroy() * * This routine will dissociate the dbuf from the arc, by calling * arc_clear_callback(), but will not evict the data from the ARC. */ void dbuf_clear(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); dbuf_evict_user(db); if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } else { DB_DNODE_EXIT(db); } if (db->db_buf) dbuf_gone = arc_clear_callback(db->db_buf); if (!dbuf_gone) mutex_exit(&db->db_mtx); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) dbuf_rele(parent, db); } /* * Note: While bpp will always be updated if the function returns success, * parentp will not be updated if the dnode does not have dn_dbuf filled in; * this happens when the dnode is the meta-dnode, or a userused or groupused * object. */ static int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) { int nlevels, epbs; *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = &dn->dn_phys->dn_spill; else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; else nlevels = dn->dn_phys->dn_nlevels; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (level >= nlevels || (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before its added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } static int dbuf_do_evict(void *private) { dmu_buf_impl_t *db = private; if (!MUTEX_HELD(&db->db_mtx)) mutex_enter(&db->db_mtx); ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_state != DB_EVICTING) { ASSERT(db->db_state == DB_CACHED); DBUF_VERIFY(db); db->db_buf = NULL; dbuf_evict(db); } else { mutex_exit(&db->db_mtx); dbuf_destroy(db); } return (0); } static void dbuf_destroy(dmu_buf_impl_t *db) { ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DMU_BONUS_BLKID) { /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ if (db->db_dnode_handle != NULL) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold * corresponding to the removed dbuf is no longer * discounted in dnode_move(), so the dnode cannot be * moved until after we release the hold. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; db->db_buf = NULL; ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ } dbuf_prefetch_arg_t; /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return; arc_flags_t aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &dpa->dpa_zb); } /* * Called when an indirect block above our prefetch target is read in. This * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ static void dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); } dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } (void) arc_buf_remove_ref(abuf, private); } /* * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to * complete. */ void dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags) { blkptr_t bp; int epbs, nlevels, curlevel; uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (blkid > dn->dn_maxblkid) + return; if (dnode_block_freed(dn, blkid)) return; /* * This dnode hasn't been written to disk yet, so there's nothing to * prefetch. */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) return; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) return; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db != NULL) { mutex_exit(&db->db_mtx); /* * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ return; } /* * Find the closest ancestor (indirect block) of the target block * that is present in the cache. In this indirect block, we will * find the bp that is at curlevel, curblkid. */ curlevel = level; curblkid = blkid; while (curlevel < nlevels - 1) { int parent_level = curlevel + 1; uint64_t parent_blkid = curblkid >> epbs; dmu_buf_impl_t *db; if (dbuf_hold_impl(dn, parent_level, parent_blkid, FALSE, TRUE, FTAG, &db) == 0) { blkptr_t *bpp = db->db_buf->b_data; bp = bpp[P2PHASE(curblkid, 1 << epbs)]; dbuf_rele(db, FTAG); break; } curlevel = parent_level; curblkid = parent_blkid; } if (curlevel == nlevels - 1) { /* No cached indirect blocks found. */ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } if (BP_IS_HOLE(&bp)) return; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, level, blkid); dpa->dpa_curlevel = curlevel; dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; /* * If we have the indirect just above us, no need to do the asynchronous * prefetch chain; we'll just run the last step ourselves. If we're at * a higher level, though, we want to issue the prefetches for all the * indirect blocks asynchronously, so we can go on with whatever we were * doing. */ if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, &bp, dbuf_prefetch_indirect_done, dpa, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } /* * We use pio here instead of dpa_zio since it's possible that * dpa may have already been freed. */ zio_nowait(pio); } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; top: /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db == NULL) { blkptr_t *bp = NULL; int err; if (fail_uncached) return (SET_ERROR(ENOENT)); ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); return (err); } } if (err && err != ENOENT) return (err); db = dbuf_create(dn, level, blkid, parent, bp); } if (fail_uncached && db->db_state != DB_CACHED) { mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } if (db->db_buf && refcount_is_zero(&db->db_holds)) { arc_buf_add_ref(db->db_buf, db); if (db->db_buf->b_data == NULL) { dbuf_clear(db); if (parent) { dbuf_rele(parent, NULL); parent = NULL; } goto top; } ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; if (dr->dt.dl.dr_data == db->db_buf) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; return (0); } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); ASSERT(holds > 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (db->db_blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag); } void dmu_buf_rele(dmu_buf_t *db, void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele_and_unlock() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); /* * If the dnode has been freed, evict the bonus * buffer immediately. The data in the bonus * buffer is no longer relevant and this prevents * a stale bonus buffer from being associated * with this dnode_t should the dnode_t be reused * prior to being destroyed. */ mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { /* * Drop dn_mtx. It is a leaf lock and * cannot be held when dnode_evict_bonus() * acquires other locks in order to * perform the eviction. * * Freed dnodes cannot be reused until the * last hold is released. Since this bonus * buffer has a hold, the dnode will remain * in the free state, even without dn_mtx * held, until the dnode_rele_and_unlock() * below. */ mutex_exit(&dn->dn_mtx); dnode_evict_bonus(dn); mutex_enter(&dn->dn_mtx); } dnode_rele_and_unlock(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_evict(db); } else if (arc_released(db->db_buf)) { arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { VERIFY(!arc_buf_remove_ref(db->db_buf, db)); /* * A dbuf will be eligible for eviction if either the * 'primarycache' property is set or a duplicate * copy of this buffer is already cached in the arc. * * In the case of the 'primarycache' a buffer * is considered for eviction if it matches the * criteria set in the property. * * To decide if our buffer is considered a * duplicate, we must call into the arc to determine * if multiple buffers are referencing the same * block on-disk. If so, then we simply evict * ourselves. */ if (!DBUF_IS_CACHEABLE(db)) { if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { spa_t *spa = dmu_objset_spa(db->db_objset); blkptr_t bp = *db->db_blkptr; dbuf_clear(db); arc_freed(spa, &bp); } else { dbuf_clear(db); } } else if (db->db_objset->os_evicting || arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { mutex_exit(&db->db_mtx); } } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (refcount_count(&db->db_holds)); } void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); dbuf_verify_user(db, DBVU_NOT_EVICTING); if (db->db_user == old_user) db->db_user = new_user; else old_user = db->db_user; dbuf_verify_user(db, DBVU_NOT_EVICTING); mutex_exit(&db->db_mtx); return (old_user); } void * dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, NULL, user)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } void * dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_verify_user(db, DBVU_NOT_EVICTING); return (db->db_user); } void dmu_buf_user_evict_wait() { taskq_wait(dbu_evict_taskq); } boolean_t dmu_buf_freeable(dmu_buf_t *dbuf) { boolean_t res = B_FALSE; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; if (db->db_blkptr) res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, db->db_blkptr, db->db_blkptr->blk_birth); return (res); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = &dn->dn_phys->dn_spill; BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mis-match). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT0(db->db_level); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while (dr = list_head(list)) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { dnode_phys_t *dnp = db->db.db_data; for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) fill++; } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) bp->blk_fill = fill; mutex_exit(&db->db_mtx); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ /* ARGSUSED */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dsl_pool_sync()'s call to * dsl_pool_undirty_space(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); else if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(zio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ void *contents = (data != NULL) ? data->b_data : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, contents, db->db.db_size, &zp, dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); dr->dr_zio = arc_write(zio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 287722) @@ -1,2112 +1,2120 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, { DMU_BSWAP_UINT64, TRUE, "object array" }, { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, { DMU_BSWAP_UINT64, TRUE, "bpobj" }, { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, { DMU_BSWAP_ZAP, TRUE, "DSL props" }, { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, { DMU_BSWAP_UINT8, FALSE, "zvol object" }, { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, { DMU_BSWAP_UINT8, TRUE, "SPA history" }, { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, { DMU_BSWAP_UINT8, TRUE, "FUID table" }, { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, { DMU_BSWAP_UINT8, TRUE, "System attributes" }, { DMU_BSWAP_ZAP, TRUE, "SA master node" }, { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, { DMU_BSWAP_ZAP, TRUE, "scan translations" }, { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); ASSERT(db != NULL); err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else dbuf_rele(db, tag); return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) - dbuf_flags |= DB_RF_NOPREFETCH; + /* + * Note: We directly notify the prefetch code of this read, so that + * we can tell it about the multi-block read. dbuf_read() only knows + * about the one block it is accessing. + */ + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | + DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; + nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - + P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (SET_ERROR(EIO)); } + /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); #ifdef _KERNEL else curthread->td_ru.ru_oublock++; #endif dbp[i] = &db->db; } + + if ((flags & DMU_READ_NO_PREFETCH) == 0 && read && + length < zfetch_array_rd_sz) { + dmu_zfetch(&dn->dn_zfetch, blkid, nblks); + } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) + uint64_t length, boolean_t read, void *tag, int *numbufsp, + dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefeteched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not in * cache, they will be asychronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; - - if (zfs_prefetch_disable) - return; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); if (*start - minimum <= iblkrange * maxblks) { *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; return (0); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; int err; if (offset >= object_size) return (0); if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin; chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_end - chunk_begin); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); dmu_tx_commit(tx); length -= chunk_end - chunk_begin; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; dmu_buf_t **dbp; int numbufs, err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); bcopy((char *)db->db_data + bufoff, buf, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); return (err); } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); bcopy(buf, (char *)db->db_data + bufoff, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } /* * DMU support for xuio */ kstat_t *xuio_ksp = NULL; int dmu_xuio_init(xuio_t *xuio, int nblk) { dmu_xuio_t *priv; uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); return (0); } void dmu_xuio_fini(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int nblk = priv->cnt; kmem_free(priv->iovp, nblk * sizeof (iovec_t)); kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); } /* * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { struct iovec *iov; uio_t *uio = &xuio->xu_uio; dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int i = priv->next++; ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_size(abuf)); iov = uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; return (0); } int dmu_xuio_cnt(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); return (priv->cnt); } arc_buf_t * dmu_xuio_arcbuf(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); return (priv->bufs[i]); } void dmu_xuio_clear(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); priv->bufs[i] = NULL; } static void xuio_stat_init(void) { xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (xuio_ksp != NULL) { xuio_ksp->ks_data = &xuio_stats; kstat_install(xuio_ksp); } } static void xuio_stat_fini(void) { if (xuio_ksp != NULL) { kstat_delete(xuio_ksp); xuio_ksp = NULL; } } void xuio_stat_wbuf_copied() { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void xuio_stat_wbuf_nocopy() { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } #ifdef _KERNEL static int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); #ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; #endif for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); if (xuio) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); if (!err) { uio->uio_resid -= tocpy; uio->uio_loffset += tocpy; } if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); } if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset uio->uio_loffset. */ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } static int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset uio->uio_loffset. */ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #else /* !illumos */ int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, vm_page_t *ma, dmu_tx_t *tx) { dmu_buf_t **dbp; struct sf_buf *sf; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(*ma, &sf); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(sf); ma += 1; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #endif /* illumos */ #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); VERIFY(arc_buf_remove_ref(buf, FTAG)); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_size(buf); uint64_t blkid; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); /* * We can only assign if the offset is aligned, the arc buf is the * same size as the dbuf, and the dbuf is not metadata. It * can't be metadata because the loaned arc buf comes from the * user-data kmem arena. */ if (offset == db->db.db_offset && blksz == db->db.db_size && DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); ASSERT(zio_checksum_table[chksum].ci_dedup); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; blkptr_t *bp_orig = &zio->io_bp_orig; if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { /* * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) * then there is nothing to do here. Otherwise, free the * newly allocated block in this txg. */ if (zio->io_flags & ZIO_FLAG_NOPWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { blkptr_t *bp = zgd->zgd_bp; dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; while (dr && dr->dr_txg != txg) dr = dr->dr_next; if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { if (zfs_mdcomp_disable) { compress = ZIO_COMPRESS_EMPTY; } else { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); } /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (zio_checksum_table[checksum].ci_correctable < 1 || zio_checksum_table[checksum].ci_eck) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_NOPARITY; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checkum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!zio_checksum_table[checksum].ci_dedup) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have a cryptographically secure * checksum that has no known collisions (i.e. SHA-256) * and compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually exclusive. */ nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; zp->zp_compress = compress; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int err; /* * Sync any current changes before * we go trundling through the block pointers. */ err = dmu_object_wait_synced(os, object); if (err) { return (err); } err = dnode_hold(os, object, FTAG, &dn); if (err) { return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); } /* * Given the ZFS object, if it contains any dirty nodes * this function flushes all dirty blocks to disk. This * ensures the DMU object info is updated. A more efficient * future version might just find the TXG with the maximum * ID and wait for that to be synced. */ int dmu_object_wait_synced(objset_t *os, uint64_t object) { dnode_t *dn; int error, i; error = dnode_hold(os, object, FTAG, &dn); if (error) { return (error); } for (i = 0; i < TXG_SIZE; i++) { if (list_link_active(&dn->dn_dirty_link[i])) { break; } } dnode_rele(dn, FTAG); if (i != TXG_SIZE) { txg_wait_synced(dmu_objset_pool(os), 0); } return (0); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp; rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); dmu_objset_init(); dnode_init(); dbuf_init(); zfetch_init(); zio_compress_init(); l2arc_init(); arc_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); zfetch_fini(); zio_compress_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); } Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c (revision 287722) @@ -1,758 +1,296 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include /* - * I'm against tune-ables, but these should probably exist as tweakable globals - * until we can get this working the way we want it to. + * This tunable disables predictive prefetch. Note that it leaves "prescient" + * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, + * prescient prefetch never issues i/os that end up not being needed, + * so it can't hurt performance. */ +boolean_t zfs_prefetch_disable = B_FALSE; -int zfs_prefetch_disable = 0; - /* max # of streams per zfetch */ uint32_t zfetch_max_streams = 8; /* min time before stream reclaim */ uint32_t zfetch_min_sec_reap = 2; -/* max number of blocks to fetch at a time */ -uint32_t zfetch_block_cap = 256; -/* number of bytes in a array_read at which we stop prefetching (1Mb) */ +/* max bytes to prefetch per stream (default 8MB) */ +uint32_t zfetch_max_distance = 8 * 1024 * 1024; +/* number of bytes in a array_read at which we stop prefetching (1MB) */ uint64_t zfetch_array_rd_sz = 1024 * 1024; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW, &zfs_prefetch_disable, 0, "Disable prefetch"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN, &zfetch_max_streams, 0, "Max # of streams per zfetch"); SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN, &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RWTUN, - &zfetch_block_cap, 0, "Max number of blocks to fetch at a time"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, + &zfetch_max_distance, 0, "Max bytes to prefetch per stream"); SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN, &zfetch_array_rd_sz, 0, "Number of bytes in a array_read at which we stop prefetching"); -/* forward decls for static routines */ -static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); -static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); -static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); -static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); -static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); -static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); -static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); -static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); - typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; - kstat_named_t zfetchstat_colinear_hits; - kstat_named_t zfetchstat_colinear_misses; - kstat_named_t zfetchstat_stride_hits; - kstat_named_t zfetchstat_stride_misses; - kstat_named_t zfetchstat_reclaim_successes; - kstat_named_t zfetchstat_reclaim_failures; - kstat_named_t zfetchstat_stream_resets; - kstat_named_t zfetchstat_stream_noresets; - kstat_named_t zfetchstat_bogus_streams; + kstat_named_t zfetchstat_max_streams; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, - { "colinear_hits", KSTAT_DATA_UINT64 }, - { "colinear_misses", KSTAT_DATA_UINT64 }, - { "stride_hits", KSTAT_DATA_UINT64 }, - { "stride_misses", KSTAT_DATA_UINT64 }, - { "reclaim_successes", KSTAT_DATA_UINT64 }, - { "reclaim_failures", KSTAT_DATA_UINT64 }, - { "streams_resets", KSTAT_DATA_UINT64 }, - { "streams_noresets", KSTAT_DATA_UINT64 }, - { "bogus_streams", KSTAT_DATA_UINT64 }, + { "max_streams", KSTAT_DATA_UINT64 }, }; -#define ZFETCHSTAT_INCR(stat, val) \ - atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); +#define ZFETCHSTAT_BUMP(stat) \ + atomic_inc_64(&zfetch_stats.stat.value.ui64); -#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1); - kstat_t *zfetch_ksp; -/* - * Given a zfetch structure and a zstream structure, determine whether the - * blocks to be read are part of a co-linear pair of existing prefetch - * streams. If a set is found, coalesce the streams, removing one, and - * configure the prefetch so it looks for a strided access pattern. - * - * In other words: if we find two sequential access streams that are - * the same length and distance N appart, and this read is N from the - * last stream, then we are probably in a strided access pattern. So - * combine the two sequential streams into a single strided stream. - * - * Returns whether co-linear streams were found. - */ -static boolean_t -dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) -{ - zstream_t *z_walk; - zstream_t *z_comp; - - if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) - return (0); - - if (zh == NULL) { - rw_exit(&zf->zf_rwlock); - return (0); - } - - for (z_walk = list_head(&zf->zf_stream); z_walk; - z_walk = list_next(&zf->zf_stream, z_walk)) { - for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; - z_comp = list_next(&zf->zf_stream, z_comp)) { - int64_t diff; - - if (z_walk->zst_len != z_walk->zst_stride || - z_comp->zst_len != z_comp->zst_stride) { - continue; - } - - diff = z_comp->zst_offset - z_walk->zst_offset; - if (z_comp->zst_offset + diff == zh->zst_offset) { - z_walk->zst_offset = zh->zst_offset; - z_walk->zst_direction = diff < 0 ? -1 : 1; - z_walk->zst_stride = - diff * z_walk->zst_direction; - z_walk->zst_ph_offset = - zh->zst_offset + z_walk->zst_stride; - dmu_zfetch_stream_remove(zf, z_comp); - mutex_destroy(&z_comp->zst_lock); - kmem_free(z_comp, sizeof (zstream_t)); - - dmu_zfetch_dofetch(zf, z_walk); - - rw_exit(&zf->zf_rwlock); - return (1); - } - - diff = z_walk->zst_offset - z_comp->zst_offset; - if (z_walk->zst_offset + diff == zh->zst_offset) { - z_walk->zst_offset = zh->zst_offset; - z_walk->zst_direction = diff < 0 ? -1 : 1; - z_walk->zst_stride = - diff * z_walk->zst_direction; - z_walk->zst_ph_offset = - zh->zst_offset + z_walk->zst_stride; - dmu_zfetch_stream_remove(zf, z_comp); - mutex_destroy(&z_comp->zst_lock); - kmem_free(z_comp, sizeof (zstream_t)); - - dmu_zfetch_dofetch(zf, z_walk); - - rw_exit(&zf->zf_rwlock); - return (1); - } - } - } - - rw_exit(&zf->zf_rwlock); - return (0); -} - -/* - * Given a zstream_t, determine the bounds of the prefetch. Then call the - * routine that actually prefetches the individual blocks. - */ -static void -dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) -{ - uint64_t prefetch_tail; - uint64_t prefetch_limit; - uint64_t prefetch_ofst; - uint64_t prefetch_len; - uint64_t blocks_fetched; - - zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); - zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); - - prefetch_tail = MAX((int64_t)zs->zst_ph_offset, - (int64_t)(zs->zst_offset + zs->zst_stride)); - /* - * XXX: use a faster division method? - */ - prefetch_limit = zs->zst_offset + zs->zst_len + - (zs->zst_cap * zs->zst_stride) / zs->zst_len; - - while (prefetch_tail < prefetch_limit) { - prefetch_ofst = zs->zst_offset + zs->zst_direction * - (prefetch_tail - zs->zst_offset); - - prefetch_len = zs->zst_len; - - /* - * Don't prefetch beyond the end of the file, if working - * backwards. - */ - if ((zs->zst_direction == ZFETCH_BACKWARD) && - (prefetch_ofst > prefetch_tail)) { - prefetch_len += prefetch_ofst; - prefetch_ofst = 0; - } - - /* don't prefetch more than we're supposed to */ - if (prefetch_len > zs->zst_len) - break; - - blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, - prefetch_ofst, zs->zst_len); - - prefetch_tail += zs->zst_stride; - /* stop if we've run out of stuff to prefetch */ - if (blocks_fetched < zs->zst_len) - break; - } - zs->zst_ph_offset = prefetch_tail; - zs->zst_last = ddi_get_lbolt(); -} - void zfetch_init(void) { - zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zfetch_ksp != NULL) { zfetch_ksp->ks_data = &zfetch_stats; kstat_install(zfetch_ksp); } } void zfetch_fini(void) { if (zfetch_ksp != NULL) { kstat_delete(zfetch_ksp); zfetch_ksp = NULL; } } /* * This takes a pointer to a zfetch structure and a dnode. It performs the * necessary setup for the zfetch structure, grokking data from the * associated dnode. */ void dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { - if (zf == NULL) { + if (zf == NULL) return; - } zf->zf_dnode = dno; - zf->zf_stream_cnt = 0; - zf->zf_alloc_fail = 0; list_create(&zf->zf_stream, sizeof (zstream_t), - offsetof(zstream_t, zst_node)); + offsetof(zstream_t, zs_node)); rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); } -/* - * This function computes the actual size, in blocks, that can be prefetched, - * and fetches it. - */ -static uint64_t -dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { - uint64_t fetchsz; - uint64_t i; - - fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); - - for (i = 0; i < fetchsz; i++) { - dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ, - ARC_FLAG_PREFETCH); - } - - return (fetchsz); + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); } /* - * this function returns the number of blocks that would be prefetched, based - * upon the supplied dnode, blockid, and nblks. This is used so that we can - * update streams in place, and then prefetch with their old value after the - * fact. This way, we can delay the prefetch, but subsequent accesses to the - * stream won't result in the same data being prefetched multiple times. + * Clean-up state associated with a zfetch structure (e.g. destroy the + * streams). This doesn't free the zfetch_t itself, that's left to the caller. */ -static uint64_t -dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) -{ - uint64_t fetchsz; - - if (blkid > dn->dn_maxblkid) { - return (0); - } - - /* compute fetch size */ - if (blkid + nblks + 1 > dn->dn_maxblkid) { - fetchsz = (dn->dn_maxblkid - blkid) + 1; - ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); - } else { - fetchsz = nblks; - } - - - return (fetchsz); -} - -/* - * given a zfetch and a zstream structure, see if there is an associated zstream - * for this block read. If so, it starts a prefetch for the stream it - * located and returns true, otherwise it returns false - */ -static boolean_t -dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) -{ - zstream_t *zs; - int64_t diff; - int reset = !prefetched; - int rc = 0; - - if (zh == NULL) - return (0); - - /* - * XXX: This locking strategy is a bit coarse; however, it's impact has - * yet to be tested. If this turns out to be an issue, it can be - * modified in a number of different ways. - */ - - rw_enter(&zf->zf_rwlock, RW_READER); -top: - - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - - /* - * XXX - should this be an assert? - */ - if (zs->zst_len == 0) { - /* bogus stream */ - ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); - continue; - } - - /* - * We hit this case when we are in a strided prefetch stream: - * we will read "len" blocks before "striding". - */ - if (zh->zst_offset >= zs->zst_offset && - zh->zst_offset < zs->zst_offset + zs->zst_len) { - if (prefetched) { - /* already fetched */ - ZFETCHSTAT_BUMP(zfetchstat_stride_hits); - rc = 1; - goto out; - } else { - ZFETCHSTAT_BUMP(zfetchstat_stride_misses); - } - } - - /* - * This is the forward sequential read case: we increment - * len by one each time we hit here, so we will enter this - * case on every read. - */ - if (zh->zst_offset == zs->zst_offset + zs->zst_len) { - - reset = !prefetched && zs->zst_len > 1; - - if (mutex_tryenter(&zs->zst_lock) == 0) { - rc = 1; - goto out; - } - - if (zh->zst_offset != zs->zst_offset + zs->zst_len) { - mutex_exit(&zs->zst_lock); - goto top; - } - zs->zst_len += zh->zst_len; - diff = zs->zst_len - zfetch_block_cap; - if (diff > 0) { - zs->zst_offset += diff; - zs->zst_len = zs->zst_len > diff ? - zs->zst_len - diff : 0; - } - zs->zst_direction = ZFETCH_FORWARD; - - break; - - /* - * Same as above, but reading backwards through the file. - */ - } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { - /* backwards sequential access */ - - reset = !prefetched && zs->zst_len > 1; - - if (mutex_tryenter(&zs->zst_lock) == 0) { - rc = 1; - goto out; - } - - if (zh->zst_offset != zs->zst_offset - zh->zst_len) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset = zs->zst_offset > zh->zst_len ? - zs->zst_offset - zh->zst_len : 0; - zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? - zs->zst_ph_offset - zh->zst_len : 0; - zs->zst_len += zh->zst_len; - - diff = zs->zst_len - zfetch_block_cap; - if (diff > 0) { - zs->zst_ph_offset = zs->zst_ph_offset > diff ? - zs->zst_ph_offset - diff : 0; - zs->zst_len = zs->zst_len > diff ? - zs->zst_len - diff : zs->zst_len; - } - zs->zst_direction = ZFETCH_BACKWARD; - - break; - - } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < - zs->zst_len) && (zs->zst_len != zs->zst_stride)) { - /* strided forward access */ - - if (mutex_tryenter(&zs->zst_lock) == 0) { - rc = 1; - goto out; - } - - if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= - zs->zst_len) || (zs->zst_len == zs->zst_stride)) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset += zs->zst_stride; - zs->zst_direction = ZFETCH_FORWARD; - - break; - - } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < - zs->zst_len) && (zs->zst_len != zs->zst_stride)) { - /* strided reverse access */ - - if (mutex_tryenter(&zs->zst_lock) == 0) { - rc = 1; - goto out; - } - - if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= - zs->zst_len) || (zs->zst_len == zs->zst_stride)) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset = zs->zst_offset > zs->zst_stride ? - zs->zst_offset - zs->zst_stride : 0; - zs->zst_ph_offset = (zs->zst_ph_offset > - (2 * zs->zst_stride)) ? - (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; - zs->zst_direction = ZFETCH_BACKWARD; - - break; - } - } - - if (zs) { - if (reset) { - zstream_t *remove = zs; - - ZFETCHSTAT_BUMP(zfetchstat_stream_resets); - rc = 0; - mutex_exit(&zs->zst_lock); - rw_exit(&zf->zf_rwlock); - rw_enter(&zf->zf_rwlock, RW_WRITER); - /* - * Relocate the stream, in case someone removes - * it while we were acquiring the WRITER lock. - */ - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - if (zs == remove) { - dmu_zfetch_stream_remove(zf, zs); - mutex_destroy(&zs->zst_lock); - kmem_free(zs, sizeof (zstream_t)); - break; - } - } - } else { - ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); - rc = 1; - dmu_zfetch_dofetch(zf, zs); - mutex_exit(&zs->zst_lock); - } - } -out: - rw_exit(&zf->zf_rwlock); - return (rc); -} - -/* - * Clean-up state associated with a zfetch structure. This frees allocated - * structure members, empties the zf_stream tree, and generally makes things - * nice. This doesn't free the zfetch_t itself, that's left to the caller. - */ void -dmu_zfetch_rele(zfetch_t *zf) +dmu_zfetch_fini(zfetch_t *zf) { - zstream_t *zs; - zstream_t *zs_next; + zstream_t *zs; ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); - for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { - zs_next = list_next(&zf->zf_stream, zs); - - list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zst_lock); - kmem_free(zs, sizeof (zstream_t)); - } + rw_enter(&zf->zf_rwlock, RW_WRITER); + while ((zs = list_head(&zf->zf_stream)) != NULL) + dmu_zfetch_stream_remove(zf, zs); + rw_exit(&zf->zf_rwlock); list_destroy(&zf->zf_stream); rw_destroy(&zf->zf_rwlock); zf->zf_dnode = NULL; } /* - * Given a zfetch and zstream structure, insert the zstream structure into the - * AVL tree contained within the zfetch structure. Peform the appropriate - * book-keeping. It is possible that another thread has inserted a stream which - * matches one that we are about to insert, so we must be sure to check for this - * case. If one is found, return failure, and let the caller cleanup the - * duplicates. + * If there aren't too many streams already, create a new stream. + * The "blkid" argument is the next block that we expect this stream to access. + * While we're here, clean up old streams (which haven't been + * accessed for at least zfetch_min_sec_reap seconds). */ -static int -dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) +static void +dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { - zstream_t *zs_walk; - zstream_t *zs_next; + zstream_t *zs_next; + int numstreams = 0; ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { - zs_next = list_next(&zf->zf_stream, zs_walk); - - if (dmu_zfetch_streams_equal(zs_walk, zs)) { - return (0); - } + /* + * Clean up old streams. + */ + for (zstream_t *zs = list_head(&zf->zf_stream); + zs != NULL; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + if (((gethrtime() - zs->zs_atime) / NANOSEC) > + zfetch_min_sec_reap) + dmu_zfetch_stream_remove(zf, zs); + else + numstreams++; } - list_insert_head(&zf->zf_stream, zs); - zf->zf_stream_cnt++; - return (1); -} - - -/* - * Walk the list of zstreams in the given zfetch, find an old one (by time), and - * reclaim it for use by the caller. - */ -static zstream_t * -dmu_zfetch_stream_reclaim(zfetch_t *zf) -{ - zstream_t *zs; - clock_t ticks; - - ticks = zfetch_min_sec_reap * hz; - if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) - return (0); - - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - - if (ddi_get_lbolt() - zs->zst_last > ticks) - break; + /* + * The maximum number of streams is normally zfetch_max_streams, + * but for small files we lower it such that it's at least possible + * for all the streams to be non-overlapping. + * + * If we are already at the maximum number of streams for this file, + * even after removing old streams, then don't create this stream. + */ + uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, + zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / + zfetch_max_distance)); + if (numstreams >= max_streams) { + ZFETCHSTAT_BUMP(zfetchstat_max_streams); + return; } - if (zs) { - dmu_zfetch_stream_remove(zf, zs); - mutex_destroy(&zs->zst_lock); - bzero(zs, sizeof (zstream_t)); - } else { - zf->zf_alloc_fail++; - } - rw_exit(&zf->zf_rwlock); + zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); + zs->zs_blkid = blkid; + zs->zs_pf_blkid = blkid; + zs->zs_atime = gethrtime(); + mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); - return (zs); + list_insert_head(&zf->zf_stream, zs); } /* - * Given a zfetch and zstream structure, remove the zstream structure from its - * container in the zfetch structure. Perform the appropriate book-keeping. - */ -static void -dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) -{ - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - - list_remove(&zf->zf_stream, zs); - zf->zf_stream_cnt--; -} - -static int -dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) -{ - if (zs1->zst_offset != zs2->zst_offset) - return (0); - - if (zs1->zst_len != zs2->zst_len) - return (0); - - if (zs1->zst_stride != zs2->zst_stride) - return (0); - - if (zs1->zst_ph_offset != zs2->zst_ph_offset) - return (0); - - if (zs1->zst_cap != zs2->zst_cap) - return (0); - - if (zs1->zst_direction != zs2->zst_direction) - return (0); - - return (1); -} - -/* * This is the prefetch entry point. It calls all of the other dmu_zfetch * routines to create, delete, find, or operate upon prefetch streams. */ void -dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks) { - zstream_t zst; - zstream_t *newstream; - boolean_t fetched; - int inserted; - unsigned int blkshft; - uint64_t blksz; + zstream_t *zs; if (zfs_prefetch_disable) return; - /* files that aren't ln2 blocksz are only one block -- nothing to do */ - if (!zf->zf_dnode->dn_datablkshift) + /* + * As a fast path for small (single-block) files, ignore access + * to the first block. + */ + if (blkid == 0) return; - /* convert offset and size, into blockid and nblocks */ - blkshft = zf->zf_dnode->dn_datablkshift; - blksz = (1 << blkshft); + rw_enter(&zf->zf_rwlock, RW_READER); - bzero(&zst, sizeof (zstream_t)); - zst.zst_offset = offset >> blkshft; - zst.zst_len = (P2ROUNDUP(offset + size, blksz) - - P2ALIGN(offset, blksz)) >> blkshft; - - fetched = dmu_zfetch_find(zf, &zst, prefetched); - if (fetched) { - ZFETCHSTAT_BUMP(zfetchstat_hits); - } else { - ZFETCHSTAT_BUMP(zfetchstat_misses); - fetched = dmu_zfetch_colinear(zf, &zst); - if (fetched) { - ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); - } else { - ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if (blkid == zs->zs_blkid) { + mutex_enter(&zs->zs_lock); + /* + * zs_blkid could have changed before we + * acquired zs_lock; re-check them here. + */ + if (blkid != zs->zs_blkid) { + mutex_exit(&zs->zs_lock); + continue; + } + break; } } - if (!fetched) { - newstream = dmu_zfetch_stream_reclaim(zf); - + if (zs == NULL) { /* - * we still couldn't find a stream, drop the lock, and allocate - * one if possible. Otherwise, give up and go home. + * This access is not part of any existing stream. Create + * a new stream for it. */ - if (newstream) { - ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); - } else { - uint64_t maxblocks; - uint32_t max_streams; - uint32_t cur_streams; + ZFETCHSTAT_BUMP(zfetchstat_misses); + if (rw_tryupgrade(&zf->zf_rwlock)) + dmu_zfetch_stream_create(zf, blkid + nblks); + rw_exit(&zf->zf_rwlock); + return; + } - ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); - cur_streams = zf->zf_stream_cnt; - maxblocks = zf->zf_dnode->dn_maxblkid; + /* + * This access was to a block that we issued a prefetch for on + * behalf of this stream. Issue further prefetches for this stream. + * + * Normally, we start prefetching where we stopped + * prefetching last (zs_pf_blkid). But when we get our first + * hit on this stream, zs_pf_blkid == zs_blkid, we don't + * want to prefetch to block we just accessed. In this case, + * start just after the block we just accessed. + */ + int64_t pf_start = MAX(zs->zs_pf_blkid, blkid + nblks); - max_streams = MIN(zfetch_max_streams, - (maxblocks / zfetch_block_cap)); - if (max_streams == 0) { - max_streams++; - } + /* + * Double our amount of prefetched data, but don't let the + * prefetch get further ahead than zfetch_max_distance. + */ + int pf_nblks = + MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks, + zs->zs_blkid + nblks + + (zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start); - if (cur_streams >= max_streams) { - return; - } - newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); - } + zs->zs_pf_blkid = pf_start + pf_nblks; + zs->zs_atime = gethrtime(); + zs->zs_blkid = blkid + nblks; - newstream->zst_offset = zst.zst_offset; - newstream->zst_len = zst.zst_len; - newstream->zst_stride = zst.zst_len; - newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; - newstream->zst_cap = zst.zst_len; - newstream->zst_direction = ZFETCH_FORWARD; - newstream->zst_last = ddi_get_lbolt(); - - mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); - - rw_enter(&zf->zf_rwlock, RW_WRITER); - inserted = dmu_zfetch_stream_insert(zf, newstream); - rw_exit(&zf->zf_rwlock); - - if (!inserted) { - mutex_destroy(&newstream->zst_lock); - kmem_free(newstream, sizeof (zstream_t)); - } + /* + * dbuf_prefetch() issues the prefetch i/o + * asynchronously, but it may need to wait for an + * indirect block to be read from disk. Therefore + * we do not want to hold any locks while we call it. + */ + mutex_exit(&zs->zs_lock); + rw_exit(&zf->zf_rwlock); + for (int i = 0; i < pf_nblks; i++) { + dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); } + ZFETCHSTAT_BUMP(zfetchstat_hits); } Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 287722) @@ -1,2045 +1,2043 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include static kmem_cache_t *dnode_cache; /* * Define DNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef DEBUG #define DNODE_STATS #endif /* DEBUG */ #ifdef DNODE_STATS #define DNODE_STAT_ADD(stat) ((stat)++) #else #define DNODE_STAT_ADD(stat) /* nothing */ #endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; #ifdef illumos static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif static int dbuf_compare(const void *x1, const void *x2) { const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; if (d1->db_level < d2->db_level) { return (-1); } if (d1->db_level > d2->db_level) { return (1); } if (d1->db_blkid < d2->db_blkid) { return (-1); } if (d1->db_blkid > d2->db_blkid) { return (1); } if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); return (-1); } else if (d2->db_state == DB_SEARCH) { ASSERT3S(d1->db_state, !=, DB_SEARCH); return (1); } if ((uintptr_t)d1 < (uintptr_t)d2) { return (-1); } if ((uintptr_t)d1 > (uintptr_t)d2) { return (1); } return (0); } /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { dnode_t *dn = arg; int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); /* * Every dbuf has a reference, and dropping a tracked reference is * O(number of references), so don't track dn_holds. */ refcount_create_untracked(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dn->dn_dbufs_count = 0; dn->dn_unlisted_l0_blkid = 0; avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; POINTER_INVALIDATE(&dn->dn_objset); return (0); } /* ARGSUSED */ static void dnode_dest(void *arg, void *unused) { int i; dnode_t *dn = arg; rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_blksz[i]); } ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); ASSERT0(dn->dn_dirtyctx); ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ASSERT3P(dn->dn_bonus, ==, NULL); ASSERT(!dn->dn_have_spill); ASSERT3P(dn->dn_zio, ==, NULL); ASSERT0(dn->dn_oldused); ASSERT0(dn->dn_oldflags); ASSERT0(dn->dn_olduid); ASSERT0(dn->dn_oldgid); ASSERT0(dn->dn_newuid); ASSERT0(dn->dn_newgid); ASSERT0(dn->dn_id_flags); ASSERT0(dn->dn_dbufs_count); ASSERT0(dn->dn_unlisted_l0_blkid); avl_destroy(&dn->dn_dbufs); } void dnode_init(void) { ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); kmem_cache_set_move(dnode_cache, dnode_move); } void dnode_fini(void) { kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } #ifdef ZFS_DEBUG void dnode_verify(dnode_t *dn) { int drop_struct_lock = FALSE; ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { int i; ASSERT3U(dn->dn_indblkshift, >=, 0); ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); if (dn->dn_datablkshift) { ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); ASSERT3U(dn->dn_datablksz, ==, dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); } } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } #endif void dnode_byteswap(dnode_phys_t *dnp) { uint64_t *buf64 = (void*)&dnp->dn_blkptr; int i; if (dnp->dn_type == DMU_OT_NONE) { bzero(dnp, sizeof (dnode_phys_t)); return; } dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); dnp->dn_used = BSWAP_64(dnp->dn_used); /* * dn_nblkptr is only one byte, so it's OK to read it in either * byte order. We can't read dn_bouslen. */ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) buf64[i] = BSWAP_64(buf64[i]); /* * OK to check dn_bonuslen for zero, because it won't matter if * we have the wrong byte order. This is necessary because the * dnode dnode is smaller than a regular dnode. */ if (dnp->dn_bonuslen != 0) { /* * Note that the bonus length calculated here may be * longer than the actual bonus buffer. This is because * we always put the bonus buffer after the last block * pointer (instead of packing it against the end of the * dnode buffer). */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); } void dnode_buf_byteswap(void *vbuf, size_t size) { dnode_phys_t *buf = vbuf; int i; ASSERT3U(sizeof (dnode_phys_t), ==, (1<>= DNODE_SHIFT; for (i = 0; i < size; i++) { dnode_byteswap(buf); buf++; } } void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); dn->dn_bonuslen = newsize; if (newsize == 0) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; else dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; rw_exit(&dn->dn_struct_rwlock); } void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dn->dn_bonustype = newtype; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; rw_exit(&dn->dn_struct_rwlock); } void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); dnode_setdirty(dn, tx); dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; dn->dn_have_spill = B_FALSE; } static void dnode_setdblksz(dnode_t *dn, int size) { ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, >=, SPA_MINBLOCKSIZE); ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); dn->dn_datablksz = size; dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; } static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; /* * Defer setting dn_objset until the dnode is ready to be a candidate * for the dnode_move() callback. */ dn->dn_object = object; dn->dn_dbuf = db; dn->dn_handle = dnh; dn->dn_phys = dnp; if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } else { dn->dn_datablksz = 0; dn->dn_datablkszsec = 0; dn->dn_datablkshift = 0; } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; dn->dn_nblkptr = dnp->dn_nblkptr; dn->dn_checksum = dnp->dn_checksum; dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); if (dnh->dnh_dnode != NULL) { /* Lost the allocation race. */ mutex_exit(&os->os_lock); kmem_cache_free(dnode_cache, dn); return (dnh->dnh_dnode); } /* * Exclude special dnodes from os_dnodes so an empty os_dnodes * signifies that the special dnodes have no references from * their children (the entries in os_dnodes). This allows * dnode_destroy() to easily determine if the last child has * been removed and then complete eviction of the objset. */ if (!DMU_OBJECT_IS_SPECIAL(object)) list_insert_head(&os->os_dnodes, dn); membar_producer(); /* * Everything else must be valid before assigning dn_objset * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } /* * Caller must be holding the dnode handle, which is released upon return. */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { list_remove(&os->os_dnodes, dn); complete_os_eviction = list_is_empty(&os->os_dnodes) && list_link_active(&os->os_evicting_node); } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_evict(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; dn->dn_have_spill = B_FALSE; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dn->dn_unlisted_l0_blkid = 0; - dmu_zfetch_rele(&dn->dn_zfetch); + dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); if (complete_os_eviction) dmu_objset_evict_done(os); } void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int i; ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) blocksize = 1 << zfs_default_bs; else blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); if (ibs == 0) ibs = zfs_default_ibs; ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } dn->dn_type = ot; dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ dn->dn_nblkptr = 1; else dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; if (dn->dn_dirtyctx_firstset) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); ASSERT0(blocksize % SPA_MINBLOCKSIZE); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); dn->dn_id_flags = 0; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_setdirty(dn, tx); if (dn->dn_datablksz != blocksize) { /* change blocksize */ ASSERT(dn->dn_maxblkid == 0 && (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || dnode_block_freed(dn, 0))); dnode_setdblksz(dn, blocksize); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; } if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ nblkptr = 1; else nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); if (dn->dn_bonustype != bonustype) dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } rw_exit(&dn->dn_struct_rwlock); /* change type */ dn->dn_type = ot; /* change bonus size and type */ mutex_enter(&dn->dn_mtx); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); /* fix up the bonus db_size */ if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); } #ifdef DNODE_STATS static struct { uint64_t dms_dnode_invalid; uint64_t dms_dnode_recheck1; uint64_t dms_dnode_recheck2; uint64_t dms_dnode_special; uint64_t dms_dnode_handle; uint64_t dms_dnode_rwlock; uint64_t dms_dnode_active; } dnode_move_stats; #endif /* DNODE_STATS */ static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) { int i; ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); /* Copy fields. */ ndn->dn_objset = odn->dn_objset; ndn->dn_object = odn->dn_object; ndn->dn_dbuf = odn->dn_dbuf; ndn->dn_handle = odn->dn_handle; ndn->dn_phys = odn->dn_phys; ndn->dn_type = odn->dn_type; ndn->dn_bonuslen = odn->dn_bonuslen; ndn->dn_bonustype = odn->dn_bonustype; ndn->dn_nblkptr = odn->dn_nblkptr; ndn->dn_checksum = odn->dn_checksum; ndn->dn_compress = odn->dn_compress; ndn->dn_nlevels = odn->dn_nlevels; ndn->dn_indblkshift = odn->dn_indblkshift; ndn->dn_datablkshift = odn->dn_datablkshift; ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], sizeof (odn->dn_next_nblkptr)); bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], sizeof (odn->dn_next_nlevels)); bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], sizeof (odn->dn_next_indblkshift)); bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], sizeof (odn->dn_next_bonustype)); bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], sizeof (odn->dn_rm_spillblk)); bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], sizeof (odn->dn_next_bonuslen)); bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], sizeof (odn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); } bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], sizeof (odn->dn_free_ranges)); ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; ndn->dn_dirtyctx = odn->dn_dirtyctx; ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; ndn->dn_bonus = odn->dn_bonus; ndn->dn_have_spill = odn->dn_have_spill; ndn->dn_zio = odn->dn_zio; ndn->dn_oldused = odn->dn_oldused; ndn->dn_oldflags = odn->dn_oldflags; ndn->dn_olduid = odn->dn_olduid; ndn->dn_oldgid = odn->dn_oldgid; ndn->dn_newuid = odn->dn_newuid; ndn->dn_newgid = odn->dn_newgid; ndn->dn_id_flags = odn->dn_id_flags; dmu_zfetch_init(&ndn->dn_zfetch, NULL); list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; - ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt; - ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail; /* * Update back pointers. Updating the handle fixes the back pointer of * every descendant dbuf as well as the bonus dbuf. */ ASSERT(ndn->dn_handle->dnh_dnode == odn); ndn->dn_handle->dnh_dnode = ndn; if (ndn->dn_zfetch.zf_dnode == odn) { ndn->dn_zfetch.zf_dnode = ndn; } /* * Invalidate the original dnode by clearing all of its back pointers. */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_unlisted_l0_blkid = 0; odn->dn_bonus = NULL; odn->dn_zfetch.zf_dnode = NULL; /* * Set the low bit of the objset pointer to ensure that dnode_move() * recognizes the dnode as invalid in any subsequent callback. */ POINTER_INVALIDATE(&odn->dn_objset); /* * Satisfy the destructor. */ for (i = 0; i < TXG_SIZE; i++) { list_create(&odn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); odn->dn_free_ranges[i] = NULL; odn->dn_next_nlevels[i] = 0; odn->dn_next_indblkshift[i] = 0; odn->dn_next_bonustype[i] = 0; odn->dn_rm_spillblk[i] = 0; odn->dn_next_bonuslen[i] = 0; odn->dn_next_blksz[i] = 0; } odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; odn->dn_dirtyctx = 0; odn->dn_dirtyctx_firstset = NULL; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; odn->dn_oldflags = 0; odn->dn_olduid = 0; odn->dn_oldgid = 0; odn->dn_newuid = 0; odn->dn_newgid = 0; odn->dn_id_flags = 0; /* * Mark the dnode. */ ndn->dn_moved = 1; odn->dn_moved = (uint8_t)-1; } #ifdef illumos #ifdef _KERNEL /*ARGSUSED*/ static kmem_cbrc_t dnode_move(void *buf, void *newbuf, size_t size, void *arg) { dnode_t *odn = buf, *ndn = newbuf; objset_t *os; int64_t refcount; uint32_t dbufs; /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when * freeing the dnode to invalidate it, and the memory patterns written * by kmem (baddcafe and deadbeef) set at least one of the two low bits. * A newly created dnode sets the objset pointer last of all to indicate * that the dnode is known and in a valid state to be moved by this * function. */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Ensure that the objset does not go away during the move. */ rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the dnode is still valid, then so is the objset. We know that no * valid objset can be freed while we hold os_lock, so we can safely * ensure that the objset remains in use. */ mutex_enter(&os->os_lock); /* * Recheck the objset pointer in case the dnode was removed just before * acquiring the lock. */ if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold os->os_lock, the dnode * cannot be freed and fields within the dnode can be safely accessed. * The objset listing this dnode cannot go away as long as this dnode is * on its list. */ rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ /* * Lock the dnode handle to prevent the dnode from obtaining any new * holds. This also prevents the descendant dbufs and the bonus dbuf * from accessing the dnode, so that we can discount their holds. The * handle is safe to access because we know that while the dnode cannot * go away, neither can its handle. Once we hold dnh_zrlock, we can * safely move any dnode referenced only by dbufs. */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); return (KMEM_CBRC_LATER); } /* * Ensure a consistent view of the dnode's holds and the dnode's dbufs. * We need to guarantee that there is a hold for every dbuf in order to * determine whether the dnode is actively referenced. Falsely matching * a dbuf to an active hold would lead to an unsafe move. It's possible * that a thread already having an active dnode hold is about to add a * dbuf, and we can't compare hold and dbuf counts while the add is in * progress. */ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); return (KMEM_CBRC_LATER); } /* * A dbuf may be removed (evicted) without an active dnode hold. In that * case, the dbuf count is decremented under the handle lock before the * dbuf's hold is released. This order ensures that if we count the hold * after the dbuf is removed but before its hold is released, we will * treat the unmatched hold as active and exit safely. If we count the * hold before the dbuf is removed, the hold is discounted, and the * removal is blocked until the move completes. */ refcount = refcount_count(&odn->dn_holds); ASSERT(refcount >= 0); dbufs = odn->dn_dbufs_count; /* We can't have more dbufs than dnode holds. */ ASSERT3U(dbufs, <=, refcount); DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, uint32_t, dbufs); if (refcount > dbufs) { rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); return (KMEM_CBRC_LATER); } rw_exit(&odn->dn_struct_rwlock); /* * At this point we know that anyone with a hold on the dnode is not * actively referencing it. The dnode is known and in a valid state to * move. We're holding the locks needed to execute the critical section. */ dnode_move_impl(odn, ndn); list_link_replace(&odn->dn_link, &ndn->dn_link); /* If the dnode was safe to move, the refcount cannot have changed. */ ASSERT(refcount == refcount_count(&ndn->dn_holds)); ASSERT(dbufs == ndn->dn_dbufs_count); zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ mutex_exit(&os->os_lock); return (KMEM_CBRC_YES); } #endif /* _KERNEL */ #endif /* illumos */ void dnode_special_close(dnode_handle_t *dnh) { dnode_t *dn = dnh->dnh_dnode; /* * Wait for final references to the dnode to clear. This can * only happen if the arc is asyncronously evicting state that * has a hold on this dnode while we are trying to evict this * dnode. */ while (refcount_count(&dn->dn_holds) > 0) delay(1); ASSERT(dn->dn_dbuf == NULL || dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { dnode_t *dn; dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); } static void dnode_buf_pageout(void *dbu) { dnode_children_t *children_dnodes = dbu; int i; for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; /* * The dnode handle lock guards against the dnode moving to * another valid address, so there is no need here to guard * against changes to or from NULL. */ if (dnh->dnh_dnode == NULL) { zrl_destroy(&dnh->dnh_zrlock); continue; } zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* * errors: * EINVAL - invalid object number. * EIO - i/o error. * succeeds even for free dnodes. */ int dnode_hold_impl(objset_t *os, uint64_t object, int flag, void *tag, dnode_t **dnp) { int epb, idx, err; int drop_struct_lock = FALSE; int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; dnode_children_t *children_dnodes; dnode_handle_t *dnh; /* * If you are holding the spa config lock as writer, you shouldn't * be asking the DMU to do *anything* unless it's the root pool * which may require us to read from the root filesystem while * holding some (not all) of the locks as writer. */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || (spa_is_root(os->os_spa) && spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) return (SET_ERROR(ENOENT)); type = dn->dn_type; if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) return (SET_ERROR(ENOENT)); if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); (void) refcount_add(&dn->dn_holds, tag); *dnp = dn; return (0); } if (object == 0 || object >= DN_MAX_OBJECT) return (SET_ERROR(EINVAL)); mdn = DMU_META_DNODE(os); ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { rw_enter(&mdn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { dbuf_rele(db, FTAG); return (err); } ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; idx = object & (epb-1); ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { int i; dnode_children_t *winner; children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); } dmu_buf_init_user(&children_dnodes->dnc_dbu, dnode_buf_pageout, NULL); winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); if (winner != NULL) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); } kmem_free(children_dnodes, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); children_dnodes = winner; } } ASSERT(children_dnodes->dnc_count == epb); dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; if (dn == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; dn = dnode_create(os, phys, db, object, dnh); } mutex_enter(&dn->dn_mtx); type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ((flag & DNODE_MUST_BE_FREE) && (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); mutex_exit(&dn->dn_mtx); /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); ASSERT3U(dn->dn_object, ==, object); dbuf_rele(db, FTAG); *dnp = dn; return (0); } /* * Return held dnode if the object is allocated, NULL if not. */ int dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } /* * Can only add a reference if there is already at least one * reference on the dnode. Returns FALSE if unable to add a * new reference. */ boolean_t dnode_add_ref(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); if (refcount_is_zero(&dn->dn_holds)) { mutex_exit(&dn->dn_mtx); return (FALSE); } VERIFY(1 < refcount_add(&dn->dn_holds, tag)); mutex_exit(&dn->dn_mtx); return (TRUE); } void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, tag); } void dnode_rele_and_unlock(dnode_t *dn, void *tag) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ dmu_buf_impl_t *db = dn->dn_dbuf; dnode_handle_t *dnh = dn->dn_handle; refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); /* * It's unsafe to release the last hold on a dnode by dnode_rele() or * indirectly by dbuf_rele() while relying on the dnode handle to * prevent the dnode from moving, since releasing the last hold could * result in the dnode's parent dbuf evicting its dnode handles. For * that reason anyone calling dnode_rele() or dbuf_rele() without some * other direct or indirect hold on the dnode must first drop the dnode * handle. */ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && db != NULL) { /* * Another thread could add a hold to the dnode handle in * dnode_hold_impl() while holding the parent dbuf. Since the * hold on the parent dbuf prevents the handle from being * destroyed, the hold on the handle is OK. We can't yet assert * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ dbuf_rele(db, dnh); } } void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { dsl_dataset_dirty(os->os_dsl_dataset, tx); return; } DNODE_VERIFY(dn); #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif /* * Determine old uid/gid when necessary */ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); mutex_enter(&os->os_lock); /* * If we are already marked dirty, we're done. */ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { mutex_exit(&os->os_lock); return; } ASSERT(!refcount_is_zero(&dn->dn_holds) || !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", dn->dn_object, txg); if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); } else { list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); } mutex_exit(&os->os_lock); /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its * children. */ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } void dnode_free(dnode_t *dn, dmu_tx_t *tx) { int txgoff = tx->tx_txg & TXG_MASK; dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); /* we should be the only holder... hopefully */ /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { mutex_exit(&dn->dn_mtx); return; } dn->dn_free_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); /* * If the dnode is already dirty, it needs to be moved from * the dirty list to the free list. */ mutex_enter(&dn->dn_objset->os_lock); if (list_link_active(&dn->dn_dirty_link[txgoff])) { list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); mutex_exit(&dn->dn_objset->os_lock); } else { mutex_exit(&dn->dn_objset->os_lock); dnode_setdirty(dn, tx); } } /* * Try to change the block size for the indicated dnode. This can only * succeed if there are no blocks allocated or dirty beyond first block */ int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db; int err; ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (size == 0) size = SPA_MINBLOCKSIZE; else size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); if (ibs == dn->dn_indblkshift) ibs = 0; if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* Check for any allocated blocks beyond the first */ if (dn->dn_maxblkid != 0) goto fail; mutex_enter(&dn->dn_dbufs_mtx); for (db = avl_first(&dn->dn_dbufs); db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } } mutex_exit(&dn->dn_dbufs_mtx); if (ibs && dn->dn_nlevels != 1) goto fail; /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) goto fail; dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; if (ibs) { dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } /* rele after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); fail: rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(ENOTSUP)); } /* read-holding callers must not rely on the lock being continuously held */ void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) { uint64_t txgoff = tx->tx_txg & TXG_MASK; int epbs, new_nlevels; uint64_t sz; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * if we have a read-lock, check to see if we need to do any work * before upgrading to a write-lock. */ if (have_read) { if (blkid <= dn->dn_maxblkid) return; if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } } if (blkid <= dn->dn_maxblkid) goto out; dn->dn_maxblkid = blkid; /* * Compute the number of levels necessary to support the new maxblkid. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) new_nlevels++; if (new_nlevels > dn->dn_nlevels) { int old_nlevels = dn->dn_nlevels; dmu_buf_impl_t *db; list_t *list; dbuf_dirty_record_t *new, *dr, *dr_next; dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); dn->dn_next_nlevels[txgoff] = new_nlevels; /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ mutex_enter(&dn->dn_mtx); mutex_enter(&new->dt.di.dr_mtx); list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); if (dr->dr_dbuf->db_level != new_nlevels-1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; } } mutex_exit(&new->dt.di.dr_mtx); mutex_exit(&dn->dn_mtx); } out: if (have_read) rw_downgrade(&dn->dn_struct_rwlock); } static void dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); if (db != NULL) { dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } } void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; int blksz, blkshift, head, tail; int trunc = FALSE; int epbs; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; if (len == DMU_OBJECT_END) { len = UINT64_MAX - off; trunc = TRUE; } /* * First, block align the region to free: */ if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) goto out; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. * Note that we won't dirty any indirect blocks, * which is fine because we will be freeing the entire * file and thus all indirect blocks will be freed * by free_children(). */ blkid = 0; nblks = 1; goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ goto out; } else { /* Freeing part of the block. */ head = blksz - off; ASSERT3U(head, >, 0); } blkoff = off; } /* zero out any partial block data at the start of the range */ if (head) { ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); } dbuf_rele(db, FTAG); } off += head; len -= head; } /* If the range was less than one block, we're done */ if (len == 0) goto out; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) goto out; ASSERT(ISP2(blksz)); if (trunc) tail = 0; else tail = P2PHASE(len, blksz); ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { if (len < tail) tail = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), TRUE, FALSE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } dbuf_rele(db, FTAG); } len -= tail; } /* If the range did not include a full block, we are done */ if (len == 0) goto out; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); blkid = off >> blkshift; nblks = len >> blkshift; if (trunc) nblks += 1; /* * Dirty all the indirect blocks in this range. Note that only * the first and last indirect blocks can actually be written * (if they were partially freed) -- they must be dirtied, even if * they do not exist on disk yet. The interior blocks will * be freed by free_children(), so they will not actually be written. * Even though these interior blocks will not be written, we * dirty them for two reasons: * * - It ensures that the indirect blocks remain in memory until * syncing context. (They have already been prefetched by * dmu_tx_hold_free(), so we don't have to worry about reading * them serially here.) * * - The dirty space accounting will put pressure on the txg sync * mechanism to begin syncing, and to delay transactions if there * is a large amount of freeing. Even though these indirect * blocks will not be written, we could need to write the same * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { uint64_t first, last; first = blkid >> epbs; dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; if (last != first) dnode_dirty_l1(dn, last, tx); int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { /* * Set i to the blockid of the next non-hole * level-1 indirect block at or after i. Note * that dnode_next_offset() operates in terms of * level-0-equivalent bytes. */ uint64_t ibyte = i << shift; int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); i = ibyte >> shift; if (i >= last) break; /* * Normally we should not see an error, either * from dnode_next_offset() or dbuf_hold_level() * (except for ESRCH from dnode_next_offset). * If there is an i/o error, then when we read * this block in syncing context, it will use * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according * to the "failmode" property. dnode_next_offset() * doesn't have a flag to indicate MUSTSUCCEED. */ if (err != 0) break; dnode_dirty_l1(dn, i, tx); } } done: /* * Add this range to the dnode range list. * We will finish up this free operation in the syncing phase. */ mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL, &dn->dn_mtx); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", blkid, nblks, tx->tx_txg); mutex_exit(&dn->dn_mtx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: rw_exit(&dn->dn_struct_rwlock); } static boolean_t dnode_spill_freed(dnode_t *dn) { int i; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) { void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; if (blkid == DMU_BONUS_BLKID) return (FALSE); /* * If we're in the process of opening the pool, dp will not be * set yet, but there shouldn't be anything dirty. */ if (dp == NULL) return (FALSE); if (dn->dn_free_txg) return (TRUE); if (blkid == DMU_SPILL_BLKID) return (dnode_spill_freed(dn)); mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_free_ranges[i] != NULL && range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* call from syncing context when we actually write/free space for this dnode */ void dnode_diduse_space(dnode_t *dn, int64_t delta) { uint64_t space; dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", dn, dn->dn_phys, (u_longlong_t)dn->dn_phys->dn_used, (longlong_t)delta); mutex_enter(&dn->dn_mtx); space = DN_USED_BYTES(dn->dn_phys); if (delta > 0) { ASSERT3U(space + delta, >=, space); /* no overflow */ } else { ASSERT3U(space, >=, -delta); /* no underflow */ } space += delta; if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ASSERT0(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; } else { dn->dn_phys->dn_used = space; dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; } mutex_exit(&dn->dn_mtx); } /* * Call when we think we're going to write/free space in open context to track * the amount of memory in use by the currently open txg. */ void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; dsl_dataset_t *ds = os->os_dsl_dataset; int64_t aspace = spa_get_asize(os->os_spa, space); if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } dmu_tx_willuse_space(tx, aspace); } /* * Scans a block at the indicated "level" looking for a hole or data, * depending on 'flags'. * * If level > 0, then we are scanning an indirect block looking at its * pointers. If level == 0, then we are looking at a block of dnodes. * * If we don't find what we are looking for in the block, we return ESRCH. * Otherwise, return with *offset pointing to the beginning (if searching * forwards) or end (if searching backwards) of the range covered by the * block pointer we matched on (or dnode). * * The basic search algorithm used below by dnode_next_offset() is to * use this function to search up the block tree (widen the search) until * we find something (i.e., we don't return ESRCH) and then search back * down the tree (narrow the search) until we reach our original search * level. */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; boolean_t hole; int i, inc, error, span; dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); if (hole) return (0); /* * This can only happen when we are searching up * the block tree for data. We don't really need to * adjust the offset, as we will just end up looking * at the pointer to this block in its parent, and its * going to be unallocated, so we will skip over it. */ return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); if (error) { dbuf_rele(db, FTAG); return (error); } data = db->db.db_data; } if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. */ error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; span = DNODE_SHIFT; ASSERT(dn->dn_type == DMU_OT_DNODE); for (i = (*offset >> span) & (blkfill - 1); i >= 0 && i < blkfill; i += inc) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; *offset += (1ULL << span) * inc; } if (i < 0 || i == blkfill) error = SET_ERROR(ESRCH); } else { blkptr_t *bp = data; uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); if (hole) maxfill--; else minfill++; *offset = *offset >> span; for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) *offset += inc; } *offset = *offset << span; if (inc < 0) { /* traversing backwards; position offset at the end */ ASSERT3U(*offset, <=, start); *offset = MIN(*offset + (1ULL << span) - 1, start); } else if (*offset < start) { *offset = start; } if (i < 0 || i >= epb) error = SET_ERROR(ESRCH); } if (db) dbuf_rele(db, FTAG); return (error); } /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, * DNODES_PER_BLOCK for the meta dnode, and some fraction of * DNODES_PER_BLOCK when searching for sparse regions thereof. * * Examples: * * dnode_next_offset(dn, flags, offset, 1, 1, 0); * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; if (!(flags & DNODE_FIND_HAVELOCK)) rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { error = SET_ERROR(ESRCH); goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = SET_ERROR(ESRCH); } goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); } /* * There's always a "virtual hole" at the end of the object, even * if all BP's which physically exist are non-holes. */ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { error = 0; } if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? initial_offset < *offset : initial_offset > *offset)) error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); return (error); } Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h (revision 287722) @@ -1,193 +1,182 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #ifndef _SYS_ARC_H #define _SYS_ARC_H #include #ifdef __cplusplus extern "C" { #endif #include #include #include /* * Used by arc_flush() to inform arc_evict_state() that it should evict * all available buffers from the arc state being passed in. */ #define ARC_EVICT_ALL -1ULL typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); typedef int arc_evict_func_t(void *priv); /* generic arc_done_func_t's which you can use */ arc_done_func_t arc_bcopy_func; arc_done_func_t arc_getbuf_func; typedef enum arc_flags { /* * Public flags that can be passed into the ARC by external consumers. */ ARC_FLAG_NONE = 1 << 0, /* No flags set */ ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 7, /* I/O from zfetch */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ - ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ - ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ - ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ - ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ + ARC_FLAG_IN_HASH_TABLE = 1 << 8, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 9, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 10, /* I/O failed for buf */ + ARC_FLAG_FREED_IN_READ = 1 << 11, /* freed during read */ + ARC_FLAG_BUF_AVAILABLE = 1 << 12, /* block not in use */ + ARC_FLAG_INDIRECT = 1 << 13, /* indirect block */ + /* Indicates that block was read with ASYNC priority. */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 14, + ARC_FLAG_L2_WRITING = 1 << 15, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 16, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 17, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 16, + ARC_FLAG_BUFC_METADATA = 1 << 18, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 17, - ARC_FLAG_HAS_L2HDR = 1 << 18, - - /* - * The arc buffer's compression mode is stored in the top 7 bits of the - * flags field, so these dummy flags are included so that MDB can - * interpret the enum properly. - */ - ARC_FLAG_COMPRESS_0 = 1 << 24, - ARC_FLAG_COMPRESS_1 = 1 << 25, - ARC_FLAG_COMPRESS_2 = 1 << 26, - ARC_FLAG_COMPRESS_3 = 1 << 27, - ARC_FLAG_COMPRESS_4 = 1 << 28, - ARC_FLAG_COMPRESS_5 = 1 << 29, - ARC_FLAG_COMPRESS_6 = 1 << 30 - + ARC_FLAG_HAS_L1HDR = 1 << 19, + ARC_FLAG_HAS_L2HDR = 1 << 20, } arc_flags_t; struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; void *b_data; arc_evict_func_t *b_efunc; void *b_private; }; typedef enum arc_buf_contents { ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; /* * The following breakdows of arc_size exist for kstat only. */ typedef enum arc_space_type { ARC_SPACE_DATA, ARC_SPACE_META, ARC_SPACE_HDRS, ARC_SPACE_L2HDRS, ARC_SPACE_OTHER, ARC_SPACE_NUMTYPES } arc_space_type_t; void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); arc_buf_t *arc_loan_buf(spa_t *spa, int size); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, arc_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv); boolean_t arc_clear_callback(arc_buf_t *buf); void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); void arc_init(void); void arc_fini(void); /* * Level 2 ARC */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); #ifdef illumos #ifndef _KERNEL extern boolean_t arc_watch; extern int arc_procfd; #endif #endif /* illumos */ #ifdef __cplusplus } #endif #endif /* _SYS_ARC_H */ Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 287722) @@ -1,949 +1,950 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct uio; struct xuio; struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct file; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_BYTESWAP_MASK 0x3f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(ot)].ot_byteswap) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum txg_how { TXG_WAIT = 1, TXG_NOWAIT, TXG_WAITED, } txg_how_t; void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, struct nvlist *snaps); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_read() to * read db_data, dmu_buf_will_dirty() before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); + uint64_t length, boolean_t read, void *tag, + int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* This instance's eviction function pointer. */ dmu_buf_evict_func_t *dbu_evict_func; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ #ifdef __lint /* Very ugly, but it beats issuing suppression directives in many Makefiles. */ extern void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); #else /* __lint */ inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp) { ASSERT(dbu->dbu_evict_func == NULL); ASSERT(evict_func != NULL); dbu->dbu_evict_func = evict_func; #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } #endif /* __lint */ /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* * Tells if the given dbuf is freeable. */ boolean_t dmu_buf_freeable(dmu_buf_t *); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); #else int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); #endif #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, size_t n); int dmu_xuio_cnt(struct xuio *uio); struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); void dmu_xuio_clear(struct xuio *uio, int i); void xuio_stat_wbuf_copied(); void xuio_stat_wbuf_nocopy(); -extern int zfs_prefetch_disable; +extern boolean_t zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; typedef struct dmu_object_byteswap_info { arc_byteswap_func_t *ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; char dds_origin[MAXNAMELEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ timestruc_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef int objset_used_cb_t(dmu_object_type_t bonustype, void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct zilog *zgd_zilog; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct rl *zgd_rl; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Check if a DMU object has any dirty blocks. If so, sync out * all pending transaction groups. Otherwise, this function * does not alter DMU state. This could be improved to only sync * out the necessary transaction groups for this particular * object. */ int dmu_object_wait_synced(objset_t *os, uint64_t object); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, struct file *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern int zfs_mdcomp_disable; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ Index: projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h (revision 287722) @@ -1,76 +1,69 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#ifndef _DFETCH_H -#define _DFETCH_H +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ +#ifndef _DMU_ZFETCH_H +#define _DMU_ZFETCH_H + #include #ifdef __cplusplus extern "C" { #endif extern uint64_t zfetch_array_rd_sz; struct dnode; /* so we can reference dnode */ -typedef enum zfetch_dirn { - ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */ - ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */ -} zfetch_dirn_t; - typedef struct zstream { - uint64_t zst_offset; /* offset of starting block in range */ - uint64_t zst_len; /* length of range, in blocks */ - zfetch_dirn_t zst_direction; /* direction of prefetch */ - uint64_t zst_stride; /* length of stride, in blocks */ - uint64_t zst_ph_offset; /* prefetch offset, in blocks */ - uint64_t zst_cap; /* prefetch limit (cap), in blocks */ - kmutex_t zst_lock; /* protects stream */ - clock_t zst_last; /* lbolt of last prefetch */ - avl_node_t zst_node; /* embed avl node here */ + uint64_t zs_blkid; /* expect next access at this blkid */ + uint64_t zs_pf_blkid; /* next block to prefetch */ + kmutex_t zs_lock; /* protects stream */ + hrtime_t zs_atime; /* time last prefetch issued */ + list_node_t zs_node; /* link for zf_stream */ } zstream_t; typedef struct zfetch { krwlock_t zf_rwlock; /* protects zfetch structure */ - list_t zf_stream; /* AVL tree of zstream_t's */ + list_t zf_stream; /* list of zstream_t's */ struct dnode *zf_dnode; /* dnode that owns this zfetch */ - uint32_t zf_stream_cnt; /* # of active streams */ - uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ } zfetch_t; void zfetch_init(void); void zfetch_fini(void); void dmu_zfetch_init(zfetch_t *, struct dnode *); -void dmu_zfetch_rele(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); +void dmu_zfetch_fini(zfetch_t *); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t); #ifdef __cplusplus } #endif -#endif /* _DFETCH_H */ +#endif /* _DMU_ZFETCH_H */ Index: projects/iosched/sys/cddl/contrib/opensolaris =================================================================== --- projects/iosched/sys/cddl/contrib/opensolaris (revision 287721) +++ projects/iosched/sys/cddl/contrib/opensolaris (revision 287722) Property changes on: projects/iosched/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,2 ## Merged /head/sys/cddl/contrib/opensolaris:r287701-287721 Merged /vendor-sys/illumos/dist:r287624,287684,287699 Index: projects/iosched/sys/dev/dwc/if_dwc.c =================================================================== --- projects/iosched/sys/dev/dwc/if_dwc.c (revision 287721) +++ projects/iosched/sys/dev/dwc/if_dwc.c (revision 287722) @@ -1,1309 +1,1307 @@ /*- * Copyright (c) 2014 Ruslan Bukin * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Ethernet media access controller (EMAC) * Chapter 17, Altera Cyclone V Device Handbook (CV-5V2 2014.07.22) * * EMAC is an instance of the Synopsys DesignWare 3504-0 * Universal 10/100/1000 Ethernet MAC (DWC_gmac). */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "miibus_if.h" #define READ4(_sc, _reg) \ bus_read_4((_sc)->res[0], _reg) #define WRITE4(_sc, _reg, _val) \ bus_write_4((_sc)->res[0], _reg, _val) #define MAC_RESET_TIMEOUT 100 #define WATCHDOG_TIMEOUT_SECS 5 #define STATS_HARVEST_INTERVAL 2 #define MII_CLK_VAL 2 #include #define DWC_LOCK(sc) mtx_lock(&(sc)->mtx) #define DWC_UNLOCK(sc) mtx_unlock(&(sc)->mtx) #define DWC_ASSERT_LOCKED(sc) mtx_assert(&(sc)->mtx, MA_OWNED) #define DWC_ASSERT_UNLOCKED(sc) mtx_assert(&(sc)->mtx, MA_NOTOWNED) #define DDESC_TDES0_OWN (1U << 31) #define DDESC_TDES0_TXINT (1U << 30) #define DDESC_TDES0_TXLAST (1U << 29) #define DDESC_TDES0_TXFIRST (1U << 28) #define DDESC_TDES0_TXCRCDIS (1U << 27) #define DDESC_TDES0_TXRINGEND (1U << 21) #define DDESC_TDES0_TXCHAIN (1U << 20) #define DDESC_RDES0_OWN (1U << 31) #define DDESC_RDES0_FL_MASK 0x3fff #define DDESC_RDES0_FL_SHIFT 16 /* Frame Length */ #define DDESC_RDES1_CHAINED (1U << 14) struct dwc_bufmap { bus_dmamap_t map; struct mbuf *mbuf; }; /* * A hardware buffer descriptor. Rx and Tx buffers have the same descriptor * layout, but the bits in the flags field have different meanings. */ struct dwc_hwdesc { uint32_t tdes0; uint32_t tdes1; uint32_t addr; /* pointer to buffer data */ uint32_t addr_next; /* link to next descriptor */ }; /* * Driver data and defines. */ #define RX_DESC_COUNT 1024 #define RX_DESC_SIZE (sizeof(struct dwc_hwdesc) * RX_DESC_COUNT) #define TX_DESC_COUNT 1024 #define TX_DESC_SIZE (sizeof(struct dwc_hwdesc) * TX_DESC_COUNT) /* * The hardware imposes alignment restrictions on various objects involved in * DMA transfers. These values are expressed in bytes (not bits). */ #define DWC_DESC_RING_ALIGN 2048 struct dwc_softc { struct resource *res[2]; bus_space_tag_t bst; bus_space_handle_t bsh; device_t dev; int mii_clk; device_t miibus; struct mii_data * mii_softc; struct ifnet *ifp; int if_flags; struct mtx mtx; void * intr_cookie; struct callout dwc_callout; boolean_t link_is_up; boolean_t is_attached; boolean_t is_detaching; int tx_watchdog_count; int stats_harvest_count; /* RX */ bus_dma_tag_t rxdesc_tag; bus_dmamap_t rxdesc_map; struct dwc_hwdesc *rxdesc_ring; bus_addr_t rxdesc_ring_paddr; bus_dma_tag_t rxbuf_tag; struct dwc_bufmap rxbuf_map[RX_DESC_COUNT]; uint32_t rx_idx; /* TX */ bus_dma_tag_t txdesc_tag; bus_dmamap_t txdesc_map; struct dwc_hwdesc *txdesc_ring; bus_addr_t txdesc_ring_paddr; bus_dma_tag_t txbuf_tag; struct dwc_bufmap txbuf_map[RX_DESC_COUNT]; uint32_t tx_idx_head; uint32_t tx_idx_tail; int txcount; }; static struct resource_spec dwc_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, { SYS_RES_IRQ, 0, RF_ACTIVE }, { -1, 0 } }; static void dwc_txfinish_locked(struct dwc_softc *sc); static void dwc_rxfinish_locked(struct dwc_softc *sc); static void dwc_stop_locked(struct dwc_softc *sc); static void dwc_setup_rxfilter(struct dwc_softc *sc); static inline uint32_t next_rxidx(struct dwc_softc *sc, uint32_t curidx) { return ((curidx + 1) % RX_DESC_COUNT); } static inline uint32_t next_txidx(struct dwc_softc *sc, uint32_t curidx) { return ((curidx + 1) % TX_DESC_COUNT); } static void dwc_get1paddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { if (error != 0) return; *(bus_addr_t *)arg = segs[0].ds_addr; } inline static uint32_t dwc_setup_txdesc(struct dwc_softc *sc, int idx, bus_addr_t paddr, uint32_t len) { uint32_t flags; uint32_t nidx; nidx = next_txidx(sc, idx); /* Addr/len 0 means we're clearing the descriptor after xmit done. */ if (paddr == 0 || len == 0) { flags = 0; --sc->txcount; } else { flags = DDESC_TDES0_TXCHAIN | DDESC_TDES0_TXFIRST | DDESC_TDES0_TXLAST | DDESC_TDES0_TXINT; ++sc->txcount; } sc->txdesc_ring[idx].addr = (uint32_t)(paddr); sc->txdesc_ring[idx].tdes0 = flags; sc->txdesc_ring[idx].tdes1 = len; if (paddr && len) { wmb(); sc->txdesc_ring[idx].tdes0 |= DDESC_TDES0_OWN; wmb(); } return (nidx); } static int dwc_setup_txbuf(struct dwc_softc *sc, int idx, struct mbuf **mp) { struct bus_dma_segment seg; int error, nsegs; struct mbuf * m; if ((m = m_defrag(*mp, M_NOWAIT)) == NULL) return (ENOMEM); *mp = m; error = bus_dmamap_load_mbuf_sg(sc->txbuf_tag, sc->txbuf_map[idx].map, m, &seg, &nsegs, 0); if (error != 0) { return (ENOMEM); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); bus_dmamap_sync(sc->txbuf_tag, sc->txbuf_map[idx].map, BUS_DMASYNC_PREWRITE); sc->txbuf_map[idx].mbuf = m; dwc_setup_txdesc(sc, idx, seg.ds_addr, seg.ds_len); return (0); } static void dwc_txstart_locked(struct dwc_softc *sc) { struct ifnet *ifp; struct mbuf *m; int enqueued; DWC_ASSERT_LOCKED(sc); if (!sc->link_is_up) return; ifp = sc->ifp; if (ifp->if_drv_flags & IFF_DRV_OACTIVE) { return; } enqueued = 0; for (;;) { if (sc->txcount == (TX_DESC_COUNT-1)) { ifp->if_drv_flags |= IFF_DRV_OACTIVE; break; } IFQ_DRV_DEQUEUE(&ifp->if_snd, m); if (m == NULL) break; if (dwc_setup_txbuf(sc, sc->tx_idx_head, &m) != 0) { IFQ_DRV_PREPEND(&ifp->if_snd, m); break; } BPF_MTAP(ifp, m); sc->tx_idx_head = next_txidx(sc, sc->tx_idx_head); ++enqueued; } if (enqueued != 0) { WRITE4(sc, TRANSMIT_POLL_DEMAND, 0x1); sc->tx_watchdog_count = WATCHDOG_TIMEOUT_SECS; } } static void dwc_txstart(struct ifnet *ifp) { struct dwc_softc *sc = ifp->if_softc; DWC_LOCK(sc); dwc_txstart_locked(sc); DWC_UNLOCK(sc); } static void dwc_stop_locked(struct dwc_softc *sc) { struct ifnet *ifp; uint32_t reg; DWC_ASSERT_LOCKED(sc); ifp = sc->ifp; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); sc->tx_watchdog_count = 0; sc->stats_harvest_count = 0; callout_stop(&sc->dwc_callout); /* Stop DMA TX */ reg = READ4(sc, OPERATION_MODE); reg &= ~(MODE_ST); WRITE4(sc, OPERATION_MODE, reg); /* Flush TX */ reg = READ4(sc, OPERATION_MODE); reg |= (MODE_FTF); WRITE4(sc, OPERATION_MODE, reg); /* Stop transmitters */ reg = READ4(sc, MAC_CONFIGURATION); reg &= ~(CONF_TE | CONF_RE); WRITE4(sc, MAC_CONFIGURATION, reg); /* Stop DMA RX */ reg = READ4(sc, OPERATION_MODE); reg &= ~(MODE_SR); WRITE4(sc, OPERATION_MODE, reg); } static void dwc_clear_stats(struct dwc_softc *sc) { uint32_t reg; reg = READ4(sc, MMC_CONTROL); reg |= (MMC_CONTROL_CNTRST); WRITE4(sc, MMC_CONTROL, reg); } static void dwc_harvest_stats(struct dwc_softc *sc) { struct ifnet *ifp; /* We don't need to harvest too often. */ if (++sc->stats_harvest_count < STATS_HARVEST_INTERVAL) return; sc->stats_harvest_count = 0; ifp = sc->ifp; if_inc_counter(ifp, IFCOUNTER_IPACKETS, READ4(sc, RXFRAMECOUNT_GB)); if_inc_counter(ifp, IFCOUNTER_IMCASTS, READ4(sc, RXMULTICASTFRAMES_G)); if_inc_counter(ifp, IFCOUNTER_IERRORS, READ4(sc, RXOVERSIZE_G) + READ4(sc, RXUNDERSIZE_G) + READ4(sc, RXCRCERROR) + READ4(sc, RXALIGNMENTERROR) + READ4(sc, RXRUNTERROR) + READ4(sc, RXJABBERERROR) + READ4(sc, RXLENGTHERROR)); if_inc_counter(ifp, IFCOUNTER_OPACKETS, READ4(sc, TXFRAMECOUNT_G)); if_inc_counter(ifp, IFCOUNTER_OMCASTS, READ4(sc, TXMULTICASTFRAMES_G)); if_inc_counter(ifp, IFCOUNTER_OERRORS, READ4(sc, TXOVERSIZE_G) + READ4(sc, TXEXCESSDEF) + READ4(sc, TXCARRIERERR) + READ4(sc, TXUNDERFLOWERROR)); if_inc_counter(ifp, IFCOUNTER_COLLISIONS, READ4(sc, TXEXESSCOL) + READ4(sc, TXLATECOL)); dwc_clear_stats(sc); } static void dwc_tick(void *arg) { struct dwc_softc *sc; struct ifnet *ifp; int link_was_up; sc = arg; DWC_ASSERT_LOCKED(sc); ifp = sc->ifp; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return; /* * Typical tx watchdog. If this fires it indicates that we enqueued * packets for output and never got a txdone interrupt for them. Maybe * it's a missed interrupt somehow, just pretend we got one. */ if (sc->tx_watchdog_count > 0) { if (--sc->tx_watchdog_count == 0) { dwc_txfinish_locked(sc); } } /* Gather stats from hardware counters. */ dwc_harvest_stats(sc); /* Check the media status. */ link_was_up = sc->link_is_up; mii_tick(sc->mii_softc); if (sc->link_is_up && !link_was_up) dwc_txstart_locked(sc); /* Schedule another check one second from now. */ callout_reset(&sc->dwc_callout, hz, dwc_tick, sc); } static void dwc_init_locked(struct dwc_softc *sc) { struct ifnet *ifp = sc->ifp; uint32_t reg; DWC_ASSERT_LOCKED(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; ifp->if_drv_flags |= IFF_DRV_RUNNING; dwc_setup_rxfilter(sc); /* Initializa DMA and enable transmitters */ reg = READ4(sc, OPERATION_MODE); reg |= (MODE_TSF | MODE_OSF | MODE_FUF); reg &= ~(MODE_RSF); reg |= (MODE_RTC_LEV32 << MODE_RTC_SHIFT); WRITE4(sc, OPERATION_MODE, reg); WRITE4(sc, INTERRUPT_ENABLE, INT_EN_DEFAULT); /* Start DMA */ reg = READ4(sc, OPERATION_MODE); reg |= (MODE_ST | MODE_SR); WRITE4(sc, OPERATION_MODE, reg); /* Enable transmitters */ reg = READ4(sc, MAC_CONFIGURATION); reg |= (CONF_JD | CONF_ACS | CONF_BE); reg |= (CONF_TE | CONF_RE); WRITE4(sc, MAC_CONFIGURATION, reg); /* * Call mii_mediachg() which will call back into dwc_miibus_statchg() * to set up the remaining config registers based on current media. */ mii_mediachg(sc->mii_softc); callout_reset(&sc->dwc_callout, hz, dwc_tick, sc); } static void dwc_init(void *if_softc) { struct dwc_softc *sc = if_softc; DWC_LOCK(sc); dwc_init_locked(sc); DWC_UNLOCK(sc); } inline static uint32_t dwc_setup_rxdesc(struct dwc_softc *sc, int idx, bus_addr_t paddr) { uint32_t nidx; sc->rxdesc_ring[idx].addr = (uint32_t)paddr; nidx = next_rxidx(sc, idx); sc->rxdesc_ring[idx].addr_next = sc->rxdesc_ring_paddr + \ (nidx * sizeof(struct dwc_hwdesc)); sc->rxdesc_ring[idx].tdes1 = DDESC_RDES1_CHAINED | MCLBYTES; wmb(); sc->rxdesc_ring[idx].tdes0 = DDESC_RDES0_OWN; wmb(); return (nidx); } static int dwc_setup_rxbuf(struct dwc_softc *sc, int idx, struct mbuf *m) { struct bus_dma_segment seg; int error, nsegs; m_adj(m, ETHER_ALIGN); error = bus_dmamap_load_mbuf_sg(sc->rxbuf_tag, sc->rxbuf_map[idx].map, m, &seg, &nsegs, 0); if (error != 0) { return (error); } KASSERT(nsegs == 1, ("%s: %d segments returned!", __func__, nsegs)); bus_dmamap_sync(sc->rxbuf_tag, sc->rxbuf_map[idx].map, BUS_DMASYNC_PREREAD); sc->rxbuf_map[idx].mbuf = m; dwc_setup_rxdesc(sc, idx, seg.ds_addr); return (0); } static struct mbuf * dwc_alloc_mbufcl(struct dwc_softc *sc) { struct mbuf *m; m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m != NULL) m->m_pkthdr.len = m->m_len = m->m_ext.ext_size; return (m); } static void dwc_media_status(struct ifnet * ifp, struct ifmediareq *ifmr) { struct dwc_softc *sc; struct mii_data *mii; sc = ifp->if_softc; mii = sc->mii_softc; DWC_LOCK(sc); mii_pollstat(mii); ifmr->ifm_active = mii->mii_media_active; ifmr->ifm_status = mii->mii_media_status; DWC_UNLOCK(sc); } static int dwc_media_change_locked(struct dwc_softc *sc) { return (mii_mediachg(sc->mii_softc)); } static int dwc_media_change(struct ifnet * ifp) { struct dwc_softc *sc; int error; sc = ifp->if_softc; DWC_LOCK(sc); error = dwc_media_change_locked(sc); DWC_UNLOCK(sc); return (error); } static const uint8_t nibbletab[] = { /* 0x0 0000 -> 0000 */ 0x0, /* 0x1 0001 -> 1000 */ 0x8, /* 0x2 0010 -> 0100 */ 0x4, /* 0x3 0011 -> 1100 */ 0xc, /* 0x4 0100 -> 0010 */ 0x2, /* 0x5 0101 -> 1010 */ 0xa, /* 0x6 0110 -> 0110 */ 0x6, /* 0x7 0111 -> 1110 */ 0xe, /* 0x8 1000 -> 0001 */ 0x1, /* 0x9 1001 -> 1001 */ 0x9, /* 0xa 1010 -> 0101 */ 0x5, /* 0xb 1011 -> 1101 */ 0xd, /* 0xc 1100 -> 0011 */ 0x3, /* 0xd 1101 -> 1011 */ 0xb, /* 0xe 1110 -> 0111 */ 0x7, /* 0xf 1111 -> 1111 */ 0xf, }; static uint8_t bitreverse(uint8_t x) { return (nibbletab[x & 0xf] << 4) | nibbletab[x >> 4]; } static void dwc_setup_rxfilter(struct dwc_softc *sc) { struct ifmultiaddr *ifma; struct ifnet *ifp; uint8_t *eaddr, val; uint32_t crc, ffval, hashbit, hashreg, hi, lo, reg; DWC_ASSERT_LOCKED(sc); ifp = sc->ifp; /* * Set the multicast (group) filter hash. */ if ((ifp->if_flags & IFF_ALLMULTI)) ffval = (FRAME_FILTER_PM); else { ffval = (FRAME_FILTER_HMC); if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &sc->ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; crc = ether_crc32_le(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), ETHER_ADDR_LEN); /* Take lower 8 bits and reverse it */ val = bitreverse(~crc & 0xff); hashreg = (val >> 5); hashbit = (val & 31); reg = READ4(sc, HASH_TABLE_REG(hashreg)); reg |= (1 << hashbit); WRITE4(sc, HASH_TABLE_REG(hashreg), reg); } if_maddr_runlock(ifp); } /* * Set the individual address filter hash. */ if (ifp->if_flags & IFF_PROMISC) ffval |= (FRAME_FILTER_PR); /* * Set the primary address. */ eaddr = IF_LLADDR(ifp); lo = eaddr[0] | (eaddr[1] << 8) | (eaddr[2] << 16) | (eaddr[3] << 24); hi = eaddr[4] | (eaddr[5] << 8); WRITE4(sc, MAC_ADDRESS_LOW(0), lo); WRITE4(sc, MAC_ADDRESS_HIGH(0), hi); WRITE4(sc, MAC_FRAME_FILTER, ffval); } static int dwc_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct dwc_softc *sc; struct mii_data *mii; struct ifreq *ifr; int mask, error; sc = ifp->if_softc; ifr = (struct ifreq *)data; error = 0; switch (cmd) { case SIOCSIFFLAGS: DWC_LOCK(sc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if ((ifp->if_flags ^ sc->if_flags) & (IFF_PROMISC | IFF_ALLMULTI)) dwc_setup_rxfilter(sc); } else { if (!sc->is_detaching) dwc_init_locked(sc); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) dwc_stop_locked(sc); } sc->if_flags = ifp->if_flags; DWC_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) { DWC_LOCK(sc); dwc_setup_rxfilter(sc); DWC_UNLOCK(sc); } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: mii = sc->mii_softc; error = ifmedia_ioctl(ifp, ifr, &mii->mii_media, cmd); break; case SIOCSIFCAP: mask = ifp->if_capenable ^ ifr->ifr_reqcap; if (mask & IFCAP_VLAN_MTU) { /* No work to do except acknowledge the change took */ ifp->if_capenable ^= IFCAP_VLAN_MTU; } break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void dwc_txfinish_locked(struct dwc_softc *sc) { struct dwc_bufmap *bmap; struct dwc_hwdesc *desc; struct ifnet *ifp; DWC_ASSERT_LOCKED(sc); ifp = sc->ifp; while (sc->tx_idx_tail != sc->tx_idx_head) { desc = &sc->txdesc_ring[sc->tx_idx_tail]; if ((desc->tdes0 & DDESC_TDES0_OWN) != 0) break; bmap = &sc->txbuf_map[sc->tx_idx_tail]; bus_dmamap_sync(sc->txbuf_tag, bmap->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->txbuf_tag, bmap->map); m_freem(bmap->mbuf); bmap->mbuf = NULL; dwc_setup_txdesc(sc, sc->tx_idx_tail, 0, 0); sc->tx_idx_tail = next_txidx(sc, sc->tx_idx_tail); ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); } /* If there are no buffers outstanding, muzzle the watchdog. */ if (sc->tx_idx_tail == sc->tx_idx_head) { sc->tx_watchdog_count = 0; } } static void dwc_rxfinish_locked(struct dwc_softc *sc) { struct ifnet *ifp; struct mbuf *m0; struct mbuf *m; int error, idx, len; uint32_t rdes0; ifp = sc->ifp; for (;;) { idx = sc->rx_idx; rdes0 = sc->rxdesc_ring[idx].tdes0; if ((rdes0 & DDESC_RDES0_OWN) != 0) break; bus_dmamap_sync(sc->rxbuf_tag, sc->rxbuf_map[idx].map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->rxbuf_tag, sc->rxbuf_map[idx].map); len = (rdes0 >> DDESC_RDES0_FL_SHIFT) & DDESC_RDES0_FL_MASK; if (len != 0) { m = sc->rxbuf_map[idx].mbuf; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = len; m->m_len = len; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); DWC_UNLOCK(sc); (*ifp->if_input)(ifp, m); DWC_LOCK(sc); } else { /* XXX Zero-length packet ? */ } if ((m0 = dwc_alloc_mbufcl(sc)) != NULL) { if ((error = dwc_setup_rxbuf(sc, idx, m0)) != 0) { /* * XXX Now what? * We've got a hole in the rx ring. */ } } else if_inc_counter(sc->ifp, IFCOUNTER_IQDROPS, 1); sc->rx_idx = next_rxidx(sc, sc->rx_idx); } } static void dwc_intr(void *arg) { struct dwc_softc *sc; uint32_t reg; sc = arg; DWC_LOCK(sc); reg = READ4(sc, INTERRUPT_STATUS); - if (reg) { - mii_mediachg(sc->mii_softc); + if (reg) READ4(sc, SGMII_RGMII_SMII_CTRL_STATUS); - } reg = READ4(sc, DMA_STATUS); if (reg & DMA_STATUS_NIS) { if (reg & DMA_STATUS_RI) dwc_rxfinish_locked(sc); if (reg & DMA_STATUS_TI) { dwc_txfinish_locked(sc); dwc_txstart_locked(sc); } } if (reg & DMA_STATUS_AIS) { if (reg & DMA_STATUS_FBI) { /* Fatal bus error */ device_printf(sc->dev, "Ethernet DMA error, restarting controller.\n"); dwc_stop_locked(sc); dwc_init_locked(sc); } } WRITE4(sc, DMA_STATUS, reg & DMA_STATUS_INTR_MASK); DWC_UNLOCK(sc); } static int setup_dma(struct dwc_softc *sc) { struct mbuf *m; int error; int nidx; int idx; /* * Set up TX descriptor ring, descriptors, and dma maps. */ error = bus_dma_tag_create( bus_get_dma_tag(sc->dev), /* Parent tag. */ DWC_DESC_RING_ALIGN, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ TX_DESC_SIZE, 1, /* maxsize, nsegments */ TX_DESC_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->txdesc_tag); if (error != 0) { device_printf(sc->dev, "could not create TX ring DMA tag.\n"); goto out; } error = bus_dmamem_alloc(sc->txdesc_tag, (void**)&sc->txdesc_ring, BUS_DMA_COHERENT | BUS_DMA_WAITOK | BUS_DMA_ZERO, &sc->txdesc_map); if (error != 0) { device_printf(sc->dev, "could not allocate TX descriptor ring.\n"); goto out; } error = bus_dmamap_load(sc->txdesc_tag, sc->txdesc_map, sc->txdesc_ring, TX_DESC_SIZE, dwc_get1paddr, &sc->txdesc_ring_paddr, 0); if (error != 0) { device_printf(sc->dev, "could not load TX descriptor ring map.\n"); goto out; } for (idx = 0; idx < TX_DESC_COUNT; idx++) { nidx = next_txidx(sc, idx); sc->txdesc_ring[idx].addr_next = sc->txdesc_ring_paddr + (nidx * sizeof(struct dwc_hwdesc)); } error = bus_dma_tag_create( bus_get_dma_tag(sc->dev), /* Parent tag. */ 1, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MCLBYTES, 1, /* maxsize, nsegments */ MCLBYTES, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->txbuf_tag); if (error != 0) { device_printf(sc->dev, "could not create TX ring DMA tag.\n"); goto out; } for (idx = 0; idx < TX_DESC_COUNT; idx++) { error = bus_dmamap_create(sc->txbuf_tag, BUS_DMA_COHERENT, &sc->txbuf_map[idx].map); if (error != 0) { device_printf(sc->dev, "could not create TX buffer DMA map.\n"); goto out; } dwc_setup_txdesc(sc, idx, 0, 0); } /* * Set up RX descriptor ring, descriptors, dma maps, and mbufs. */ error = bus_dma_tag_create( bus_get_dma_tag(sc->dev), /* Parent tag. */ DWC_DESC_RING_ALIGN, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ RX_DESC_SIZE, 1, /* maxsize, nsegments */ RX_DESC_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->rxdesc_tag); if (error != 0) { device_printf(sc->dev, "could not create RX ring DMA tag.\n"); goto out; } error = bus_dmamem_alloc(sc->rxdesc_tag, (void **)&sc->rxdesc_ring, BUS_DMA_COHERENT | BUS_DMA_WAITOK | BUS_DMA_ZERO, &sc->rxdesc_map); if (error != 0) { device_printf(sc->dev, "could not allocate RX descriptor ring.\n"); goto out; } error = bus_dmamap_load(sc->rxdesc_tag, sc->rxdesc_map, sc->rxdesc_ring, RX_DESC_SIZE, dwc_get1paddr, &sc->rxdesc_ring_paddr, 0); if (error != 0) { device_printf(sc->dev, "could not load RX descriptor ring map.\n"); goto out; } error = bus_dma_tag_create( bus_get_dma_tag(sc->dev), /* Parent tag. */ 1, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MCLBYTES, 1, /* maxsize, nsegments */ MCLBYTES, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->rxbuf_tag); if (error != 0) { device_printf(sc->dev, "could not create RX buf DMA tag.\n"); goto out; } for (idx = 0; idx < RX_DESC_COUNT; idx++) { error = bus_dmamap_create(sc->rxbuf_tag, BUS_DMA_COHERENT, &sc->rxbuf_map[idx].map); if (error != 0) { device_printf(sc->dev, "could not create RX buffer DMA map.\n"); goto out; } if ((m = dwc_alloc_mbufcl(sc)) == NULL) { device_printf(sc->dev, "Could not alloc mbuf\n"); error = ENOMEM; goto out; } if ((error = dwc_setup_rxbuf(sc, idx, m)) != 0) { device_printf(sc->dev, "could not create new RX buffer.\n"); goto out; } } out: if (error != 0) return (ENXIO); return (0); } static int dwc_get_hwaddr(struct dwc_softc *sc, uint8_t *hwaddr) { uint32_t hi, lo, rnd; /* * Try to recover a MAC address from the running hardware. If there's * something non-zero there, assume the bootloader did the right thing * and just use it. * * Otherwise, set the address to a convenient locally assigned address, * 'bsd' + random 24 low-order bits. 'b' is 0x62, which has the locally * assigned bit set, and the broadcast/multicast bit clear. */ lo = READ4(sc, MAC_ADDRESS_LOW(0)); hi = READ4(sc, MAC_ADDRESS_HIGH(0)) & 0xffff; if ((lo != 0xffffffff) || (hi != 0xffff)) { hwaddr[0] = (lo >> 0) & 0xff; hwaddr[1] = (lo >> 8) & 0xff; hwaddr[2] = (lo >> 16) & 0xff; hwaddr[3] = (lo >> 24) & 0xff; hwaddr[4] = (hi >> 0) & 0xff; hwaddr[5] = (hi >> 8) & 0xff; } else { rnd = arc4random() & 0x00ffffff; hwaddr[0] = 'b'; hwaddr[1] = 's'; hwaddr[2] = 'd'; hwaddr[3] = rnd >> 16; hwaddr[4] = rnd >> 8; hwaddr[5] = rnd >> 0; } return (0); } static int dwc_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (!ofw_bus_is_compatible(dev, "snps,dwmac")) return (ENXIO); device_set_desc(dev, "Gigabit Ethernet Controller"); return (BUS_PROBE_DEFAULT); } static int dwc_attach(device_t dev) { uint8_t macaddr[ETHER_ADDR_LEN]; struct dwc_softc *sc; struct ifnet *ifp; int error, i; uint32_t reg; sc = device_get_softc(dev); sc->dev = dev; sc->mii_clk = MII_CLK_VAL; sc->rx_idx = 0; sc->txcount = TX_DESC_COUNT; if (bus_alloc_resources(dev, dwc_spec, sc->res)) { device_printf(dev, "could not allocate resources\n"); return (ENXIO); } /* Memory interface */ sc->bst = rman_get_bustag(sc->res[0]); sc->bsh = rman_get_bushandle(sc->res[0]); /* Read MAC before reset */ if (dwc_get_hwaddr(sc, macaddr)) { device_printf(sc->dev, "can't get mac\n"); return (ENXIO); } /* Reset */ reg = READ4(sc, BUS_MODE); reg |= (BUS_MODE_SWR); WRITE4(sc, BUS_MODE, reg); for (i = 0; i < MAC_RESET_TIMEOUT; i++) { if ((READ4(sc, BUS_MODE) & BUS_MODE_SWR) == 0) break; DELAY(10); } if (i >= MAC_RESET_TIMEOUT) { device_printf(sc->dev, "Can't reset DWC.\n"); return (ENXIO); } reg = READ4(sc, BUS_MODE); reg |= (BUS_MODE_EIGHTXPBL); reg |= (BUS_MODE_PBL_BEATS_8 << BUS_MODE_PBL_SHIFT); WRITE4(sc, BUS_MODE, reg); /* * DMA must be stop while changing descriptor list addresses. */ reg = READ4(sc, OPERATION_MODE); reg &= ~(MODE_ST | MODE_SR); WRITE4(sc, OPERATION_MODE, reg); if (setup_dma(sc)) return (ENXIO); /* Setup addresses */ WRITE4(sc, RX_DESCR_LIST_ADDR, sc->rxdesc_ring_paddr); WRITE4(sc, TX_DESCR_LIST_ADDR, sc->txdesc_ring_paddr); mtx_init(&sc->mtx, device_get_nameunit(sc->dev), MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&sc->dwc_callout, &sc->mtx, 0); /* Setup interrupt handler. */ error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_NET | INTR_MPSAFE, NULL, dwc_intr, sc, &sc->intr_cookie); if (error != 0) { device_printf(dev, "could not setup interrupt handler.\n"); return (ENXIO); } /* Set up the ethernet interface. */ sc->ifp = ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_capabilities = IFCAP_VLAN_MTU; ifp->if_capenable = ifp->if_capabilities; ifp->if_start = dwc_txstart; ifp->if_ioctl = dwc_ioctl; ifp->if_init = dwc_init; IFQ_SET_MAXLEN(&ifp->if_snd, TX_DESC_COUNT - 1); ifp->if_snd.ifq_drv_maxlen = TX_DESC_COUNT - 1; IFQ_SET_READY(&ifp->if_snd); ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* Attach the mii driver. */ error = mii_attach(dev, &sc->miibus, ifp, dwc_media_change, dwc_media_status, BMSR_DEFCAPMASK, MII_PHY_ANY, MII_OFFSET_ANY, 0); if (error != 0) { device_printf(dev, "PHY attach failed\n"); return (ENXIO); } sc->mii_softc = device_get_softc(sc->miibus); /* All ready to run, attach the ethernet interface. */ ether_ifattach(ifp, macaddr); sc->is_attached = true; return (0); } static int dwc_miibus_read_reg(device_t dev, int phy, int reg) { struct dwc_softc *sc; uint16_t mii; size_t cnt; int rv = 0; sc = device_get_softc(dev); mii = ((phy & GMII_ADDRESS_PA_MASK) << GMII_ADDRESS_PA_SHIFT) | ((reg & GMII_ADDRESS_GR_MASK) << GMII_ADDRESS_GR_SHIFT) | (sc->mii_clk << GMII_ADDRESS_CR_SHIFT) | GMII_ADDRESS_GB; /* Busy flag */ WRITE4(sc, GMII_ADDRESS, mii); for (cnt = 0; cnt < 1000; cnt++) { if (!(READ4(sc, GMII_ADDRESS) & GMII_ADDRESS_GB)) { rv = READ4(sc, GMII_DATA); break; } DELAY(10); } return rv; } static int dwc_miibus_write_reg(device_t dev, int phy, int reg, int val) { struct dwc_softc *sc; uint16_t mii; size_t cnt; sc = device_get_softc(dev); mii = ((phy & GMII_ADDRESS_PA_MASK) << GMII_ADDRESS_PA_SHIFT) | ((reg & GMII_ADDRESS_GR_MASK) << GMII_ADDRESS_GR_SHIFT) | (sc->mii_clk << GMII_ADDRESS_CR_SHIFT) | GMII_ADDRESS_GB | GMII_ADDRESS_GW; WRITE4(sc, GMII_DATA, val); WRITE4(sc, GMII_ADDRESS, mii); for (cnt = 0; cnt < 1000; cnt++) { if (!(READ4(sc, GMII_ADDRESS) & GMII_ADDRESS_GB)) { break; } DELAY(10); } return (0); } static void dwc_miibus_statchg(device_t dev) { struct dwc_softc *sc; struct mii_data *mii; uint32_t reg; /* * Called by the MII bus driver when the PHY establishes * link to set the MAC interface registers. */ sc = device_get_softc(dev); DWC_ASSERT_LOCKED(sc); mii = sc->mii_softc; if (mii->mii_media_status & IFM_ACTIVE) sc->link_is_up = true; else sc->link_is_up = false; reg = READ4(sc, MAC_CONFIGURATION); switch (IFM_SUBTYPE(mii->mii_media_active)) { case IFM_1000_T: case IFM_1000_SX: reg &= ~(CONF_FES | CONF_PS); break; case IFM_100_TX: reg |= (CONF_FES | CONF_PS); break; case IFM_10_T: reg &= ~(CONF_FES); reg |= (CONF_PS); break; case IFM_NONE: sc->link_is_up = false; return; default: sc->link_is_up = false; device_printf(dev, "Unsupported media %u\n", IFM_SUBTYPE(mii->mii_media_active)); return; } if ((IFM_OPTIONS(mii->mii_media_active) & IFM_FDX) != 0) reg |= (CONF_DM); else reg &= ~(CONF_DM); WRITE4(sc, MAC_CONFIGURATION, reg); } static device_method_t dwc_methods[] = { DEVMETHOD(device_probe, dwc_probe), DEVMETHOD(device_attach, dwc_attach), /* MII Interface */ DEVMETHOD(miibus_readreg, dwc_miibus_read_reg), DEVMETHOD(miibus_writereg, dwc_miibus_write_reg), DEVMETHOD(miibus_statchg, dwc_miibus_statchg), { 0, 0 } }; static driver_t dwc_driver = { "dwc", dwc_methods, sizeof(struct dwc_softc), }; static devclass_t dwc_devclass; DRIVER_MODULE(dwc, simplebus, dwc_driver, dwc_devclass, 0, 0); DRIVER_MODULE(miibus, dwc, miibus_driver, miibus_devclass, 0, 0); MODULE_DEPEND(dwc, ether, 1, 1, 1); MODULE_DEPEND(dwc, miibus, 1, 1, 1); Index: projects/iosched/sys/netinet/sctp.h =================================================================== --- projects/iosched/sys/netinet/sctp.h (revision 287721) +++ projects/iosched/sys/netinet/sctp.h (revision 287722) @@ -1,633 +1,637 @@ /*- * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_H_ #define _NETINET_SCTP_H_ #include #define SCTP_PACKED __attribute__((packed)) /* * SCTP protocol - RFC4960. */ struct sctphdr { uint16_t src_port; /* source port */ uint16_t dest_port; /* destination port */ uint32_t v_tag; /* verification tag of packet */ uint32_t checksum; /* CRC32C checksum */ /* chunks follow... */ } SCTP_PACKED; /* * SCTP Chunks */ struct sctp_chunkhdr { uint8_t chunk_type; /* chunk type */ uint8_t chunk_flags; /* chunk flags */ uint16_t chunk_length; /* chunk length */ /* optional params follow */ } SCTP_PACKED; /* * SCTP chunk parameters */ struct sctp_paramhdr { uint16_t param_type; /* parameter type */ uint16_t param_length; /* parameter length */ } SCTP_PACKED; /* * user socket options: socket API defined */ /* * read-write options */ #define SCTP_RTOINFO 0x00000001 #define SCTP_ASSOCINFO 0x00000002 #define SCTP_INITMSG 0x00000003 #define SCTP_NODELAY 0x00000004 #define SCTP_AUTOCLOSE 0x00000005 #define SCTP_SET_PEER_PRIMARY_ADDR 0x00000006 #define SCTP_PRIMARY_ADDR 0x00000007 #define SCTP_ADAPTATION_LAYER 0x00000008 /* same as above */ #define SCTP_ADAPTION_LAYER 0x00000008 #define SCTP_DISABLE_FRAGMENTS 0x00000009 #define SCTP_PEER_ADDR_PARAMS 0x0000000a #define SCTP_DEFAULT_SEND_PARAM 0x0000000b /* ancillary data/notification interest options */ #define SCTP_EVENTS 0x0000000c /* deprecated */ /* Without this applied we will give V4 and V6 addresses on a V6 socket */ #define SCTP_I_WANT_MAPPED_V4_ADDR 0x0000000d #define SCTP_MAXSEG 0x0000000e #define SCTP_DELAYED_SACK 0x0000000f #define SCTP_FRAGMENT_INTERLEAVE 0x00000010 #define SCTP_PARTIAL_DELIVERY_POINT 0x00000011 /* authentication support */ #define SCTP_AUTH_CHUNK 0x00000012 #define SCTP_AUTH_KEY 0x00000013 #define SCTP_HMAC_IDENT 0x00000014 #define SCTP_AUTH_ACTIVE_KEY 0x00000015 #define SCTP_AUTH_DELETE_KEY 0x00000016 #define SCTP_USE_EXT_RCVINFO 0x00000017 #define SCTP_AUTO_ASCONF 0x00000018 /* rw */ #define SCTP_MAXBURST 0x00000019 /* rw */ #define SCTP_MAX_BURST 0x00000019 /* rw */ /* assoc level context */ #define SCTP_CONTEXT 0x0000001a /* rw */ /* explicit EOR signalling */ #define SCTP_EXPLICIT_EOR 0x0000001b #define SCTP_REUSE_PORT 0x0000001c /* rw */ #define SCTP_AUTH_DEACTIVATE_KEY 0x0000001d #define SCTP_EVENT 0x0000001e #define SCTP_RECVRCVINFO 0x0000001f #define SCTP_RECVNXTINFO 0x00000020 #define SCTP_DEFAULT_SNDINFO 0x00000021 #define SCTP_DEFAULT_PRINFO 0x00000022 #define SCTP_PEER_ADDR_THLDS 0x00000023 #define SCTP_REMOTE_UDP_ENCAPS_PORT 0x00000024 #define SCTP_ECN_SUPPORTED 0x00000025 #define SCTP_PR_SUPPORTED 0x00000026 #define SCTP_AUTH_SUPPORTED 0x00000027 #define SCTP_ASCONF_SUPPORTED 0x00000028 #define SCTP_RECONFIG_SUPPORTED 0x00000029 #define SCTP_NRSACK_SUPPORTED 0x00000030 #define SCTP_PKTDROP_SUPPORTED 0x00000031 #define SCTP_MAX_CWND 0x00000032 /* * read-only options */ #define SCTP_STATUS 0x00000100 #define SCTP_GET_PEER_ADDR_INFO 0x00000101 /* authentication support */ #define SCTP_PEER_AUTH_CHUNKS 0x00000102 #define SCTP_LOCAL_AUTH_CHUNKS 0x00000103 #define SCTP_GET_ASSOC_NUMBER 0x00000104 /* ro */ #define SCTP_GET_ASSOC_ID_LIST 0x00000105 /* ro */ #define SCTP_TIMEOUTS 0x00000106 #define SCTP_PR_STREAM_STATUS 0x00000107 #define SCTP_PR_ASSOC_STATUS 0x00000108 /* * user socket options: BSD implementation specific */ /* * Blocking I/O is enabled on any TCP type socket by default. For the UDP * model if this is turned on then the socket buffer is shared for send * resources amongst all associations. The default for the UDP model is that * is SS_NBIO is set. Which means all associations have a separate send * limit BUT they will NOT ever BLOCK instead you will get an error back * EAGAIN if you try to send too much. If you want the blocking semantics you * set this option at the cost of sharing one socket send buffer size amongst * all associations. Peeled off sockets turn this option off and block. But * since both TCP and peeled off sockets have only one assoc per socket this * is fine. It probably does NOT make sense to set this on SS_NBIO on a TCP * model OR peeled off UDP model, but we do allow you to do so. You just use * the normal syscall to toggle SS_NBIO the way you want. * * Blocking I/O is controlled by the SS_NBIO flag on the socket state so_state * field. */ #define SCTP_ENABLE_STREAM_RESET 0x00000900 /* struct * sctp_assoc_value */ #define SCTP_RESET_STREAMS 0x00000901 /* struct * sctp_reset_streams */ #define SCTP_RESET_ASSOC 0x00000902 /* sctp_assoc_t */ #define SCTP_ADD_STREAMS 0x00000903 /* struct * sctp_add_streams */ /* For enable stream reset */ #define SCTP_ENABLE_RESET_STREAM_REQ 0x00000001 #define SCTP_ENABLE_RESET_ASSOC_REQ 0x00000002 #define SCTP_ENABLE_CHANGE_ASSOC_REQ 0x00000004 #define SCTP_ENABLE_VALUE_MASK 0x00000007 /* For reset streams */ #define SCTP_STREAM_RESET_INCOMING 0x00000001 #define SCTP_STREAM_RESET_OUTGOING 0x00000002 /* here on down are more implementation specific */ #define SCTP_SET_DEBUG_LEVEL 0x00001005 #define SCTP_CLR_STAT_LOG 0x00001007 /* CMT ON/OFF socket option */ #define SCTP_CMT_ON_OFF 0x00001200 #define SCTP_CMT_USE_DAC 0x00001201 /* JRS - Pluggable Congestion Control Socket option */ #define SCTP_PLUGGABLE_CC 0x00001202 /* RS - Pluggable Stream Scheduling Socket option */ #define SCTP_PLUGGABLE_SS 0x00001203 #define SCTP_SS_VALUE 0x00001204 #define SCTP_CC_OPTION 0x00001205 /* Options for CC * modules */ /* read only */ #define SCTP_GET_SNDBUF_USE 0x00001101 #define SCTP_GET_STAT_LOG 0x00001103 #define SCTP_PCB_STATUS 0x00001104 #define SCTP_GET_NONCE_VALUES 0x00001105 /* Special hook for dynamically setting primary for all assoc's, * this is a write only option that requires root privilege. */ #define SCTP_SET_DYNAMIC_PRIMARY 0x00002001 /* VRF (virtual router feature) and multi-VRF support * options. VRF's provide splits within a router * that give the views of multiple routers. A * standard host, without VRF support, is just * a single VRF. If VRF's are supported then * the transport must be VRF aware. This means * that every socket call coming in must be directed * within the endpoint to one of the VRF's it belongs * to. The endpoint, before binding, may select * the "default" VRF it is in by using a set socket * option with SCTP_VRF_ID. This will also * get propagated to the default VRF. Once the * endpoint binds an address then it CANNOT add * additional VRF's to become a Multi-VRF endpoint. * * Before BINDING additional VRF's can be added with * the SCTP_ADD_VRF_ID call or deleted with * SCTP_DEL_VRF_ID. * * Associations are ALWAYS contained inside a single * VRF. They cannot reside in two (or more) VRF's. Incoming * packets, assuming the router is VRF aware, can always * tell us what VRF they arrived on. A host not supporting * any VRF's will find that the packets always arrived on the * single VRF that the host has. * */ #define SCTP_VRF_ID 0x00003001 #define SCTP_ADD_VRF_ID 0x00003002 #define SCTP_GET_VRF_IDS 0x00003003 #define SCTP_GET_ASOC_VRF 0x00003004 #define SCTP_DEL_VRF_ID 0x00003005 /* * If you enable packet logging you can get * a poor mans ethereal output in binary * form. Note this is a compile option to * the kernel, SCTP_PACKET_LOGGING, and * without it in your kernel you * will get a EOPNOTSUPP */ #define SCTP_GET_PACKET_LOG 0x00004001 /* * hidden implementation specific options these are NOT user visible (should * move out of sctp.h) */ /* sctp_bindx() flags as hidden socket options */ #define SCTP_BINDX_ADD_ADDR 0x00008001 #define SCTP_BINDX_REM_ADDR 0x00008002 /* Hidden socket option that gets the addresses */ #define SCTP_GET_PEER_ADDRESSES 0x00008003 #define SCTP_GET_LOCAL_ADDRESSES 0x00008004 /* return the total count in bytes needed to hold all local addresses bound */ #define SCTP_GET_LOCAL_ADDR_SIZE 0x00008005 /* Return the total count in bytes needed to hold the remote address */ #define SCTP_GET_REMOTE_ADDR_SIZE 0x00008006 /* hidden option for connectx */ #define SCTP_CONNECT_X 0x00008007 /* hidden option for connectx_delayed, part of sendx */ #define SCTP_CONNECT_X_DELAYED 0x00008008 #define SCTP_CONNECT_X_COMPLETE 0x00008009 /* hidden socket option based sctp_peeloff */ #define SCTP_PEELOFF 0x0000800a /* the real worker for sctp_getaddrlen() */ #define SCTP_GET_ADDR_LEN 0x0000800b /* Debug things that need to be purged */ #define SCTP_SET_INITIAL_DBG_SEQ 0x00009f00 /* JRS - Supported congestion control modules for pluggable * congestion control */ /* Standard TCP Congestion Control */ #define SCTP_CC_RFC2581 0x00000000 /* High Speed TCP Congestion Control (Floyd) */ #define SCTP_CC_HSTCP 0x00000001 /* HTCP Congestion Control */ #define SCTP_CC_HTCP 0x00000002 /* RTCC Congestion Control - RFC2581 plus */ #define SCTP_CC_RTCC 0x00000003 #define SCTP_CC_OPT_RTCC_SETMODE 0x00002000 #define SCTP_CC_OPT_USE_DCCC_ECN 0x00002001 #define SCTP_CC_OPT_STEADY_STEP 0x00002002 #define SCTP_CMT_OFF 0 #define SCTP_CMT_BASE 1 #define SCTP_CMT_RPV1 2 #define SCTP_CMT_RPV2 3 #define SCTP_CMT_MPTCP 4 #define SCTP_CMT_MAX SCTP_CMT_MPTCP /* RS - Supported stream scheduling modules for pluggable * stream scheduling */ /* Default simple round-robin */ #define SCTP_SS_DEFAULT 0x00000000 /* Real round-robin */ #define SCTP_SS_ROUND_ROBIN 0x00000001 /* Real round-robin per packet */ #define SCTP_SS_ROUND_ROBIN_PACKET 0x00000002 /* Priority */ #define SCTP_SS_PRIORITY 0x00000003 /* Fair Bandwidth */ #define SCTP_SS_FAIR_BANDWITH 0x00000004 /* First-come, first-serve */ #define SCTP_SS_FIRST_COME 0x00000005 /* fragment interleave constants * setting must be one of these or * EINVAL returned. */ #define SCTP_FRAG_LEVEL_0 0x00000000 #define SCTP_FRAG_LEVEL_1 0x00000001 #define SCTP_FRAG_LEVEL_2 0x00000002 /* * user state values */ #define SCTP_CLOSED 0x0000 #define SCTP_BOUND 0x1000 #define SCTP_LISTEN 0x2000 #define SCTP_COOKIE_WAIT 0x0002 #define SCTP_COOKIE_ECHOED 0x0004 #define SCTP_ESTABLISHED 0x0008 #define SCTP_SHUTDOWN_SENT 0x0010 #define SCTP_SHUTDOWN_RECEIVED 0x0020 #define SCTP_SHUTDOWN_ACK_SENT 0x0040 #define SCTP_SHUTDOWN_PENDING 0x0080 /* * SCTP operational error codes (user visible) */ #define SCTP_CAUSE_NO_ERROR 0x0000 #define SCTP_CAUSE_INVALID_STREAM 0x0001 #define SCTP_CAUSE_MISSING_PARAM 0x0002 #define SCTP_CAUSE_STALE_COOKIE 0x0003 #define SCTP_CAUSE_OUT_OF_RESC 0x0004 #define SCTP_CAUSE_UNRESOLVABLE_ADDR 0x0005 #define SCTP_CAUSE_UNRECOG_CHUNK 0x0006 #define SCTP_CAUSE_INVALID_PARAM 0x0007 #define SCTP_CAUSE_UNRECOG_PARAM 0x0008 #define SCTP_CAUSE_NO_USER_DATA 0x0009 #define SCTP_CAUSE_COOKIE_IN_SHUTDOWN 0x000a #define SCTP_CAUSE_RESTART_W_NEWADDR 0x000b #define SCTP_CAUSE_USER_INITIATED_ABT 0x000c #define SCTP_CAUSE_PROTOCOL_VIOLATION 0x000d /* Error causes from RFC5061 */ #define SCTP_CAUSE_DELETING_LAST_ADDR 0x00a0 #define SCTP_CAUSE_RESOURCE_SHORTAGE 0x00a1 #define SCTP_CAUSE_DELETING_SRC_ADDR 0x00a2 #define SCTP_CAUSE_ILLEGAL_ASCONF_ACK 0x00a3 #define SCTP_CAUSE_REQUEST_REFUSED 0x00a4 /* Error causes from nat-draft */ #define SCTP_CAUSE_NAT_COLLIDING_STATE 0x00b0 #define SCTP_CAUSE_NAT_MISSING_STATE 0x00b1 /* Error causes from RFC4895 */ #define SCTP_CAUSE_UNSUPPORTED_HMACID 0x0105 /* * error cause parameters (user visible) */ struct sctp_gen_error_cause { uint16_t code; uint16_t length; uint8_t info[]; } SCTP_PACKED; struct sctp_error_cause { uint16_t code; uint16_t length; /* optional cause-specific info may follow */ } SCTP_PACKED; struct sctp_error_invalid_stream { - struct sctp_error_cause cause; /* code=SCTP_ERROR_INVALID_STREAM */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_INVALID_STREAM */ uint16_t stream_id; /* stream id of the DATA in error */ uint16_t reserved; } SCTP_PACKED; struct sctp_error_missing_param { - struct sctp_error_cause cause; /* code=SCTP_ERROR_MISSING_PARAM */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_MISSING_PARAM */ uint32_t num_missing_params; /* number of missing parameters */ - /* uint16_t param_type's follow */ + uint16_t type[]; } SCTP_PACKED; struct sctp_error_stale_cookie { - struct sctp_error_cause cause; /* code=SCTP_ERROR_STALE_COOKIE */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_STALE_COOKIE */ uint32_t stale_time; /* time in usec of staleness */ } SCTP_PACKED; struct sctp_error_out_of_resource { - struct sctp_error_cause cause; /* code=SCTP_ERROR_OUT_OF_RESOURCES */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_OUT_OF_RESOURCES */ } SCTP_PACKED; struct sctp_error_unresolv_addr { - struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRESOLVABLE_ADDR */ - + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRESOLVABLE_ADDR */ } SCTP_PACKED; struct sctp_error_unrecognized_chunk { - struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRECOG_CHUNK */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRECOG_CHUNK */ struct sctp_chunkhdr ch;/* header from chunk in error */ } SCTP_PACKED; struct sctp_error_no_user_data { struct sctp_error_cause cause; /* code=SCTP_CAUSE_NO_USER_DATA */ uint32_t tsn; /* TSN of the empty data chunk */ } SCTP_PACKED; + +struct sctp_error_auth_invalid_hmac { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNSUPPORTED_HMACID */ + uint16_t hmac_id; +} SCTP_PACKED; /* * Main SCTP chunk types we place these here so natd and f/w's in user land * can find them. */ /************0x00 series ***********/ #define SCTP_DATA 0x00 #define SCTP_INITIATION 0x01 #define SCTP_INITIATION_ACK 0x02 #define SCTP_SELECTIVE_ACK 0x03 #define SCTP_HEARTBEAT_REQUEST 0x04 #define SCTP_HEARTBEAT_ACK 0x05 #define SCTP_ABORT_ASSOCIATION 0x06 #define SCTP_SHUTDOWN 0x07 #define SCTP_SHUTDOWN_ACK 0x08 #define SCTP_OPERATION_ERROR 0x09 #define SCTP_COOKIE_ECHO 0x0a #define SCTP_COOKIE_ACK 0x0b #define SCTP_ECN_ECHO 0x0c #define SCTP_ECN_CWR 0x0d #define SCTP_SHUTDOWN_COMPLETE 0x0e /* RFC4895 */ #define SCTP_AUTHENTICATION 0x0f /* EY nr_sack chunk id*/ #define SCTP_NR_SELECTIVE_ACK 0x10 /************0x40 series ***********/ /************0x80 series ***********/ /* RFC5061 */ #define SCTP_ASCONF_ACK 0x80 /* draft-ietf-stewart-pktdrpsctp */ #define SCTP_PACKET_DROPPED 0x81 /* draft-ietf-stewart-strreset-xxx */ #define SCTP_STREAM_RESET 0x82 /* RFC4820 */ #define SCTP_PAD_CHUNK 0x84 /************0xc0 series ***********/ /* RFC3758 */ #define SCTP_FORWARD_CUM_TSN 0xc0 /* RFC5061 */ #define SCTP_ASCONF 0xc1 /* ABORT and SHUTDOWN COMPLETE FLAG */ #define SCTP_HAD_NO_TCB 0x01 /* Packet dropped flags */ #define SCTP_FROM_MIDDLE_BOX SCTP_HAD_NO_TCB #define SCTP_BADCRC 0x02 #define SCTP_PACKET_TRUNCATED 0x04 /* Flag for ECN -CWR */ #define SCTP_CWR_REDUCE_OVERRIDE 0x01 #define SCTP_CWR_IN_SAME_WINDOW 0x02 #define SCTP_SAT_NETWORK_MIN 400 /* min ms for RTT to set satellite * time */ #define SCTP_SAT_NETWORK_BURST_INCR 2 /* how many times to multiply maxburst * in sat */ /* Data Chuck Specific Flags */ #define SCTP_DATA_FRAG_MASK 0x03 #define SCTP_DATA_MIDDLE_FRAG 0x00 #define SCTP_DATA_LAST_FRAG 0x01 #define SCTP_DATA_FIRST_FRAG 0x02 #define SCTP_DATA_NOT_FRAG 0x03 #define SCTP_DATA_UNORDERED 0x04 #define SCTP_DATA_SACK_IMMEDIATELY 0x08 /* ECN Nonce: SACK Chunk Specific Flags */ #define SCTP_SACK_NONCE_SUM 0x01 /* CMT DAC algorithm SACK flag */ #define SCTP_SACK_CMT_DAC 0x80 /* * PCB flags (in sctp_flags bitmask). * Note the features and flags are meant * for use by netstat. */ #define SCTP_PCB_FLAGS_UDPTYPE 0x00000001 #define SCTP_PCB_FLAGS_TCPTYPE 0x00000002 #define SCTP_PCB_FLAGS_BOUNDALL 0x00000004 #define SCTP_PCB_FLAGS_ACCEPTING 0x00000008 #define SCTP_PCB_FLAGS_UNBOUND 0x00000010 #define SCTP_PCB_FLAGS_CLOSE_IP 0x00040000 #define SCTP_PCB_FLAGS_WAS_CONNECTED 0x00080000 #define SCTP_PCB_FLAGS_WAS_ABORTED 0x00100000 /* TCP model support */ #define SCTP_PCB_FLAGS_CONNECTED 0x00200000 #define SCTP_PCB_FLAGS_IN_TCPPOOL 0x00400000 #define SCTP_PCB_FLAGS_DONT_WAKE 0x00800000 #define SCTP_PCB_FLAGS_WAKEOUTPUT 0x01000000 #define SCTP_PCB_FLAGS_WAKEINPUT 0x02000000 #define SCTP_PCB_FLAGS_BOUND_V6 0x04000000 #define SCTP_PCB_FLAGS_BLOCKING_IO 0x08000000 #define SCTP_PCB_FLAGS_SOCKET_GONE 0x10000000 #define SCTP_PCB_FLAGS_SOCKET_ALLGONE 0x20000000 #define SCTP_PCB_FLAGS_SOCKET_CANT_READ 0x40000000 /* flags to copy to new PCB */ #define SCTP_PCB_COPY_FLAGS (SCTP_PCB_FLAGS_BOUNDALL|\ SCTP_PCB_FLAGS_WAKEINPUT|\ SCTP_PCB_FLAGS_BOUND_V6) /* * PCB Features (in sctp_features bitmask) */ #define SCTP_PCB_FLAGS_DO_NOT_PMTUD 0x0000000000000001 #define SCTP_PCB_FLAGS_EXT_RCVINFO 0x0000000000000002 /* deprecated */ #define SCTP_PCB_FLAGS_DONOT_HEARTBEAT 0x0000000000000004 #define SCTP_PCB_FLAGS_FRAG_INTERLEAVE 0x0000000000000008 #define SCTP_PCB_FLAGS_INTERLEAVE_STRMS 0x0000000000000010 #define SCTP_PCB_FLAGS_DO_ASCONF 0x0000000000000020 #define SCTP_PCB_FLAGS_AUTO_ASCONF 0x0000000000000040 #define SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE 0x0000000000000080 /* socket options */ #define SCTP_PCB_FLAGS_NODELAY 0x0000000000000100 #define SCTP_PCB_FLAGS_AUTOCLOSE 0x0000000000000200 #define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x0000000000000400 /* deprecated */ #define SCTP_PCB_FLAGS_RECVASSOCEVNT 0x0000000000000800 #define SCTP_PCB_FLAGS_RECVPADDREVNT 0x0000000000001000 #define SCTP_PCB_FLAGS_RECVPEERERR 0x0000000000002000 #define SCTP_PCB_FLAGS_RECVSENDFAILEVNT 0x0000000000004000 /* deprecated */ #define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT 0x0000000000008000 #define SCTP_PCB_FLAGS_ADAPTATIONEVNT 0x0000000000010000 #define SCTP_PCB_FLAGS_PDAPIEVNT 0x0000000000020000 #define SCTP_PCB_FLAGS_AUTHEVNT 0x0000000000040000 #define SCTP_PCB_FLAGS_STREAM_RESETEVNT 0x0000000000080000 #define SCTP_PCB_FLAGS_NO_FRAGMENT 0x0000000000100000 #define SCTP_PCB_FLAGS_EXPLICIT_EOR 0x0000000000400000 #define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4 0x0000000000800000 #define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS 0x0000000001000000 #define SCTP_PCB_FLAGS_PORTREUSE 0x0000000002000000 #define SCTP_PCB_FLAGS_DRYEVNT 0x0000000004000000 #define SCTP_PCB_FLAGS_RECVRCVINFO 0x0000000008000000 #define SCTP_PCB_FLAGS_RECVNXTINFO 0x0000000010000000 #define SCTP_PCB_FLAGS_ASSOC_RESETEVNT 0x0000000020000000 #define SCTP_PCB_FLAGS_STREAM_CHANGEEVNT 0x0000000040000000 #define SCTP_PCB_FLAGS_RECVNSENDFAILEVNT 0x0000000080000000 /*- * mobility_features parameters (by micchie).Note * these features are applied against the * sctp_mobility_features flags.. not the sctp_features * flags. */ #define SCTP_MOBILITY_BASE 0x00000001 #define SCTP_MOBILITY_FASTHANDOFF 0x00000002 #define SCTP_MOBILITY_PRIM_DELETED 0x00000004 #define SCTP_SMALLEST_PMTU 512 /* smallest pmtu allowed when disabling PMTU * discovery */ #undef SCTP_PACKED #include /* This dictates the size of the packet * collection buffer. This only applies * if SCTP_PACKET_LOGGING is enabled in * your config. */ #define SCTP_PACKET_LOG_SIZE 65536 /* Maximum delays and such a user can set for options that * take ms. */ #define SCTP_MAX_SACK_DELAY 500 /* per RFC4960 */ #define SCTP_MAX_HB_INTERVAL 14400000 /* 4 hours in ms */ #define SCTP_MAX_COOKIE_LIFE 3600000 /* 1 hour in ms */ /* Types of logging/KTR tracing that can be enabled via the * sysctl net.inet.sctp.sctp_logging. You must also enable * SUBSYS tracing. * Note that you must have the SCTP option in the kernel * to enable these as well. */ #define SCTP_BLK_LOGGING_ENABLE 0x00000001 #define SCTP_CWND_MONITOR_ENABLE 0x00000002 #define SCTP_CWND_LOGGING_ENABLE 0x00000004 #define SCTP_FLIGHT_LOGGING_ENABLE 0x00000020 #define SCTP_FR_LOGGING_ENABLE 0x00000040 #define SCTP_LOCK_LOGGING_ENABLE 0x00000080 #define SCTP_MAP_LOGGING_ENABLE 0x00000100 #define SCTP_MBCNT_LOGGING_ENABLE 0x00000200 #define SCTP_MBUF_LOGGING_ENABLE 0x00000400 #define SCTP_NAGLE_LOGGING_ENABLE 0x00000800 #define SCTP_RECV_RWND_LOGGING_ENABLE 0x00001000 #define SCTP_RTTVAR_LOGGING_ENABLE 0x00002000 #define SCTP_SACK_LOGGING_ENABLE 0x00004000 #define SCTP_SACK_RWND_LOGGING_ENABLE 0x00008000 #define SCTP_SB_LOGGING_ENABLE 0x00010000 #define SCTP_STR_LOGGING_ENABLE 0x00020000 #define SCTP_WAKE_LOGGING_ENABLE 0x00040000 #define SCTP_LOG_MAXBURST_ENABLE 0x00080000 #define SCTP_LOG_RWND_ENABLE 0x00100000 #define SCTP_LOG_SACK_ARRIVALS_ENABLE 0x00200000 #define SCTP_LTRACE_CHUNK_ENABLE 0x00400000 #define SCTP_LTRACE_ERROR_ENABLE 0x00800000 #define SCTP_LAST_PACKET_TRACING 0x01000000 #define SCTP_THRESHOLD_LOGGING 0x02000000 #define SCTP_LOG_AT_SEND_2_SCTP 0x04000000 #define SCTP_LOG_AT_SEND_2_OUTQ 0x08000000 #define SCTP_LOG_TRY_ADVANCE 0x10000000 #endif /* !_NETINET_SCTP_H_ */ Index: projects/iosched/sys/netinet/sctp_auth.c =================================================================== --- projects/iosched/sys/netinet/sctp_auth.c (revision 287721) +++ projects/iosched/sys/netinet/sctp_auth.c (revision 287722) @@ -1,2026 +1,2025 @@ /*- * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #ifdef SCTP_DEBUG #define SCTP_AUTH_DEBUG (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH1) #define SCTP_AUTH_DEBUG2 (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH2) #endif /* SCTP_DEBUG */ void sctp_clear_chunklist(sctp_auth_chklist_t * chklist) { bzero(chklist, sizeof(*chklist)); /* chklist->num_chunks = 0; */ } sctp_auth_chklist_t * sctp_alloc_chunklist(void) { sctp_auth_chklist_t *chklist; SCTP_MALLOC(chklist, sctp_auth_chklist_t *, sizeof(*chklist), SCTP_M_AUTH_CL); if (chklist == NULL) { SCTPDBG(SCTP_DEBUG_AUTH1, "sctp_alloc_chunklist: failed to get memory!\n"); } else { sctp_clear_chunklist(chklist); } return (chklist); } void sctp_free_chunklist(sctp_auth_chklist_t * list) { if (list != NULL) SCTP_FREE(list, SCTP_M_AUTH_CL); } sctp_auth_chklist_t * sctp_copy_chunklist(sctp_auth_chklist_t * list) { sctp_auth_chklist_t *new_list; if (list == NULL) return (NULL); /* get a new list */ new_list = sctp_alloc_chunklist(); if (new_list == NULL) return (NULL); /* copy it */ bcopy(list, new_list, sizeof(*new_list)); return (new_list); } /* * add a chunk to the required chunks list */ int sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t * list) { if (list == NULL) return (-1); /* is chunk restricted? */ if ((chunk == SCTP_INITIATION) || (chunk == SCTP_INITIATION_ACK) || (chunk == SCTP_SHUTDOWN_COMPLETE) || (chunk == SCTP_AUTHENTICATION)) { return (-1); } if (list->chunks[chunk] == 0) { list->chunks[chunk] = 1; list->num_chunks++; SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: added chunk %u (0x%02x) to Auth list\n", chunk, chunk); } return (0); } /* * delete a chunk from the required chunks list */ int sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list) { if (list == NULL) return (-1); if (list->chunks[chunk] == 1) { list->chunks[chunk] = 0; list->num_chunks--; SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: deleted chunk %u (0x%02x) from Auth list\n", chunk, chunk); } return (0); } size_t sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list) { if (list == NULL) return (0); else return (list->num_chunks); } /* * return the current number and list of required chunks caller must * guarantee ptr has space for up to 256 bytes */ int sctp_serialize_auth_chunks(const sctp_auth_chklist_t * list, uint8_t * ptr) { int i, count = 0; if (list == NULL) return (0); for (i = 0; i < 256; i++) { if (list->chunks[i] != 0) { *ptr++ = i; count++; } } return (count); } int sctp_pack_auth_chunks(const sctp_auth_chklist_t * list, uint8_t * ptr) { int i, size = 0; if (list == NULL) return (0); if (list->num_chunks <= 32) { /* just list them, one byte each */ for (i = 0; i < 256; i++) { if (list->chunks[i] != 0) { *ptr++ = i; size++; } } } else { int index, offset; /* pack into a 32 byte bitfield */ for (i = 0; i < 256; i++) { if (list->chunks[i] != 0) { index = i / 8; offset = i % 8; ptr[index] |= (1 << offset); } } size = 32; } return (size); } int sctp_unpack_auth_chunks(const uint8_t * ptr, uint8_t num_chunks, sctp_auth_chklist_t * list) { int i; int size; if (list == NULL) return (0); if (num_chunks <= 32) { /* just pull them, one byte each */ for (i = 0; i < num_chunks; i++) { (void)sctp_auth_add_chunk(*ptr++, list); } size = num_chunks; } else { int index, offset; /* unpack from a 32 byte bitfield */ for (index = 0; index < 32; index++) { for (offset = 0; offset < 8; offset++) { if (ptr[index] & (1 << offset)) { (void)sctp_auth_add_chunk((index * 8) + offset, list); } } } size = 32; } return (size); } /* * allocate structure space for a key of length keylen */ sctp_key_t * sctp_alloc_key(uint32_t keylen) { sctp_key_t *new_key; SCTP_MALLOC(new_key, sctp_key_t *, sizeof(*new_key) + keylen, SCTP_M_AUTH_KY); if (new_key == NULL) { /* out of memory */ return (NULL); } new_key->keylen = keylen; return (new_key); } void sctp_free_key(sctp_key_t * key) { if (key != NULL) SCTP_FREE(key, SCTP_M_AUTH_KY); } void sctp_print_key(sctp_key_t * key, const char *str) { uint32_t i; if (key == NULL) { SCTP_PRINTF("%s: [Null key]\n", str); return; } SCTP_PRINTF("%s: len %u, ", str, key->keylen); if (key->keylen) { for (i = 0; i < key->keylen; i++) SCTP_PRINTF("%02x", key->key[i]); SCTP_PRINTF("\n"); } else { SCTP_PRINTF("[Null key]\n"); } } void sctp_show_key(sctp_key_t * key, const char *str) { uint32_t i; if (key == NULL) { SCTP_PRINTF("%s: [Null key]\n", str); return; } SCTP_PRINTF("%s: len %u, ", str, key->keylen); if (key->keylen) { for (i = 0; i < key->keylen; i++) SCTP_PRINTF("%02x", key->key[i]); SCTP_PRINTF("\n"); } else { SCTP_PRINTF("[Null key]\n"); } } static uint32_t sctp_get_keylen(sctp_key_t * key) { if (key != NULL) return (key->keylen); else return (0); } /* * generate a new random key of length 'keylen' */ sctp_key_t * sctp_generate_random_key(uint32_t keylen) { sctp_key_t *new_key; new_key = sctp_alloc_key(keylen); if (new_key == NULL) { /* out of memory */ return (NULL); } SCTP_READ_RANDOM(new_key->key, keylen); new_key->keylen = keylen; return (new_key); } sctp_key_t * sctp_set_key(uint8_t * key, uint32_t keylen) { sctp_key_t *new_key; new_key = sctp_alloc_key(keylen); if (new_key == NULL) { /* out of memory */ return (NULL); } bcopy(key, new_key->key, keylen); return (new_key); } /*- * given two keys of variable size, compute which key is "larger/smaller" * returns: 1 if key1 > key2 * -1 if key1 < key2 * 0 if key1 = key2 */ static int sctp_compare_key(sctp_key_t * key1, sctp_key_t * key2) { uint32_t maxlen; uint32_t i; uint32_t key1len, key2len; uint8_t *key_1, *key_2; uint8_t val1, val2; /* sanity/length check */ key1len = sctp_get_keylen(key1); key2len = sctp_get_keylen(key2); if ((key1len == 0) && (key2len == 0)) return (0); else if (key1len == 0) return (-1); else if (key2len == 0) return (1); if (key1len < key2len) { maxlen = key2len; } else { maxlen = key1len; } key_1 = key1->key; key_2 = key2->key; /* check for numeric equality */ for (i = 0; i < maxlen; i++) { /* left-pad with zeros */ val1 = (i < (maxlen - key1len)) ? 0 : *(key_1++); val2 = (i < (maxlen - key2len)) ? 0 : *(key_2++); if (val1 > val2) { return (1); } else if (val1 < val2) { return (-1); } } /* keys are equal value, so check lengths */ if (key1len == key2len) return (0); else if (key1len < key2len) return (-1); else return (1); } /* * generate the concatenated keying material based on the two keys and the * shared key (if available). draft-ietf-tsvwg-auth specifies the specific * order for concatenation */ sctp_key_t * sctp_compute_hashkey(sctp_key_t * key1, sctp_key_t * key2, sctp_key_t * shared) { uint32_t keylen; sctp_key_t *new_key; uint8_t *key_ptr; keylen = sctp_get_keylen(key1) + sctp_get_keylen(key2) + sctp_get_keylen(shared); if (keylen > 0) { /* get space for the new key */ new_key = sctp_alloc_key(keylen); if (new_key == NULL) { /* out of memory */ return (NULL); } new_key->keylen = keylen; key_ptr = new_key->key; } else { /* all keys empty/null?! */ return (NULL); } /* concatenate the keys */ if (sctp_compare_key(key1, key2) <= 0) { /* key is shared + key1 + key2 */ if (sctp_get_keylen(shared)) { bcopy(shared->key, key_ptr, shared->keylen); key_ptr += shared->keylen; } if (sctp_get_keylen(key1)) { bcopy(key1->key, key_ptr, key1->keylen); key_ptr += key1->keylen; } if (sctp_get_keylen(key2)) { bcopy(key2->key, key_ptr, key2->keylen); } } else { /* key is shared + key2 + key1 */ if (sctp_get_keylen(shared)) { bcopy(shared->key, key_ptr, shared->keylen); key_ptr += shared->keylen; } if (sctp_get_keylen(key2)) { bcopy(key2->key, key_ptr, key2->keylen); key_ptr += key2->keylen; } if (sctp_get_keylen(key1)) { bcopy(key1->key, key_ptr, key1->keylen); } } return (new_key); } sctp_sharedkey_t * sctp_alloc_sharedkey(void) { sctp_sharedkey_t *new_key; SCTP_MALLOC(new_key, sctp_sharedkey_t *, sizeof(*new_key), SCTP_M_AUTH_KY); if (new_key == NULL) { /* out of memory */ return (NULL); } new_key->keyid = 0; new_key->key = NULL; new_key->refcount = 1; new_key->deactivated = 0; return (new_key); } void sctp_free_sharedkey(sctp_sharedkey_t * skey) { if (skey == NULL) return; if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&skey->refcount)) { if (skey->key != NULL) sctp_free_key(skey->key); SCTP_FREE(skey, SCTP_M_AUTH_KY); } } sctp_sharedkey_t * sctp_find_sharedkey(struct sctp_keyhead *shared_keys, uint16_t key_id) { sctp_sharedkey_t *skey; LIST_FOREACH(skey, shared_keys, next) { if (skey->keyid == key_id) return (skey); } return (NULL); } int sctp_insert_sharedkey(struct sctp_keyhead *shared_keys, sctp_sharedkey_t * new_skey) { sctp_sharedkey_t *skey; if ((shared_keys == NULL) || (new_skey == NULL)) return (EINVAL); /* insert into an empty list? */ if (LIST_EMPTY(shared_keys)) { LIST_INSERT_HEAD(shared_keys, new_skey, next); return (0); } /* insert into the existing list, ordered by key id */ LIST_FOREACH(skey, shared_keys, next) { if (new_skey->keyid < skey->keyid) { /* insert it before here */ LIST_INSERT_BEFORE(skey, new_skey, next); return (0); } else if (new_skey->keyid == skey->keyid) { /* replace the existing key */ /* verify this key *can* be replaced */ if ((skey->deactivated) && (skey->refcount > 1)) { SCTPDBG(SCTP_DEBUG_AUTH1, "can't replace shared key id %u\n", new_skey->keyid); return (EBUSY); } SCTPDBG(SCTP_DEBUG_AUTH1, "replacing shared key id %u\n", new_skey->keyid); LIST_INSERT_BEFORE(skey, new_skey, next); LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); return (0); } if (LIST_NEXT(skey, next) == NULL) { /* belongs at the end of the list */ LIST_INSERT_AFTER(skey, new_skey, next); return (0); } } /* shouldn't reach here */ return (0); } void sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t key_id) { sctp_sharedkey_t *skey; /* find the shared key */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id); /* bump the ref count */ if (skey) { atomic_add_int(&skey->refcount, 1); SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u refcount acquire to %d\n", __FUNCTION__, (void *)stcb, key_id, skey->refcount); } } void sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t key_id, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { sctp_sharedkey_t *skey; /* find the shared key */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id); /* decrement the ref count */ if (skey) { SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u refcount release to %d\n", __FUNCTION__, (void *)stcb, key_id, skey->refcount); /* see if a notification should be generated */ if ((skey->refcount <= 2) && (skey->deactivated)) { /* notify ULP that key is no longer used */ sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, key_id, 0, so_locked); SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u no longer used, %d\n", __FUNCTION__, (void *)stcb, key_id, skey->refcount); } sctp_free_sharedkey(skey); } } static sctp_sharedkey_t * sctp_copy_sharedkey(const sctp_sharedkey_t * skey) { sctp_sharedkey_t *new_skey; if (skey == NULL) return (NULL); new_skey = sctp_alloc_sharedkey(); if (new_skey == NULL) return (NULL); if (skey->key != NULL) new_skey->key = sctp_set_key(skey->key->key, skey->key->keylen); else new_skey->key = NULL; new_skey->keyid = skey->keyid; return (new_skey); } int sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest) { sctp_sharedkey_t *skey, *new_skey; int count = 0; if ((src == NULL) || (dest == NULL)) return (0); LIST_FOREACH(skey, src, next) { new_skey = sctp_copy_sharedkey(skey); if (new_skey != NULL) { (void)sctp_insert_sharedkey(dest, new_skey); count++; } } return (count); } sctp_hmaclist_t * sctp_alloc_hmaclist(uint16_t num_hmacs) { sctp_hmaclist_t *new_list; int alloc_size; alloc_size = sizeof(*new_list) + num_hmacs * sizeof(new_list->hmac[0]); SCTP_MALLOC(new_list, sctp_hmaclist_t *, alloc_size, SCTP_M_AUTH_HL); if (new_list == NULL) { /* out of memory */ return (NULL); } new_list->max_algo = num_hmacs; new_list->num_algo = 0; return (new_list); } void sctp_free_hmaclist(sctp_hmaclist_t * list) { if (list != NULL) { SCTP_FREE(list, SCTP_M_AUTH_HL); list = NULL; } } int sctp_auth_add_hmacid(sctp_hmaclist_t * list, uint16_t hmac_id) { int i; if (list == NULL) return (-1); if (list->num_algo == list->max_algo) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: HMAC id list full, ignoring add %u\n", hmac_id); return (-1); } if ((hmac_id != SCTP_AUTH_HMAC_ID_SHA1) && (hmac_id != SCTP_AUTH_HMAC_ID_SHA256)) { return (-1); } /* Now is it already in the list */ for (i = 0; i < list->num_algo; i++) { if (list->hmac[i] == hmac_id) { /* already in list */ return (-1); } } SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: add HMAC id %u to list\n", hmac_id); list->hmac[list->num_algo++] = hmac_id; return (0); } sctp_hmaclist_t * sctp_copy_hmaclist(sctp_hmaclist_t * list) { sctp_hmaclist_t *new_list; int i; if (list == NULL) return (NULL); /* get a new list */ new_list = sctp_alloc_hmaclist(list->max_algo); if (new_list == NULL) return (NULL); /* copy it */ new_list->max_algo = list->max_algo; new_list->num_algo = list->num_algo; for (i = 0; i < list->num_algo; i++) new_list->hmac[i] = list->hmac[i]; return (new_list); } sctp_hmaclist_t * sctp_default_supported_hmaclist(void) { sctp_hmaclist_t *new_list; new_list = sctp_alloc_hmaclist(2); if (new_list == NULL) return (NULL); /* We prefer SHA256, so list it first */ (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA256); (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA1); return (new_list); } /*- * HMAC algos are listed in priority/preference order * find the best HMAC id to use for the peer based on local support */ uint16_t sctp_negotiate_hmacid(sctp_hmaclist_t * peer, sctp_hmaclist_t * local) { int i, j; if ((local == NULL) || (peer == NULL)) return (SCTP_AUTH_HMAC_ID_RSVD); for (i = 0; i < peer->num_algo; i++) { for (j = 0; j < local->num_algo; j++) { if (peer->hmac[i] == local->hmac[j]) { /* found the "best" one */ SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: negotiated peer HMAC id %u\n", peer->hmac[i]); return (peer->hmac[i]); } } } /* didn't find one! */ return (SCTP_AUTH_HMAC_ID_RSVD); } /*- * serialize the HMAC algo list and return space used * caller must guarantee ptr has appropriate space */ int sctp_serialize_hmaclist(sctp_hmaclist_t * list, uint8_t * ptr) { int i; uint16_t hmac_id; if (list == NULL) return (0); for (i = 0; i < list->num_algo; i++) { hmac_id = htons(list->hmac[i]); bcopy(&hmac_id, ptr, sizeof(hmac_id)); ptr += sizeof(hmac_id); } return (list->num_algo * sizeof(hmac_id)); } int sctp_verify_hmac_param(struct sctp_auth_hmac_algo *hmacs, uint32_t num_hmacs) { uint32_t i; for (i = 0; i < num_hmacs; i++) { if (ntohs(hmacs->hmac_ids[i]) == SCTP_AUTH_HMAC_ID_SHA1) { return (0); } } return (-1); } sctp_authinfo_t * sctp_alloc_authinfo(void) { sctp_authinfo_t *new_authinfo; SCTP_MALLOC(new_authinfo, sctp_authinfo_t *, sizeof(*new_authinfo), SCTP_M_AUTH_IF); if (new_authinfo == NULL) { /* out of memory */ return (NULL); } bzero(new_authinfo, sizeof(*new_authinfo)); return (new_authinfo); } void sctp_free_authinfo(sctp_authinfo_t * authinfo) { if (authinfo == NULL) return; if (authinfo->random != NULL) sctp_free_key(authinfo->random); if (authinfo->peer_random != NULL) sctp_free_key(authinfo->peer_random); if (authinfo->assoc_key != NULL) sctp_free_key(authinfo->assoc_key); if (authinfo->recv_key != NULL) sctp_free_key(authinfo->recv_key); /* We are NOT dynamically allocating authinfo's right now... */ /* SCTP_FREE(authinfo, SCTP_M_AUTH_??); */ } uint32_t sctp_get_auth_chunk_len(uint16_t hmac_algo) { int size; size = sizeof(struct sctp_auth_chunk) + sctp_get_hmac_digest_len(hmac_algo); return (SCTP_SIZE32(size)); } uint32_t sctp_get_hmac_digest_len(uint16_t hmac_algo) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: return (SCTP_AUTH_DIGEST_LEN_SHA1); case SCTP_AUTH_HMAC_ID_SHA256: return (SCTP_AUTH_DIGEST_LEN_SHA256); default: /* unknown HMAC algorithm: can't do anything */ return (0); } /* end switch */ } static inline int sctp_get_hmac_block_len(uint16_t hmac_algo) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: return (64); case SCTP_AUTH_HMAC_ID_SHA256: return (64); case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return (0); } /* end switch */ } static void sctp_hmac_init(uint16_t hmac_algo, sctp_hash_context_t * ctx) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: SCTP_SHA1_INIT(&ctx->sha1); break; case SCTP_AUTH_HMAC_ID_SHA256: SCTP_SHA256_INIT(&ctx->sha256); break; case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return; } /* end switch */ } static void sctp_hmac_update(uint16_t hmac_algo, sctp_hash_context_t * ctx, uint8_t * text, uint32_t textlen) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: SCTP_SHA1_UPDATE(&ctx->sha1, text, textlen); break; case SCTP_AUTH_HMAC_ID_SHA256: SCTP_SHA256_UPDATE(&ctx->sha256, text, textlen); break; case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return; } /* end switch */ } static void sctp_hmac_final(uint16_t hmac_algo, sctp_hash_context_t * ctx, uint8_t * digest) { switch (hmac_algo) { case SCTP_AUTH_HMAC_ID_SHA1: SCTP_SHA1_FINAL(digest, &ctx->sha1); break; case SCTP_AUTH_HMAC_ID_SHA256: SCTP_SHA256_FINAL(digest, &ctx->sha256); break; case SCTP_AUTH_HMAC_ID_RSVD: default: /* unknown HMAC algorithm: can't do anything */ return; } /* end switch */ } /*- * Keyed-Hashing for Message Authentication: FIPS 198 (RFC 2104) * * Compute the HMAC digest using the desired hash key, text, and HMAC * algorithm. Resulting digest is placed in 'digest' and digest length * is returned, if the HMAC was performed. * * WARNING: it is up to the caller to supply sufficient space to hold the * resultant digest. */ uint32_t sctp_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, uint8_t * text, uint32_t textlen, uint8_t * digest) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; uint32_t i; /* sanity check the material and length */ if ((key == NULL) || (keylen == 0) || (text == NULL) || (textlen == 0) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key, keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* set the hashed key as the key */ keylen = digestlen; key = temp; } /* initialize the inner/outer pads with the key and "append" zeroes */ bzero(ipad, blocklen); bzero(opad, blocklen); bcopy(key, ipad, keylen); bcopy(key, opad, keylen); /* XOR the key with ipad and opad values */ for (i = 0; i < blocklen; i++) { ipad[i] ^= 0x36; opad[i] ^= 0x5c; } /* perform inner hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen); sctp_hmac_update(hmac_algo, &ctx, text, textlen); sctp_hmac_final(hmac_algo, &ctx, temp); /* perform outer hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, opad, blocklen); sctp_hmac_update(hmac_algo, &ctx, temp, digestlen); sctp_hmac_final(hmac_algo, &ctx, digest); return (digestlen); } /* mbuf version */ uint32_t sctp_hmac_m(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, struct mbuf *m, uint32_t m_offset, uint8_t * digest, uint32_t trailer) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */ uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; uint32_t i; struct mbuf *m_tmp; /* sanity check the material and length */ if ((key == NULL) || (keylen == 0) || (m == NULL) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key, keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* set the hashed key as the key */ keylen = digestlen; key = temp; } /* initialize the inner/outer pads with the key and "append" zeroes */ bzero(ipad, blocklen); bzero(opad, blocklen); bcopy(key, ipad, keylen); bcopy(key, opad, keylen); /* XOR the key with ipad and opad values */ for (i = 0; i < blocklen; i++) { ipad[i] ^= 0x36; opad[i] ^= 0x5c; } /* perform inner hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen); /* find the correct starting mbuf and offset (get start of text) */ m_tmp = m; while ((m_tmp != NULL) && (m_offset >= (uint32_t) SCTP_BUF_LEN(m_tmp))) { m_offset -= SCTP_BUF_LEN(m_tmp); m_tmp = SCTP_BUF_NEXT(m_tmp); } /* now use the rest of the mbuf chain for the text */ while (m_tmp != NULL) { if ((SCTP_BUF_NEXT(m_tmp) == NULL) && trailer) { sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *) + m_offset, SCTP_BUF_LEN(m_tmp) - (trailer + m_offset)); } else { sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *) + m_offset, SCTP_BUF_LEN(m_tmp) - m_offset); } /* clear the offset since it's only for the first mbuf */ m_offset = 0; m_tmp = SCTP_BUF_NEXT(m_tmp); } sctp_hmac_final(hmac_algo, &ctx, temp); /* perform outer hash */ sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, opad, blocklen); sctp_hmac_update(hmac_algo, &ctx, temp, digestlen); sctp_hmac_final(hmac_algo, &ctx, digest); return (digestlen); } /*- * verify the HMAC digest using the desired hash key, text, and HMAC * algorithm. * Returns -1 on error, 0 on success. */ int sctp_verify_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, uint8_t * text, uint32_t textlen, uint8_t * digest, uint32_t digestlen) { uint32_t len; uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; /* sanity check the material and length */ if ((key == NULL) || (keylen == 0) || (text == NULL) || (textlen == 0) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest */ return (-1); } len = sctp_get_hmac_digest_len(hmac_algo); if ((len == 0) || (digestlen != len)) return (-1); /* compute the expected hash */ if (sctp_hmac(hmac_algo, key, keylen, text, textlen, temp) != len) return (-1); if (memcmp(digest, temp, digestlen) != 0) return (-1); else return (0); } /* * computes the requested HMAC using a key struct (which may be modified if * the keylen exceeds the HMAC block len). */ uint32_t sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t * key, uint8_t * text, uint32_t textlen, uint8_t * digest) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; /* sanity check */ if ((key == NULL) || (text == NULL) || (textlen == 0) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (key->keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* save the hashed key as the new key */ key->keylen = digestlen; bcopy(temp, key->key, key->keylen); } return (sctp_hmac(hmac_algo, key->key, key->keylen, text, textlen, digest)); } /* mbuf version */ uint32_t sctp_compute_hmac_m(uint16_t hmac_algo, sctp_key_t * key, struct mbuf *m, uint32_t m_offset, uint8_t * digest) { uint32_t digestlen; uint32_t blocklen; sctp_hash_context_t ctx; uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; /* sanity check */ if ((key == NULL) || (m == NULL) || (digest == NULL)) { /* can't do HMAC with empty key or text or digest store */ return (0); } /* validate the hmac algo and get the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_algo); if (digestlen == 0) return (0); /* hash the key if it is longer than the hash block size */ blocklen = sctp_get_hmac_block_len(hmac_algo); if (key->keylen > blocklen) { sctp_hmac_init(hmac_algo, &ctx); sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen); sctp_hmac_final(hmac_algo, &ctx, temp); /* save the hashed key as the new key */ key->keylen = digestlen; bcopy(temp, key->key, key->keylen); } return (sctp_hmac_m(hmac_algo, key->key, key->keylen, m, m_offset, digest, 0)); } int sctp_auth_is_supported_hmac(sctp_hmaclist_t * list, uint16_t id) { int i; if ((list == NULL) || (id == SCTP_AUTH_HMAC_ID_RSVD)) return (0); for (i = 0; i < list->num_algo; i++) if (list->hmac[i] == id) return (1); /* not in the list */ return (0); } /*- * clear any cached key(s) if they match the given key id on an association. * the cached key(s) will be recomputed and re-cached at next use. * ASSUMES TCB_LOCK is already held */ void sctp_clear_cachedkeys(struct sctp_tcb *stcb, uint16_t keyid) { if (stcb == NULL) return; if (keyid == stcb->asoc.authinfo.assoc_keyid) { sctp_free_key(stcb->asoc.authinfo.assoc_key); stcb->asoc.authinfo.assoc_key = NULL; } if (keyid == stcb->asoc.authinfo.recv_keyid) { sctp_free_key(stcb->asoc.authinfo.recv_key); stcb->asoc.authinfo.recv_key = NULL; } } /*- * clear any cached key(s) if they match the given key id for all assocs on * an endpoint. * ASSUMES INP_WLOCK is already held */ void sctp_clear_cachedkeys_ep(struct sctp_inpcb *inp, uint16_t keyid) { struct sctp_tcb *stcb; if (inp == NULL) return; /* clear the cached keys on all assocs on this instance */ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { SCTP_TCB_LOCK(stcb); sctp_clear_cachedkeys(stcb, keyid); SCTP_TCB_UNLOCK(stcb); } } /*- * delete a shared key from an association * ASSUMES TCB_LOCK is already held */ int sctp_delete_sharedkey(struct sctp_tcb *stcb, uint16_t keyid) { sctp_sharedkey_t *skey; if (stcb == NULL) return (-1); /* is the keyid the assoc active sending key */ if (keyid == stcb->asoc.authinfo.active_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); if (skey == NULL) return (-1); /* are there other refcount holders on the key? */ if (skey->refcount > 1) return (-1); /* remove it */ LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); /* frees skey->key as well */ /* clear any cached keys */ sctp_clear_cachedkeys(stcb, keyid); return (0); } /*- * deletes a shared key from the endpoint * ASSUMES INP_WLOCK is already held */ int sctp_delete_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid) { sctp_sharedkey_t *skey; if (inp == NULL) return (-1); /* is the keyid the active sending key on the endpoint */ if (keyid == inp->sctp_ep.default_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); if (skey == NULL) return (-1); /* endpoint keys are not refcounted */ /* remove it */ LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); /* frees skey->key as well */ /* clear any cached keys */ sctp_clear_cachedkeys_ep(inp, keyid); return (0); } /*- * set the active key on an association * ASSUMES TCB_LOCK is already held */ int sctp_auth_setactivekey(struct sctp_tcb *stcb, uint16_t keyid) { sctp_sharedkey_t *skey = NULL; /* find the key on the assoc */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); if (skey == NULL) { /* that key doesn't exist */ return (-1); } if ((skey->deactivated) && (skey->refcount > 1)) { /* can't reactivate a deactivated key with other refcounts */ return (-1); } /* set the (new) active key */ stcb->asoc.authinfo.active_keyid = keyid; /* reset the deactivated flag */ skey->deactivated = 0; return (0); } /*- * set the active key on an endpoint * ASSUMES INP_WLOCK is already held */ int sctp_auth_setactivekey_ep(struct sctp_inpcb *inp, uint16_t keyid) { sctp_sharedkey_t *skey; /* find the key */ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); if (skey == NULL) { /* that key doesn't exist */ return (-1); } inp->sctp_ep.default_keyid = keyid; return (0); } /*- * deactivates a shared key from the association * ASSUMES INP_WLOCK is already held */ int sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid) { sctp_sharedkey_t *skey; if (stcb == NULL) return (-1); /* is the keyid the assoc active sending key */ if (keyid == stcb->asoc.authinfo.active_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); if (skey == NULL) return (-1); /* are there other refcount holders on the key? */ if (skey->refcount == 1) { /* no other users, send a notification for this key */ sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, keyid, 0, SCTP_SO_LOCKED); } /* mark the key as deactivated */ skey->deactivated = 1; return (0); } /*- * deactivates a shared key from the endpoint * ASSUMES INP_WLOCK is already held */ int sctp_deact_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid) { sctp_sharedkey_t *skey; if (inp == NULL) return (-1); /* is the keyid the active sending key on the endpoint */ if (keyid == inp->sctp_ep.default_keyid) return (-1); /* does the key exist? */ skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); if (skey == NULL) return (-1); /* endpoint keys are not refcounted */ /* remove it */ LIST_REMOVE(skey, next); sctp_free_sharedkey(skey); /* frees skey->key as well */ return (0); } /* * get local authentication parameters from cookie (from INIT-ACK) */ void sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m, uint32_t offset, uint32_t length) { struct sctp_paramhdr *phdr, tmp_param; uint16_t plen, ptype; uint8_t random_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_random *p_random = NULL; uint16_t random_len = 0; uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_hmac_algo *hmacs = NULL; uint16_t hmacs_len = 0; uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_chunk_list *chunks = NULL; uint16_t num_chunks = 0; sctp_key_t *new_key; uint32_t keylen; /* convert to upper bound */ length += offset; phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param); while (phdr != NULL) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if ((plen == 0) || (offset + plen > length)) break; if (ptype == SCTP_RANDOM) { if (plen > sizeof(random_store)) break; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)random_store, min(plen, sizeof(random_store))); if (phdr == NULL) return; /* save the random and length for the key */ p_random = (struct sctp_auth_random *)phdr; random_len = plen - sizeof(*p_random); } else if (ptype == SCTP_HMAC_LIST) { uint16_t num_hmacs; uint16_t i; if (plen > sizeof(hmacs_store)) break; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)hmacs_store, min(plen, sizeof(hmacs_store))); if (phdr == NULL) return; /* save the hmacs list and num for the key */ hmacs = (struct sctp_auth_hmac_algo *)phdr; hmacs_len = plen - sizeof(*hmacs); num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]); if (stcb->asoc.local_hmacs != NULL) sctp_free_hmaclist(stcb->asoc.local_hmacs); stcb->asoc.local_hmacs = sctp_alloc_hmaclist(num_hmacs); if (stcb->asoc.local_hmacs != NULL) { for (i = 0; i < num_hmacs; i++) { (void)sctp_auth_add_hmacid(stcb->asoc.local_hmacs, ntohs(hmacs->hmac_ids[i])); } } } else if (ptype == SCTP_CHUNK_LIST) { int i; if (plen > sizeof(chunks_store)) break; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)chunks_store, min(plen, sizeof(chunks_store))); if (phdr == NULL) return; chunks = (struct sctp_auth_chunk_list *)phdr; num_chunks = plen - sizeof(*chunks); /* save chunks list and num for the key */ if (stcb->asoc.local_auth_chunks != NULL) sctp_clear_chunklist(stcb->asoc.local_auth_chunks); else stcb->asoc.local_auth_chunks = sctp_alloc_chunklist(); for (i = 0; i < num_chunks; i++) { (void)sctp_auth_add_chunk(chunks->chunk_types[i], stcb->asoc.local_auth_chunks); } } /* get next parameter */ offset += SCTP_SIZE32(plen); if (offset + sizeof(struct sctp_paramhdr) > length) break; phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param); } /* concatenate the full random key */ keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len; if (chunks != NULL) { keylen += sizeof(*chunks) + num_chunks; } new_key = sctp_alloc_key(keylen); if (new_key != NULL) { /* copy in the RANDOM */ if (p_random != NULL) { keylen = sizeof(*p_random) + random_len; bcopy(p_random, new_key->key, keylen); } /* append in the AUTH chunks */ if (chunks != NULL) { bcopy(chunks, new_key->key + keylen, sizeof(*chunks) + num_chunks); keylen += sizeof(*chunks) + num_chunks; } /* append in the HMACs */ if (hmacs != NULL) { bcopy(hmacs, new_key->key + keylen, sizeof(*hmacs) + hmacs_len); } } if (stcb->asoc.authinfo.random != NULL) sctp_free_key(stcb->asoc.authinfo.random); stcb->asoc.authinfo.random = new_key; stcb->asoc.authinfo.random_len = random_len; sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid); sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid); /* negotiate what HMAC to use for the peer */ stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, stcb->asoc.local_hmacs); /* copy defaults from the endpoint */ /* FIX ME: put in cookie? */ stcb->asoc.authinfo.active_keyid = stcb->sctp_ep->sctp_ep.default_keyid; /* copy out the shared key list (by reference) from the endpoint */ (void)sctp_copy_skeylist(&stcb->sctp_ep->sctp_ep.shared_keys, &stcb->asoc.shared_keys); } /* * compute and fill in the HMAC digest for a packet */ void sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset, struct sctp_auth_chunk *auth, struct sctp_tcb *stcb, uint16_t keyid) { uint32_t digestlen; sctp_sharedkey_t *skey; sctp_key_t *key; if ((stcb == NULL) || (auth == NULL)) return; /* zero the digest + chunk padding */ digestlen = sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id); bzero(auth->hmac, SCTP_SIZE32(digestlen)); /* is the desired key cached? */ if ((keyid != stcb->asoc.authinfo.assoc_keyid) || (stcb->asoc.authinfo.assoc_key == NULL)) { if (stcb->asoc.authinfo.assoc_key != NULL) { /* free the old cached key */ sctp_free_key(stcb->asoc.authinfo.assoc_key); } skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); /* the only way skey is NULL is if null key id 0 is used */ if (skey != NULL) key = skey->key; else key = NULL; /* compute a new assoc key and cache it */ stcb->asoc.authinfo.assoc_key = sctp_compute_hashkey(stcb->asoc.authinfo.random, stcb->asoc.authinfo.peer_random, key); stcb->asoc.authinfo.assoc_keyid = keyid; SCTPDBG(SCTP_DEBUG_AUTH1, "caching key id %u\n", stcb->asoc.authinfo.assoc_keyid); #ifdef SCTP_DEBUG if (SCTP_AUTH_DEBUG) sctp_print_key(stcb->asoc.authinfo.assoc_key, "Assoc Key"); #endif } /* set in the active key id */ auth->shared_key_id = htons(keyid); /* compute and fill in the digest */ (void)sctp_compute_hmac_m(stcb->asoc.peer_hmac_id, stcb->asoc.authinfo.assoc_key, m, auth_offset, auth->hmac); } static void sctp_bzero_m(struct mbuf *m, uint32_t m_offset, uint32_t size) { struct mbuf *m_tmp; uint8_t *data; /* sanity check */ if (m == NULL) return; /* find the correct starting mbuf and offset (get start position) */ m_tmp = m; while ((m_tmp != NULL) && (m_offset >= (uint32_t) SCTP_BUF_LEN(m_tmp))) { m_offset -= SCTP_BUF_LEN(m_tmp); m_tmp = SCTP_BUF_NEXT(m_tmp); } /* now use the rest of the mbuf chain */ while ((m_tmp != NULL) && (size > 0)) { data = mtod(m_tmp, uint8_t *) + m_offset; if (size > (uint32_t) SCTP_BUF_LEN(m_tmp)) { bzero(data, SCTP_BUF_LEN(m_tmp)); size -= SCTP_BUF_LEN(m_tmp); } else { bzero(data, size); size = 0; } /* clear the offset since it's only for the first mbuf */ m_offset = 0; m_tmp = SCTP_BUF_NEXT(m_tmp); } } /*- * process the incoming Authentication chunk * return codes: * -1 on any authentication error * 0 on authentication verification */ int sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth, struct mbuf *m, uint32_t offset) { uint16_t chunklen; uint16_t shared_key_id; uint16_t hmac_id; sctp_sharedkey_t *skey; uint32_t digestlen; uint8_t digest[SCTP_AUTH_DIGEST_LEN_MAX]; uint8_t computed_digest[SCTP_AUTH_DIGEST_LEN_MAX]; /* auth is checked for NULL by caller */ chunklen = ntohs(auth->ch.chunk_length); if (chunklen < sizeof(*auth)) { SCTP_STAT_INCR(sctps_recvauthfailed); return (-1); } SCTP_STAT_INCR(sctps_recvauth); /* get the auth params */ shared_key_id = ntohs(auth->shared_key_id); hmac_id = ntohs(auth->hmac_id); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP AUTH Chunk: shared key %u, HMAC id %u\n", shared_key_id, hmac_id); /* is the indicated HMAC supported? */ if (!sctp_auth_is_supported_hmac(stcb->asoc.local_hmacs, hmac_id)) { - struct mbuf *m_err; - struct sctp_auth_invalid_hmac *err; + struct mbuf *op_err; + struct sctp_error_auth_invalid_hmac *cause; SCTP_STAT_INCR(sctps_recvivalhmacid); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: unsupported HMAC id %u\n", hmac_id); /* * report this in an Error Chunk: Unsupported HMAC * Identifier */ - m_err = sctp_get_mbuf_for_msg(sizeof(*err), 0, M_NOWAIT, - 1, MT_HEADER); - if (m_err != NULL) { + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_auth_invalid_hmac), + 0, M_NOWAIT, 1, MT_HEADER); + if (op_err != NULL) { /* pre-reserve some space */ - SCTP_BUF_RESV_UF(m_err, sizeof(struct sctp_chunkhdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); /* fill in the error */ - err = mtod(m_err, struct sctp_auth_invalid_hmac *); - bzero(err, sizeof(*err)); - err->ph.param_type = htons(SCTP_CAUSE_UNSUPPORTED_HMACID); - err->ph.param_length = htons(sizeof(*err)); - err->hmac_id = ntohs(hmac_id); - SCTP_BUF_LEN(m_err) = sizeof(*err); + cause = mtod(op_err, struct sctp_error_auth_invalid_hmac *); + cause->cause.code = htons(SCTP_CAUSE_UNSUPPORTED_HMACID); + cause->cause.length = htons(sizeof(struct sctp_error_auth_invalid_hmac)); + cause->hmac_id = ntohs(hmac_id); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_auth_invalid_hmac); /* queue it */ - sctp_queue_op_err(stcb, m_err); + sctp_queue_op_err(stcb, op_err); } return (-1); } /* get the indicated shared key, if available */ if ((stcb->asoc.authinfo.recv_key == NULL) || (stcb->asoc.authinfo.recv_keyid != shared_key_id)) { /* find the shared key on the assoc first */ skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, shared_key_id); /* if the shared key isn't found, discard the chunk */ if (skey == NULL) { SCTP_STAT_INCR(sctps_recvivalkeyid); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: unknown key id %u\n", shared_key_id); return (-1); } /* generate a notification if this is a new key id */ if (stcb->asoc.authinfo.recv_keyid != shared_key_id) /* * sctp_ulp_notify(SCTP_NOTIFY_AUTH_NEW_KEY, stcb, * shared_key_id, (void * *)stcb->asoc.authinfo.recv_keyid); */ sctp_notify_authentication(stcb, SCTP_AUTH_NEW_KEY, shared_key_id, stcb->asoc.authinfo.recv_keyid, SCTP_SO_NOT_LOCKED); /* compute a new recv assoc key and cache it */ if (stcb->asoc.authinfo.recv_key != NULL) sctp_free_key(stcb->asoc.authinfo.recv_key); stcb->asoc.authinfo.recv_key = sctp_compute_hashkey(stcb->asoc.authinfo.random, stcb->asoc.authinfo.peer_random, skey->key); stcb->asoc.authinfo.recv_keyid = shared_key_id; #ifdef SCTP_DEBUG if (SCTP_AUTH_DEBUG) sctp_print_key(stcb->asoc.authinfo.recv_key, "Recv Key"); #endif } /* validate the digest length */ digestlen = sctp_get_hmac_digest_len(hmac_id); if (chunklen < (sizeof(*auth) + digestlen)) { /* invalid digest length */ SCTP_STAT_INCR(sctps_recvauthfailed); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: chunk too short for HMAC\n"); return (-1); } /* save a copy of the digest, zero the pseudo header, and validate */ bcopy(auth->hmac, digest, digestlen); sctp_bzero_m(m, offset + sizeof(*auth), SCTP_SIZE32(digestlen)); (void)sctp_compute_hmac_m(hmac_id, stcb->asoc.authinfo.recv_key, m, offset, computed_digest); /* compare the computed digest with the one in the AUTH chunk */ if (memcmp(digest, computed_digest, digestlen) != 0) { SCTP_STAT_INCR(sctps_recvauthfailed); SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP Auth: HMAC digest check failed\n"); return (-1); } return (0); } /* * Generate NOTIFICATION */ void sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, uint16_t keyid, uint16_t alt_keyid, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { struct mbuf *m_notify; struct sctp_authkey_event *auth; struct sctp_queued_to_read *control; if ((stcb == NULL) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) ) { /* If the socket is gone we are out of here */ return; } if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_AUTHEVNT)) /* event not enabled */ return; m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_authkey_event), 0, M_NOWAIT, 1, MT_HEADER); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = 0; auth = mtod(m_notify, struct sctp_authkey_event *); memset(auth, 0, sizeof(struct sctp_authkey_event)); auth->auth_type = SCTP_AUTHENTICATION_EVENT; auth->auth_flags = 0; auth->auth_length = sizeof(*auth); auth->auth_keynumber = keyid; auth->auth_altkeynumber = alt_keyid; auth->auth_indication = indication; auth->auth_assoc_id = sctp_get_associd(stcb); SCTP_BUF_LEN(m_notify) = sizeof(*auth); SCTP_BUF_NEXT(m_notify) = NULL; /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->spec_flags = M_NOTIFICATION; control->length = SCTP_BUF_LEN(m_notify); /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); } /*- * validates the AUTHentication related parameters in an INIT/INIT-ACK * Note: currently only used for INIT as INIT-ACK is handled inline * with sctp_load_addresses_from_init() */ int sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit) { struct sctp_paramhdr *phdr, parm_buf; uint16_t ptype, plen; int peer_supports_asconf = 0; int peer_supports_auth = 0; int got_random = 0, got_hmacs = 0, got_chklist = 0; uint8_t saw_asconf = 0; uint8_t saw_asconf_ack = 0; /* go through each of the params. */ phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf)); while (phdr) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if (offset + plen > limit) { break; } if (plen < sizeof(struct sctp_paramhdr)) { break; } if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { /* A supported extension chunk */ struct sctp_supported_chunk_types_param *pr_supported; uint8_t local_store[SCTP_PARAM_BUFFER_SIZE]; int num_ent, i; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&local_store, min(plen, sizeof(local_store))); if (phdr == NULL) { return (-1); } pr_supported = (struct sctp_supported_chunk_types_param *)phdr; num_ent = plen - sizeof(struct sctp_paramhdr); for (i = 0; i < num_ent; i++) { switch (pr_supported->chunk_types[i]) { case SCTP_ASCONF: case SCTP_ASCONF_ACK: peer_supports_asconf = 1; break; default: /* one we don't care about */ break; } } } else if (ptype == SCTP_RANDOM) { got_random = 1; /* enforce the random length */ if (plen != (sizeof(struct sctp_auth_random) + SCTP_AUTH_RANDOM_SIZE_REQUIRED)) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n"); return (-1); } } else if (ptype == SCTP_HMAC_LIST) { uint8_t store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_hmac_algo *hmacs; int num_hmacs; if (plen > sizeof(store)) break; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)store, min(plen, sizeof(store))); if (phdr == NULL) return (-1); hmacs = (struct sctp_auth_hmac_algo *)phdr; num_hmacs = (plen - sizeof(*hmacs)) / sizeof(hmacs->hmac_ids[0]); /* validate the hmac list */ if (sctp_verify_hmac_param(hmacs, num_hmacs)) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid HMAC param\n"); return (-1); } got_hmacs = 1; } else if (ptype == SCTP_CHUNK_LIST) { int i, num_chunks; uint8_t chunks_store[SCTP_SMALL_CHUNK_STORE]; /* did the peer send a non-empty chunk list? */ struct sctp_auth_chunk_list *chunks = NULL; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)chunks_store, min(plen, sizeof(chunks_store))); if (phdr == NULL) return (-1); /*- * Flip through the list and mark that the * peer supports asconf/asconf_ack. */ chunks = (struct sctp_auth_chunk_list *)phdr; num_chunks = plen - sizeof(*chunks); for (i = 0; i < num_chunks; i++) { /* record asconf/asconf-ack if listed */ if (chunks->chunk_types[i] == SCTP_ASCONF) saw_asconf = 1; if (chunks->chunk_types[i] == SCTP_ASCONF_ACK) saw_asconf_ack = 1; } if (num_chunks) got_chklist = 1; } offset += SCTP_SIZE32(plen); if (offset >= limit) { break; } phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf)); } /* validate authentication required parameters */ if (got_random && got_hmacs) { peer_supports_auth = 1; } else { peer_supports_auth = 0; } if (!peer_supports_auth && got_chklist) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: peer sent chunk list w/o AUTH\n"); return (-1); } if (peer_supports_asconf && !peer_supports_auth) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: peer supports ASCONF but not AUTH\n"); return (-1); } else if ((peer_supports_asconf) && (peer_supports_auth) && ((saw_asconf == 0) || (saw_asconf_ack == 0))) { return (-2); } return (0); } void sctp_initialize_auth_params(struct sctp_inpcb *inp, struct sctp_tcb *stcb) { uint16_t chunks_len = 0; uint16_t hmacs_len = 0; uint16_t random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT; sctp_key_t *new_key; uint16_t keylen; /* initialize hmac list from endpoint */ stcb->asoc.local_hmacs = sctp_copy_hmaclist(inp->sctp_ep.local_hmacs); if (stcb->asoc.local_hmacs != NULL) { hmacs_len = stcb->asoc.local_hmacs->num_algo * sizeof(stcb->asoc.local_hmacs->hmac[0]); } /* initialize auth chunks list from endpoint */ stcb->asoc.local_auth_chunks = sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks); if (stcb->asoc.local_auth_chunks != NULL) { int i; for (i = 0; i < 256; i++) { if (stcb->asoc.local_auth_chunks->chunks[i]) chunks_len++; } } /* copy defaults from the endpoint */ stcb->asoc.authinfo.active_keyid = inp->sctp_ep.default_keyid; /* copy out the shared key list (by reference) from the endpoint */ (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys, &stcb->asoc.shared_keys); /* now set the concatenated key (random + chunks + hmacs) */ /* key includes parameter headers */ keylen = (3 * sizeof(struct sctp_paramhdr)) + random_len + chunks_len + hmacs_len; new_key = sctp_alloc_key(keylen); if (new_key != NULL) { struct sctp_paramhdr *ph; int plen; /* generate and copy in the RANDOM */ ph = (struct sctp_paramhdr *)new_key->key; ph->param_type = htons(SCTP_RANDOM); plen = sizeof(*ph) + random_len; ph->param_length = htons(plen); SCTP_READ_RANDOM(new_key->key + sizeof(*ph), random_len); keylen = plen; /* append in the AUTH chunks */ /* NOTE: currently we always have chunks to list */ ph = (struct sctp_paramhdr *)(new_key->key + keylen); ph->param_type = htons(SCTP_CHUNK_LIST); plen = sizeof(*ph) + chunks_len; ph->param_length = htons(plen); keylen += sizeof(*ph); if (stcb->asoc.local_auth_chunks) { int i; for (i = 0; i < 256; i++) { if (stcb->asoc.local_auth_chunks->chunks[i]) new_key->key[keylen++] = i; } } /* append in the HMACs */ ph = (struct sctp_paramhdr *)(new_key->key + keylen); ph->param_type = htons(SCTP_HMAC_LIST); plen = sizeof(*ph) + hmacs_len; ph->param_length = htons(plen); keylen += sizeof(*ph); (void)sctp_serialize_hmaclist(stcb->asoc.local_hmacs, new_key->key + keylen); } if (stcb->asoc.authinfo.random != NULL) sctp_free_key(stcb->asoc.authinfo.random); stcb->asoc.authinfo.random = new_key; stcb->asoc.authinfo.random_len = random_len; } Index: projects/iosched/sys/netinet/sctp_header.h =================================================================== --- projects/iosched/sys/netinet/sctp_header.h (revision 287721) +++ projects/iosched/sys/netinet/sctp_header.h (revision 287722) @@ -1,613 +1,562 @@ /*- * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_HEADER_H_ #define _NETINET_SCTP_HEADER_H_ #include #include #include #define SCTP_PACKED __attribute__((packed)) /* * Parameter structures */ struct sctp_ipv4addr_param { struct sctp_paramhdr ph;/* type=SCTP_IPV4_PARAM_TYPE, len=8 */ uint32_t addr; /* IPV4 address */ } SCTP_PACKED; #define SCTP_V6_ADDR_BYTES 16 struct sctp_ipv6addr_param { struct sctp_paramhdr ph;/* type=SCTP_IPV6_PARAM_TYPE, len=20 */ uint8_t addr[SCTP_V6_ADDR_BYTES]; /* IPV6 address */ } SCTP_PACKED; /* Cookie Preservative */ struct sctp_cookie_perserve_param { struct sctp_paramhdr ph;/* type=SCTP_COOKIE_PRESERVE, len=8 */ uint32_t time; /* time in ms to extend cookie */ } SCTP_PACKED; #define SCTP_ARRAY_MIN_LEN 1 /* Host Name Address */ struct sctp_host_name_param { struct sctp_paramhdr ph;/* type=SCTP_HOSTNAME_ADDRESS */ char name[SCTP_ARRAY_MIN_LEN]; /* host name */ } SCTP_PACKED; /* * This is the maximum padded size of a s-a-p * so paramheadr + 3 address types (6 bytes) + 2 byte pad = 12 */ #define SCTP_MAX_ADDR_PARAMS_SIZE 12 /* supported address type */ struct sctp_supported_addr_param { struct sctp_paramhdr ph;/* type=SCTP_SUPPORTED_ADDRTYPE */ uint16_t addr_type[2]; /* array of supported address types */ } SCTP_PACKED; /* heartbeat info parameter */ struct sctp_heartbeat_info_param { struct sctp_paramhdr ph; uint32_t time_value_1; uint32_t time_value_2; uint32_t random_value1; uint32_t random_value2; uint8_t addr_family; uint8_t addr_len; /* make sure that this structure is 4 byte aligned */ uint8_t padding[2]; char address[SCTP_ADDRMAX]; } SCTP_PACKED; /* draft-ietf-tsvwg-prsctp */ /* PR-SCTP supported parameter */ struct sctp_prsctp_supported_param { struct sctp_paramhdr ph; } SCTP_PACKED; /* draft-ietf-tsvwg-addip-sctp */ struct sctp_asconf_paramhdr { /* an ASCONF "parameter" */ struct sctp_paramhdr ph;/* a SCTP parameter header */ uint32_t correlation_id;/* correlation id for this param */ } SCTP_PACKED; struct sctp_asconf_addr_param { /* an ASCONF address parameter */ struct sctp_asconf_paramhdr aph; /* asconf "parameter" */ struct sctp_ipv6addr_param addrp; /* max storage size */ } SCTP_PACKED; struct sctp_asconf_tag_param { /* an ASCONF NAT-Vtag parameter */ struct sctp_asconf_paramhdr aph; /* asconf "parameter" */ uint32_t local_vtag; uint32_t remote_vtag; } SCTP_PACKED; struct sctp_asconf_addrv4_param { /* an ASCONF address (v4) parameter */ struct sctp_asconf_paramhdr aph; /* asconf "parameter" */ struct sctp_ipv4addr_param addrp; /* max storage size */ } SCTP_PACKED; #define SCTP_MAX_SUPPORTED_EXT 256 struct sctp_supported_chunk_types_param { struct sctp_paramhdr ph;/* type = 0x8008 len = x */ uint8_t chunk_types[]; } SCTP_PACKED; /* * Structures for DATA chunks */ struct sctp_data { uint32_t tsn; uint16_t stream_id; uint16_t stream_sequence; uint32_t protocol_id; /* user data follows */ } SCTP_PACKED; struct sctp_data_chunk { struct sctp_chunkhdr ch; struct sctp_data dp; } SCTP_PACKED; /* * Structures for the control chunks */ /* Initiate (INIT)/Initiate Ack (INIT ACK) */ struct sctp_init { uint32_t initiate_tag; /* initiate tag */ uint32_t a_rwnd; /* a_rwnd */ uint16_t num_outbound_streams; /* OS */ uint16_t num_inbound_streams; /* MIS */ uint32_t initial_tsn; /* I-TSN */ /* optional param's follow */ } SCTP_PACKED; #define SCTP_IDENTIFICATION_SIZE 16 #define SCTP_ADDRESS_SIZE 4 #define SCTP_RESERVE_SPACE 6 /* state cookie header */ struct sctp_state_cookie { /* this is our definition... */ uint8_t identification[SCTP_IDENTIFICATION_SIZE]; /* id of who we are */ struct timeval time_entered; /* the time I built cookie */ uint32_t cookie_life; /* life I will award this cookie */ uint32_t tie_tag_my_vtag; /* my tag in old association */ uint32_t tie_tag_peer_vtag; /* peers tag in old association */ uint32_t peers_vtag; /* peers tag in INIT (for quick ref) */ uint32_t my_vtag; /* my tag in INIT-ACK (for quick ref) */ uint32_t address[SCTP_ADDRESS_SIZE]; /* 4 ints/128 bits */ uint32_t addr_type; /* address type */ uint32_t laddress[SCTP_ADDRESS_SIZE]; /* my local from address */ uint32_t laddr_type; /* my local from address type */ uint32_t scope_id; /* v6 scope id for link-locals */ uint16_t peerport; /* port address of the peer in the INIT */ uint16_t myport; /* my port address used in the INIT */ uint8_t ipv4_addr_legal;/* Are V4 addr legal? */ uint8_t ipv6_addr_legal;/* Are V6 addr legal? */ uint8_t local_scope; /* IPv6 local scope flag */ uint8_t site_scope; /* IPv6 site scope flag */ uint8_t ipv4_scope; /* IPv4 private addr scope */ uint8_t loopback_scope; /* loopback scope information */ uint8_t reserved[SCTP_RESERVE_SPACE]; /* Align to 64 bits */ /* * at the end is tacked on the INIT chunk and the INIT-ACK chunk * (minus the cookie). */ } SCTP_PACKED; - -/* Used for NAT state error cause */ -struct sctp_missing_nat_state { - uint16_t cause; - uint16_t length; - uint8_t data[]; -} SCTP_PACKED; - - -struct sctp_inv_mandatory_param { - uint16_t cause; - uint16_t length; - uint32_t num_param; - uint16_t param; - /* - * We include this to 0 it since only a missing cookie will cause - * this error. - */ - uint16_t resv; -} SCTP_PACKED; - -struct sctp_unresolv_addr { - uint16_t cause; - uint16_t length; - uint16_t addr_type; - uint16_t reserved; /* Only one invalid addr type */ -} SCTP_PACKED; - /* state cookie parameter */ struct sctp_state_cookie_param { struct sctp_paramhdr ph; struct sctp_state_cookie cookie; } SCTP_PACKED; struct sctp_init_chunk { struct sctp_chunkhdr ch; struct sctp_init init; } SCTP_PACKED; struct sctp_init_msg { struct sctphdr sh; struct sctp_init_chunk msg; } SCTP_PACKED; /* ... used for both INIT and INIT ACK */ #define sctp_init_ack sctp_init #define sctp_init_ack_chunk sctp_init_chunk #define sctp_init_ack_msg sctp_init_msg /* Selective Ack (SACK) */ struct sctp_gap_ack_block { uint16_t start; /* Gap Ack block start */ uint16_t end; /* Gap Ack block end */ } SCTP_PACKED; struct sctp_sack { uint32_t cum_tsn_ack; /* cumulative TSN Ack */ uint32_t a_rwnd; /* updated a_rwnd of sender */ uint16_t num_gap_ack_blks; /* number of Gap Ack blocks */ uint16_t num_dup_tsns; /* number of duplicate TSNs */ /* struct sctp_gap_ack_block's follow */ /* uint32_t duplicate_tsn's follow */ } SCTP_PACKED; struct sctp_sack_chunk { struct sctp_chunkhdr ch; struct sctp_sack sack; } SCTP_PACKED; struct sctp_nr_sack { uint32_t cum_tsn_ack; /* cumulative TSN Ack */ uint32_t a_rwnd; /* updated a_rwnd of sender */ uint16_t num_gap_ack_blks; /* number of Gap Ack blocks */ uint16_t num_nr_gap_ack_blks; /* number of NR Gap Ack blocks */ uint16_t num_dup_tsns; /* number of duplicate TSNs */ uint16_t reserved; /* not currently used */ /* struct sctp_gap_ack_block's follow */ /* uint32_t duplicate_tsn's follow */ } SCTP_PACKED; struct sctp_nr_sack_chunk { struct sctp_chunkhdr ch; struct sctp_nr_sack nr_sack; } SCTP_PACKED; /* Heartbeat Request (HEARTBEAT) */ struct sctp_heartbeat { struct sctp_heartbeat_info_param hb_info; } SCTP_PACKED; struct sctp_heartbeat_chunk { struct sctp_chunkhdr ch; struct sctp_heartbeat heartbeat; } SCTP_PACKED; /* ... used for Heartbeat Ack (HEARTBEAT ACK) */ #define sctp_heartbeat_ack sctp_heartbeat #define sctp_heartbeat_ack_chunk sctp_heartbeat_chunk /* Abort Asssociation (ABORT) */ struct sctp_abort_chunk { struct sctp_chunkhdr ch; /* optional error cause may follow */ } SCTP_PACKED; struct sctp_abort_msg { struct sctphdr sh; struct sctp_abort_chunk msg; } SCTP_PACKED; /* Shutdown Association (SHUTDOWN) */ struct sctp_shutdown_chunk { struct sctp_chunkhdr ch; uint32_t cumulative_tsn_ack; } SCTP_PACKED; /* Shutdown Acknowledgment (SHUTDOWN ACK) */ struct sctp_shutdown_ack_chunk { struct sctp_chunkhdr ch; } SCTP_PACKED; /* Operation Error (ERROR) */ struct sctp_error_chunk { struct sctp_chunkhdr ch; /* optional error causes follow */ } SCTP_PACKED; /* Cookie Echo (COOKIE ECHO) */ struct sctp_cookie_echo_chunk { struct sctp_chunkhdr ch; struct sctp_state_cookie cookie; } SCTP_PACKED; /* Cookie Acknowledgment (COOKIE ACK) */ struct sctp_cookie_ack_chunk { struct sctp_chunkhdr ch; } SCTP_PACKED; /* Explicit Congestion Notification Echo (ECNE) */ struct old_sctp_ecne_chunk { struct sctp_chunkhdr ch; uint32_t tsn; } SCTP_PACKED; struct sctp_ecne_chunk { struct sctp_chunkhdr ch; uint32_t tsn; uint32_t num_pkts_since_cwr; } SCTP_PACKED; /* Congestion Window Reduced (CWR) */ struct sctp_cwr_chunk { struct sctp_chunkhdr ch; uint32_t tsn; } SCTP_PACKED; /* Shutdown Complete (SHUTDOWN COMPLETE) */ struct sctp_shutdown_complete_chunk { struct sctp_chunkhdr ch; } SCTP_PACKED; -/* Oper error holding a stale cookie */ -struct sctp_stale_cookie_msg { - struct sctp_paramhdr ph;/* really an error cause */ - uint32_t time_usec; -} SCTP_PACKED; - struct sctp_adaptation_layer_indication { struct sctp_paramhdr ph; uint32_t indication; } SCTP_PACKED; -struct sctp_cookie_while_shutting_down { - struct sctphdr sh; - struct sctp_chunkhdr ch; - struct sctp_paramhdr ph;/* really an error cause */ -} SCTP_PACKED; - -struct sctp_shutdown_complete_msg { - struct sctphdr sh; - struct sctp_shutdown_complete_chunk shut_cmp; -} SCTP_PACKED; - /* * draft-ietf-tsvwg-addip-sctp */ /* Address/Stream Configuration Change (ASCONF) */ struct sctp_asconf_chunk { struct sctp_chunkhdr ch; uint32_t serial_number; /* lookup address parameter (mandatory) */ /* asconf parameters follow */ } SCTP_PACKED; /* Address/Stream Configuration Acknowledge (ASCONF ACK) */ struct sctp_asconf_ack_chunk { struct sctp_chunkhdr ch; uint32_t serial_number; /* asconf parameters follow */ } SCTP_PACKED; /* draft-ietf-tsvwg-prsctp */ /* Forward Cumulative TSN (FORWARD TSN) */ struct sctp_forward_tsn_chunk { struct sctp_chunkhdr ch; uint32_t new_cumulative_tsn; /* stream/sequence pairs (sctp_strseq) follow */ } SCTP_PACKED; struct sctp_strseq { uint16_t stream; uint16_t sequence; } SCTP_PACKED; struct sctp_forward_tsn_msg { struct sctphdr sh; struct sctp_forward_tsn_chunk msg; } SCTP_PACKED; /* should be a multiple of 4 - 1 aka 3/7/11 etc. */ #define SCTP_NUM_DB_TO_VERIFY 31 struct sctp_chunk_desc { uint8_t chunk_type; uint8_t data_bytes[SCTP_NUM_DB_TO_VERIFY]; uint32_t tsn_ifany; } SCTP_PACKED; struct sctp_pktdrop_chunk { struct sctp_chunkhdr ch; uint32_t bottle_bw; uint32_t current_onq; uint16_t trunc_len; uint16_t reserved; uint8_t data[]; } SCTP_PACKED; /**********STREAM RESET STUFF ******************/ struct sctp_stream_reset_request { struct sctp_paramhdr ph; uint32_t request_seq; } SCTP_PACKED; struct sctp_stream_reset_out_request { struct sctp_paramhdr ph; uint32_t request_seq; /* monotonically increasing seq no */ uint32_t response_seq; /* if a response, the resp seq no */ uint32_t send_reset_at_tsn; /* last TSN I assigned outbound */ uint16_t list_of_streams[]; /* if not all list of streams */ } SCTP_PACKED; struct sctp_stream_reset_in_request { struct sctp_paramhdr ph; uint32_t request_seq; uint16_t list_of_streams[]; /* if not all list of streams */ } SCTP_PACKED; struct sctp_stream_reset_tsn_request { struct sctp_paramhdr ph; uint32_t request_seq; } SCTP_PACKED; struct sctp_stream_reset_response { struct sctp_paramhdr ph; uint32_t response_seq; /* if a response, the resp seq no */ uint32_t result; } SCTP_PACKED; struct sctp_stream_reset_response_tsn { struct sctp_paramhdr ph; uint32_t response_seq; /* if a response, the resp seq no */ uint32_t result; uint32_t senders_next_tsn; uint32_t receivers_next_tsn; } SCTP_PACKED; struct sctp_stream_reset_add_strm { struct sctp_paramhdr ph; uint32_t request_seq; uint16_t number_of_streams; uint16_t reserved; } SCTP_PACKED; #define SCTP_STREAM_RESET_RESULT_NOTHING_TO_DO 0x00000000 /* XXX: unused */ #define SCTP_STREAM_RESET_RESULT_PERFORMED 0x00000001 #define SCTP_STREAM_RESET_RESULT_DENIED 0x00000002 #define SCTP_STREAM_RESET_RESULT_ERR__WRONG_SSN 0x00000003 /* XXX: unused */ #define SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS 0x00000004 #define SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO 0x00000005 #define SCTP_STREAM_RESET_RESULT_IN_PROGRESS 0x00000006 /* XXX: unused */ /* * convience structures, note that if you are making a request for specific * streams then the request will need to be an overlay structure. */ struct sctp_stream_reset_tsn_req { struct sctp_chunkhdr ch; struct sctp_stream_reset_tsn_request sr_req; } SCTP_PACKED; struct sctp_stream_reset_resp { struct sctp_chunkhdr ch; struct sctp_stream_reset_response sr_resp; } SCTP_PACKED; /* respone only valid with a TSN request */ struct sctp_stream_reset_resp_tsn { struct sctp_chunkhdr ch; struct sctp_stream_reset_response_tsn sr_resp; } SCTP_PACKED; /****************************************************/ /* * Authenticated chunks support draft-ietf-tsvwg-sctp-auth */ /* Should we make the max be 32? */ #define SCTP_RANDOM_MAX_SIZE 256 struct sctp_auth_random { struct sctp_paramhdr ph;/* type = 0x8002 */ uint8_t random_data[]; } SCTP_PACKED; struct sctp_auth_chunk_list { struct sctp_paramhdr ph;/* type = 0x8003 */ uint8_t chunk_types[]; } SCTP_PACKED; struct sctp_auth_hmac_algo { struct sctp_paramhdr ph;/* type = 0x8004 */ uint16_t hmac_ids[]; } SCTP_PACKED; struct sctp_auth_chunk { struct sctp_chunkhdr ch; uint16_t shared_key_id; uint16_t hmac_id; uint8_t hmac[]; } SCTP_PACKED; - -struct sctp_auth_invalid_hmac { - struct sctp_paramhdr ph; - uint16_t hmac_id; - uint16_t padding; -} SCTP_PACKED; /* * we pre-reserve enough room for a ECNE or CWR AND a SACK with no missing * pieces. If ENCE is missing we could have a couple of blocks. This way we * optimize so we MOST likely can bundle a SACK/ECN with the smallest size * data chunk I will split into. We could increase throughput slightly by * taking out these two but the 24-sack/8-CWR i.e. 32 bytes I pre-reserve I * feel is worth it for now. */ #ifndef SCTP_MAX_OVERHEAD #ifdef INET6 #define SCTP_MAX_OVERHEAD (sizeof(struct sctp_data_chunk) + \ sizeof(struct sctphdr) + \ sizeof(struct sctp_ecne_chunk) + \ sizeof(struct sctp_sack_chunk) + \ sizeof(struct ip6_hdr)) #define SCTP_MED_OVERHEAD (sizeof(struct sctp_data_chunk) + \ sizeof(struct sctphdr) + \ sizeof(struct ip6_hdr)) #define SCTP_MIN_OVERHEAD (sizeof(struct ip6_hdr) + \ sizeof(struct sctphdr)) #else #define SCTP_MAX_OVERHEAD (sizeof(struct sctp_data_chunk) + \ sizeof(struct sctphdr) + \ sizeof(struct sctp_ecne_chunk) + \ sizeof(struct sctp_sack_chunk) + \ sizeof(struct ip)) #define SCTP_MED_OVERHEAD (sizeof(struct sctp_data_chunk) + \ sizeof(struct sctphdr) + \ sizeof(struct ip)) #define SCTP_MIN_OVERHEAD (sizeof(struct ip) + \ sizeof(struct sctphdr)) #endif /* INET6 */ #endif /* !SCTP_MAX_OVERHEAD */ #define SCTP_MED_V4_OVERHEAD (sizeof(struct sctp_data_chunk) + \ sizeof(struct sctphdr) + \ sizeof(struct ip)) #define SCTP_MIN_V4_OVERHEAD (sizeof(struct ip) + \ sizeof(struct sctphdr)) #undef SCTP_PACKED #endif /* !__sctp_header_h__ */ Index: projects/iosched/sys/netinet/sctp_indata.c =================================================================== --- projects/iosched/sys/netinet/sctp_indata.c (revision 287721) +++ projects/iosched/sys/netinet/sctp_indata.c (revision 287722) @@ -1,5339 +1,5325 @@ /*- * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include /* * NOTES: On the outbound side of things I need to check the sack timer to * see if I should generate a sack into the chunk queue (if I have data to * send that is and will be sending it .. for bundling. * * The callback in sctp_usrreq.c will get called when the socket is read from. * This will cause sctp_service_queues() to get called on the top entry in * the list. */ void sctp_set_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) { asoc->my_rwnd = sctp_calc_rwnd(stcb, asoc); } /* Calculate what the rwnd would be */ uint32_t sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) { uint32_t calc = 0; /* * This is really set wrong with respect to a 1-2-m socket. Since * the sb_cc is the count that everyone as put up. When we re-write * sctp_soreceive then we will fix this so that ONLY this * associations data is taken into account. */ if (stcb->sctp_socket == NULL) return (calc); if (stcb->asoc.sb_cc == 0 && asoc->size_on_reasm_queue == 0 && asoc->size_on_all_streams == 0) { /* Full rwnd granted */ calc = max(SCTP_SB_LIMIT_RCV(stcb->sctp_socket), SCTP_MINIMAL_RWND); return (calc); } /* get actual space */ calc = (uint32_t) sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv); /* * take out what has NOT been put on socket queue and we yet hold * for putting up. */ calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_reasm_queue + asoc->cnt_on_reasm_queue * MSIZE)); calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_all_streams + asoc->cnt_on_all_streams * MSIZE)); if (calc == 0) { /* out of space */ return (calc); } /* what is the overhead of all these rwnd's */ calc = sctp_sbspace_sub(calc, stcb->asoc.my_rwnd_control_len); /* * If the window gets too small due to ctrl-stuff, reduce it to 1, * even it is 0. SWS engaged */ if (calc < stcb->asoc.my_rwnd_control_len) { calc = 1; } return (calc); } /* * Build out our readq entry based on the incoming packet. */ struct sctp_queued_to_read * sctp_build_readq_entry(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t tsn, uint32_t ppid, uint32_t context, uint16_t stream_no, uint16_t stream_seq, uint8_t flags, struct mbuf *dm) { struct sctp_queued_to_read *read_queue_e = NULL; sctp_alloc_a_readq(stcb, read_queue_e); if (read_queue_e == NULL) { goto failed_build; } read_queue_e->sinfo_stream = stream_no; read_queue_e->sinfo_ssn = stream_seq; read_queue_e->sinfo_flags = (flags << 8); read_queue_e->sinfo_ppid = ppid; read_queue_e->sinfo_context = context; read_queue_e->sinfo_timetolive = 0; read_queue_e->sinfo_tsn = tsn; read_queue_e->sinfo_cumtsn = tsn; read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb); read_queue_e->whoFrom = net; read_queue_e->length = 0; atomic_add_int(&net->ref_count, 1); read_queue_e->data = dm; read_queue_e->spec_flags = 0; read_queue_e->tail_mbuf = NULL; read_queue_e->aux_data = NULL; read_queue_e->stcb = stcb; read_queue_e->port_from = stcb->rport; read_queue_e->do_not_ref_stcb = 0; read_queue_e->end_added = 0; read_queue_e->some_taken = 0; read_queue_e->pdapi_aborted = 0; failed_build: return (read_queue_e); } /* * Build out our readq entry based on the incoming packet. */ static struct sctp_queued_to_read * sctp_build_readq_entry_chk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk) { struct sctp_queued_to_read *read_queue_e = NULL; sctp_alloc_a_readq(stcb, read_queue_e); if (read_queue_e == NULL) { goto failed_build; } read_queue_e->sinfo_stream = chk->rec.data.stream_number; read_queue_e->sinfo_ssn = chk->rec.data.stream_seq; read_queue_e->sinfo_flags = (chk->rec.data.rcv_flags << 8); read_queue_e->sinfo_ppid = chk->rec.data.payloadtype; read_queue_e->sinfo_context = stcb->asoc.context; read_queue_e->sinfo_timetolive = 0; read_queue_e->sinfo_tsn = chk->rec.data.TSN_seq; read_queue_e->sinfo_cumtsn = chk->rec.data.TSN_seq; read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb); read_queue_e->whoFrom = chk->whoTo; read_queue_e->aux_data = NULL; read_queue_e->length = 0; atomic_add_int(&chk->whoTo->ref_count, 1); read_queue_e->data = chk->data; read_queue_e->tail_mbuf = NULL; read_queue_e->stcb = stcb; read_queue_e->port_from = stcb->rport; read_queue_e->spec_flags = 0; read_queue_e->do_not_ref_stcb = 0; read_queue_e->end_added = 0; read_queue_e->some_taken = 0; read_queue_e->pdapi_aborted = 0; failed_build: return (read_queue_e); } struct mbuf * sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) { struct sctp_extrcvinfo *seinfo; struct sctp_sndrcvinfo *outinfo; struct sctp_rcvinfo *rcvinfo; struct sctp_nxtinfo *nxtinfo; struct cmsghdr *cmh; struct mbuf *ret; int len; int use_extended; int provide_nxt; if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT) && sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO) && sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) { /* user does not want any ancillary data */ return (NULL); } len = 0; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO)) { len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); } seinfo = (struct sctp_extrcvinfo *)sinfo; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO) && (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_AVAIL)) { provide_nxt = 1; len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); } else { provide_nxt = 0; } if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { use_extended = 1; len += CMSG_SPACE(sizeof(struct sctp_extrcvinfo)); } else { use_extended = 0; len += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo)); } } else { use_extended = 0; } ret = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (ret == NULL) { /* No space */ return (ret); } SCTP_BUF_LEN(ret) = 0; /* We need a CMSG header followed by the struct */ cmh = mtod(ret, struct cmsghdr *); /* * Make sure that there is no un-initialized padding between the * cmsg header and cmsg data and after the cmsg data. */ memset(cmh, 0, len); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO)) { cmh->cmsg_level = IPPROTO_SCTP; cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_rcvinfo)); cmh->cmsg_type = SCTP_RCVINFO; rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmh); rcvinfo->rcv_sid = sinfo->sinfo_stream; rcvinfo->rcv_ssn = sinfo->sinfo_ssn; rcvinfo->rcv_flags = sinfo->sinfo_flags; rcvinfo->rcv_ppid = sinfo->sinfo_ppid; rcvinfo->rcv_tsn = sinfo->sinfo_tsn; rcvinfo->rcv_cumtsn = sinfo->sinfo_cumtsn; rcvinfo->rcv_context = sinfo->sinfo_context; rcvinfo->rcv_assoc_id = sinfo->sinfo_assoc_id; cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_rcvinfo))); SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); } if (provide_nxt) { cmh->cmsg_level = IPPROTO_SCTP; cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_nxtinfo)); cmh->cmsg_type = SCTP_NXTINFO; nxtinfo = (struct sctp_nxtinfo *)CMSG_DATA(cmh); nxtinfo->nxt_sid = seinfo->sreinfo_next_stream; nxtinfo->nxt_flags = 0; if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_UNORDERED) { nxtinfo->nxt_flags |= SCTP_UNORDERED; } if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_NOTIFICATION) { nxtinfo->nxt_flags |= SCTP_NOTIFICATION; } if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_ISCOMPLETE) { nxtinfo->nxt_flags |= SCTP_COMPLETE; } nxtinfo->nxt_ppid = seinfo->sreinfo_next_ppid; nxtinfo->nxt_length = seinfo->sreinfo_next_length; nxtinfo->nxt_assoc_id = seinfo->sreinfo_next_aid; cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_nxtinfo))); SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_nxtinfo)); } if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { cmh->cmsg_level = IPPROTO_SCTP; outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); if (use_extended) { cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); cmh->cmsg_type = SCTP_EXTRCV; memcpy(outinfo, sinfo, sizeof(struct sctp_extrcvinfo)); SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_extrcvinfo)); } else { cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); cmh->cmsg_type = SCTP_SNDRCV; *outinfo = *sinfo; SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo)); } } return (ret); } static void sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn) { uint32_t gap, i, cumackp1; int fnd = 0; if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { return; } cumackp1 = asoc->cumulative_tsn + 1; if (SCTP_TSN_GT(cumackp1, tsn)) { /* * this tsn is behind the cum ack and thus we don't need to * worry about it being moved from one to the other. */ return; } SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn); if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { SCTP_PRINTF("gap:%x tsn:%x\n", gap, tsn); sctp_print_mapping_array(asoc); #ifdef INVARIANTS panic("Things are really messed up now!!"); #endif } SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = tsn; } if (tsn == asoc->highest_tsn_inside_map) { /* We must back down to see what the new highest is */ for (i = tsn - 1; SCTP_TSN_GE(i, asoc->mapping_array_base_tsn); i--) { SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn); if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { asoc->highest_tsn_inside_map = i; fnd = 1; break; } } if (!fnd) { asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1; } } } /* * We are delivering currently from the reassembly queue. We must continue to * deliver until we either: 1) run out of space. 2) run out of sequential * TSN's 3) hit the SCTP_DATA_LAST_FRAG flag. */ static void sctp_service_reassembly(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *chk, *nchk; uint16_t nxt_todel; uint16_t stream_no; int end = 0; int cntDel; struct sctp_queued_to_read *control, *ctl, *nctl; if (stcb == NULL) return; cntDel = stream_no = 0; if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) || (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { /* socket above is long gone or going.. */ abandon: asoc->fragmented_delivery_inprogress = 0; TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); asoc->size_on_reasm_queue -= chk->send_size; sctp_ucount_decr(asoc->cnt_on_reasm_queue); /* * Lose the data pointer, since its in the socket * buffer */ if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } /* Now free the address and data */ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } return; } SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { if (chk->rec.data.TSN_seq != (asoc->tsn_last_delivered + 1)) { /* Can't deliver more :< */ return; } stream_no = chk->rec.data.stream_number; nxt_todel = asoc->strmin[stream_no].last_sequence_delivered + 1; if (nxt_todel != chk->rec.data.stream_seq && (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { /* * Not the next sequence to deliver in its stream OR * unordered */ return; } if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { control = sctp_build_readq_entry_chk(stcb, chk); if (control == NULL) { /* out of memory? */ return; } /* save it off for our future deliveries */ stcb->asoc.control_pdapi = control; if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) end = 1; else end = 0; sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, end, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); cntDel++; } else { if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) end = 1; else end = 0; sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); if (sctp_append_to_readq(stcb->sctp_ep, stcb, stcb->asoc.control_pdapi, chk->data, end, chk->rec.data.TSN_seq, &stcb->sctp_socket->so_rcv)) { /* * something is very wrong, either * control_pdapi is NULL, or the tail_mbuf * is corrupt, or there is a EOM already on * the mbuf chain. */ if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { goto abandon; } else { #ifdef INVARIANTS if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) { panic("This should not happen control_pdapi NULL?"); } /* if we did not panic, it was a EOM */ panic("Bad chunking ??"); #else if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) { SCTP_PRINTF("This should not happen control_pdapi NULL?\n"); } SCTP_PRINTF("Bad chunking ??\n"); SCTP_PRINTF("Dumping re-assembly queue this will probably hose the association\n"); #endif goto abandon; } } cntDel++; } /* pull it we did it */ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { asoc->fragmented_delivery_inprogress = 0; if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { asoc->strmin[stream_no].last_sequence_delivered++; } if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) { SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs); } } else if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { /* * turn the flag back on since we just delivered * yet another one. */ asoc->fragmented_delivery_inprogress = 1; } asoc->tsn_of_pdapi_last_delivered = chk->rec.data.TSN_seq; asoc->last_flags_delivered = chk->rec.data.rcv_flags; asoc->last_strm_seq_delivered = chk->rec.data.stream_seq; asoc->last_strm_no_delivered = chk->rec.data.stream_number; asoc->tsn_last_delivered = chk->rec.data.TSN_seq; asoc->size_on_reasm_queue -= chk->send_size; sctp_ucount_decr(asoc->cnt_on_reasm_queue); /* free up the chk */ chk->data = NULL; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); if (asoc->fragmented_delivery_inprogress == 0) { /* * Now lets see if we can deliver the next one on * the stream */ struct sctp_stream_in *strm; strm = &asoc->strmin[stream_no]; nxt_todel = strm->last_sequence_delivered + 1; TAILQ_FOREACH_SAFE(ctl, &strm->inqueue, next, nctl) { /* Deliver more if we can. */ if (nxt_todel == ctl->sinfo_ssn) { TAILQ_REMOVE(&strm->inqueue, ctl, next); asoc->size_on_all_streams -= ctl->length; sctp_ucount_decr(asoc->cnt_on_all_streams); strm->last_sequence_delivered++; sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, ctl, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } else { break; } nxt_todel = strm->last_sequence_delivered + 1; } break; } } } /* * Queue the chunk either right into the socket buffer if it is the next one * to go OR put it in the correct place in the delivery queue. If we do * append to the so_buf, keep doing so until we are out of order. One big * question still remains, what to do when the socket buffer is FULL?? */ static void sctp_queue_data_to_stream(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_queued_to_read *control, int *abort_flag) { /* * FIX-ME maybe? What happens when the ssn wraps? If we are getting * all the data in one stream this could happen quite rapidly. One * could use the TSN to keep track of things, but this scheme breaks * down in the other type of stream useage that could occur. Send a * single msg to stream 0, send 4Billion messages to stream 1, now * send a message to stream 0. You have a situation where the TSN * has wrapped but not in the stream. Is this worth worrying about * or should we just change our queue sort at the bottom to be by * TSN. * * Could it also be legal for a peer to send ssn 1 with TSN 2 and ssn 2 * with TSN 1? If the peer is doing some sort of funky TSN/SSN * assignment this could happen... and I don't see how this would be * a violation. So for now I am undecided an will leave the sort by * SSN alone. Maybe a hybred approach is the answer * */ struct sctp_stream_in *strm; struct sctp_queued_to_read *at; int queue_needed; uint16_t nxt_todel; struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; queue_needed = 1; asoc->size_on_all_streams += control->length; sctp_ucount_incr(asoc->cnt_on_all_streams); strm = &asoc->strmin[control->sinfo_stream]; nxt_todel = strm->last_sequence_delivered + 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INTO_STRD); } SCTPDBG(SCTP_DEBUG_INDATA1, "queue to stream called for sid:%u ssn:%u tsn:%u lastdel:%u nxt:%u\n", (uint32_t) control->sinfo_stream, (uint32_t) control->sinfo_ssn, (uint32_t) control->sinfo_tsn, (uint32_t) strm->last_sequence_delivered, (uint32_t) nxt_todel); if (SCTP_SSN_GE(strm->last_sequence_delivered, control->sinfo_ssn)) { /* The incoming sseq is behind where we last delivered? */ SCTPDBG(SCTP_DEBUG_INDATA1, "Duplicate S-SEQ:%d delivered:%d from peer, Abort association\n", control->sinfo_ssn, strm->last_sequence_delivered); protocol_error: /* * throw it in the stream so it gets cleaned up in * association destruction */ TAILQ_INSERT_HEAD(&strm->inqueue, control, next); snprintf(msg, sizeof(msg), "Delivered SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", strm->last_sequence_delivered, control->sinfo_tsn, control->sinfo_stream, control->sinfo_ssn); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { SCTP_SOCKET_UNLOCK(so, 1); return; } #endif if (nxt_todel == control->sinfo_ssn) { /* can be delivered right away? */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_IMMED_DEL); } /* EY it wont be queued if it could be delivered directly */ queue_needed = 0; asoc->size_on_all_streams -= control->length; sctp_ucount_decr(asoc->cnt_on_all_streams); strm->last_sequence_delivered++; sctp_mark_non_revokable(asoc, control->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_LOCKED); TAILQ_FOREACH_SAFE(control, &strm->inqueue, next, at) { /* all delivered */ nxt_todel = strm->last_sequence_delivered + 1; if (nxt_todel == control->sinfo_ssn) { TAILQ_REMOVE(&strm->inqueue, control, next); asoc->size_on_all_streams -= control->length; sctp_ucount_decr(asoc->cnt_on_all_streams); strm->last_sequence_delivered++; /* * We ignore the return of deliver_data here * since we always can hold the chunk on the * d-queue. And we have a finite number that * can be delivered from the strq. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_IMMED_DEL); } sctp_mark_non_revokable(asoc, control->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_LOCKED); continue; } break; } } if (queue_needed) { /* * Ok, we did not deliver this guy, find the correct place * to put it on the queue. */ if (SCTP_TSN_GE(asoc->cumulative_tsn, control->sinfo_tsn)) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif goto protocol_error; } if (TAILQ_EMPTY(&strm->inqueue)) { /* Empty queue */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INSERT_HD); } TAILQ_INSERT_HEAD(&strm->inqueue, control, next); } else { TAILQ_FOREACH(at, &strm->inqueue, next) { if (SCTP_SSN_GT(at->sinfo_ssn, control->sinfo_ssn)) { /* * one in queue is bigger than the * new one, insert before this one */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, at, SCTP_STR_LOG_FROM_INSERT_MD); } TAILQ_INSERT_BEFORE(at, control, next); break; } else if (at->sinfo_ssn == control->sinfo_ssn) { /* * Gak, He sent me a duplicate str * seq number */ /* * foo bar, I guess I will just free * this new guy, should we abort * too? FIX ME MAYBE? Or it COULD be * that the SSN's have wrapped. * Maybe I should compare to TSN * somehow... sigh for now just blow * away the chunk! */ if (control->data) sctp_m_freem(control->data); control->data = NULL; asoc->size_on_all_streams -= control->length; sctp_ucount_decr(asoc->cnt_on_all_streams); if (control->whoFrom) { sctp_free_remote_addr(control->whoFrom); control->whoFrom = NULL; } sctp_free_a_readq(stcb, control); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif return; } else { if (TAILQ_NEXT(at, next) == NULL) { /* * We are at the end, insert * it after this one */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, at, SCTP_STR_LOG_FROM_INSERT_TL); } TAILQ_INSERT_AFTER(&strm->inqueue, at, control, next); break; } } } } } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* * Returns two things: You get the total size of the deliverable parts of the * first fragmented message on the reassembly queue. And you get a 1 back if * all of the message is ready or a 0 back if the message is still incomplete */ static int sctp_is_all_msg_on_reasm(struct sctp_association *asoc, uint32_t * t_size) { struct sctp_tmit_chunk *chk; uint32_t tsn; *t_size = 0; chk = TAILQ_FIRST(&asoc->reasmqueue); if (chk == NULL) { /* nothing on the queue */ return (0); } if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) { /* Not a first on the queue */ return (0); } tsn = chk->rec.data.TSN_seq; TAILQ_FOREACH(chk, &asoc->reasmqueue, sctp_next) { if (tsn != chk->rec.data.TSN_seq) { return (0); } *t_size += chk->send_size; if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { return (1); } tsn++; } return (0); } static void sctp_deliver_reasm_check(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *chk; uint16_t nxt_todel; uint32_t tsize, pd_point; doit_again: chk = TAILQ_FIRST(&asoc->reasmqueue); if (chk == NULL) { /* Huh? */ asoc->size_on_reasm_queue = 0; asoc->cnt_on_reasm_queue = 0; return; } if (asoc->fragmented_delivery_inprogress == 0) { nxt_todel = asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1; if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) && (nxt_todel == chk->rec.data.stream_seq || (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) { /* * Yep the first one is here and its ok to deliver * but should we? */ if (stcb->sctp_socket) { pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT, stcb->sctp_ep->partial_delivery_point); } else { pd_point = stcb->sctp_ep->partial_delivery_point; } if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) { /* * Yes, we setup to start reception, by * backing down the TSN just in case we * can't deliver. If we */ asoc->fragmented_delivery_inprogress = 1; asoc->tsn_last_delivered = chk->rec.data.TSN_seq - 1; asoc->str_of_pdapi = chk->rec.data.stream_number; asoc->ssn_of_pdapi = chk->rec.data.stream_seq; asoc->pdapi_ppid = chk->rec.data.payloadtype; asoc->fragment_flags = chk->rec.data.rcv_flags; sctp_service_reassembly(stcb, asoc); } } } else { /* * Service re-assembly will deliver stream data queued at * the end of fragmented delivery.. but it wont know to go * back and call itself again... we do that here with the * got doit_again */ sctp_service_reassembly(stcb, asoc); if (asoc->fragmented_delivery_inprogress == 0) { /* * finished our Fragmented delivery, could be more * waiting? */ goto doit_again; } } } /* * Dump onto the re-assembly queue, in its proper place. After dumping on the * queue, see if anthing can be delivered. If so pull it off (or as much as * we can. If we run out of space then we must dump what we can and set the * appropriate flag to say we queued what we could. */ static void sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_tmit_chunk *chk, int *abort_flag) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; uint32_t cum_ackp1, prev_tsn, post_tsn; struct sctp_tmit_chunk *at, *prev, *next; prev = next = NULL; cum_ackp1 = asoc->tsn_last_delivered + 1; if (TAILQ_EMPTY(&asoc->reasmqueue)) { /* This is the first one on the queue */ TAILQ_INSERT_HEAD(&asoc->reasmqueue, chk, sctp_next); /* * we do not check for delivery of anything when only one * fragment is here */ asoc->size_on_reasm_queue = chk->send_size; sctp_ucount_incr(asoc->cnt_on_reasm_queue); if (chk->rec.data.TSN_seq == cum_ackp1) { if (asoc->fragmented_delivery_inprogress == 0 && (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) != SCTP_DATA_FIRST_FRAG) { /* * An empty queue, no delivery inprogress, * we hit the next one and it does NOT have * a FIRST fragment mark. */ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not first, no fragmented delivery in progress\n"); snprintf(msg, sizeof(msg), "Expected B-bit for TSN=%8.8x, SID=%4.4x, SSN=%4.4x", chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; } else if (asoc->fragmented_delivery_inprogress && (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) { /* * We are doing a partial delivery and the * NEXT chunk MUST be either the LAST or * MIDDLE fragment NOT a FIRST */ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS a first and fragmented delivery in progress\n"); snprintf(msg, sizeof(msg), "Didn't expect B-bit for TSN=%8.8x, SID=%4.4x, SSN=%4.4x", chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; } else if (asoc->fragmented_delivery_inprogress) { /* * Here we are ok with a MIDDLE or LAST * piece */ if (chk->rec.data.stream_number != asoc->str_of_pdapi) { /* Got to be the right STR No */ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream number %d vs %d\n", chk->rec.data.stream_number, asoc->str_of_pdapi); snprintf(msg, sizeof(msg), "Expected SID=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", asoc->str_of_pdapi, chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_4; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; } else if ((asoc->fragment_flags & SCTP_DATA_UNORDERED) != SCTP_DATA_UNORDERED && chk->rec.data.stream_seq != asoc->ssn_of_pdapi) { /* Got to be the right STR Seq */ SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream seq %d vs %d\n", chk->rec.data.stream_seq, asoc->ssn_of_pdapi); snprintf(msg, sizeof(msg), "Expected SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", asoc->ssn_of_pdapi, chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_5; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; } } } return; } /* Find its place */ TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) { if (SCTP_TSN_GT(at->rec.data.TSN_seq, chk->rec.data.TSN_seq)) { /* * one in queue is bigger than the new one, insert * before this one */ /* A check */ asoc->size_on_reasm_queue += chk->send_size; sctp_ucount_incr(asoc->cnt_on_reasm_queue); next = at; TAILQ_INSERT_BEFORE(at, chk, sctp_next); break; } else if (at->rec.data.TSN_seq == chk->rec.data.TSN_seq) { /* Gak, He sent me a duplicate str seq number */ /* * foo bar, I guess I will just free this new guy, * should we abort too? FIX ME MAYBE? Or it COULD be * that the SSN's have wrapped. Maybe I should * compare to TSN somehow... sigh for now just blow * away the chunk! */ if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } else { prev = at; if (TAILQ_NEXT(at, sctp_next) == NULL) { /* * We are at the end, insert it after this * one */ /* check it first */ asoc->size_on_reasm_queue += chk->send_size; sctp_ucount_incr(asoc->cnt_on_reasm_queue); TAILQ_INSERT_AFTER(&asoc->reasmqueue, at, chk, sctp_next); break; } } } /* Now the audits */ if (prev) { prev_tsn = chk->rec.data.TSN_seq - 1; if (prev_tsn == prev->rec.data.TSN_seq) { /* * Ok the one I am dropping onto the end is the * NEXT. A bit of valdiation here. */ if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == SCTP_DATA_FIRST_FRAG || (prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == SCTP_DATA_MIDDLE_FRAG) { /* * Insert chk MUST be a MIDDLE or LAST * fragment */ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == SCTP_DATA_FIRST_FRAG) { SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - It can be a midlle or last but not a first\n"); SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it's a FIRST!\n"); snprintf(msg, sizeof(msg), "Can't handle B-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_6; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } if (chk->rec.data.stream_number != prev->rec.data.stream_number) { /* * Huh, need the correct STR here, * they must be the same. */ SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sid:%d not the same as at:%d\n", chk->rec.data.stream_number, prev->rec.data.stream_number); snprintf(msg, sizeof(msg), "Expect SID=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", prev->rec.data.stream_number, chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != (prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { /* * Huh, need the same ordering here, * they must be the same. */ SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, U-bit not constant\n"); snprintf(msg, sizeof(msg), "Expect U-bit=%d for TSN=%8.8x, got U-bit=%d", (prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0, chk->rec.data.TSN_seq, (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_8; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } if ((prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 && chk->rec.data.stream_seq != prev->rec.data.stream_seq) { /* * Huh, need the correct STR here, * they must be the same. */ SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sseq:%d not the same as at:%d\n", chk->rec.data.stream_seq, prev->rec.data.stream_seq); snprintf(msg, sizeof(msg), "Expect SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", prev->rec.data.stream_seq, chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_9; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } } else if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == SCTP_DATA_LAST_FRAG) { /* Insert chk MUST be a FIRST */ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != SCTP_DATA_FIRST_FRAG) { SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, evil plot, its not FIRST and it must be!\n"); snprintf(msg, sizeof(msg), "Expect B-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_10; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } } } } if (next) { post_tsn = chk->rec.data.TSN_seq + 1; if (post_tsn == next->rec.data.TSN_seq) { /* * Ok the one I am inserting ahead of is my NEXT * one. A bit of valdiation here. */ if (next->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { /* Insert chk MUST be a last fragment */ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != SCTP_DATA_LAST_FRAG) { SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is FIRST, we must be LAST\n"); SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not a last!\n"); snprintf(msg, sizeof(msg), "Expect only E-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_11; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } } else if ((next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == SCTP_DATA_MIDDLE_FRAG || (next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == SCTP_DATA_LAST_FRAG) { /* * Insert chk CAN be MIDDLE or FIRST NOT * LAST */ if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == SCTP_DATA_LAST_FRAG) { SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is a MIDDLE/LAST\n"); SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, new prev chunk is a LAST\n"); snprintf(msg, sizeof(msg), "Didn't expect E-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } if (chk->rec.data.stream_number != next->rec.data.stream_number) { /* * Huh, need the correct STR here, * they must be the same. */ SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, ssn:%d not the same as at:%d\n", chk->rec.data.stream_number, next->rec.data.stream_number); snprintf(msg, sizeof(msg), "Required SID %4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", next->rec.data.stream_number, chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_13; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != (next->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { /* * Huh, need the same ordering here, * they must be the same. */ SCTPDBG(SCTP_DEBUG_INDATA1, "Next check - Gak, Evil plot, U-bit not constant\n"); snprintf(msg, sizeof(msg), "Expect U-bit=%d for TSN=%8.8x, got U-bit=%d", (next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0, chk->rec.data.TSN_seq, (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } if ((next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 && chk->rec.data.stream_seq != next->rec.data.stream_seq) { /* * Huh, need the correct STR here, * they must be the same. */ SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, sseq:%d not the same as at:%d\n", chk->rec.data.stream_seq, next->rec.data.stream_seq); snprintf(msg, sizeof(msg), "Required SSN %4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", next->rec.data.stream_seq, chk->rec.data.TSN_seq, chk->rec.data.stream_number, chk->rec.data.stream_seq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } } } } /* Do we need to do some delivery? check */ sctp_deliver_reasm_check(stcb, asoc); } /* * This is an unfortunate routine. It checks to make sure a evil guy is not * stuffing us full of bad packet fragments. A broken peer could also do this * but this is doubtful. It is to bad I must worry about evil crackers sigh * :< more cycles. */ static int sctp_does_tsn_belong_to_reasm(struct sctp_association *asoc, uint32_t TSN_seq) { struct sctp_tmit_chunk *at; uint32_t tsn_est; TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) { if (SCTP_TSN_GT(TSN_seq, at->rec.data.TSN_seq)) { /* is it one bigger? */ tsn_est = at->rec.data.TSN_seq + 1; if (tsn_est == TSN_seq) { /* yep. It better be a last then */ if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != SCTP_DATA_LAST_FRAG) { /* * Ok this guy belongs next to a guy * that is NOT last, it should be a * middle/last, not a complete * chunk. */ return (1); } else { /* * This guy is ok since its a LAST * and the new chunk is a fully * self- contained one. */ return (0); } } } else if (TSN_seq == at->rec.data.TSN_seq) { /* Software error since I have a dup? */ return (1); } else { /* * Ok, 'at' is larger than new chunk but does it * need to be right before it. */ tsn_est = TSN_seq + 1; if (tsn_est == at->rec.data.TSN_seq) { /* Yep, It better be a first */ if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != SCTP_DATA_FIRST_FRAG) { return (1); } else { return (0); } } } } return (0); } static int sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, struct mbuf **m, int offset, struct sctp_data_chunk *ch, int chk_length, struct sctp_nets *net, uint32_t * high_tsn, int *abort_flag, int *break_flag, int last_chunk) { /* Process a data chunk */ /* struct sctp_tmit_chunk *chk; */ struct sctp_tmit_chunk *chk; uint32_t tsn, gap; struct mbuf *dmbuf; int the_len; int need_reasm_check = 0; uint16_t strmno, strmseq; struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; struct sctp_queued_to_read *control; int ordered; uint32_t protocol_id; uint8_t chunk_flags; struct sctp_stream_reset_list *liste; chk = NULL; tsn = ntohl(ch->dp.tsn); chunk_flags = ch->ch.chunk_flags; if ((chunk_flags & SCTP_DATA_SACK_IMMEDIATELY) == SCTP_DATA_SACK_IMMEDIATELY) { asoc->send_sack = 1; } protocol_id = ch->dp.protocol_id; ordered = ((chunk_flags & SCTP_DATA_UNORDERED) == 0); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, SCTP_MAP_TSN_ENTERS); } if (stcb == NULL) { return (0); } SCTP_LTRACE_CHK(stcb->sctp_ep, stcb, ch->ch.chunk_type, tsn); if (SCTP_TSN_GE(asoc->cumulative_tsn, tsn)) { /* It is a duplicate */ SCTP_STAT_INCR(sctps_recvdupdata); if (asoc->numduptsns < SCTP_MAX_DUP_TSNS) { /* Record a dup for the next outbound sack */ asoc->dup_tsns[asoc->numduptsns] = tsn; asoc->numduptsns++; } asoc->send_sack = 1; return (0); } /* Calculate the number of TSN's between the base and this TSN */ SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn); if (gap >= (SCTP_MAPPING_ARRAY << 3)) { /* Can't hold the bit in the mapping at max array, toss it */ return (0); } if (gap >= (uint32_t) (asoc->mapping_array_size << 3)) { SCTP_TCB_LOCK_ASSERT(stcb); if (sctp_expand_mapping_array(asoc, gap)) { /* Can't expand, drop it */ return (0); } } if (SCTP_TSN_GT(tsn, *high_tsn)) { *high_tsn = tsn; } /* See if we have received this one already */ if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap) || SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, gap)) { SCTP_STAT_INCR(sctps_recvdupdata); if (asoc->numduptsns < SCTP_MAX_DUP_TSNS) { /* Record a dup for the next outbound sack */ asoc->dup_tsns[asoc->numduptsns] = tsn; asoc->numduptsns++; } asoc->send_sack = 1; return (0); } /* * Check to see about the GONE flag, duplicates would cause a sack * to be sent up above */ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET))) { /* * wait a minute, this guy is gone, there is no longer a * receiver. Send peer an ABORT! */ op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return (0); } /* * Now before going further we see if there is room. If NOT then we * MAY let one through only IF this TSN is the one we are waiting * for on a partial delivery API. */ /* now do the tests */ if (((asoc->cnt_on_all_streams + asoc->cnt_on_reasm_queue + asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) || (((int)asoc->my_rwnd) <= 0)) { /* * When we have NO room in the rwnd we check to make sure * the reader is doing its job... */ if (stcb->sctp_socket->so_rcv.sb_cc) { /* some to read, wake-up */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* assoc was freed while we were unlocked */ SCTP_SOCKET_UNLOCK(so, 1); return (0); } #endif sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* now is it in the mapping array of what we have accepted? */ if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map) && SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { /* Nope not in the valid range dump it */ sctp_set_rwnd(stcb, asoc); if ((asoc->cnt_on_all_streams + asoc->cnt_on_reasm_queue + asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) { SCTP_STAT_INCR(sctps_datadropchklmt); } else { SCTP_STAT_INCR(sctps_datadroprwnd); } *break_flag = 1; return (0); } } strmno = ntohs(ch->dp.stream_id); if (strmno >= asoc->streamincnt) { - struct sctp_paramhdr *phdr; - struct mbuf *mb; + struct sctp_error_invalid_stream *cause; - mb = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) * 2), + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_invalid_stream), 0, M_NOWAIT, 1, MT_DATA); - if (mb != NULL) { + if (op_err != NULL) { /* add some space up front so prepend will work well */ - SCTP_BUF_RESV_UF(mb, sizeof(struct sctp_chunkhdr)); - phdr = mtod(mb, struct sctp_paramhdr *); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + cause = mtod(op_err, struct sctp_error_invalid_stream *); /* * Error causes are just param's and this one has * two back to back phdr, one with the error type * and size, the other with the streamid and a rsvd */ - SCTP_BUF_LEN(mb) = (sizeof(struct sctp_paramhdr) * 2); - phdr->param_type = htons(SCTP_CAUSE_INVALID_STREAM); - phdr->param_length = - htons(sizeof(struct sctp_paramhdr) * 2); - phdr++; - /* We insert the stream in the type field */ - phdr->param_type = ch->dp.stream_id; - /* And set the length to 0 for the rsvd field */ - phdr->param_length = 0; - sctp_queue_op_err(stcb, mb); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_invalid_stream); + cause->cause.code = htons(SCTP_CAUSE_INVALID_STREAM); + cause->cause.length = htons(sizeof(struct sctp_error_invalid_stream)); + cause->stream_id = ch->dp.stream_id; + cause->reserved = htons(0); + sctp_queue_op_err(stcb, op_err); } SCTP_STAT_INCR(sctps_badsid); SCTP_TCB_LOCK_ASSERT(stcb); SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = tsn; } if (tsn == (asoc->cumulative_tsn + 1)) { /* Update cum-ack */ asoc->cumulative_tsn = tsn; } return (0); } /* * Before we continue lets validate that we are not being fooled by * an evil attacker. We can only have 4k chunks based on our TSN * spread allowed by the mapping array 512 * 8 bits, so there is no * way our stream sequence numbers could have wrapped. We of course * only validate the FIRST fragment so the bit must be set. */ strmseq = ntohs(ch->dp.stream_sequence); #ifdef SCTP_ASOCLOG_OF_TSNS SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->tsn_in_at >= SCTP_TSN_LOG_SIZE) { asoc->tsn_in_at = 0; asoc->tsn_in_wrapped = 1; } asoc->in_tsnlog[asoc->tsn_in_at].tsn = tsn; asoc->in_tsnlog[asoc->tsn_in_at].strm = strmno; asoc->in_tsnlog[asoc->tsn_in_at].seq = strmseq; asoc->in_tsnlog[asoc->tsn_in_at].sz = chk_length; asoc->in_tsnlog[asoc->tsn_in_at].flgs = chunk_flags; asoc->in_tsnlog[asoc->tsn_in_at].stcb = (void *)stcb; asoc->in_tsnlog[asoc->tsn_in_at].in_pos = asoc->tsn_in_at; asoc->in_tsnlog[asoc->tsn_in_at].in_out = 1; asoc->tsn_in_at++; #endif if ((chunk_flags & SCTP_DATA_FIRST_FRAG) && (TAILQ_EMPTY(&asoc->resetHead)) && (chunk_flags & SCTP_DATA_UNORDERED) == 0 && SCTP_SSN_GE(asoc->strmin[strmno].last_sequence_delivered, strmseq)) { /* The incoming sseq is behind where we last delivered? */ SCTPDBG(SCTP_DEBUG_INDATA1, "EVIL/Broken-Dup S-SEQ:%d delivered:%d from peer, Abort!\n", strmseq, asoc->strmin[strmno].last_sequence_delivered); snprintf(msg, sizeof(msg), "Delivered SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", asoc->strmin[strmno].last_sequence_delivered, tsn, strmno, strmseq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return (0); } /************************************ * From here down we may find ch-> invalid * so its a good idea NOT to use it. *************************************/ the_len = (chk_length - sizeof(struct sctp_data_chunk)); if (last_chunk == 0) { dmbuf = SCTP_M_COPYM(*m, (offset + sizeof(struct sctp_data_chunk)), the_len, M_NOWAIT); #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(dmbuf, SCTP_MBUF_ICOPY); } #endif } else { /* We can steal the last chunk */ int l_len; dmbuf = *m; /* lop off the top part */ m_adj(dmbuf, (offset + sizeof(struct sctp_data_chunk))); if (SCTP_BUF_NEXT(dmbuf) == NULL) { l_len = SCTP_BUF_LEN(dmbuf); } else { /* * need to count up the size hopefully does not hit * this to often :-0 */ struct mbuf *lat; l_len = 0; for (lat = dmbuf; lat; lat = SCTP_BUF_NEXT(lat)) { l_len += SCTP_BUF_LEN(lat); } } if (l_len > the_len) { /* Trim the end round bytes off too */ m_adj(dmbuf, -(l_len - the_len)); } } if (dmbuf == NULL) { SCTP_STAT_INCR(sctps_nomem); return (0); } if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG && asoc->fragmented_delivery_inprogress == 0 && TAILQ_EMPTY(&asoc->resetHead) && ((ordered == 0) || ((uint16_t) (asoc->strmin[strmno].last_sequence_delivered + 1) == strmseq && TAILQ_EMPTY(&asoc->strmin[strmno].inqueue)))) { /* Candidate for express delivery */ /* * Its not fragmented, No PD-API is up, Nothing in the * delivery queue, Its un-ordered OR ordered and the next to * deliver AND nothing else is stuck on the stream queue, * And there is room for it in the socket buffer. Lets just * stuff it up the buffer.... */ /* It would be nice to avoid this copy if we could :< */ sctp_alloc_a_readq(stcb, control); sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, protocol_id, strmno, strmseq, chunk_flags, dmbuf); if (control == NULL) { goto failed_express_del; } SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = tsn; } sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) { /* for ordered, bump what we delivered */ asoc->strmin[strmno].last_sequence_delivered++; } SCTP_STAT_INCR(sctps_recvexpress); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, SCTP_STR_LOG_FROM_EXPRS_DEL); } control = NULL; goto finish_express_del; } failed_express_del: /* If we reach here this is a new chunk */ chk = NULL; control = NULL; /* Express for fragmented delivery? */ if ((asoc->fragmented_delivery_inprogress) && (stcb->asoc.control_pdapi) && (asoc->str_of_pdapi == strmno) && (asoc->ssn_of_pdapi == strmseq) ) { control = stcb->asoc.control_pdapi; if ((chunk_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) { /* Can't be another first? */ goto failed_pdapi_express_del; } if (tsn == (control->sinfo_tsn + 1)) { /* Yep, we can add it on */ int end = 0; if (chunk_flags & SCTP_DATA_LAST_FRAG) { end = 1; } if (sctp_append_to_readq(stcb->sctp_ep, stcb, control, dmbuf, end, tsn, &stcb->sctp_socket->so_rcv)) { SCTP_PRINTF("Append fails end:%d\n", end); goto failed_pdapi_express_del; } SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = tsn; } SCTP_STAT_INCR(sctps_recvexpressm); asoc->tsn_last_delivered = tsn; asoc->fragment_flags = chunk_flags; asoc->tsn_of_pdapi_last_delivered = tsn; asoc->last_flags_delivered = chunk_flags; asoc->last_strm_seq_delivered = strmseq; asoc->last_strm_no_delivered = strmno; if (end) { /* clean up the flags and such */ asoc->fragmented_delivery_inprogress = 0; if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) { asoc->strmin[strmno].last_sequence_delivered++; } stcb->asoc.control_pdapi = NULL; if (TAILQ_EMPTY(&asoc->reasmqueue) == 0) { /* * There could be another message * ready */ need_reasm_check = 1; } } control = NULL; goto finish_express_del; } } failed_pdapi_express_del: control = NULL; if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = tsn; } } else { SCTP_SET_TSN_PRESENT(asoc->mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map)) { asoc->highest_tsn_inside_map = tsn; } } if ((chunk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) { sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* No memory so we drop the chunk */ SCTP_STAT_INCR(sctps_nomem); if (last_chunk == 0) { /* we copied it, free the copy */ sctp_m_freem(dmbuf); } return (0); } chk->rec.data.TSN_seq = tsn; chk->no_fr_allowed = 0; chk->rec.data.stream_seq = strmseq; chk->rec.data.stream_number = strmno; chk->rec.data.payloadtype = protocol_id; chk->rec.data.context = stcb->asoc.context; chk->rec.data.doing_fast_retransmit = 0; chk->rec.data.rcv_flags = chunk_flags; chk->asoc = asoc; chk->send_size = the_len; chk->whoTo = net; atomic_add_int(&net->ref_count, 1); chk->data = dmbuf; } else { sctp_alloc_a_readq(stcb, control); sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, protocol_id, strmno, strmseq, chunk_flags, dmbuf); if (control == NULL) { /* No memory so we drop the chunk */ SCTP_STAT_INCR(sctps_nomem); if (last_chunk == 0) { /* we copied it, free the copy */ sctp_m_freem(dmbuf); } return (0); } control->length = the_len; } /* Mark it as received */ /* Now queue it where it belongs */ if (control != NULL) { /* First a sanity check */ if (asoc->fragmented_delivery_inprogress) { /* * Ok, we have a fragmented delivery in progress if * this chunk is next to deliver OR belongs in our * view to the reassembly, the peer is evil or * broken. */ uint32_t estimate_tsn; estimate_tsn = asoc->tsn_last_delivered + 1; if (TAILQ_EMPTY(&asoc->reasmqueue) && (estimate_tsn == control->sinfo_tsn)) { /* Evil/Broke peer */ sctp_m_freem(control->data); control->data = NULL; if (control->whoFrom) { sctp_free_remote_addr(control->whoFrom); control->whoFrom = NULL; } sctp_free_a_readq(stcb, control); snprintf(msg, sizeof(msg), "Reas. queue emtpy, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", tsn, strmno, strmseq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_17; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; if (last_chunk) { *m = NULL; } return (0); } else { if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) { sctp_m_freem(control->data); control->data = NULL; if (control->whoFrom) { sctp_free_remote_addr(control->whoFrom); control->whoFrom = NULL; } sctp_free_a_readq(stcb, control); snprintf(msg, sizeof(msg), "PD ongoing, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", tsn, strmno, strmseq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_18; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; if (last_chunk) { *m = NULL; } return (0); } } } else { /* No PDAPI running */ if (!TAILQ_EMPTY(&asoc->reasmqueue)) { /* * Reassembly queue is NOT empty validate * that this tsn does not need to be in * reasembly queue. If it does then our peer * is broken or evil. */ if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) { sctp_m_freem(control->data); control->data = NULL; if (control->whoFrom) { sctp_free_remote_addr(control->whoFrom); control->whoFrom = NULL; } sctp_free_a_readq(stcb, control); snprintf(msg, sizeof(msg), "No PD ongoing, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", tsn, strmno, strmseq); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; if (last_chunk) { *m = NULL; } return (0); } } } /* ok, if we reach here we have passed the sanity checks */ if (chunk_flags & SCTP_DATA_UNORDERED) { /* queue directly into socket buffer */ sctp_mark_non_revokable(asoc, control->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } else { /* * Special check for when streams are resetting. We * could be more smart about this and check the * actual stream to see if it is not being reset.. * that way we would not create a HOLB when amongst * streams being reset and those not being reset. * * We take complete messages that have a stream reset * intervening (aka the TSN is after where our * cum-ack needs to be) off and put them on a * pending_reply_queue. The reassembly ones we do * not have to worry about since they are all sorted * and proceessed by TSN order. It is only the * singletons I must worry about. */ if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) && SCTP_TSN_GT(tsn, liste->tsn)) { /* * yep its past where we need to reset... go * ahead and queue it. */ if (TAILQ_EMPTY(&asoc->pending_reply_queue)) { /* first one on */ TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); } else { struct sctp_queued_to_read *ctlOn, *nctlOn; unsigned char inserted = 0; TAILQ_FOREACH_SAFE(ctlOn, &asoc->pending_reply_queue, next, nctlOn) { if (SCTP_TSN_GT(control->sinfo_tsn, ctlOn->sinfo_tsn)) { continue; } else { /* found it */ TAILQ_INSERT_BEFORE(ctlOn, control, next); inserted = 1; break; } } if (inserted == 0) { /* * must be put at end, use * prevP (all setup from * loop) to setup nextP. */ TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); } } } else { sctp_queue_data_to_stream(stcb, asoc, control, abort_flag); if (*abort_flag) { if (last_chunk) { *m = NULL; } return (0); } } } } else { /* Into the re-assembly queue */ sctp_queue_data_for_reasm(stcb, asoc, chk, abort_flag); if (*abort_flag) { /* * the assoc is now gone and chk was put onto the * reasm queue, which has all been freed. */ if (last_chunk) { *m = NULL; } return (0); } } finish_express_del: if (tsn == (asoc->cumulative_tsn + 1)) { /* Update cum-ack */ asoc->cumulative_tsn = tsn; } if (last_chunk) { *m = NULL; } if (ordered) { SCTP_STAT_INCR_COUNTER64(sctps_inorderchunks); } else { SCTP_STAT_INCR_COUNTER64(sctps_inunorderchunks); } SCTP_STAT_INCR(sctps_recvdata); /* Set it present please */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, SCTP_STR_LOG_FROM_MARK_TSN); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, SCTP_MAP_PREPARE_SLIDE); } /* check the special flag for stream resets */ if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) && SCTP_TSN_GE(asoc->cumulative_tsn, liste->tsn)) { /* * we have finished working through the backlogged TSN's now * time to reset streams. 1: call reset function. 2: free * pending_reply space 3: distribute any chunks in * pending_reply_queue. */ struct sctp_queued_to_read *ctl, *nctl; sctp_reset_in_stream(stcb, liste->number_entries, liste->list_of_streams); TAILQ_REMOVE(&asoc->resetHead, liste, next_resp); sctp_send_deferred_reset_response(stcb, liste, SCTP_STREAM_RESET_RESULT_PERFORMED); SCTP_FREE(liste, SCTP_M_STRESET); /* sa_ignore FREED_MEMORY */ liste = TAILQ_FIRST(&asoc->resetHead); if (TAILQ_EMPTY(&asoc->resetHead)) { /* All can be removed */ TAILQ_FOREACH_SAFE(ctl, &asoc->pending_reply_queue, next, nctl) { TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next); sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag); if (*abort_flag) { return (0); } } } else { TAILQ_FOREACH_SAFE(ctl, &asoc->pending_reply_queue, next, nctl) { if (SCTP_TSN_GT(ctl->sinfo_tsn, liste->tsn)) { break; } /* * if ctl->sinfo_tsn is <= liste->tsn we can * process it which is the NOT of * ctl->sinfo_tsn > liste->tsn */ TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next); sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag); if (*abort_flag) { return (0); } } } /* * Now service re-assembly to pick up anything that has been * held on reassembly queue? */ sctp_deliver_reasm_check(stcb, asoc); need_reasm_check = 0; } if (need_reasm_check) { /* Another one waits ? */ sctp_deliver_reasm_check(stcb, asoc); } return (1); } int8_t sctp_map_lookup_tab[256] = { 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8 }; void sctp_slide_mapping_arrays(struct sctp_tcb *stcb) { /* * Now we also need to check the mapping array in a couple of ways. * 1) Did we move the cum-ack point? * * When you first glance at this you might think that all entries that * make up the postion of the cum-ack would be in the nr-mapping * array only.. i.e. things up to the cum-ack are always * deliverable. Thats true with one exception, when its a fragmented * message we may not deliver the data until some threshold (or all * of it) is in place. So we must OR the nr_mapping_array and * mapping_array to get a true picture of the cum-ack. */ struct sctp_association *asoc; int at; uint8_t val; int slide_from, slide_end, lgap, distance; uint32_t old_cumack, old_base, old_highest, highest_tsn; asoc = &stcb->asoc; old_cumack = asoc->cumulative_tsn; old_base = asoc->mapping_array_base_tsn; old_highest = asoc->highest_tsn_inside_map; /* * We could probably improve this a small bit by calculating the * offset of the current cum-ack as the starting point. */ at = 0; for (slide_from = 0; slide_from < stcb->asoc.mapping_array_size; slide_from++) { val = asoc->nr_mapping_array[slide_from] | asoc->mapping_array[slide_from]; if (val == 0xff) { at += 8; } else { /* there is a 0 bit */ at += sctp_map_lookup_tab[val]; break; } } asoc->cumulative_tsn = asoc->mapping_array_base_tsn + (at - 1); if (SCTP_TSN_GT(asoc->cumulative_tsn, asoc->highest_tsn_inside_map) && SCTP_TSN_GT(asoc->cumulative_tsn, asoc->highest_tsn_inside_nr_map)) { #ifdef INVARIANTS panic("huh, cumack 0x%x greater than high-tsn 0x%x in map", asoc->cumulative_tsn, asoc->highest_tsn_inside_map); #else SCTP_PRINTF("huh, cumack 0x%x greater than high-tsn 0x%x in map - should panic?\n", asoc->cumulative_tsn, asoc->highest_tsn_inside_map); sctp_print_mapping_array(asoc); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 6, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } asoc->highest_tsn_inside_map = asoc->cumulative_tsn; asoc->highest_tsn_inside_nr_map = asoc->cumulative_tsn; #endif } if (SCTP_TSN_GT(asoc->highest_tsn_inside_nr_map, asoc->highest_tsn_inside_map)) { highest_tsn = asoc->highest_tsn_inside_nr_map; } else { highest_tsn = asoc->highest_tsn_inside_map; } if ((asoc->cumulative_tsn == highest_tsn) && (at >= 8)) { /* The complete array was completed by a single FR */ /* highest becomes the cum-ack */ int clr; #ifdef INVARIANTS unsigned int i; #endif /* clear the array */ clr = ((at + 7) >> 3); if (clr > asoc->mapping_array_size) { clr = asoc->mapping_array_size; } memset(asoc->mapping_array, 0, clr); memset(asoc->nr_mapping_array, 0, clr); #ifdef INVARIANTS for (i = 0; i < asoc->mapping_array_size; i++) { if ((asoc->mapping_array[i]) || (asoc->nr_mapping_array[i])) { SCTP_PRINTF("Error Mapping array's not clean at clear\n"); sctp_print_mapping_array(asoc); } } #endif asoc->mapping_array_base_tsn = asoc->cumulative_tsn + 1; asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map = asoc->cumulative_tsn; } else if (at >= 8) { /* we can slide the mapping array down */ /* slide_from holds where we hit the first NON 0xff byte */ /* * now calculate the ceiling of the move using our highest * TSN value */ SCTP_CALC_TSN_TO_GAP(lgap, highest_tsn, asoc->mapping_array_base_tsn); slide_end = (lgap >> 3); if (slide_end < slide_from) { sctp_print_mapping_array(asoc); #ifdef INVARIANTS panic("impossible slide"); #else SCTP_PRINTF("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n", lgap, slide_end, slide_from, at); return; #endif } if (slide_end > asoc->mapping_array_size) { #ifdef INVARIANTS panic("would overrun buffer"); #else SCTP_PRINTF("Gak, would have overrun map end:%d slide_end:%d\n", asoc->mapping_array_size, slide_end); slide_end = asoc->mapping_array_size; #endif } distance = (slide_end - slide_from) + 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(old_base, old_cumack, old_highest, SCTP_MAP_PREPARE_SLIDE); sctp_log_map((uint32_t) slide_from, (uint32_t) slide_end, (uint32_t) lgap, SCTP_MAP_SLIDE_FROM); } if (distance + slide_from > asoc->mapping_array_size || distance < 0) { /* * Here we do NOT slide forward the array so that * hopefully when more data comes in to fill it up * we will be able to slide it forward. Really I * don't think this should happen :-0 */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map((uint32_t) distance, (uint32_t) slide_from, (uint32_t) asoc->mapping_array_size, SCTP_MAP_SLIDE_NONE); } } else { int ii; for (ii = 0; ii < distance; ii++) { asoc->mapping_array[ii] = asoc->mapping_array[slide_from + ii]; asoc->nr_mapping_array[ii] = asoc->nr_mapping_array[slide_from + ii]; } for (ii = distance; ii < asoc->mapping_array_size; ii++) { asoc->mapping_array[ii] = 0; asoc->nr_mapping_array[ii] = 0; } if (asoc->highest_tsn_inside_map + 1 == asoc->mapping_array_base_tsn) { asoc->highest_tsn_inside_map += (slide_from << 3); } if (asoc->highest_tsn_inside_nr_map + 1 == asoc->mapping_array_base_tsn) { asoc->highest_tsn_inside_nr_map += (slide_from << 3); } asoc->mapping_array_base_tsn += (slide_from << 3); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } } } } void sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap) { struct sctp_association *asoc; uint32_t highest_tsn; asoc = &stcb->asoc; if (SCTP_TSN_GT(asoc->highest_tsn_inside_nr_map, asoc->highest_tsn_inside_map)) { highest_tsn = asoc->highest_tsn_inside_nr_map; } else { highest_tsn = asoc->highest_tsn_inside_map; } /* * Now we need to see if we need to queue a sack or just start the * timer (if allowed). */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { /* * Ok special case, in SHUTDOWN-SENT case. here we maker * sure SACK timer is off and instead send a SHUTDOWN and a * SACK */ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INDATA + SCTP_LOC_20); } sctp_send_shutdown(stcb, ((stcb->asoc.alternate) ? stcb->asoc.alternate : stcb->asoc.primary_destination)); sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); } else { int is_a_gap; /* is there a gap now ? */ is_a_gap = SCTP_TSN_GT(highest_tsn, stcb->asoc.cumulative_tsn); /* * CMT DAC algorithm: increase number of packets received * since last ack */ stcb->asoc.cmt_dac_pkts_rcvd++; if ((stcb->asoc.send_sack == 1) || /* We need to send a * SACK */ ((was_a_gap) && (is_a_gap == 0)) || /* was a gap, but no * longer is one */ (stcb->asoc.numduptsns) || /* we have dup's */ (is_a_gap) || /* is still a gap */ (stcb->asoc.delayed_ack == 0) || /* Delayed sack disabled */ (stcb->asoc.data_pkts_seen >= stcb->asoc.sack_freq) /* hit limit of pkts */ ) { if ((stcb->asoc.sctp_cmt_on_off > 0) && (SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) && (stcb->asoc.send_sack == 0) && (stcb->asoc.numduptsns == 0) && (stcb->asoc.delayed_ack) && (!SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer))) { /* * CMT DAC algorithm: With CMT, delay acks * even in the face of * * reordering. Therefore, if acks that do not * have to be sent because of the above * reasons, will be delayed. That is, acks * that would have been sent due to gap * reports will be delayed with DAC. Start * the delayed ack timer. */ sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } else { /* * Ok we must build a SACK since the timer * is pending, we got our first packet OR * there are gaps or duplicates. */ (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer); sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); } } else { if (!SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } } } } void sctp_service_queues(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *chk; uint32_t tsize, pd_point; uint16_t nxt_todel; if (asoc->fragmented_delivery_inprogress) { sctp_service_reassembly(stcb, asoc); } /* Can we proceed further, i.e. the PD-API is complete */ if (asoc->fragmented_delivery_inprogress) { /* no */ return; } /* * Now is there some other chunk I can deliver from the reassembly * queue. */ doit_again: chk = TAILQ_FIRST(&asoc->reasmqueue); if (chk == NULL) { asoc->size_on_reasm_queue = 0; asoc->cnt_on_reasm_queue = 0; return; } nxt_todel = asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1; if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) && ((nxt_todel == chk->rec.data.stream_seq) || (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) { /* * Yep the first one is here. We setup to start reception, * by backing down the TSN just in case we can't deliver. */ /* * Before we start though either all of the message should * be here or the socket buffer max or nothing on the * delivery queue and something can be delivered. */ if (stcb->sctp_socket) { pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT, stcb->sctp_ep->partial_delivery_point); } else { pd_point = stcb->sctp_ep->partial_delivery_point; } if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) { asoc->fragmented_delivery_inprogress = 1; asoc->tsn_last_delivered = chk->rec.data.TSN_seq - 1; asoc->str_of_pdapi = chk->rec.data.stream_number; asoc->ssn_of_pdapi = chk->rec.data.stream_seq; asoc->pdapi_ppid = chk->rec.data.payloadtype; asoc->fragment_flags = chk->rec.data.rcv_flags; sctp_service_reassembly(stcb, asoc); if (asoc->fragmented_delivery_inprogress == 0) { goto doit_again; } } } } int sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t * high_tsn) { struct sctp_data_chunk *ch, chunk_buf; struct sctp_association *asoc; int num_chunks = 0; /* number of control chunks processed */ int stop_proc = 0; int chk_length, break_flag, last_chunk; int abort_flag = 0, was_a_gap; struct mbuf *m; uint32_t highest_tsn; /* set the rwnd */ sctp_set_rwnd(stcb, &stcb->asoc); m = *mm; SCTP_TCB_LOCK_ASSERT(stcb); asoc = &stcb->asoc; if (SCTP_TSN_GT(asoc->highest_tsn_inside_nr_map, asoc->highest_tsn_inside_map)) { highest_tsn = asoc->highest_tsn_inside_nr_map; } else { highest_tsn = asoc->highest_tsn_inside_map; } was_a_gap = SCTP_TSN_GT(highest_tsn, stcb->asoc.cumulative_tsn); /* * setup where we got the last DATA packet from for any SACK that * may need to go out. Don't bump the net. This is done ONLY when a * chunk is assigned. */ asoc->last_data_chunk_from = net; /*- * Now before we proceed we must figure out if this is a wasted * cluster... i.e. it is a small packet sent in and yet the driver * underneath allocated a full cluster for it. If so we must copy it * to a smaller mbuf and free up the cluster mbuf. This will help * with cluster starvation. Note for __Panda__ we don't do this * since it has clusters all the way down to 64 bytes. */ if (SCTP_BUF_LEN(m) < (long)MLEN && SCTP_BUF_NEXT(m) == NULL) { /* we only handle mbufs that are singletons.. not chains */ m = sctp_get_mbuf_for_msg(SCTP_BUF_LEN(m), 0, M_NOWAIT, 1, MT_DATA); if (m) { /* ok lets see if we can copy the data up */ caddr_t *from, *to; /* get the pointers and copy */ to = mtod(m, caddr_t *); from = mtod((*mm), caddr_t *); memcpy(to, from, SCTP_BUF_LEN((*mm))); /* copy the length and free up the old */ SCTP_BUF_LEN(m) = SCTP_BUF_LEN((*mm)); sctp_m_freem(*mm); /* sucess, back copy */ *mm = m; } else { /* We are in trouble in the mbuf world .. yikes */ m = *mm; } } /* get pointer to the first chunk header */ ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset, sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf); if (ch == NULL) { return (1); } /* * process all DATA chunks... */ *high_tsn = asoc->cumulative_tsn; break_flag = 0; asoc->data_pkts_seen++; while (stop_proc == 0) { /* validate chunk length */ chk_length = ntohs(ch->ch.chunk_length); if (length - *offset < chk_length) { /* all done, mutulated chunk */ stop_proc = 1; continue; } if (ch->ch.chunk_type == SCTP_DATA) { if ((size_t)chk_length < sizeof(struct sctp_data_chunk)) { /* * Need to send an abort since we had a * invalid data chunk. */ struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; snprintf(msg, sizeof(msg), "DATA chunk of length %d", chk_length); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_21; sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); return (2); } if ((size_t)chk_length == sizeof(struct sctp_data_chunk)) { /* * Need to send an abort since we had an * empty data chunk. */ struct mbuf *op_err; op_err = sctp_generate_no_user_data_cause(ch->dp.tsn); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_22; sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); return (2); } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xB1, 0); #endif if (SCTP_SIZE32(chk_length) == (length - *offset)) { last_chunk = 1; } else { last_chunk = 0; } if (sctp_process_a_data_chunk(stcb, asoc, mm, *offset, ch, chk_length, net, high_tsn, &abort_flag, &break_flag, last_chunk)) { num_chunks++; } if (abort_flag) return (2); if (break_flag) { /* * Set because of out of rwnd space and no * drop rep space left. */ stop_proc = 1; continue; } } else { /* not a data chunk in the data region */ switch (ch->ch.chunk_type) { case SCTP_INITIATION: case SCTP_INITIATION_ACK: case SCTP_SELECTIVE_ACK: case SCTP_NR_SELECTIVE_ACK: case SCTP_HEARTBEAT_REQUEST: case SCTP_HEARTBEAT_ACK: case SCTP_ABORT_ASSOCIATION: case SCTP_SHUTDOWN: case SCTP_SHUTDOWN_ACK: case SCTP_OPERATION_ERROR: case SCTP_COOKIE_ECHO: case SCTP_COOKIE_ACK: case SCTP_ECN_ECHO: case SCTP_ECN_CWR: case SCTP_SHUTDOWN_COMPLETE: case SCTP_AUTHENTICATION: case SCTP_ASCONF_ACK: case SCTP_PACKET_DROPPED: case SCTP_STREAM_RESET: case SCTP_FORWARD_CUM_TSN: case SCTP_ASCONF: /* * Now, what do we do with KNOWN chunks that * are NOT in the right place? * * For now, I do nothing but ignore them. We * may later want to add sysctl stuff to * switch out and do either an ABORT() or * possibly process them. */ if (SCTP_BASE_SYSCTL(sctp_strict_data_order)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; snprintf(msg, sizeof(msg), "DATA chunk followed by chunk of type %2.2x", ch->ch.chunk_type); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); return (2); } break; default: /* unknown chunk type, use bit rules */ if (ch->ch.chunk_type & 0x40) { /* Add a error report to the queue */ - struct mbuf *merr; - struct sctp_paramhdr *phd; + struct mbuf *op_err; + struct sctp_gen_error_cause *cause; - merr = sctp_get_mbuf_for_msg(sizeof(*phd), 0, M_NOWAIT, 1, MT_DATA); - if (merr) { - phd = mtod(merr, struct sctp_paramhdr *); - /* - * We cheat and use param - * type since we did not - * bother to define a error - * cause struct. They are - * the same basic format - * with different names. - */ - phd->param_type = - htons(SCTP_CAUSE_UNRECOG_CHUNK); - phd->param_length = - htons(chk_length + sizeof(*phd)); - SCTP_BUF_LEN(merr) = sizeof(*phd); - SCTP_BUF_NEXT(merr) = SCTP_M_COPYM(m, *offset, chk_length, M_NOWAIT); - if (SCTP_BUF_NEXT(merr)) { - sctp_queue_op_err(stcb, merr); + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause), + 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + cause = mtod(op_err, struct sctp_gen_error_cause *); + cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK); + cause->length = htons(chk_length + sizeof(struct sctp_gen_error_cause)); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); + SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, chk_length, M_NOWAIT); + if (SCTP_BUF_NEXT(op_err) != NULL) { + sctp_queue_op_err(stcb, op_err); } else { - sctp_m_freem(merr); + sctp_m_freem(op_err); } } } if ((ch->ch.chunk_type & 0x80) == 0) { /* discard the rest of this packet */ stop_proc = 1; } /* else skip this bad chunk and * continue... */ break; } /* switch of chunk type */ } *offset += SCTP_SIZE32(chk_length); if ((*offset >= length) || stop_proc) { /* no more data left in the mbuf chain */ stop_proc = 1; continue; } ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset, sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf); if (ch == NULL) { *offset = length; stop_proc = 1; continue; } } if (break_flag) { /* * we need to report rwnd overrun drops. */ sctp_send_packet_dropped(stcb, net, *mm, length, iphlen, 0); } if (num_chunks) { /* * Did we get data, if so update the time for auto-close and * give peer credit for being alive. */ SCTP_STAT_INCR(sctps_recvpktwithdata); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INDATA, __LINE__); } stcb->asoc.overall_error_count = 0; (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_last_rcvd); } /* now service all of the reassm queue if needed */ if (!(TAILQ_EMPTY(&asoc->reasmqueue))) sctp_service_queues(stcb, asoc); if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { /* Assure that we ack right away */ stcb->asoc.send_sack = 1; } /* Start a sack timer or QUEUE a SACK for sending */ sctp_sack_check(stcb, was_a_gap); return (0); } static int sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1, uint32_t last_tsn, uint16_t frag_strt, uint16_t frag_end, int nr_sacking, int *num_frs, uint32_t * biggest_newly_acked_tsn, uint32_t * this_sack_lowest_newack, int *rto_ok) { struct sctp_tmit_chunk *tp1; unsigned int theTSN; int j, wake_him = 0, circled = 0; /* Recover the tp1 we last saw */ tp1 = *p_tp1; if (tp1 == NULL) { tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); } for (j = frag_strt; j <= frag_end; j++) { theTSN = j + last_tsn; while (tp1) { if (tp1->rec.data.doing_fast_retransmit) (*num_frs) += 1; /*- * CMT: CUCv2 algorithm. For each TSN being * processed from the sent queue, track the * next expected pseudo-cumack, or * rtx_pseudo_cumack, if required. Separate * cumack trackers for first transmissions, * and retransmissions. */ if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (tp1->whoTo->find_pseudo_cumack == 1) && (tp1->snd_count == 1)) { tp1->whoTo->pseudo_cumack = tp1->rec.data.TSN_seq; tp1->whoTo->find_pseudo_cumack = 0; } if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (tp1->whoTo->find_rtx_pseudo_cumack == 1) && (tp1->snd_count > 1)) { tp1->whoTo->rtx_pseudo_cumack = tp1->rec.data.TSN_seq; tp1->whoTo->find_rtx_pseudo_cumack = 0; } if (tp1->rec.data.TSN_seq == theTSN) { if (tp1->sent != SCTP_DATAGRAM_UNSENT) { /*- * must be held until * cum-ack passes */ if (tp1->sent < SCTP_DATAGRAM_RESEND) { /*- * If it is less than RESEND, it is * now no-longer in flight. * Higher values may already be set * via previous Gap Ack Blocks... * i.e. ACKED or RESEND. */ if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, *biggest_newly_acked_tsn)) { *biggest_newly_acked_tsn = tp1->rec.data.TSN_seq; } /*- * CMT: SFR algo (and HTNA) - set * saw_newack to 1 for dest being * newly acked. update * this_sack_highest_newack if * appropriate. */ if (tp1->rec.data.chunk_was_revoked == 0) tp1->whoTo->saw_newack = 1; if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, tp1->whoTo->this_sack_highest_newack)) { tp1->whoTo->this_sack_highest_newack = tp1->rec.data.TSN_seq; } /*- * CMT DAC algo: also update * this_sack_lowest_newack */ if (*this_sack_lowest_newack == 0) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { sctp_log_sack(*this_sack_lowest_newack, last_tsn, tp1->rec.data.TSN_seq, 0, 0, SCTP_LOG_TSN_ACKED); } *this_sack_lowest_newack = tp1->rec.data.TSN_seq; } /*- * CMT: CUCv2 algorithm. If (rtx-)pseudo-cumack for corresp * dest is being acked, then we have a new (rtx-)pseudo-cumack. Set * new_(rtx_)pseudo_cumack to TRUE so that the cwnd for this dest can be * updated. Also trigger search for the next expected (rtx-)pseudo-cumack. * Separate pseudo_cumack trackers for first transmissions and * retransmissions. */ if (tp1->rec.data.TSN_seq == tp1->whoTo->pseudo_cumack) { if (tp1->rec.data.chunk_was_revoked == 0) { tp1->whoTo->new_pseudo_cumack = 1; } tp1->whoTo->find_pseudo_cumack = 1; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK); } if (tp1->rec.data.TSN_seq == tp1->whoTo->rtx_pseudo_cumack) { if (tp1->rec.data.chunk_was_revoked == 0) { tp1->whoTo->new_pseudo_cumack = 1; } tp1->whoTo->find_rtx_pseudo_cumack = 1; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { sctp_log_sack(*biggest_newly_acked_tsn, last_tsn, tp1->rec.data.TSN_seq, frag_strt, frag_end, SCTP_LOG_TSN_ACKED); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_GAP, tp1->whoTo->flight_size, tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_decrease(tp1); if (stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) { (*stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) (tp1->whoTo, tp1); } sctp_total_flight_decrease(stcb, tp1); tp1->whoTo->net_ack += tp1->send_size; if (tp1->snd_count < 2) { /*- * True non-retransmited chunk */ tp1->whoTo->net_ack2 += tp1->send_size; /*- * update RTO too ? */ if (tp1->do_rtt) { if (*rto_ok) { tp1->whoTo->RTO = sctp_calculate_rto(stcb, &stcb->asoc, tp1->whoTo, &tp1->sent_rcv_time, sctp_align_safe_nocopy, SCTP_RTT_FROM_DATA); *rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { tp1->whoTo->rto_needed = 1; } tp1->do_rtt = 0; } } } if (tp1->sent <= SCTP_DATAGRAM_RESEND) { if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, stcb->asoc.this_sack_highest_gap)) { stcb->asoc.this_sack_highest_gap = tp1->rec.data.TSN_seq; } if (tp1->sent == SCTP_DATAGRAM_RESEND) { sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt); #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xB2, (stcb->asoc.sent_queue_retran_cnt & 0x000000ff)); #endif } } /*- * All chunks NOT UNSENT fall through here and are marked * (leave PR-SCTP ones that are to skip alone though) */ if ((tp1->sent != SCTP_FORWARD_TSN_SKIP) && (tp1->sent != SCTP_DATAGRAM_NR_ACKED)) { tp1->sent = SCTP_DATAGRAM_MARKED; } if (tp1->rec.data.chunk_was_revoked) { /* deflate the cwnd */ tp1->whoTo->cwnd -= tp1->book_size; tp1->rec.data.chunk_was_revoked = 0; } /* NR Sack code here */ if (nr_sacking && (tp1->sent != SCTP_DATAGRAM_NR_ACKED)) { if (stcb->asoc.strmout[tp1->rec.data.stream_number].chunks_on_queues > 0) { stcb->asoc.strmout[tp1->rec.data.stream_number].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", tp1->rec.data.stream_number); #endif } tp1->sent = SCTP_DATAGRAM_NR_ACKED; if (tp1->data) { /* * sa_ignore * NO_NULL_CHK */ sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1); sctp_m_freem(tp1->data); tp1->data = NULL; } wake_him++; } } break; } /* if (tp1->TSN_seq == theTSN) */ if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, theTSN)) { break; } tp1 = TAILQ_NEXT(tp1, sctp_next); if ((tp1 == NULL) && (circled == 0)) { circled++; tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); } } /* end while (tp1) */ if (tp1 == NULL) { circled = 0; tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); } /* In case the fragments were not in order we must reset */ } /* end for (j = fragStart */ *p_tp1 = tp1; return (wake_him); /* Return value only used for nr-sack */ } static int sctp_handle_segments(struct mbuf *m, int *offset, struct sctp_tcb *stcb, struct sctp_association *asoc, uint32_t last_tsn, uint32_t * biggest_tsn_acked, uint32_t * biggest_newly_acked_tsn, uint32_t * this_sack_lowest_newack, int num_seg, int num_nr_seg, int *rto_ok) { struct sctp_gap_ack_block *frag, block; struct sctp_tmit_chunk *tp1; int i; int num_frs = 0; int chunk_freed; int non_revocable; uint16_t frag_strt, frag_end, prev_frag_end; tp1 = TAILQ_FIRST(&asoc->sent_queue); prev_frag_end = 0; chunk_freed = 0; for (i = 0; i < (num_seg + num_nr_seg); i++) { if (i == num_seg) { prev_frag_end = 0; tp1 = TAILQ_FIRST(&asoc->sent_queue); } frag = (struct sctp_gap_ack_block *)sctp_m_getptr(m, *offset, sizeof(struct sctp_gap_ack_block), (uint8_t *) & block); *offset += sizeof(block); if (frag == NULL) { return (chunk_freed); } frag_strt = ntohs(frag->start); frag_end = ntohs(frag->end); if (frag_strt > frag_end) { /* This gap report is malformed, skip it. */ continue; } if (frag_strt <= prev_frag_end) { /* This gap report is not in order, so restart. */ tp1 = TAILQ_FIRST(&asoc->sent_queue); } if (SCTP_TSN_GT((last_tsn + frag_end), *biggest_tsn_acked)) { *biggest_tsn_acked = last_tsn + frag_end; } if (i < num_seg) { non_revocable = 0; } else { non_revocable = 1; } if (sctp_process_segment_range(stcb, &tp1, last_tsn, frag_strt, frag_end, non_revocable, &num_frs, biggest_newly_acked_tsn, this_sack_lowest_newack, rto_ok)) { chunk_freed = 1; } prev_frag_end = frag_end; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { if (num_frs) sctp_log_fr(*biggest_tsn_acked, *biggest_newly_acked_tsn, last_tsn, SCTP_FR_LOG_BIGGEST_TSNS); } return (chunk_freed); } static void sctp_check_for_revoked(struct sctp_tcb *stcb, struct sctp_association *asoc, uint32_t cumack, uint32_t biggest_tsn_acked) { struct sctp_tmit_chunk *tp1; TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, cumack)) { /* * ok this guy is either ACK or MARKED. If it is * ACKED it has been previously acked but not this * time i.e. revoked. If it is MARKED it was ACK'ed * again. */ if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, biggest_tsn_acked)) { break; } if (tp1->sent == SCTP_DATAGRAM_ACKED) { /* it has been revoked */ tp1->sent = SCTP_DATAGRAM_SENT; tp1->rec.data.chunk_was_revoked = 1; /* * We must add this stuff back in to assure * timers and such get started. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE, tp1->whoTo->flight_size, tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_increase(tp1); sctp_total_flight_increase(stcb, tp1); /* * We inflate the cwnd to compensate for our * artificial inflation of the flight_size. */ tp1->whoTo->cwnd += tp1->book_size; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { sctp_log_sack(asoc->last_acked_seq, cumack, tp1->rec.data.TSN_seq, 0, 0, SCTP_LOG_TSN_REVOKED); } } else if (tp1->sent == SCTP_DATAGRAM_MARKED) { /* it has been re-acked in this SACK */ tp1->sent = SCTP_DATAGRAM_ACKED; } } if (tp1->sent == SCTP_DATAGRAM_UNSENT) break; } } static void sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, uint32_t biggest_tsn_acked, uint32_t biggest_tsn_newly_acked, uint32_t this_sack_lowest_newack, int accum_moved) { struct sctp_tmit_chunk *tp1; int strike_flag = 0; struct timeval now; int tot_retrans = 0; uint32_t sending_seq; struct sctp_nets *net; int num_dests_sacked = 0; /* * select the sending_seq, this is either the next thing ready to be * sent but not transmitted, OR, the next seq we assign. */ tp1 = TAILQ_FIRST(&stcb->asoc.send_queue); if (tp1 == NULL) { sending_seq = asoc->sending_seq; } else { sending_seq = tp1->rec.data.TSN_seq; } /* CMT DAC algo: finding out if SACK is a mixed SACK */ if ((asoc->sctp_cmt_on_off > 0) && SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (net->saw_newack) num_dests_sacked++; } } if (stcb->asoc.prsctp_supported) { (void)SCTP_GETTIME_TIMEVAL(&now); } TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { strike_flag = 0; if (tp1->no_fr_allowed) { /* this one had a timeout or something */ continue; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { if (tp1->sent < SCTP_DATAGRAM_RESEND) sctp_log_fr(biggest_tsn_newly_acked, tp1->rec.data.TSN_seq, tp1->sent, SCTP_FR_LOG_CHECK_STRIKE); } if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, biggest_tsn_acked) || tp1->sent == SCTP_DATAGRAM_UNSENT) { /* done */ break; } if (stcb->asoc.prsctp_supported) { if ((PR_SCTP_TTL_ENABLED(tp1->flags)) && tp1->sent < SCTP_DATAGRAM_ACKED) { /* Is it expired? */ if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) { /* Yes so drop it */ if (tp1->data != NULL) { (void)sctp_release_pr_sctp_chunk(stcb, tp1, 1, SCTP_SO_NOT_LOCKED); } continue; } } } if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, asoc->this_sack_highest_gap)) { /* we are beyond the tsn in the sack */ break; } if (tp1->sent >= SCTP_DATAGRAM_RESEND) { /* either a RESEND, ACKED, or MARKED */ /* skip */ if (tp1->sent == SCTP_FORWARD_TSN_SKIP) { /* Continue strikin FWD-TSN chunks */ tp1->rec.data.fwd_tsn_cnt++; } continue; } /* * CMT : SFR algo (covers part of DAC and HTNA as well) */ if (tp1->whoTo && tp1->whoTo->saw_newack == 0) { /* * No new acks were receieved for data sent to this * dest. Therefore, according to the SFR algo for * CMT, no data sent to this dest can be marked for * FR using this SACK. */ continue; } else if (tp1->whoTo && SCTP_TSN_GT(tp1->rec.data.TSN_seq, tp1->whoTo->this_sack_highest_newack)) { /* * CMT: New acks were receieved for data sent to * this dest. But no new acks were seen for data * sent after tp1. Therefore, according to the SFR * algo for CMT, tp1 cannot be marked for FR using * this SACK. This step covers part of the DAC algo * and the HTNA algo as well. */ continue; } /* * Here we check to see if we were have already done a FR * and if so we see if the biggest TSN we saw in the sack is * smaller than the recovery point. If so we don't strike * the tsn... otherwise we CAN strike the TSN. */ /* * @@@ JRI: Check for CMT if (accum_moved && * asoc->fast_retran_loss_recovery && (sctp_cmt_on_off == * 0)) { */ if (accum_moved && asoc->fast_retran_loss_recovery) { /* * Strike the TSN if in fast-recovery and cum-ack * moved. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(biggest_tsn_newly_acked, tp1->rec.data.TSN_seq, tp1->sent, SCTP_FR_LOG_STRIKE_CHUNK); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { tp1->sent++; } if ((asoc->sctp_cmt_on_off > 0) && SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { /* * CMT DAC algorithm: If SACK flag is set to * 0, then lowest_newack test will not pass * because it would have been set to the * cumack earlier. If not already to be * rtx'd, If not a mixed sack and if tp1 is * not between two sacked TSNs, then mark by * one more. NOTE that we are marking by one * additional time since the SACK DAC flag * indicates that two packets have been * received after this missing TSN. */ if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (num_dests_sacked == 1) && SCTP_TSN_GT(this_sack_lowest_newack, tp1->rec.data.TSN_seq)) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(16 + num_dests_sacked, tp1->rec.data.TSN_seq, tp1->sent, SCTP_FR_LOG_STRIKE_CHUNK); } tp1->sent++; } } } else if ((tp1->rec.data.doing_fast_retransmit) && (asoc->sctp_cmt_on_off == 0)) { /* * For those that have done a FR we must take * special consideration if we strike. I.e the * biggest_newly_acked must be higher than the * sending_seq at the time we did the FR. */ if ( #ifdef SCTP_FR_TO_ALTERNATE /* * If FR's go to new networks, then we must only do * this for singly homed asoc's. However if the FR's * go to the same network (Armando's work) then its * ok to FR multiple times. */ (asoc->numnets < 2) #else (1) #endif ) { if (SCTP_TSN_GE(biggest_tsn_newly_acked, tp1->rec.data.fast_retran_tsn)) { /* * Strike the TSN, since this ack is * beyond where things were when we * did a FR. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(biggest_tsn_newly_acked, tp1->rec.data.TSN_seq, tp1->sent, SCTP_FR_LOG_STRIKE_CHUNK); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { tp1->sent++; } strike_flag = 1; if ((asoc->sctp_cmt_on_off > 0) && SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { /* * CMT DAC algorithm: If * SACK flag is set to 0, * then lowest_newack test * will not pass because it * would have been set to * the cumack earlier. If * not already to be rtx'd, * If not a mixed sack and * if tp1 is not between two * sacked TSNs, then mark by * one more. NOTE that we * are marking by one * additional time since the * SACK DAC flag indicates * that two packets have * been received after this * missing TSN. */ if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (num_dests_sacked == 1) && SCTP_TSN_GT(this_sack_lowest_newack, tp1->rec.data.TSN_seq)) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(32 + num_dests_sacked, tp1->rec.data.TSN_seq, tp1->sent, SCTP_FR_LOG_STRIKE_CHUNK); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { tp1->sent++; } } } } } /* * JRI: TODO: remove code for HTNA algo. CMT's SFR * algo covers HTNA. */ } else if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, biggest_tsn_newly_acked)) { /* * We don't strike these: This is the HTNA * algorithm i.e. we don't strike If our TSN is * larger than the Highest TSN Newly Acked. */ ; } else { /* Strike the TSN */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(biggest_tsn_newly_acked, tp1->rec.data.TSN_seq, tp1->sent, SCTP_FR_LOG_STRIKE_CHUNK); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { tp1->sent++; } if ((asoc->sctp_cmt_on_off > 0) && SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { /* * CMT DAC algorithm: If SACK flag is set to * 0, then lowest_newack test will not pass * because it would have been set to the * cumack earlier. If not already to be * rtx'd, If not a mixed sack and if tp1 is * not between two sacked TSNs, then mark by * one more. NOTE that we are marking by one * additional time since the SACK DAC flag * indicates that two packets have been * received after this missing TSN. */ if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (num_dests_sacked == 1) && SCTP_TSN_GT(this_sack_lowest_newack, tp1->rec.data.TSN_seq)) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(48 + num_dests_sacked, tp1->rec.data.TSN_seq, tp1->sent, SCTP_FR_LOG_STRIKE_CHUNK); } tp1->sent++; } } } if (tp1->sent == SCTP_DATAGRAM_RESEND) { struct sctp_nets *alt; /* fix counts and things */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND, (tp1->whoTo ? (tp1->whoTo->flight_size) : 0), tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } if (tp1->whoTo) { tp1->whoTo->net_ack++; sctp_flight_size_decrease(tp1); if (stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) { (*stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) (tp1->whoTo, tp1); } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd(SCTP_INCREASE_PEER_RWND, asoc->peers_rwnd, tp1->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); } /* add back to the rwnd */ asoc->peers_rwnd += (tp1->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); /* remove from the total flight */ sctp_total_flight_decrease(stcb, tp1); if ((stcb->asoc.prsctp_supported) && (PR_SCTP_RTX_ENABLED(tp1->flags))) { /* * Has it been retransmitted tv_sec times? - * we store the retran count there. */ if (tp1->snd_count > tp1->rec.data.timetodrop.tv_sec) { /* Yes, so drop it */ if (tp1->data != NULL) { (void)sctp_release_pr_sctp_chunk(stcb, tp1, 1, SCTP_SO_NOT_LOCKED); } /* Make sure to flag we had a FR */ tp1->whoTo->net_ack++; continue; } } /* * SCTP_PRINTF("OK, we are now ready to FR this * guy\n"); */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(tp1->rec.data.TSN_seq, tp1->snd_count, 0, SCTP_FR_MARKED); } if (strike_flag) { /* This is a subsequent FR */ SCTP_STAT_INCR(sctps_sendmultfastretrans); } sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); if (asoc->sctp_cmt_on_off > 0) { /* * CMT: Using RTX_SSTHRESH policy for CMT. * If CMT is being used, then pick dest with * largest ssthresh for any retransmission. */ tp1->no_fr_allowed = 1; alt = tp1->whoTo; /* sa_ignore NO_NULL_CHK */ if (asoc->sctp_cmt_pf > 0) { /* * JRS 5/18/07 - If CMT PF is on, * use the PF version of * find_alt_net() */ alt = sctp_find_alternate_net(stcb, alt, 2); } else { /* * JRS 5/18/07 - If only CMT is on, * use the CMT version of * find_alt_net() */ /* sa_ignore NO_NULL_CHK */ alt = sctp_find_alternate_net(stcb, alt, 1); } if (alt == NULL) { alt = tp1->whoTo; } /* * CUCv2: If a different dest is picked for * the retransmission, then new * (rtx-)pseudo_cumack needs to be tracked * for orig dest. Let CUCv2 track new (rtx-) * pseudo-cumack always. */ if (tp1->whoTo) { tp1->whoTo->find_pseudo_cumack = 1; tp1->whoTo->find_rtx_pseudo_cumack = 1; } } else {/* CMT is OFF */ #ifdef SCTP_FR_TO_ALTERNATE /* Can we find an alternate? */ alt = sctp_find_alternate_net(stcb, tp1->whoTo, 0); #else /* * default behavior is to NOT retransmit * FR's to an alternate. Armando Caro's * paper details why. */ alt = tp1->whoTo; #endif } tp1->rec.data.doing_fast_retransmit = 1; tot_retrans++; /* mark the sending seq for possible subsequent FR's */ /* * SCTP_PRINTF("Marking TSN for FR new value %x\n", * (uint32_t)tpi->rec.data.TSN_seq); */ if (TAILQ_EMPTY(&asoc->send_queue)) { /* * If the queue of send is empty then its * the next sequence number that will be * assigned so we subtract one from this to * get the one we last sent. */ tp1->rec.data.fast_retran_tsn = sending_seq; } else { /* * If there are chunks on the send queue * (unsent data that has made it from the * stream queues but not out the door, we * take the first one (which will have the * lowest TSN) and subtract one to get the * one we last sent. */ struct sctp_tmit_chunk *ttt; ttt = TAILQ_FIRST(&asoc->send_queue); tp1->rec.data.fast_retran_tsn = ttt->rec.data.TSN_seq; } if (tp1->do_rtt) { /* * this guy had a RTO calculation pending on * it, cancel it */ if ((tp1->whoTo != NULL) && (tp1->whoTo->rto_needed == 0)) { tp1->whoTo->rto_needed = 1; } tp1->do_rtt = 0; } if (alt != tp1->whoTo) { /* yes, there is an alternate. */ sctp_free_remote_addr(tp1->whoTo); /* sa_ignore FREED_MEMORY */ tp1->whoTo = alt; atomic_add_int(&alt->ref_count, 1); } } } } struct sctp_tmit_chunk * sctp_try_advance_peer_ack_point(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *tp1, *tp2, *a_adv = NULL; struct timeval now; int now_filled = 0; if (asoc->prsctp_supported == 0) { return (NULL); } TAILQ_FOREACH_SAFE(tp1, &asoc->sent_queue, sctp_next, tp2) { if (tp1->sent != SCTP_FORWARD_TSN_SKIP && tp1->sent != SCTP_DATAGRAM_RESEND && tp1->sent != SCTP_DATAGRAM_NR_ACKED) { /* no chance to advance, out of here */ break; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { if ((tp1->sent == SCTP_FORWARD_TSN_SKIP) || (tp1->sent == SCTP_DATAGRAM_NR_ACKED)) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, asoc->advanced_peer_ack_point, tp1->rec.data.TSN_seq, 0, 0); } } if (!PR_SCTP_ENABLED(tp1->flags)) { /* * We can't fwd-tsn past any that are reliable aka * retransmitted until the asoc fails. */ break; } if (!now_filled) { (void)SCTP_GETTIME_TIMEVAL(&now); now_filled = 1; } /* * now we got a chunk which is marked for another * retransmission to a PR-stream but has run out its chances * already maybe OR has been marked to skip now. Can we skip * it if its a resend? */ if (tp1->sent == SCTP_DATAGRAM_RESEND && (PR_SCTP_TTL_ENABLED(tp1->flags))) { /* * Now is this one marked for resend and its time is * now up? */ if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) { /* Yes so drop it */ if (tp1->data) { (void)sctp_release_pr_sctp_chunk(stcb, tp1, 1, SCTP_SO_NOT_LOCKED); } } else { /* * No, we are done when hit one for resend * whos time as not expired. */ break; } } /* * Ok now if this chunk is marked to drop it we can clean up * the chunk, advance our peer ack point and we can check * the next chunk. */ if ((tp1->sent == SCTP_FORWARD_TSN_SKIP) || (tp1->sent == SCTP_DATAGRAM_NR_ACKED)) { /* advance PeerAckPoint goes forward */ if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, asoc->advanced_peer_ack_point)) { asoc->advanced_peer_ack_point = tp1->rec.data.TSN_seq; a_adv = tp1; } else if (tp1->rec.data.TSN_seq == asoc->advanced_peer_ack_point) { /* No update but we do save the chk */ a_adv = tp1; } } else { /* * If it is still in RESEND we can advance no * further */ break; } } return (a_adv); } static int sctp_fs_audit(struct sctp_association *asoc) { struct sctp_tmit_chunk *chk; int inflight = 0, resend = 0, inbetween = 0, acked = 0, above = 0; int ret; #ifndef INVARIANTS int entry_flight, entry_cnt; #endif ret = 0; #ifndef INVARIANTS entry_flight = asoc->total_flight; entry_cnt = asoc->total_flight_count; #endif if (asoc->pr_sctp_cnt >= asoc->sent_queue_cnt) return (0); TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { if (chk->sent < SCTP_DATAGRAM_RESEND) { SCTP_PRINTF("Chk TSN:%u size:%d inflight cnt:%d\n", chk->rec.data.TSN_seq, chk->send_size, chk->snd_count); inflight++; } else if (chk->sent == SCTP_DATAGRAM_RESEND) { resend++; } else if (chk->sent < SCTP_DATAGRAM_ACKED) { inbetween++; } else if (chk->sent > SCTP_DATAGRAM_ACKED) { above++; } else { acked++; } } if ((inflight > 0) || (inbetween > 0)) { #ifdef INVARIANTS panic("Flight size-express incorrect? \n"); #else SCTP_PRINTF("asoc->total_flight:%d cnt:%d\n", entry_flight, entry_cnt); SCTP_PRINTF("Flight size-express incorrect F:%d I:%d R:%d Ab:%d ACK:%d\n", inflight, inbetween, resend, above, acked); ret = 1; #endif } return (ret); } static void sctp_window_probe_recovery(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_tmit_chunk *tp1) { tp1->window_probe = 0; if ((tp1->sent >= SCTP_DATAGRAM_ACKED) || (tp1->data == NULL)) { /* TSN's skipped we do NOT move back. */ sctp_misc_ints(SCTP_FLIGHT_LOG_DWN_WP_FWD, tp1->whoTo ? tp1->whoTo->flight_size : 0, tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); return; } /* First setup this by shrinking flight */ if (stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) { (*stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) (tp1->whoTo, tp1); } sctp_flight_size_decrease(tp1); sctp_total_flight_decrease(stcb, tp1); /* Now mark for resend */ tp1->sent = SCTP_DATAGRAM_RESEND; sctp_ucount_incr(asoc->sent_queue_retran_cnt); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_WP, tp1->whoTo->flight_size, tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } } void sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, uint32_t rwnd, int *abort_now, int ecne_seen) { struct sctp_nets *net; struct sctp_association *asoc; struct sctp_tmit_chunk *tp1, *tp2; uint32_t old_rwnd; int win_probe_recovery = 0; int win_probe_recovered = 0; int j, done_once = 0; int rto_ok = 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) { sctp_misc_ints(SCTP_SACK_LOG_EXPRESS, cumack, rwnd, stcb->asoc.last_acked_seq, stcb->asoc.peers_rwnd); } SCTP_TCB_LOCK_ASSERT(stcb); #ifdef SCTP_ASOCLOG_OF_TSNS stcb->asoc.cumack_log[stcb->asoc.cumack_log_at] = cumack; stcb->asoc.cumack_log_at++; if (stcb->asoc.cumack_log_at > SCTP_TSN_LOG_SIZE) { stcb->asoc.cumack_log_at = 0; } #endif asoc = &stcb->asoc; old_rwnd = asoc->peers_rwnd; if (SCTP_TSN_GT(asoc->last_acked_seq, cumack)) { /* old ack */ return; } else if (asoc->last_acked_seq == cumack) { /* Window update sack */ asoc->peers_rwnd = sctp_sbspace_sub(rwnd, (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)))); if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } if (asoc->peers_rwnd > old_rwnd) { goto again; } return; } /* First setup for CC stuff */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (SCTP_TSN_GT(cumack, net->cwr_window_tsn)) { /* Drag along the window_tsn for cwr's */ net->cwr_window_tsn = cumack; } net->prev_cwnd = net->cwnd; net->net_ack = 0; net->net_ack2 = 0; /* * CMT: Reset CUC and Fast recovery algo variables before * SACK processing */ net->new_pseudo_cumack = 0; net->will_exit_fast_recovery = 0; if (stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) { (*stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) (stcb, net); } } if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { uint32_t send_s; if (!TAILQ_EMPTY(&asoc->sent_queue)) { tp1 = TAILQ_LAST(&asoc->sent_queue, sctpchunk_listhead); send_s = tp1->rec.data.TSN_seq + 1; } else { send_s = asoc->sending_seq; } if (SCTP_TSN_GE(cumack, send_s)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; *abort_now = 1; /* XXX */ snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x", cumack, send_s); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_23; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); return; } } asoc->this_sack_highest_gap = cumack; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INDATA, __LINE__); } stcb->asoc.overall_error_count = 0; if (SCTP_TSN_GT(cumack, asoc->last_acked_seq)) { /* process the new consecutive TSN first */ TAILQ_FOREACH_SAFE(tp1, &asoc->sent_queue, sctp_next, tp2) { if (SCTP_TSN_GE(cumack, tp1->rec.data.TSN_seq)) { if (tp1->sent == SCTP_DATAGRAM_UNSENT) { SCTP_PRINTF("Warning, an unsent is now acked?\n"); } if (tp1->sent < SCTP_DATAGRAM_ACKED) { /* * If it is less than ACKED, it is * now no-longer in flight. Higher * values may occur during marking */ if (tp1->sent < SCTP_DATAGRAM_RESEND) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA, tp1->whoTo->flight_size, tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_decrease(tp1); if (stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) { (*stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) (tp1->whoTo, tp1); } /* sa_ignore NO_NULL_CHK */ sctp_total_flight_decrease(stcb, tp1); } tp1->whoTo->net_ack += tp1->send_size; if (tp1->snd_count < 2) { /* * True non-retransmited * chunk */ tp1->whoTo->net_ack2 += tp1->send_size; /* update RTO too? */ if (tp1->do_rtt) { if (rto_ok) { tp1->whoTo->RTO = /* * sa_ignore * NO_NULL_CH * K */ sctp_calculate_rto(stcb, asoc, tp1->whoTo, &tp1->sent_rcv_time, sctp_align_safe_nocopy, SCTP_RTT_FROM_DATA); rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { tp1->whoTo->rto_needed = 1; } tp1->do_rtt = 0; } } /* * CMT: CUCv2 algorithm. From the * cumack'd TSNs, for each TSN being * acked for the first time, set the * following variables for the * corresp destination. * new_pseudo_cumack will trigger a * cwnd update. * find_(rtx_)pseudo_cumack will * trigger search for the next * expected (rtx-)pseudo-cumack. */ tp1->whoTo->new_pseudo_cumack = 1; tp1->whoTo->find_pseudo_cumack = 1; tp1->whoTo->find_rtx_pseudo_cumack = 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { /* sa_ignore NO_NULL_CHK */ sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK); } } if (tp1->sent == SCTP_DATAGRAM_RESEND) { sctp_ucount_decr(asoc->sent_queue_retran_cnt); } if (tp1->rec.data.chunk_was_revoked) { /* deflate the cwnd */ tp1->whoTo->cwnd -= tp1->book_size; tp1->rec.data.chunk_was_revoked = 0; } if (tp1->sent != SCTP_DATAGRAM_NR_ACKED) { if (asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues > 0) { asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", tp1->rec.data.stream_number); #endif } } TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next); if (tp1->data) { /* sa_ignore NO_NULL_CHK */ sctp_free_bufspace(stcb, asoc, tp1, 1); sctp_m_freem(tp1->data); tp1->data = NULL; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { sctp_log_sack(asoc->last_acked_seq, cumack, tp1->rec.data.TSN_seq, 0, 0, SCTP_LOG_FREE_SENT); } asoc->sent_queue_cnt--; sctp_free_a_chunk(stcb, tp1, SCTP_SO_NOT_LOCKED); } else { break; } } } /* sa_ignore NO_NULL_CHK */ if (stcb->sctp_socket) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SOCKBUF_LOCK(&stcb->sctp_socket->so_snd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { /* sa_ignore NO_NULL_CHK */ sctp_wakeup_log(stcb, 1, SCTP_WAKESND_FROM_SACK); } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* assoc was freed while we were unlocked */ SCTP_SOCKET_UNLOCK(so, 1); return; } #endif sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { sctp_wakeup_log(stcb, 1, SCTP_NOWAKE_FROM_SACK); } } /* JRS - Use the congestion control given in the CC module */ if ((asoc->last_acked_seq != cumack) && (ecne_seen == 0)) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (net->net_ack2 > 0) { /* * Karn's rule applies to clearing error * count, this is optional. */ net->error_count = 0; if (!(net->dest_state & SCTP_ADDR_REACHABLE)) { /* addr came good */ net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, (void *)net, SCTP_SO_NOT_LOCKED); } if (net == stcb->asoc.primary_destination) { if (stcb->asoc.alternate) { /* * release the alternate, * primary is good */ sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } } if (net->dest_state & SCTP_ADDR_PF) { net->dest_state &= ~SCTP_ADDR_PF; sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_24); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); asoc->cc_functions.sctp_cwnd_update_exit_pf(stcb, net); /* Done with this net */ net->net_ack = 0; } /* restore any doubled timers */ net->RTO = (net->lastsa >> SCTP_RTT_SHIFT) + net->lastsv; if (net->RTO < stcb->asoc.minrto) { net->RTO = stcb->asoc.minrto; } if (net->RTO > stcb->asoc.maxrto) { net->RTO = stcb->asoc.maxrto; } } } asoc->cc_functions.sctp_cwnd_update_after_sack(stcb, asoc, 1, 0, 0); } asoc->last_acked_seq = cumack; if (TAILQ_EMPTY(&asoc->sent_queue)) { /* nothing left in-flight */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { net->flight_size = 0; net->partial_bytes_acked = 0; } asoc->total_flight = 0; asoc->total_flight_count = 0; } /* RWND update */ asoc->peers_rwnd = sctp_sbspace_sub(rwnd, (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)))); if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } if (asoc->peers_rwnd > old_rwnd) { win_probe_recovery = 1; } /* Now assure a timer where data is queued at */ again: j = 0; TAILQ_FOREACH(net, &asoc->nets, sctp_next) { int to_ticks; if (win_probe_recovery && (net->window_probe)) { win_probe_recovered = 1; /* * Find first chunk that was used with window probe * and clear the sent */ /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { if (tp1->window_probe) { /* move back to data send queue */ sctp_window_probe_recovery(stcb, asoc, tp1); break; } } } if (net->RTO == 0) { to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); } else { to_ticks = MSEC_TO_TICKS(net->RTO); } if (net->flight_size) { j++; (void)SCTP_OS_TIMER_START(&net->rxt_timer.timer, to_ticks, sctp_timeout_handler, &net->rxt_timer); if (net->window_probe) { net->window_probe = 0; } } else { if (net->window_probe) { /* * In window probes we must assure a timer * is still running there */ net->window_probe = 0; if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { SCTP_OS_TIMER_START(&net->rxt_timer.timer, to_ticks, sctp_timeout_handler, &net->rxt_timer); } } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_25); } } } if ((j == 0) && (!TAILQ_EMPTY(&asoc->sent_queue)) && (asoc->sent_queue_retran_cnt == 0) && (win_probe_recovered == 0) && (done_once == 0)) { /* * huh, this should not happen unless all packets are * PR-SCTP and marked to skip of course. */ if (sctp_fs_audit(asoc)) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { net->flight_size = 0; } asoc->total_flight = 0; asoc->total_flight_count = 0; asoc->sent_queue_retran_cnt = 0; TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { if (tp1->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_increase(tp1); sctp_total_flight_increase(stcb, tp1); } else if (tp1->sent == SCTP_DATAGRAM_RESEND) { sctp_ucount_incr(asoc->sent_queue_retran_cnt); } } } done_once = 1; goto again; } /**********************************/ /* Now what about shutdown issues */ /**********************************/ if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue)) { /* nothing left on sendqueue.. consider done */ /* clean up */ if ((asoc->stream_queue_cnt == 1) && ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && (asoc->locked_on_sending) ) { struct sctp_stream_queue_pending *sp; /* * I may be in a state where we got all across.. but * cannot write more due to a shutdown... we abort * since the user did not indicate EOR in this case. * The sp will be cleaned during free of the asoc. */ sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue), sctp_streamhead); if ((sp) && (sp->length == 0)) { /* Let cleanup code purge it */ if (sp->msg_is_complete) { asoc->stream_queue_cnt--; } else { asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; asoc->locked_on_sending = NULL; asoc->stream_queue_cnt--; } } } if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) && (asoc->stream_queue_cnt == 0)) { if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { /* Need to abort here */ struct mbuf *op_err; abort_out_now: *abort_now = 1; /* XXX */ op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_26; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); } else { struct sctp_nets *netp; if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; } else { netp = asoc->primary_destination; } sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); } } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) && (asoc->stream_queue_cnt == 0)) { struct sctp_nets *netp; if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { goto abort_out_now; } SCTP_STAT_DECR_GAUGE32(sctps_currestab); SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; } else { netp = asoc->primary_destination; } sctp_send_shutdown_ack(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, netp); } } /*********************************************/ /* Here we perform PR-SCTP procedures */ /* (section 4.2) */ /*********************************************/ /* C1. update advancedPeerAckPoint */ if (SCTP_TSN_GT(cumack, asoc->advanced_peer_ack_point)) { asoc->advanced_peer_ack_point = cumack; } /* PR-Sctp issues need to be addressed too */ if ((asoc->prsctp_supported) && (asoc->pr_sctp_cnt > 0)) { struct sctp_tmit_chunk *lchk; uint32_t old_adv_peer_ack_point; old_adv_peer_ack_point = asoc->advanced_peer_ack_point; lchk = sctp_try_advance_peer_ack_point(stcb, asoc); /* C3. See if we need to send a Fwd-TSN */ if (SCTP_TSN_GT(asoc->advanced_peer_ack_point, cumack)) { /* * ISSUE with ECN, see FWD-TSN processing. */ if (SCTP_TSN_GT(asoc->advanced_peer_ack_point, old_adv_peer_ack_point)) { send_forward_tsn(stcb, asoc); } else if (lchk) { /* try to FR fwd-tsn's that get lost too */ if (lchk->rec.data.fwd_tsn_cnt >= 3) { send_forward_tsn(stcb, asoc); } } } if (lchk) { /* Assure a timer is up */ sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, lchk->whoTo); } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_RWND_LOGGING_ENABLE) { sctp_misc_ints(SCTP_SACK_RWND_UPDATE, rwnd, stcb->asoc.peers_rwnd, stcb->asoc.total_flight, stcb->asoc.total_output_queue_size); } } void sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, struct sctp_tcb *stcb, uint16_t num_seg, uint16_t num_nr_seg, uint16_t num_dup, int *abort_now, uint8_t flags, uint32_t cum_ack, uint32_t rwnd, int ecne_seen) { struct sctp_association *asoc; struct sctp_tmit_chunk *tp1, *tp2; uint32_t last_tsn, biggest_tsn_acked, biggest_tsn_newly_acked, this_sack_lowest_newack; uint16_t wake_him = 0; uint32_t send_s = 0; long j; int accum_moved = 0; int will_exit_fast_recovery = 0; uint32_t a_rwnd, old_rwnd; int win_probe_recovery = 0; int win_probe_recovered = 0; struct sctp_nets *net = NULL; int done_once; int rto_ok = 1; uint8_t reneged_all = 0; uint8_t cmt_dac_flag; /* * we take any chance we can to service our queues since we cannot * get awoken when the socket is read from :< */ /* * Now perform the actual SACK handling: 1) Verify that it is not an * old sack, if so discard. 2) If there is nothing left in the send * queue (cum-ack is equal to last acked) then you have a duplicate * too, update any rwnd change and verify no timers are running. * then return. 3) Process any new consequtive data i.e. cum-ack * moved process these first and note that it moved. 4) Process any * sack blocks. 5) Drop any acked from the queue. 6) Check for any * revoked blocks and mark. 7) Update the cwnd. 8) Nothing left, * sync up flightsizes and things, stop all timers and also check * for shutdown_pending state. If so then go ahead and send off the * shutdown. If in shutdown recv, send off the shutdown-ack and * start that timer, Ret. 9) Strike any non-acked things and do FR * procedure if needed being sure to set the FR flag. 10) Do pr-sctp * procedures. 11) Apply any FR penalties. 12) Assure we will SACK * if in shutdown_recv state. */ SCTP_TCB_LOCK_ASSERT(stcb); /* CMT DAC algo */ this_sack_lowest_newack = 0; SCTP_STAT_INCR(sctps_slowpath_sack); last_tsn = cum_ack; cmt_dac_flag = flags & SCTP_SACK_CMT_DAC; #ifdef SCTP_ASOCLOG_OF_TSNS stcb->asoc.cumack_log[stcb->asoc.cumack_log_at] = cum_ack; stcb->asoc.cumack_log_at++; if (stcb->asoc.cumack_log_at > SCTP_TSN_LOG_SIZE) { stcb->asoc.cumack_log_at = 0; } #endif a_rwnd = rwnd; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) { sctp_misc_ints(SCTP_SACK_LOG_NORMAL, cum_ack, rwnd, stcb->asoc.last_acked_seq, stcb->asoc.peers_rwnd); } old_rwnd = stcb->asoc.peers_rwnd; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INDATA, __LINE__); } stcb->asoc.overall_error_count = 0; asoc = &stcb->asoc; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { sctp_log_sack(asoc->last_acked_seq, cum_ack, 0, num_seg, num_dup, SCTP_LOG_NEW_SACK); } if ((num_dup) && (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE)) { uint16_t i; uint32_t *dupdata, dblock; for (i = 0; i < num_dup; i++) { dupdata = (uint32_t *) sctp_m_getptr(m, offset_dup + i * sizeof(uint32_t), sizeof(uint32_t), (uint8_t *) & dblock); if (dupdata == NULL) { break; } sctp_log_fr(*dupdata, 0, 0, SCTP_FR_DUPED); } } if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { /* reality check */ if (!TAILQ_EMPTY(&asoc->sent_queue)) { tp1 = TAILQ_LAST(&asoc->sent_queue, sctpchunk_listhead); send_s = tp1->rec.data.TSN_seq + 1; } else { tp1 = NULL; send_s = asoc->sending_seq; } if (SCTP_TSN_GE(cum_ack, send_s)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * no way, we have not even sent this TSN out yet. * Peer is hopelessly messed up with us. */ SCTP_PRINTF("NEW cum_ack:%x send_s:%x is smaller or equal\n", cum_ack, send_s); if (tp1) { SCTP_PRINTF("Got send_s from tsn:%x + 1 of tp1:%p\n", tp1->rec.data.TSN_seq, (void *)tp1); } hopeless_peer: *abort_now = 1; /* XXX */ snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x", cum_ack, send_s); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_27; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); return; } } /**********************/ /* 1) check the range */ /**********************/ if (SCTP_TSN_GT(asoc->last_acked_seq, last_tsn)) { /* acking something behind */ return; } /* update the Rwnd of the peer */ if (TAILQ_EMPTY(&asoc->sent_queue) && TAILQ_EMPTY(&asoc->send_queue) && (asoc->stream_queue_cnt == 0)) { /* nothing left on send/sent and strmq */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK, asoc->peers_rwnd, 0, 0, a_rwnd); } asoc->peers_rwnd = a_rwnd; if (asoc->sent_queue_retran_cnt) { asoc->sent_queue_retran_cnt = 0; } if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } /* stop any timers */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_28); net->partial_bytes_acked = 0; net->flight_size = 0; } asoc->total_flight = 0; asoc->total_flight_count = 0; return; } /* * We init netAckSz and netAckSz2 to 0. These are used to track 2 * things. The total byte count acked is tracked in netAckSz AND * netAck2 is used to track the total bytes acked that are un- * amibguious and were never retransmitted. We track these on a per * destination address basis. */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (SCTP_TSN_GT(cum_ack, net->cwr_window_tsn)) { /* Drag along the window_tsn for cwr's */ net->cwr_window_tsn = cum_ack; } net->prev_cwnd = net->cwnd; net->net_ack = 0; net->net_ack2 = 0; /* * CMT: Reset CUC and Fast recovery algo variables before * SACK processing */ net->new_pseudo_cumack = 0; net->will_exit_fast_recovery = 0; if (stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) { (*stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) (stcb, net); } } /* process the new consecutive TSN first */ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { if (SCTP_TSN_GE(last_tsn, tp1->rec.data.TSN_seq)) { if (tp1->sent != SCTP_DATAGRAM_UNSENT) { accum_moved = 1; if (tp1->sent < SCTP_DATAGRAM_ACKED) { /* * If it is less than ACKED, it is * now no-longer in flight. Higher * values may occur during marking */ if ((tp1->whoTo->dest_state & SCTP_ADDR_UNCONFIRMED) && (tp1->snd_count < 2)) { /* * If there was no retran * and the address is * un-confirmed and we sent * there and are now * sacked.. its confirmed, * mark it so. */ tp1->whoTo->dest_state &= ~SCTP_ADDR_UNCONFIRMED; } if (tp1->sent < SCTP_DATAGRAM_RESEND) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA, tp1->whoTo->flight_size, tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_decrease(tp1); sctp_total_flight_decrease(stcb, tp1); if (stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) { (*stcb->asoc.cc_functions.sctp_cwnd_update_tsn_acknowledged) (tp1->whoTo, tp1); } } tp1->whoTo->net_ack += tp1->send_size; /* CMT SFR and DAC algos */ this_sack_lowest_newack = tp1->rec.data.TSN_seq; tp1->whoTo->saw_newack = 1; if (tp1->snd_count < 2) { /* * True non-retransmited * chunk */ tp1->whoTo->net_ack2 += tp1->send_size; /* update RTO too? */ if (tp1->do_rtt) { if (rto_ok) { tp1->whoTo->RTO = sctp_calculate_rto(stcb, asoc, tp1->whoTo, &tp1->sent_rcv_time, sctp_align_safe_nocopy, SCTP_RTT_FROM_DATA); rto_ok = 0; } if (tp1->whoTo->rto_needed == 0) { tp1->whoTo->rto_needed = 1; } tp1->do_rtt = 0; } } /* * CMT: CUCv2 algorithm. From the * cumack'd TSNs, for each TSN being * acked for the first time, set the * following variables for the * corresp destination. * new_pseudo_cumack will trigger a * cwnd update. * find_(rtx_)pseudo_cumack will * trigger search for the next * expected (rtx-)pseudo-cumack. */ tp1->whoTo->new_pseudo_cumack = 1; tp1->whoTo->find_pseudo_cumack = 1; tp1->whoTo->find_rtx_pseudo_cumack = 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { sctp_log_sack(asoc->last_acked_seq, cum_ack, tp1->rec.data.TSN_seq, 0, 0, SCTP_LOG_TSN_ACKED); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK); } } if (tp1->sent == SCTP_DATAGRAM_RESEND) { sctp_ucount_decr(asoc->sent_queue_retran_cnt); #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xB3, (asoc->sent_queue_retran_cnt & 0x000000ff)); #endif } if (tp1->rec.data.chunk_was_revoked) { /* deflate the cwnd */ tp1->whoTo->cwnd -= tp1->book_size; tp1->rec.data.chunk_was_revoked = 0; } if (tp1->sent != SCTP_DATAGRAM_NR_ACKED) { tp1->sent = SCTP_DATAGRAM_ACKED; } } } else { break; } } biggest_tsn_newly_acked = biggest_tsn_acked = last_tsn; /* always set this up to cum-ack */ asoc->this_sack_highest_gap = last_tsn; if ((num_seg > 0) || (num_nr_seg > 0)) { /* * CMT: SFR algo (and HTNA) - this_sack_highest_newack has * to be greater than the cumack. Also reset saw_newack to 0 * for all dests. */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { net->saw_newack = 0; net->this_sack_highest_newack = last_tsn; } /* * thisSackHighestGap will increase while handling NEW * segments this_sack_highest_newack will increase while * handling NEWLY ACKED chunks. this_sack_lowest_newack is * used for CMT DAC algo. saw_newack will also change. */ if (sctp_handle_segments(m, &offset_seg, stcb, asoc, last_tsn, &biggest_tsn_acked, &biggest_tsn_newly_acked, &this_sack_lowest_newack, num_seg, num_nr_seg, &rto_ok)) { wake_him++; } if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { /* * validate the biggest_tsn_acked in the gap acks if * strict adherence is wanted. */ if (SCTP_TSN_GE(biggest_tsn_acked, send_s)) { /* * peer is either confused or we are under * attack. We must abort. */ SCTP_PRINTF("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n", biggest_tsn_acked, send_s); goto hopeless_peer; } } } /*******************************************/ /* cancel ALL T3-send timer if accum moved */ /*******************************************/ if (asoc->sctp_cmt_on_off > 0) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (net->new_pseudo_cumack) sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_29); } } else { if (accum_moved) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_30); } } } /********************************************/ /* drop the acked chunks from the sentqueue */ /********************************************/ asoc->last_acked_seq = cum_ack; TAILQ_FOREACH_SAFE(tp1, &asoc->sent_queue, sctp_next, tp2) { if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, cum_ack)) { break; } if (tp1->sent != SCTP_DATAGRAM_NR_ACKED) { if (asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues > 0) { asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", tp1->rec.data.stream_number); #endif } } TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next); if (PR_SCTP_ENABLED(tp1->flags)) { if (asoc->pr_sctp_cnt != 0) asoc->pr_sctp_cnt--; } asoc->sent_queue_cnt--; if (tp1->data) { /* sa_ignore NO_NULL_CHK */ sctp_free_bufspace(stcb, asoc, tp1, 1); sctp_m_freem(tp1->data); tp1->data = NULL; if (asoc->prsctp_supported && PR_SCTP_BUF_ENABLED(tp1->flags)) { asoc->sent_queue_cnt_removeable--; } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { sctp_log_sack(asoc->last_acked_seq, cum_ack, tp1->rec.data.TSN_seq, 0, 0, SCTP_LOG_FREE_SENT); } sctp_free_a_chunk(stcb, tp1, SCTP_SO_NOT_LOCKED); wake_him++; } if (TAILQ_EMPTY(&asoc->sent_queue) && (asoc->total_flight > 0)) { #ifdef INVARIANTS panic("Warning flight size is postive and should be 0"); #else SCTP_PRINTF("Warning flight size incorrect should be 0 is %d\n", asoc->total_flight); #endif asoc->total_flight = 0; } /* sa_ignore NO_NULL_CHK */ if ((wake_him) && (stcb->sctp_socket)) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SOCKBUF_LOCK(&stcb->sctp_socket->so_snd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { sctp_wakeup_log(stcb, wake_him, SCTP_WAKESND_FROM_SACK); } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* assoc was freed while we were unlocked */ SCTP_SOCKET_UNLOCK(so, 1); return; } #endif sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { sctp_wakeup_log(stcb, wake_him, SCTP_NOWAKE_FROM_SACK); } } if (asoc->fast_retran_loss_recovery && accum_moved) { if (SCTP_TSN_GE(asoc->last_acked_seq, asoc->fast_recovery_tsn)) { /* Setup so we will exit RFC2582 fast recovery */ will_exit_fast_recovery = 1; } } /* * Check for revoked fragments: * * if Previous sack - Had no frags then we can't have any revoked if * Previous sack - Had frag's then - If we now have frags aka * num_seg > 0 call sctp_check_for_revoked() to tell if peer revoked * some of them. else - The peer revoked all ACKED fragments, since * we had some before and now we have NONE. */ if (num_seg) { sctp_check_for_revoked(stcb, asoc, cum_ack, biggest_tsn_acked); asoc->saw_sack_with_frags = 1; } else if (asoc->saw_sack_with_frags) { int cnt_revoked = 0; /* Peer revoked all dg's marked or acked */ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { if (tp1->sent == SCTP_DATAGRAM_ACKED) { tp1->sent = SCTP_DATAGRAM_SENT; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE, tp1->whoTo->flight_size, tp1->book_size, (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_increase(tp1); sctp_total_flight_increase(stcb, tp1); tp1->rec.data.chunk_was_revoked = 1; /* * To ensure that this increase in * flightsize, which is artificial, does not * throttle the sender, we also increase the * cwnd artificially. */ tp1->whoTo->cwnd += tp1->book_size; cnt_revoked++; } } if (cnt_revoked) { reneged_all = 1; } asoc->saw_sack_with_frags = 0; } if (num_nr_seg > 0) asoc->saw_sack_with_nr_frags = 1; else asoc->saw_sack_with_nr_frags = 0; /* JRS - Use the congestion control given in the CC module */ if (ecne_seen == 0) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (net->net_ack2 > 0) { /* * Karn's rule applies to clearing error * count, this is optional. */ net->error_count = 0; if (!(net->dest_state & SCTP_ADDR_REACHABLE)) { /* addr came good */ net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, (void *)net, SCTP_SO_NOT_LOCKED); } if (net == stcb->asoc.primary_destination) { if (stcb->asoc.alternate) { /* * release the alternate, * primary is good */ sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } } if (net->dest_state & SCTP_ADDR_PF) { net->dest_state &= ~SCTP_ADDR_PF; sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_31); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); asoc->cc_functions.sctp_cwnd_update_exit_pf(stcb, net); /* Done with this net */ net->net_ack = 0; } /* restore any doubled timers */ net->RTO = (net->lastsa >> SCTP_RTT_SHIFT) + net->lastsv; if (net->RTO < stcb->asoc.minrto) { net->RTO = stcb->asoc.minrto; } if (net->RTO > stcb->asoc.maxrto) { net->RTO = stcb->asoc.maxrto; } } } asoc->cc_functions.sctp_cwnd_update_after_sack(stcb, asoc, accum_moved, reneged_all, will_exit_fast_recovery); } if (TAILQ_EMPTY(&asoc->sent_queue)) { /* nothing left in-flight */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { /* stop all timers */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); net->flight_size = 0; net->partial_bytes_acked = 0; } asoc->total_flight = 0; asoc->total_flight_count = 0; } /**********************************/ /* Now what about shutdown issues */ /**********************************/ if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue)) { /* nothing left on sendqueue.. consider done */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK, asoc->peers_rwnd, 0, 0, a_rwnd); } asoc->peers_rwnd = a_rwnd; if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } /* clean up */ if ((asoc->stream_queue_cnt == 1) && ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && (asoc->locked_on_sending) ) { struct sctp_stream_queue_pending *sp; /* * I may be in a state where we got all across.. but * cannot write more due to a shutdown... we abort * since the user did not indicate EOR in this case. */ sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue), sctp_streamhead); if ((sp) && (sp->length == 0)) { asoc->locked_on_sending = NULL; if (sp->msg_is_complete) { asoc->stream_queue_cnt--; } else { asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; asoc->stream_queue_cnt--; } } } if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) && (asoc->stream_queue_cnt == 0)) { if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { /* Need to abort here */ struct mbuf *op_err; abort_out_now: *abort_now = 1; /* XXX */ op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_33; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); return; } else { struct sctp_nets *netp; if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; } else { netp = asoc->primary_destination; } sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); } return; } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) && (asoc->stream_queue_cnt == 0)) { struct sctp_nets *netp; if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { goto abort_out_now; } SCTP_STAT_DECR_GAUGE32(sctps_currestab); SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; } else { netp = asoc->primary_destination; } sctp_send_shutdown_ack(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, netp); return; } } /* * Now here we are going to recycle net_ack for a different use... * HEADS UP. */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { net->net_ack = 0; } /* * CMT DAC algorithm: If SACK DAC flag was 0, then no extra marking * to be done. Setting this_sack_lowest_newack to the cum_ack will * automatically ensure that. */ if ((asoc->sctp_cmt_on_off > 0) && SCTP_BASE_SYSCTL(sctp_cmt_use_dac) && (cmt_dac_flag == 0)) { this_sack_lowest_newack = cum_ack; } if ((num_seg > 0) || (num_nr_seg > 0)) { sctp_strike_gap_ack_chunks(stcb, asoc, biggest_tsn_acked, biggest_tsn_newly_acked, this_sack_lowest_newack, accum_moved); } /* JRS - Use the congestion control given in the CC module */ asoc->cc_functions.sctp_cwnd_update_after_fr(stcb, asoc); /* Now are we exiting loss recovery ? */ if (will_exit_fast_recovery) { /* Ok, we must exit fast recovery */ asoc->fast_retran_loss_recovery = 0; } if ((asoc->sat_t3_loss_recovery) && SCTP_TSN_GE(asoc->last_acked_seq, asoc->sat_t3_recovery_tsn)) { /* end satellite t3 loss recovery */ asoc->sat_t3_loss_recovery = 0; } /* * CMT Fast recovery */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (net->will_exit_fast_recovery) { /* Ok, we must exit fast recovery */ net->fast_retran_loss_recovery = 0; } } /* Adjust and set the new rwnd value */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK, asoc->peers_rwnd, asoc->total_flight, (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)), a_rwnd); } asoc->peers_rwnd = sctp_sbspace_sub(a_rwnd, (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)))); if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } if (asoc->peers_rwnd > old_rwnd) { win_probe_recovery = 1; } /* * Now we must setup so we have a timer up for anyone with * outstanding data. */ done_once = 0; again: j = 0; TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (win_probe_recovery && (net->window_probe)) { win_probe_recovered = 1; /*- * Find first chunk that was used with * window probe and clear the event. Put * it back into the send queue as if has * not been sent. */ TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { if (tp1->window_probe) { sctp_window_probe_recovery(stcb, asoc, tp1); break; } } } if (net->flight_size) { j++; if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } if (net->window_probe) { net->window_probe = 0; } } else { if (net->window_probe) { /* * In window probes we must assure a timer * is still running there */ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_34); } } } if ((j == 0) && (!TAILQ_EMPTY(&asoc->sent_queue)) && (asoc->sent_queue_retran_cnt == 0) && (win_probe_recovered == 0) && (done_once == 0)) { /* * huh, this should not happen unless all packets are * PR-SCTP and marked to skip of course. */ if (sctp_fs_audit(asoc)) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { net->flight_size = 0; } asoc->total_flight = 0; asoc->total_flight_count = 0; asoc->sent_queue_retran_cnt = 0; TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { if (tp1->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_increase(tp1); sctp_total_flight_increase(stcb, tp1); } else if (tp1->sent == SCTP_DATAGRAM_RESEND) { sctp_ucount_incr(asoc->sent_queue_retran_cnt); } } } done_once = 1; goto again; } /*********************************************/ /* Here we perform PR-SCTP procedures */ /* (section 4.2) */ /*********************************************/ /* C1. update advancedPeerAckPoint */ if (SCTP_TSN_GT(cum_ack, asoc->advanced_peer_ack_point)) { asoc->advanced_peer_ack_point = cum_ack; } /* C2. try to further move advancedPeerAckPoint ahead */ if ((asoc->prsctp_supported) && (asoc->pr_sctp_cnt > 0)) { struct sctp_tmit_chunk *lchk; uint32_t old_adv_peer_ack_point; old_adv_peer_ack_point = asoc->advanced_peer_ack_point; lchk = sctp_try_advance_peer_ack_point(stcb, asoc); /* C3. See if we need to send a Fwd-TSN */ if (SCTP_TSN_GT(asoc->advanced_peer_ack_point, cum_ack)) { /* * ISSUE with ECN, see FWD-TSN processing. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, 0xee, cum_ack, asoc->advanced_peer_ack_point, old_adv_peer_ack_point); } if (SCTP_TSN_GT(asoc->advanced_peer_ack_point, old_adv_peer_ack_point)) { send_forward_tsn(stcb, asoc); } else if (lchk) { /* try to FR fwd-tsn's that get lost too */ if (lchk->rec.data.fwd_tsn_cnt >= 3) { send_forward_tsn(stcb, asoc); } } } if (lchk) { /* Assure a timer is up */ sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, lchk->whoTo); } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_RWND_LOGGING_ENABLE) { sctp_misc_ints(SCTP_SACK_RWND_UPDATE, a_rwnd, stcb->asoc.peers_rwnd, stcb->asoc.total_flight, stcb->asoc.total_output_queue_size); } } void sctp_update_acked(struct sctp_tcb *stcb, struct sctp_shutdown_chunk *cp, int *abort_flag) { /* Copy cum-ack */ uint32_t cum_ack, a_rwnd; cum_ack = ntohl(cp->cumulative_tsn_ack); /* Arrange so a_rwnd does NOT change */ a_rwnd = stcb->asoc.peers_rwnd + stcb->asoc.total_flight; /* Now call the express sack handling */ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, abort_flag, 0); } static void sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb, struct sctp_stream_in *strmin) { struct sctp_queued_to_read *ctl, *nctl; struct sctp_association *asoc; uint16_t tt; asoc = &stcb->asoc; tt = strmin->last_sequence_delivered; /* * First deliver anything prior to and including the stream no that * came in */ TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next, nctl) { if (SCTP_SSN_GE(tt, ctl->sinfo_ssn)) { /* this is deliverable now */ TAILQ_REMOVE(&strmin->inqueue, ctl, next); /* subtract pending on streams */ asoc->size_on_all_streams -= ctl->length; sctp_ucount_decr(asoc->cnt_on_all_streams); /* deliver it to at least the delivery-q */ if (stcb->sctp_socket) { sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, ctl, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); } } else { /* no more delivery now. */ break; } } /* * now we must deliver things in queue the normal way if any are * now ready. */ tt = strmin->last_sequence_delivered + 1; TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next, nctl) { if (tt == ctl->sinfo_ssn) { /* this is deliverable now */ TAILQ_REMOVE(&strmin->inqueue, ctl, next); /* subtract pending on streams */ asoc->size_on_all_streams -= ctl->length; sctp_ucount_decr(asoc->cnt_on_all_streams); /* deliver it to at least the delivery-q */ strmin->last_sequence_delivered = ctl->sinfo_ssn; if (stcb->sctp_socket) { sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, ctl, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); } tt = strmin->last_sequence_delivered + 1; } else { break; } } } static void sctp_flush_reassm_for_str_seq(struct sctp_tcb *stcb, struct sctp_association *asoc, uint16_t stream, uint16_t seq) { struct sctp_tmit_chunk *chk, *nchk; /* For each one on here see if we need to toss it */ /* * For now large messages held on the reasmqueue that are complete * will be tossed too. We could in theory do more work to spin * through and stop after dumping one msg aka seeing the start of a * new msg at the head, and call the delivery function... to see if * it can be delivered... But for now we just dump everything on the * queue. */ TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { /* * Do not toss it if on a different stream or marked for * unordered delivery in which case the stream sequence * number has no meaning. */ if ((chk->rec.data.stream_number != stream) || ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == SCTP_DATA_UNORDERED)) { continue; } if (chk->rec.data.stream_seq == seq) { /* It needs to be tossed */ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); if (SCTP_TSN_GT(chk->rec.data.TSN_seq, asoc->tsn_last_delivered)) { asoc->tsn_last_delivered = chk->rec.data.TSN_seq; asoc->str_of_pdapi = chk->rec.data.stream_number; asoc->ssn_of_pdapi = chk->rec.data.stream_seq; asoc->fragment_flags = chk->rec.data.rcv_flags; } asoc->size_on_reasm_queue -= chk->send_size; sctp_ucount_decr(asoc->cnt_on_reasm_queue); /* Clear up any stream problem */ if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != SCTP_DATA_UNORDERED && SCTP_SSN_GT(chk->rec.data.stream_seq, asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered)) { /* * We must dump forward this streams * sequence number if the chunk is not * unordered that is being skipped. There is * a chance that if the peer does not * include the last fragment in its FWD-TSN * we WILL have a problem here since you * would have a partial chunk in queue that * may not be deliverable. Also if a Partial * delivery API as started the user may get * a partial chunk. The next read returning * a new chunk... really ugly but I see no * way around it! Maybe a notify?? */ asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = chk->rec.data.stream_seq; } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } else if (SCTP_SSN_GT(chk->rec.data.stream_seq, seq)) { /* * If the stream_seq is > than the purging one, we * are done */ break; } } } void sctp_handle_forward_tsn(struct sctp_tcb *stcb, struct sctp_forward_tsn_chunk *fwd, int *abort_flag, struct mbuf *m, int offset) { /* The pr-sctp fwd tsn */ /* * here we will perform all the data receiver side steps for * processing FwdTSN, as required in by pr-sctp draft: * * Assume we get FwdTSN(x): * * 1) update local cumTSN to x 2) try to further advance cumTSN to x + * others we have 3) examine and update re-ordering queue on * pr-in-streams 4) clean up re-assembly queue 5) Send a sack to * report where we are. */ struct sctp_association *asoc; uint32_t new_cum_tsn, gap; unsigned int i, fwd_sz, m_size; uint32_t str_seq; struct sctp_stream_in *strm; struct sctp_tmit_chunk *chk, *nchk; struct sctp_queued_to_read *ctl, *sv; asoc = &stcb->asoc; if ((fwd_sz = ntohs(fwd->ch.chunk_length)) < sizeof(struct sctp_forward_tsn_chunk)) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size too small/big fwd-tsn\n"); return; } m_size = (stcb->asoc.mapping_array_size << 3); /*************************************************************/ /* 1. Here we update local cumTSN and shift the bitmap array */ /*************************************************************/ new_cum_tsn = ntohl(fwd->new_cumulative_tsn); if (SCTP_TSN_GE(asoc->cumulative_tsn, new_cum_tsn)) { /* Already got there ... */ return; } /* * now we know the new TSN is more advanced, let's find the actual * gap */ SCTP_CALC_TSN_TO_GAP(gap, new_cum_tsn, asoc->mapping_array_base_tsn); asoc->cumulative_tsn = new_cum_tsn; if (gap >= m_size) { if ((long)gap > sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * out of range (of single byte chunks in the rwnd I * give out). This must be an attacker. */ *abort_flag = 1; snprintf(msg, sizeof(msg), "New cum ack %8.8x too high, highest TSN %8.8x", new_cum_tsn, asoc->highest_tsn_inside_map); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_35; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); return; } SCTP_STAT_INCR(sctps_fwdtsn_map_over); memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); asoc->mapping_array_base_tsn = new_cum_tsn + 1; asoc->highest_tsn_inside_map = new_cum_tsn; memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size); asoc->highest_tsn_inside_nr_map = new_cum_tsn; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 3, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } } else { SCTP_TCB_LOCK_ASSERT(stcb); for (i = 0; i <= gap; i++) { if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, i) && !SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, i)) { SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, i); if (SCTP_TSN_GT(asoc->mapping_array_base_tsn + i, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = asoc->mapping_array_base_tsn + i; } } } } /*************************************************************/ /* 2. Clear up re-assembly queue */ /*************************************************************/ /* * First service it if pd-api is up, just in case we can progress it * forward */ if (asoc->fragmented_delivery_inprogress) { sctp_service_reassembly(stcb, asoc); } /* For each one on here see if we need to toss it */ /* * For now large messages held on the reasmqueue that are complete * will be tossed too. We could in theory do more work to spin * through and stop after dumping one msg aka seeing the start of a * new msg at the head, and call the delivery function... to see if * it can be delivered... But for now we just dump everything on the * queue. */ TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { if (SCTP_TSN_GE(new_cum_tsn, chk->rec.data.TSN_seq)) { /* It needs to be tossed */ TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); if (SCTP_TSN_GT(chk->rec.data.TSN_seq, asoc->tsn_last_delivered)) { asoc->tsn_last_delivered = chk->rec.data.TSN_seq; asoc->str_of_pdapi = chk->rec.data.stream_number; asoc->ssn_of_pdapi = chk->rec.data.stream_seq; asoc->fragment_flags = chk->rec.data.rcv_flags; } asoc->size_on_reasm_queue -= chk->send_size; sctp_ucount_decr(asoc->cnt_on_reasm_queue); /* Clear up any stream problem */ if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != SCTP_DATA_UNORDERED && SCTP_SSN_GT(chk->rec.data.stream_seq, asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered)) { /* * We must dump forward this streams * sequence number if the chunk is not * unordered that is being skipped. There is * a chance that if the peer does not * include the last fragment in its FWD-TSN * we WILL have a problem here since you * would have a partial chunk in queue that * may not be deliverable. Also if a Partial * delivery API as started the user may get * a partial chunk. The next read returning * a new chunk... really ugly but I see no * way around it! Maybe a notify?? */ asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = chk->rec.data.stream_seq; } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } else { /* * Ok we have gone beyond the end of the fwd-tsn's * mark. */ break; } } /*******************************************************/ /* 3. Update the PR-stream re-ordering queues and fix */ /* delivery issues as needed. */ /*******************************************************/ fwd_sz -= sizeof(*fwd); if (m && fwd_sz) { /* New method. */ unsigned int num_str; struct sctp_strseq *stseq, strseqbuf; offset += sizeof(*fwd); SCTP_INP_READ_LOCK(stcb->sctp_ep); num_str = fwd_sz / sizeof(struct sctp_strseq); for (i = 0; i < num_str; i++) { uint16_t st; stseq = (struct sctp_strseq *)sctp_m_getptr(m, offset, sizeof(struct sctp_strseq), (uint8_t *) & strseqbuf); offset += sizeof(struct sctp_strseq); if (stseq == NULL) { break; } /* Convert */ st = ntohs(stseq->stream); stseq->stream = st; st = ntohs(stseq->sequence); stseq->sequence = st; /* now process */ /* * Ok we now look for the stream/seq on the read * queue where its not all delivered. If we find it * we transmute the read entry into a PDI_ABORTED. */ if (stseq->stream >= asoc->streamincnt) { /* screwed up streams, stop! */ break; } if ((asoc->str_of_pdapi == stseq->stream) && (asoc->ssn_of_pdapi == stseq->sequence)) { /* * If this is the one we were partially * delivering now then we no longer are. * Note this will change with the reassembly * re-write. */ asoc->fragmented_delivery_inprogress = 0; } sctp_flush_reassm_for_str_seq(stcb, asoc, stseq->stream, stseq->sequence); TAILQ_FOREACH(ctl, &stcb->sctp_ep->read_queue, next) { if ((ctl->sinfo_stream == stseq->stream) && (ctl->sinfo_ssn == stseq->sequence)) { str_seq = (stseq->stream << 16) | stseq->sequence; ctl->end_added = 1; ctl->pdapi_aborted = 1; sv = stcb->asoc.control_pdapi; stcb->asoc.control_pdapi = ctl; sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION, stcb, SCTP_PARTIAL_DELIVERY_ABORTED, (void *)&str_seq, SCTP_SO_NOT_LOCKED); stcb->asoc.control_pdapi = sv; break; } else if ((ctl->sinfo_stream == stseq->stream) && SCTP_SSN_GT(ctl->sinfo_ssn, stseq->sequence)) { /* We are past our victim SSN */ break; } } strm = &asoc->strmin[stseq->stream]; if (SCTP_SSN_GT(stseq->sequence, strm->last_sequence_delivered)) { /* Update the sequence number */ strm->last_sequence_delivered = stseq->sequence; } /* now kick the stream the new way */ /* sa_ignore NO_NULL_CHK */ sctp_kick_prsctp_reorder_queue(stcb, strm); } SCTP_INP_READ_UNLOCK(stcb->sctp_ep); } /* * Now slide thing forward. */ sctp_slide_mapping_arrays(stcb); if (!TAILQ_EMPTY(&asoc->reasmqueue)) { /* now lets kick out and check for more fragmented delivery */ /* sa_ignore NO_NULL_CHK */ sctp_deliver_reasm_check(stcb, &stcb->asoc); } } Index: projects/iosched/sys/netinet/sctp_input.c =================================================================== --- projects/iosched/sys/netinet/sctp_input.c (revision 287721) +++ projects/iosched/sys/netinet/sctp_input.c (revision 287722) @@ -1,6231 +1,6219 @@ /*- * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #include static void sctp_stop_all_cookie_timers(struct sctp_tcb *stcb) { struct sctp_nets *net; /* * This now not only stops all cookie timers it also stops any INIT * timers as well. This will make sure that the timers are stopped * in all collision cases. */ SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->rxt_timer.type == SCTP_TIMER_TYPE_COOKIE) { sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_1); } else if (net->rxt_timer.type == SCTP_TIMER_TYPE_INIT) { sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_2); } } } /* INIT handler */ static void sctp_handle_init(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_chunk *cp, struct sctp_inpcb *inp, struct sctp_tcb *stcb, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_init *init; struct mbuf *op_err; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init: handling INIT tcb:%p\n", (void *)stcb); if (stcb == NULL) { SCTP_INP_RLOCK(inp); } /* validate length */ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) { op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } /* validate parameters */ init = &cp->init; if (init->initiate_tag == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) { /* invalid parameter... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (init->num_inbound_streams == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (init->num_outbound_streams == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (sctp_validate_init_auth_params(m, offset + sizeof(*cp), offset + ntohs(cp->ch.chunk_length))) { /* auth parameter(s) error... send abort */ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with AUTH parameters"); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } /* * We are only accepting if we have a socket with positive * so_qlimit. */ if ((stcb == NULL) && ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_socket == NULL) || (inp->sctp_socket->so_qlimit == 0))) { /* * FIX ME ?? What about TCP model and we have a * match/restart case? Actually no fix is needed. the lookup * will always find the existing assoc so stcb would not be * NULL. It may be questionable to do this since we COULD * just send back the INIT-ACK and hope that the app did * accept()'s by the time the COOKIE was sent. But there is * a price to pay for COOKIE generation and I don't want to * pay it on the chance that the app will actually do some * accepts(). The App just looses and should NOT be in this * state :-) */ if (SCTP_BASE_SYSCTL(sctp_blackhole) == 0) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "No listener"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); } goto outnow; } if ((stcb != NULL) && (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT)) { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending SHUTDOWN-ACK\n"); sctp_send_shutdown_ack(stcb, NULL); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); } else { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n"); sctp_send_initiate_ack(inp, stcb, m, iphlen, offset, src, dst, sh, cp, mflowtype, mflowid, vrf_id, port, ((stcb == NULL) ? SCTP_HOLDS_LOCK : SCTP_NOT_LOCKED)); } outnow: if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); } } /* * process peer "INIT/INIT-ACK" chunk returns value < 0 on error */ int sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ) { int unsent_data = 0; unsigned int i; struct sctp_stream_queue_pending *sp; struct sctp_association *asoc; /* * This function returns the number of streams that have true unsent * data on them. Note that as it looks through it will clean up any * places that have old data that has been sent but left at top of * stream queue. */ asoc = &stcb->asoc; SCTP_TCB_SEND_LOCK(stcb); if (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { /* Check to see if some data queued */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { /* sa_ignore FREED_MEMORY */ sp = TAILQ_FIRST(&stcb->asoc.strmout[i].outqueue); if (sp == NULL) { continue; } if ((sp->msg_is_complete) && (sp->length == 0) && (sp->sender_all_done)) { /* * We are doing differed cleanup. Last time * through when we took all the data the * sender_all_done was not set. */ if (sp->put_last_out == 0) { SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d\n", sp->sender_all_done, sp->length, sp->msg_is_complete, sp->put_last_out); } atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_REMOVE(&stcb->asoc.strmout[i].outqueue, sp, next); if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); } else { unsent_data++; break; } } } SCTP_TCB_SEND_UNLOCK(stcb); return (unsent_data); } static int sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) { struct sctp_init *init; struct sctp_association *asoc; struct sctp_nets *lnet; unsigned int i; init = &cp->init; asoc = &stcb->asoc; /* save off parameters */ asoc->peer_vtag = ntohl(init->initiate_tag); asoc->peers_rwnd = ntohl(init->a_rwnd); /* init tsn's */ asoc->highest_tsn_inside_map = asoc->asconf_seq_in = ntohl(init->initial_tsn) - 1; if (!TAILQ_EMPTY(&asoc->nets)) { /* update any ssthresh's that may have a default */ TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) { lnet->ssthresh = asoc->peers_rwnd; if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_INITIALIZATION); } } } SCTP_TCB_SEND_LOCK(stcb); if (asoc->pre_open_streams > ntohs(init->num_inbound_streams)) { unsigned int newcnt; struct sctp_stream_out *outs; struct sctp_stream_queue_pending *sp, *nsp; struct sctp_tmit_chunk *chk, *nchk; /* abandon the upper streams */ newcnt = ntohs(init->num_inbound_streams); TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (chk->rec.data.stream_number >= newcnt) { TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); asoc->send_queue_cnt--; if (asoc->strmout[chk->rec.data.stream_number].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.stream_number].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.stream_number); #endif } if (chk->data != NULL) { sctp_free_bufspace(stcb, asoc, chk, 1); sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, chk, SCTP_SO_NOT_LOCKED); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } } if (asoc->strmout) { for (i = newcnt; i < asoc->pre_open_streams; i++) { outs = &asoc->strmout[i]; TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { TAILQ_REMOVE(&outs->outqueue, sp, next); asoc->stream_queue_cnt--; sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, 0, sp, SCTP_SO_NOT_LOCKED); if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } /* Free the chunk */ sctp_free_a_strmoq(stcb, sp, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } outs->state = SCTP_STREAM_CLOSED; } } /* cut back the count */ asoc->pre_open_streams = newcnt; } SCTP_TCB_SEND_UNLOCK(stcb); asoc->streamoutcnt = asoc->pre_open_streams; for (i = 0; i < asoc->streamoutcnt; i++) { asoc->strmout[i].state = SCTP_STREAM_OPEN; } /* EY - nr_sack: initialize highest tsn in nr_mapping_array */ asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 5, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } /* This is the next one we expect */ asoc->str_reset_seq_in = asoc->asconf_seq_in + 1; asoc->mapping_array_base_tsn = ntohl(init->initial_tsn); asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->asconf_seq_in; asoc->advanced_peer_ack_point = asoc->last_acked_seq; /* open the requested streams */ if (asoc->strmin != NULL) { /* Free the old ones */ struct sctp_queued_to_read *ctl, *nctl; for (i = 0; i < asoc->streamincnt; i++) { TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[i].inqueue, next, nctl) { TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next); sctp_free_remote_addr(ctl->whoFrom); ctl->whoFrom = NULL; sctp_m_freem(ctl->data); ctl->data = NULL; sctp_free_a_readq(stcb, ctl); } } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); } if (asoc->max_inbound_streams > ntohs(init->num_outbound_streams)) { asoc->streamincnt = ntohs(init->num_outbound_streams); } else { asoc->streamincnt = asoc->max_inbound_streams; } SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt * sizeof(struct sctp_stream_in), SCTP_M_STRMI); if (asoc->strmin == NULL) { /* we didn't get memory for the streams! */ SCTPDBG(SCTP_DEBUG_INPUT2, "process_init: couldn't get memory for the streams!\n"); return (-1); } for (i = 0; i < asoc->streamincnt; i++) { asoc->strmin[i].stream_no = i; asoc->strmin[i].last_sequence_delivered = 0xffff; TAILQ_INIT(&asoc->strmin[i].inqueue); asoc->strmin[i].delivery_started = 0; } /* * load_address_from_init will put the addresses into the * association when the COOKIE is processed or the INIT-ACK is * processed. Both types of COOKIE's existing and new call this * routine. It will remove addresses that are no longer in the * association (for the restarting case where addresses are * removed). Up front when the INIT arrives we will discard it if it * is a restart and new addresses have been added. */ /* sa_ignore MEMLEAK */ return (0); } /* * INIT-ACK message processing/consumption returns value < 0 on error */ static int sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_association *asoc; struct mbuf *op_err; int retval, abort_flag; uint32_t initack_limit; int nat_friendly = 0; /* First verify that we have no illegal param's */ abort_flag = 0; op_err = sctp_arethere_unrecognized_parameters(m, (offset + sizeof(struct sctp_init_chunk)), &abort_flag, (struct sctp_chunkhdr *)cp, &nat_friendly); if (abort_flag) { /* Send an abort and notify peer */ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_no_unlock = 1; return (-1); } asoc = &stcb->asoc; asoc->peer_supports_nat = (uint8_t) nat_friendly; /* process the peer's parameters in the INIT-ACK */ retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb); if (retval < 0) { return (retval); } initack_limit = offset + ntohs(cp->ch.chunk_length); /* load all addresses */ if ((retval = sctp_load_addresses_from_init(stcb, m, (offset + sizeof(struct sctp_init_chunk)), initack_limit, src, dst, NULL))) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with address parameters"); SCTPDBG(SCTP_DEBUG_INPUT1, "Load addresses from INIT causes an abort %d\n", retval); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } /* if the peer doesn't support asconf, flush the asconf queue */ if (asoc->asconf_supported == 0) { struct sctp_asconf_addr *param, *nparam; TAILQ_FOREACH_SAFE(param, &asoc->asconf_queue, next, nparam) { TAILQ_REMOVE(&asoc->asconf_queue, param, next); SCTP_FREE(param, SCTP_M_ASC_ADDR); } } stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, stcb->asoc.local_hmacs); if (op_err) { sctp_queue_op_err(stcb, op_err); /* queuing will steal away the mbuf chain to the out queue */ op_err = NULL; } /* extract the cookie and queue it to "echo" it back... */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; net->error_count = 0; /* * Cancel the INIT timer, We do this first before queueing the * cookie. We always cancel at the primary to assue that we are * canceling the timer started by the INIT which always goes to the * primary. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); /* calculate the RTO */ net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy, SCTP_RTT_FROM_NON_DATA); retval = sctp_send_cookie_echo(m, offset, stcb, net); if (retval < 0) { /* * No cookie, we probably should send a op error. But in any * case if there is no cookie in the INIT-ACK, we can * abandon the peer, its broke. */ if (retval == -3) { + uint16_t len; + + len = (uint16_t) (sizeof(struct sctp_error_missing_param) + sizeof(uint16_t)); /* We abort with an error of missing mandatory param */ - op_err = sctp_generate_cause(SCTP_CAUSE_MISSING_PARAM, ""); - if (op_err) { - /* - * Expand beyond to include the mandatory - * param cookie - */ - struct sctp_inv_mandatory_param *mp; + op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + struct sctp_error_missing_param *cause; - SCTP_BUF_LEN(op_err) = - sizeof(struct sctp_inv_mandatory_param); - mp = mtod(op_err, - struct sctp_inv_mandatory_param *); + SCTP_BUF_LEN(op_err) = len; + cause = mtod(op_err, struct sctp_error_missing_param *); /* Subtract the reserved param */ - mp->length = - htons(sizeof(struct sctp_inv_mandatory_param) - 2); - mp->num_param = htonl(1); - mp->param = htons(SCTP_STATE_COOKIE); - mp->resv = 0; + cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM); + cause->cause.length = htons(len); + cause->num_missing_params = htonl(1); + cause->type[0] = htons(SCTP_STATE_COOKIE); } sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; } return (retval); } return (0); } static void sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net) { union sctp_sockstore store; struct sctp_nets *r_net, *f_net; struct timeval tv; int req_prim = 0; uint16_t old_error_counter; if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) { /* Invalid length */ return; } memset(&store, 0, sizeof(store)); switch (cp->heartbeat.hb_info.addr_family) { #ifdef INET case AF_INET: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) { store.sin.sin_family = cp->heartbeat.hb_info.addr_family; store.sin.sin_len = cp->heartbeat.hb_info.addr_len; store.sin.sin_port = stcb->rport; memcpy(&store.sin.sin_addr, cp->heartbeat.hb_info.address, sizeof(store.sin.sin_addr)); } else { return; } break; #endif #ifdef INET6 case AF_INET6: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) { store.sin6.sin6_family = cp->heartbeat.hb_info.addr_family; store.sin6.sin6_len = cp->heartbeat.hb_info.addr_len; store.sin6.sin6_port = stcb->rport; memcpy(&store.sin6.sin6_addr, cp->heartbeat.hb_info.address, sizeof(struct in6_addr)); } else { return; } break; #endif default: return; } r_net = sctp_findnet(stcb, &store.sa); if (r_net == NULL) { SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n"); return; } if ((r_net && (r_net->dest_state & SCTP_ADDR_UNCONFIRMED)) && (r_net->heartbeat_random1 == cp->heartbeat.hb_info.random_value1) && (r_net->heartbeat_random2 == cp->heartbeat.hb_info.random_value2)) { /* * If the its a HB and it's random value is correct when can * confirm the destination. */ r_net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; if (r_net->dest_state & SCTP_ADDR_REQ_PRIMARY) { stcb->asoc.primary_destination = r_net; r_net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY; f_net = TAILQ_FIRST(&stcb->asoc.nets); if (f_net != r_net) { /* * first one on the list is NOT the primary * sctp_cmpaddr() is much more efficent if * the primary is the first on the list, * make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, r_net, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, r_net, sctp_next); } req_prim = 1; } sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } old_error_counter = r_net->error_count; r_net->error_count = 0; r_net->hb_responded = 1; tv.tv_sec = cp->heartbeat.hb_info.time_value_1; tv.tv_usec = cp->heartbeat.hb_info.time_value_2; /* Now lets do a RTO with this */ r_net->RTO = sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, sctp_align_safe_nocopy, SCTP_RTT_FROM_NON_DATA); if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) { r_net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); } if (r_net->dest_state & SCTP_ADDR_PF) { r_net->dest_state &= ~SCTP_ADDR_PF; stcb->asoc.cc_functions.sctp_cwnd_update_exit_pf(stcb, net); } if (old_error_counter > 0) { sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_5); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } if (r_net == stcb->asoc.primary_destination) { if (stcb->asoc.alternate) { /* release the alternate, primary is good */ sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } } /* Mobility adaptation */ if (req_prim) { if ((sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED)) { sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_assoc_immediate_retrans(stcb, stcb->asoc.primary_destination); } if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { sctp_move_chunks_from_net(stcb, stcb->asoc.deleted_primary); } sctp_delete_prim_timer(stcb->sctp_ep, stcb, stcb->asoc.deleted_primary); } } } static int sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) { /* * return 0 means we want you to proceed with the abort non-zero * means no abort processing */ struct sctpasochead *head; if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) { /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { /* * treat like a case where the cookie expired i.e.: - dump * current cookie. - generate a new vtag. - resend init. */ /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); stcb->asoc.state &= ~SCTP_STATE_COOKIE_ECHOED; stcb->asoc.state |= SCTP_STATE_COOKIE_WAIT; sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, &stcb->asoc); stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } return (0); } static int sctp_handle_nat_missing_state(struct sctp_tcb *stcb, struct sctp_nets *net) { /* * return 0 means we want you to proceed with the abort non-zero * means no abort processing */ if (stcb->asoc.auth_supported == 0) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n"); return (0); } sctp_asconf_send_nat_state_update(stcb, net); return (1); } static void sctp_handle_abort(struct sctp_abort_chunk *abort, struct sctp_tcb *stcb, struct sctp_nets *net) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif uint16_t len; uint16_t error; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: handling ABORT\n"); if (stcb == NULL) return; len = ntohs(abort->ch.chunk_length); if (len > sizeof(struct sctp_chunkhdr)) { /* * Need to check the cause codes for our two magic nat * aborts which don't kill the assoc necessarily. */ - struct sctp_missing_nat_state *natc; + struct sctp_gen_error_cause *cause; - natc = (struct sctp_missing_nat_state *)(abort + 1); - error = ntohs(natc->cause); + cause = (struct sctp_gen_error_cause *)(abort + 1); + error = ntohs(cause->code); if (error == SCTP_CAUSE_NAT_COLLIDING_STATE) { SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n", abort->ch.chunk_flags); if (sctp_handle_nat_colliding_state(stcb)) { return; } } else if (error == SCTP_CAUSE_NAT_MISSING_STATE) { SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n", abort->ch.chunk_flags); if (sctp_handle_nat_missing_state(stcb, net)) { return; } } } else { error = 0; } /* stop any receive timers */ sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_7); /* notify user of the abort and clean up... */ sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED); /* free the tcb */ SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } #ifdef SCTP_ASOCLOG_OF_TSNS sctp_print_out_track_log(stcb); #endif #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: finished\n"); } static void sctp_start_net_timers(struct sctp_tcb *stcb) { uint32_t cnt_hb_sent; struct sctp_nets *net; cnt_hb_sent = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* * For each network start: 1) A pmtu timer. 2) A HB timer 3) * If the dest in unconfirmed send a hb as well if under * max_hb_burst have been sent. */ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) && (cnt_hb_sent < SCTP_BASE_SYSCTL(sctp_hb_maxburst))) { sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); cnt_hb_sent++; } } if (cnt_hb_sent) { sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED); } } static void sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_flag) { struct sctp_association *asoc; int some_on_streamwheel; int old_state; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown: handling SHUTDOWN\n"); if (stcb == NULL) return; asoc = &stcb->asoc; if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { return; } if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) { /* Shutdown NOT the expected size */ return; } old_state = SCTP_GET_STATE(asoc); sctp_update_acked(stcb, cp, abort_flag); if (*abort_flag) { return; } if (asoc->control_pdapi) { /* * With a normal shutdown we assume the end of last record. */ SCTP_INP_READ_LOCK(stcb->sctp_ep); asoc->control_pdapi->end_added = 1; asoc->control_pdapi->pdapi_aborted = 1; asoc->control_pdapi = NULL; SCTP_INP_READ_UNLOCK(stcb->sctp_ep); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* assoc was freed while we were unlocked */ SCTP_SOCKET_UNLOCK(so, 1); return; } #endif sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* goto SHUTDOWN_RECEIVED state to block new requests */ if (stcb->sctp_socket) { if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_RECEIVED); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); /* * notify upper layer that peer has initiated a * shutdown */ sctp_ulp_notify(SCTP_NOTIFY_PEER_SHUTDOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); /* reset time */ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); } } if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { /* * stop the shutdown timer, since we WILL move to * SHUTDOWN-ACK-SENT. */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9); } /* Now is there unsent data on a stream somewhere? */ some_on_streamwheel = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED); if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || some_on_streamwheel) { /* By returning we will push more data out */ return; } else { /* no outstanding data to send, so move on... */ /* send SHUTDOWN-ACK */ /* move to SHUTDOWN-ACK-SENT state */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown_ack(stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net); } else if (old_state == SCTP_STATE_SHUTDOWN_ACK_SENT) { sctp_send_shutdown_ack(stcb, net); } } } static void sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *asoc; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; so = SCTP_INP_SO(stcb->sctp_ep); #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_ack: handling SHUTDOWN ACK\n"); if (stcb == NULL) return; asoc = &stcb->asoc; /* process according to association state */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { /* unexpected SHUTDOWN-ACK... do OOTB handling... */ sctp_send_shutdown_complete(stcb, net, 1); SCTP_TCB_UNLOCK(stcb); return; } if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* unexpected SHUTDOWN-ACK... so ignore... */ SCTP_TCB_UNLOCK(stcb); return; } if (asoc->control_pdapi) { /* * With a normal shutdown we assume the end of last record. */ SCTP_INP_READ_LOCK(stcb->sctp_ep); asoc->control_pdapi->end_added = 1; asoc->control_pdapi->pdapi_aborted = 1; asoc->control_pdapi = NULL; SCTP_INP_READ_UNLOCK(stcb->sctp_ep); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* assoc was freed while we were unlocked */ SCTP_SOCKET_UNLOCK(so, 1); return; } #endif sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } #ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { panic("Queues are not empty when handling SHUTDOWN-ACK"); } #endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_10); /* send SHUTDOWN-COMPLETE */ sctp_send_shutdown_complete(stcb, net, 0); /* notify upper layer protocol */ if (stcb->sctp_socket) { if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { stcb->sctp_socket->so_snd.sb_cc = 0; } sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } SCTP_STAT_INCR_COUNTER32(sctps_shutdown); /* free the TCB but first save off the ep */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_11); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* * Skip past the param header and then we will find the chunk that caused the * problem. There are two possiblities ASCONF or FWD-TSN other than that and * our peer must be broken. */ static void sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr, struct sctp_nets *net) { struct sctp_chunkhdr *chk; chk = (struct sctp_chunkhdr *)((caddr_t)phdr + sizeof(*phdr)); switch (chk->chunk_type) { case SCTP_ASCONF_ACK: case SCTP_ASCONF: sctp_asconf_cleanup(stcb, net); break; case SCTP_FORWARD_CUM_TSN: stcb->asoc.prsctp_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, "Peer does not support chunk type %d(%x)??\n", chk->chunk_type, (uint32_t) chk->chunk_type); break; } } /* * Skip past the param header and then we will find the param that caused the * problem. There are a number of param's in a ASCONF OR the prsctp param * these will turn of specific features. * XXX: Is this the right thing to do? */ static void sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr) { struct sctp_paramhdr *pbad; pbad = phdr + 1; switch (ntohs(pbad->param_type)) { /* pr-sctp draft */ case SCTP_PRSCTP_SUPPORTED: stcb->asoc.prsctp_supported = 0; break; case SCTP_SUPPORTED_CHUNK_EXT: break; /* draft-ietf-tsvwg-addip-sctp */ case SCTP_HAS_NAT_SUPPORT: stcb->asoc.peer_supports_nat = 0; break; case SCTP_ADD_IP_ADDRESS: case SCTP_DEL_IP_ADDRESS: case SCTP_SET_PRIM_ADDR: stcb->asoc.asconf_supported = 0; break; case SCTP_SUCCESS_REPORT: case SCTP_ERROR_CAUSE_IND: SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n"); SCTPDBG(SCTP_DEBUG_INPUT2, "Turning off ASCONF to this strange peer\n"); stcb->asoc.asconf_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, "Peer does not support param type %d(%x)??\n", pbad->param_type, (uint32_t) pbad->param_type); break; } } static int sctp_handle_error(struct sctp_chunkhdr *ch, struct sctp_tcb *stcb, struct sctp_nets *net) { int chklen; struct sctp_paramhdr *phdr; uint16_t error, error_type; uint16_t error_len; struct sctp_association *asoc; int adjust; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif /* parse through all of the errors and process */ asoc = &stcb->asoc; phdr = (struct sctp_paramhdr *)((caddr_t)ch + sizeof(struct sctp_chunkhdr)); chklen = ntohs(ch->chunk_length) - sizeof(struct sctp_chunkhdr); error = 0; while ((size_t)chklen >= sizeof(struct sctp_paramhdr)) { /* Process an Error Cause */ error_type = ntohs(phdr->param_type); error_len = ntohs(phdr->param_length); if ((error_len > chklen) || (error_len == 0)) { /* invalid param length for this param */ SCTPDBG(SCTP_DEBUG_INPUT1, "Bogus length in error param- chunk left:%d errorlen:%d\n", chklen, error_len); return (0); } if (error == 0) { /* report the first error cause */ error = error_type; } switch (error_type) { case SCTP_CAUSE_INVALID_STREAM: case SCTP_CAUSE_MISSING_PARAM: case SCTP_CAUSE_INVALID_PARAM: case SCTP_CAUSE_NO_USER_DATA: SCTPDBG(SCTP_DEBUG_INPUT1, "Software error we got a %d back? We have a bug :/ (or do they?)\n", error_type); break; case SCTP_CAUSE_NAT_COLLIDING_STATE: SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n", ch->chunk_flags); if (sctp_handle_nat_colliding_state(stcb)) { return (0); } break; case SCTP_CAUSE_NAT_MISSING_STATE: SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n", ch->chunk_flags); if (sctp_handle_nat_missing_state(stcb, net)) { return (0); } break; case SCTP_CAUSE_STALE_COOKIE: /* * We only act if we have echoed a cookie and are * waiting. */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) { int *p; p = (int *)((caddr_t)phdr + sizeof(*phdr)); /* Save the time doubled */ asoc->cookie_preserve_req = ntohl(*p) << 1; asoc->stale_cookie_count++; if (asoc->stale_cookie_count > asoc->max_init_times) { sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED); /* now free the asoc */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_12); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif return (-1); } /* blast back to INIT state */ sctp_toss_old_cookies(stcb, &stcb->asoc); asoc->state &= ~SCTP_STATE_COOKIE_ECHOED; asoc->state |= SCTP_STATE_COOKIE_WAIT; sctp_stop_all_cookie_timers(stcb); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } break; case SCTP_CAUSE_UNRESOLVABLE_ADDR: /* * Nothing we can do here, we don't do hostname * addresses so if the peer does not like my IPv6 * (or IPv4 for that matter) it does not matter. If * they don't support that type of address, they can * NOT possibly get that packet type... i.e. with no * IPv6 you can't recieve a IPv6 packet. so we can * safely ignore this one. If we ever added support * for HOSTNAME Addresses, then we would need to do * something here. */ break; case SCTP_CAUSE_UNRECOG_CHUNK: sctp_process_unrecog_chunk(stcb, phdr, net); break; case SCTP_CAUSE_UNRECOG_PARAM: sctp_process_unrecog_param(stcb, phdr); break; case SCTP_CAUSE_COOKIE_IN_SHUTDOWN: /* * We ignore this since the timer will drive out a * new cookie anyway and there timer will drive us * to send a SHUTDOWN_COMPLETE. We can't send one * here since we don't have their tag. */ break; case SCTP_CAUSE_DELETING_LAST_ADDR: case SCTP_CAUSE_RESOURCE_SHORTAGE: case SCTP_CAUSE_DELETING_SRC_ADDR: /* * We should NOT get these here, but in a * ASCONF-ACK. */ SCTPDBG(SCTP_DEBUG_INPUT2, "Peer sends ASCONF errors in a Operational Error?<%d>?\n", error_type); break; case SCTP_CAUSE_OUT_OF_RESC: /* * And what, pray tell do we do with the fact that * the peer is out of resources? Not really sure we * could do anything but abort. I suspect this * should have came WITH an abort instead of in a * OP-ERROR. */ break; default: SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_handle_error: unknown error type = 0x%xh\n", error_type); break; } adjust = SCTP_SIZE32(error_len); chklen -= adjust; phdr = (struct sctp_paramhdr *)((caddr_t)phdr + adjust); } sctp_ulp_notify(SCTP_NOTIFY_REMOTE_ERROR, stcb, error, ch, SCTP_SO_NOT_LOCKED); return (0); } static int sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_init_ack *init_ack; struct mbuf *op_err; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init_ack: handling INIT-ACK\n"); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init_ack: TCB is null\n"); return (-1); } if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) { /* Invalid length */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } init_ack = &cp->init; /* validate parameters */ if (init_ack->initiate_tag == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (init_ack->num_inbound_streams == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (init_ack->num_outbound_streams == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } /* process according to association state... */ switch (stcb->asoc.state & SCTP_STATE_MASK) { case SCTP_STATE_COOKIE_WAIT: /* this is the expected state for this chunk */ /* process the INIT-ACK parameters */ if (stcb->asoc.primary_destination->dest_state & SCTP_ADDR_UNCONFIRMED) { /* * The primary is where we sent the INIT, we can * always consider it confirmed when the INIT-ACK is * returned. Do this before we load addresses * though. */ stcb->asoc.primary_destination->dest_state &= ~SCTP_ADDR_UNCONFIRMED; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, stcb, 0, (void *)stcb->asoc.primary_destination, SCTP_SO_NOT_LOCKED); } if (sctp_process_init_ack(m, iphlen, offset, src, dst, sh, cp, stcb, net, abort_no_unlock, mflowtype, mflowid, vrf_id) < 0) { /* error in parsing parameters */ return (-1); } /* update our state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to COOKIE-ECHOED state\n"); SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_ECHOED); /* reset the RTO calc */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); /* * collapse the init timer back in case of a exponential * backoff */ sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep, stcb, net); /* * the send at the end of the inbound data processing will * cause the cookie to be sent */ break; case SCTP_STATE_SHUTDOWN_SENT: /* incorrect state... discard */ break; case SCTP_STATE_COOKIE_ECHOED: /* incorrect state... discard */ break; case SCTP_STATE_OPEN: /* incorrect state... discard */ break; case SCTP_STATE_EMPTY: case SCTP_STATE_INUSE: default: /* incorrect state... discard */ return (-1); break; } SCTPDBG(SCTP_DEBUG_INPUT1, "Leaving handle-init-ack end\n"); return (0); } static struct sctp_tcb * sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port); /* * handle a state cookie for an existing association m: input packet mbuf * chain-- assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a * "split" mbuf and the cookie signature does not exist offset: offset into * mbuf to the cookie-echo chunk */ static struct sctp_tcb * sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; struct sctp_init_chunk *init_cp, init_buf; struct sctp_init_ack_chunk *initack_cp, initack_buf; struct sctp_nets *net; struct mbuf *op_err; int init_offset, initack_offset, i; int retval; int spec_flag = 0; uint32_t how_indx; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif net = *netp; /* I know that the TCB is non-NULL from the caller */ asoc = &stcb->asoc; for (how_indx = 0; how_indx < sizeof(asoc->cookie_how); how_indx++) { if (asoc->cookie_how[how_indx] == 0) break; } if (how_indx < sizeof(asoc->cookie_how)) { asoc->cookie_how[how_indx] = 1; } if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /* SHUTDOWN came in after sending INIT-ACK */ sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination); op_err = sctp_generate_cause(SCTP_CAUSE_COOKIE_IN_SHUTDOWN, ""); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, net->port); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 2; return (NULL); } /* * find and validate the INIT chunk in the cookie (peer's info) the * INIT should start after the cookie-echo header struct (chunk * header, state cookie header struct) */ init_offset = offset += sizeof(struct sctp_cookie_echo_chunk); init_cp = (struct sctp_init_chunk *) sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), (uint8_t *) & init_buf); if (init_cp == NULL) { /* could not pull a INIT chunk in cookie */ return (NULL); } if (init_cp->ch.chunk_type != SCTP_INITIATION) { return (NULL); } /* * find and validate the INIT-ACK chunk in the cookie (my info) the * INIT-ACK follows the INIT chunk */ initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length)); initack_cp = (struct sctp_init_ack_chunk *) sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), (uint8_t *) & initack_buf); if (initack_cp == NULL) { /* could not pull INIT-ACK chunk in cookie */ return (NULL); } if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && (ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag)) { /* * case D in Section 5.2.4 Table 2: MMAA process accordingly * to get into the OPEN state */ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { /*- * Opps, this means that we somehow generated two vtag's * the same. I.e. we did: * Us Peer * <---INIT(tag=a)------ * ----INIT-ACK(tag=t)--> * ----INIT(tag=t)------> *1 * <---INIT-ACK(tag=a)--- * <----CE(tag=t)------------- *2 * * At point *1 we should be generating a different * tag t'. Which means we would throw away the CE and send * ours instead. Basically this is case C (throw away side). */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 17; return (NULL); } switch (SCTP_GET_STATE(asoc)) { case SCTP_STATE_COOKIE_WAIT: case SCTP_STATE_COOKIE_ECHOED: /* * INIT was sent but got a COOKIE_ECHO with the * correct tags... just accept it...but we must * process the init so that we can make sure we have * the right seq no's. */ /* First we must process the INIT !! */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 3; return (NULL); } /* we have already processed the INIT so no problem */ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_13); sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14); /* update current state */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } SCTP_STAT_INCR_GAUGE32(sctps_currestab); sctp_stop_all_cookie_timers(stcb); if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (inp->sctp_socket->so_qlimit == 0) ) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif /* * Here is where collision would go if we * did a connect() and instead got a * init/init-ack/cookie done before the * init-ack came back.. */ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } #endif soisconnected(stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_UP; /* * since we did not send a HB make sure we don't * double things */ net->hb_responded = 1; net->RTO = sctp_calculate_rto(stcb, asoc, net, &cookie->time_entered, sctp_align_unsafe_makecopy, SCTP_RTT_FROM_NON_DATA); if (stcb->asoc.sctp_autoclose_ticks && (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } break; default: /* * we're in the OPEN state (or beyond), so peer must * have simply lost the COOKIE-ACK */ break; } /* end switch */ sctp_stop_all_cookie_timers(stcb); /* * We ignore the return code here.. not sure if we should * somehow abort.. but we do have an existing asoc. This * really should not fail. */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 4; return (NULL); } /* respond with a COOKIE-ACK */ sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 5; return (stcb); } if (ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag && cookie->tie_tag_my_vtag == 0 && cookie->tie_tag_peer_vtag == 0) { /* * case C in Section 5.2.4 Table 2: XMOO silently discard */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 6; return (NULL); } /* * If nat support, and the below and stcb is established, send back * a ABORT(colliding state) if we are established. */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) && (asoc->peer_supports_nat) && ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || (asoc->peer_vtag == 0)))) { /* * Special case - Peer's support nat. We may have two init's * that we gave out the same tag on since one was not * established.. i.e. we get INIT from host-1 behind the nat * and we respond tag-a, we get a INIT from host-2 behind * the nat and we get tag-a again. Then we bring up host-1 * (or 2's) assoc, Then comes the cookie from hsot-2 (or 1). * Now we have colliding state. We must send an abort here * with colliding state indication. */ op_err = sctp_generate_cause(SCTP_CAUSE_NAT_COLLIDING_STATE, ""); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || (asoc->peer_vtag == 0))) { /* * case B in Section 5.2.4 Table 2: MXAA or MOAA my info * should be ok, re-accept peer info */ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { /* * Extension of case C. If we hit this, then the * random number generator returned the same vtag * when we first sent our INIT-ACK and when we later * sent our INIT. The side with the seq numbers that * are different will be the one that normnally * would have hit case C. This in effect "extends" * our vtags in this collision case to be 64 bits. * The same collision could occur aka you get both * vtag and seq number the same twice in a row.. but * is much less likely. If it did happen then we * would proceed through and bring up the assoc.. we * may end up with the wrong stream setup however.. * which would be bad.. but there is no way to * tell.. until we send on a stream that does not * exist :-) */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 7; return (NULL); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 8; sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_15); sctp_stop_all_cookie_timers(stcb); /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); if (ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) { /* * Ok the peer probably discarded our data (if we * echoed a cookie+data). So anything on the * sent_queue should be marked for retransmit, we * may not get something to kick us so it COULD * still take a timeout to move these.. but it can't * hurt to mark them. */ struct sctp_tmit_chunk *chk; TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if (chk->sent < SCTP_DATAGRAM_RESEND) { chk->sent = SCTP_DATAGRAM_RESEND; sctp_flight_size_decrease(chk); sctp_total_flight_decrease(stcb, chk); sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); spec_flag++; } } } /* process the INIT info (peer's info) */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 9; return (NULL); } if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 10; return (NULL); } if ((asoc->state & SCTP_STATE_COOKIE_WAIT) || (asoc->state & SCTP_STATE_COOKIE_ECHOED)) { *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (inp->sctp_socket->so_qlimit == 0)) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } #endif soisconnected(stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); } else if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_COUNTER32(sctps_restartestab); } else { SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); } SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (spec_flag) { /* * only if we have retrans set do we do this. What * this call does is get only the COOKIE-ACK out and * then when we return the normal call to * sctp_chunk_output will get the retrans out behind * this. */ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 11; return (stcb); } if ((ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) && cookie->tie_tag_my_vtag == asoc->my_vtag_nonce && cookie->tie_tag_peer_vtag == asoc->peer_vtag_nonce && cookie->tie_tag_peer_vtag != 0) { struct sctpasochead *head; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif if (asoc->peer_supports_nat) { /* * This is a gross gross hack. Just call the * cookie_new code since we are allowing a duplicate * association. I hope this works... */ return (sctp_process_cookie_new(m, iphlen, offset, src, dst, sh, cookie, cookie_len, inp, netp, init_src, notification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port)); } /* * case A in Section 5.2.4 Table 2: XXMM (peer restarted) */ /* temp code */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 12; sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17); /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_RESTART; atomic_add_int(&stcb->asoc.refcnt, 1); if ((SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { SCTP_STAT_INCR_GAUGE32(sctps_currestab); } if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_GAUGE32(sctps_restartestab); } else if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { SCTP_STAT_INCR_GAUGE32(sctps_collisionestab); } if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } else if (!(asoc->state & SCTP_STATE_SHUTDOWN_SENT)) { /* move to OPEN state, if not in SHUTDOWN_SENT */ SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); } asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; asoc->str_reset_seq_in = asoc->init_seq_number; asoc->advanced_peer_ack_point = asoc->last_acked_seq; if (asoc->mapping_array) { memset(asoc->mapping_array, 0, asoc->mapping_array_size); } if (asoc->nr_mapping_array) { memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); } SCTP_TCB_UNLOCK(stcb); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); SCTP_SOCKET_LOCK(so, 1); #endif SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(stcb->sctp_ep); SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); /* send up all the data */ SCTP_TCB_SEND_LOCK(stcb); sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED); for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].chunks_on_queues = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { asoc->strmout[i].abandoned_sent[j] = 0; asoc->strmout[i].abandoned_unsent[j] = 0; } #else asoc->strmout[i].abandoned_sent[0] = 0; asoc->strmout[i].abandoned_unsent[0] = 0; #endif stcb->asoc.strmout[i].stream_no = i; stcb->asoc.strmout[i].next_sequence_send = 0; stcb->asoc.strmout[i].last_msg_incomplete = 0; } /* process the INIT-ACK info (my info) */ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); /* pull from vtag hash */ LIST_REMOVE(stcb, sctp_asocs); /* re-insert to new vtag position */ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); SCTP_TCB_SEND_UNLOCK(stcb); SCTP_INP_WUNLOCK(stcb->sctp_ep); SCTP_INP_INFO_WUNLOCK(); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif asoc->total_flight = 0; asoc->total_flight_count = 0; /* process the INIT info (peer's info) */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 13; return (NULL); } /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 14; return (NULL); } /* respond with a COOKIE-ACK */ sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 15; return (stcb); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 16; /* all other cases... */ return (NULL); } /* * handle a state cookie for a new association m: input packet mbuf chain-- * assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a "split" mbuf * and the cookie signature does not exist offset: offset into mbuf to the * cookie-echo chunk length: length of the cookie chunk to: where the init * was from returns a new TCB */ static struct sctp_tcb * sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_tcb *stcb; struct sctp_init_chunk *init_cp, init_buf; struct sctp_init_ack_chunk *initack_cp, initack_buf; union sctp_sockstore store; struct sctp_association *asoc; int init_offset, initack_offset, initack_limit; int retval; int error = 0; uint8_t auth_chunk_buf[SCTP_PARAM_BUFFER_SIZE]; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; so = SCTP_INP_SO(inp); #endif /* * find and validate the INIT chunk in the cookie (peer's info) the * INIT should start after the cookie-echo header struct (chunk * header, state cookie header struct) */ init_offset = offset + sizeof(struct sctp_cookie_echo_chunk); init_cp = (struct sctp_init_chunk *) sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), (uint8_t *) & init_buf); if (init_cp == NULL) { /* could not pull a INIT chunk in cookie */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT chunk hdr\n"); return (NULL); } if (init_cp->ch.chunk_type != SCTP_INITIATION) { SCTPDBG(SCTP_DEBUG_INPUT1, "HUH? process_cookie_new: could not find INIT chunk!\n"); return (NULL); } initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length)); /* * find and validate the INIT-ACK chunk in the cookie (my info) the * INIT-ACK follows the INIT chunk */ initack_cp = (struct sctp_init_ack_chunk *) sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), (uint8_t *) & initack_buf); if (initack_cp == NULL) { /* could not pull INIT-ACK chunk in cookie */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT-ACK chunk hdr\n"); return (NULL); } if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { return (NULL); } /* * NOTE: We can't use the INIT_ACK's chk_length to determine the * "initack_limit" value. This is because the chk_length field * includes the length of the cookie, but the cookie is omitted when * the INIT and INIT_ACK are tacked onto the cookie... */ initack_limit = offset + cookie_len; /* * now that we know the INIT/INIT-ACK are in place, create a new TCB * and popluate */ /* * Here we do a trick, we set in NULL for the proc/thread argument. * We do this since in effect we only use the p argument when the * socket is unbound and we must do an implicit bind. Since we are * getting a cookie, we cannot be unbound. */ stcb = sctp_aloc_assoc(inp, init_src, &error, ntohl(initack_cp->init.initiate_tag), vrf_id, (struct thread *)NULL ); if (stcb == NULL) { struct mbuf *op_err; /* memory problem? */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another TCB!\n"); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); return (NULL); } /* get the correct sctp_nets */ if (netp) *netp = sctp_findnet(stcb, init_src); asoc = &stcb->asoc; /* get scope variables out of cookie */ asoc->scope.ipv4_local_scope = cookie->ipv4_scope; asoc->scope.site_scope = cookie->site_scope; asoc->scope.local_scope = cookie->local_scope; asoc->scope.loopback_scope = cookie->loopback_scope; if ((asoc->scope.ipv4_addr_legal != cookie->ipv4_addr_legal) || (asoc->scope.ipv6_addr_legal != cookie->ipv6_addr_legal)) { struct mbuf *op_err; /* * Houston we have a problem. The EP changed while the * cookie was in flight. Only recourse is to abort the * association. */ atomic_add_int(&stcb->asoc.refcnt, 1); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* process the INIT-ACK info (my info) */ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; asoc->str_reset_seq_in = asoc->init_seq_number; asoc->advanced_peer_ack_point = asoc->last_acked_seq; /* process the INIT info (peer's info) */ if (netp) retval = sctp_process_init(init_cp, stcb); else retval = 0; if (retval < 0) { atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* load all addresses */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src)) { atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* * verify any preceding AUTH chunk that was skipped */ /* pull the local authentication parameters from the cookie/init-ack */ sctp_auth_get_cookie_params(stcb, m, initack_offset + sizeof(struct sctp_init_ack_chunk), initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk))); if (auth_skipped) { struct sctp_auth_chunk *auth; auth = (struct sctp_auth_chunk *) sctp_m_getptr(m, auth_offset, auth_len, auth_chunk_buf); if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) { /* auth HMAC failed, dump the assoc and packet */ SCTPDBG(SCTP_DEBUG_AUTH1, "COOKIE-ECHO: AUTH failed\n"); atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_21); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } else { /* remaining chunks checked... good to go */ stcb->asoc.authenticated = 1; } } /* update current state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } sctp_stop_all_cookie_timers(stcb); SCTP_STAT_INCR_COUNTER32(sctps_passiveestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); /* * if we're doing ASCONFs, check to see if we have any new local * addresses that need to get added to the peer (eg. addresses * changed while cookie echo in flight). This needs to be done * after we go to the OPEN state to do the correct asconf * processing. else, make sure we have the correct addresses in our * lists */ /* warning, we re-use sin, sin6, sa_store here! */ /* pull in local_address (our "from" address) */ switch (cookie->laddr_type) { #ifdef INET case SCTP_IPV4_ADDRESS: /* source addr is IPv4 */ memset(&store.sin, 0, sizeof(struct sockaddr_in)); store.sin.sin_family = AF_INET; store.sin.sin_len = sizeof(struct sockaddr_in); store.sin.sin_addr.s_addr = cookie->laddress[0]; break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: /* source addr is IPv6 */ memset(&store.sin6, 0, sizeof(struct sockaddr_in6)); store.sin6.sin6_family = AF_INET6; store.sin6.sin6_len = sizeof(struct sockaddr_in6); store.sin6.sin6_scope_id = cookie->scope_id; memcpy(&store.sin6.sin6_addr, cookie->laddress, sizeof(struct in6_addr)); break; #endif default: atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif atomic_subtract_int(&stcb->asoc.refcnt, 1); return (NULL); } /* set up to notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (inp->sctp_socket->so_qlimit == 0)) { /* * This is an endpoint that called connect() how it got a * cookie that is NEW is a bit of a mystery. It must be that * the INIT was sent, but before it got there.. a complete * INIT/INIT-ACK/COOKIE arrived. But of course then it * should have went to the other code.. not here.. oh well.. * a bit of protection is worth having.. */ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } #endif soisconnected(stcb->sctp_socket); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_socket->so_qlimit)) { /* * We don't want to do anything with this one. Since it is * the listening guy. The timer will get started for * accepted connections in the caller. */ ; } /* since we did not send a HB make sure we don't double things */ if ((netp) && (*netp)) (*netp)->hb_responded = 1; if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); if ((netp) && (*netp)) { /* calculate the RTT and set the encaps port */ (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp, &cookie->time_entered, sctp_align_unsafe_makecopy, SCTP_RTT_FROM_NON_DATA); (*netp)->port = port; } /* respond with a COOKIE-ACK */ sctp_send_cookie_ack(stcb); /* * check the address lists for any ASCONFs that need to be sent * AFTER the cookie-ack is sent */ sctp_check_address_list(stcb, m, initack_offset + sizeof(struct sctp_init_ack_chunk), initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)), &store.sa, cookie->local_scope, cookie->site_scope, cookie->ipv4_scope, cookie->loopback_scope); return (stcb); } /* * CODE LIKE THIS NEEDS TO RUN IF the peer supports the NAT extension, i.e * we NEED to make sure we are not already using the vtag. If so we * need to send back an ABORT-TRY-AGAIN-WITH-NEW-TAG No middle box bit! head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { if ((stcb->asoc.my_vtag == tag) && (stcb->rport == rport) && (inp == stcb->sctp_ep)) { -- SEND ABORT - TRY AGAIN -- } } */ /* * handles a COOKIE-ECHO message stcb: modified to either a new or left as * existing (non-NULL) TCB */ static struct mbuf * sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_cookie_echo_chunk *cp, struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, struct sctp_tcb **locked_tcb, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_state_cookie *cookie; struct sctp_tcb *l_stcb = *stcb; struct sctp_inpcb *l_inp; struct sockaddr *to; struct sctp_pcb *ep; struct mbuf *m_sig; uint8_t calc_sig[SCTP_SIGNATURE_SIZE], tmp_sig[SCTP_SIGNATURE_SIZE]; uint8_t *sig; uint8_t cookie_ok = 0; unsigned int sig_offset, cookie_offset; unsigned int cookie_len; struct timeval now; struct timeval time_expires; int notification = 0; struct sctp_nets *netl; int had_a_existing_tcb = 0; int send_int_conf = 0; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_cookie: handling COOKIE-ECHO\n"); if (inp_p == NULL) { return (NULL); } cookie = &cp->cookie; cookie_offset = offset + sizeof(struct sctp_chunkhdr); cookie_len = ntohs(cp->ch.chunk_length); if ((cookie->peerport != sh->src_port) && (cookie->myport != sh->dest_port) && (cookie->my_vtag != sh->v_tag)) { /* * invalid ports or bad tag. Note that we always leave the * v_tag in the header in network order and when we stored * it in the my_vtag slot we also left it in network order. * This maintains the match even though it may be in the * opposite byte order of the machine :-> */ return (NULL); } if (cookie_len < sizeof(struct sctp_cookie_echo_chunk) + sizeof(struct sctp_init_chunk) + sizeof(struct sctp_init_ack_chunk) + SCTP_SIGNATURE_SIZE) { /* cookie too small */ return (NULL); } /* * split off the signature into its own mbuf (since it should not be * calculated in the sctp_hmac_m() call). */ sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE; m_sig = m_split(m, sig_offset, M_NOWAIT); if (m_sig == NULL) { /* out of memory or ?? */ return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m_sig, SCTP_MBUF_SPLIT); } #endif /* * compute the signature/digest for the cookie */ ep = &(*inp_p)->sctp_ep; l_inp = *inp_p; if (l_stcb) { SCTP_TCB_UNLOCK(l_stcb); } SCTP_INP_RLOCK(l_inp); if (l_stcb) { SCTP_TCB_LOCK(l_stcb); } /* which cookie is it? */ if ((cookie->time_entered.tv_sec < (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* it's the old cookie */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *) ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); } else { /* it's the current cookie */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *) ep->secret_key[(int)ep->current_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); } /* get the signature */ SCTP_INP_RUNLOCK(l_inp); sig = (uint8_t *) sctp_m_getptr(m_sig, 0, SCTP_SIGNATURE_SIZE, (uint8_t *) & tmp_sig); if (sig == NULL) { /* couldn't find signature */ sctp_m_freem(m_sig); return (NULL); } /* compare the received digest with the computed digest */ if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { /* try the old cookie? */ if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* compute digest with old */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *) ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); /* compare */ if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) cookie_ok = 1; } } else { cookie_ok = 1; } /* * Now before we continue we must reconstruct our mbuf so that * normal processing of any other chunks will work. */ { struct mbuf *m_at; m_at = m; while (SCTP_BUF_NEXT(m_at) != NULL) { m_at = SCTP_BUF_NEXT(m_at); } SCTP_BUF_NEXT(m_at) = m_sig; } if (cookie_ok == 0) { SCTPDBG(SCTP_DEBUG_INPUT2, "handle_cookie_echo: cookie signature validation failed!\n"); SCTPDBG(SCTP_DEBUG_INPUT2, "offset = %u, cookie_offset = %u, sig_offset = %u\n", (uint32_t) offset, cookie_offset, sig_offset); return (NULL); } /* * check the cookie timestamps to be sure it's not stale */ (void)SCTP_GETTIME_TIMEVAL(&now); /* Expire time is in Ticks, so we convert to seconds */ time_expires.tv_sec = cookie->time_entered.tv_sec + TICKS_TO_SEC(cookie->cookie_life); time_expires.tv_usec = cookie->time_entered.tv_usec; /* * TODO sctp_constants.h needs alternative time macros when _KERNEL * is undefined. */ if (timevalcmp(&now, &time_expires, >)) { /* cookie is stale! */ struct mbuf *op_err; - struct sctp_stale_cookie_msg *scm; + struct sctp_error_stale_cookie *cause; uint32_t tim; - op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_stale_cookie_msg), + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_stale_cookie), 0, M_NOWAIT, 1, MT_DATA); if (op_err == NULL) { /* FOOBAR */ return (NULL); } /* Set the len */ - SCTP_BUF_LEN(op_err) = sizeof(struct sctp_stale_cookie_msg); - scm = mtod(op_err, struct sctp_stale_cookie_msg *); - scm->ph.param_type = htons(SCTP_CAUSE_STALE_COOKIE); - scm->ph.param_length = htons((sizeof(struct sctp_paramhdr) + + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_stale_cookie); + cause = mtod(op_err, struct sctp_error_stale_cookie *); + cause->cause.code = htons(SCTP_CAUSE_STALE_COOKIE); + cause->cause.length = htons((sizeof(struct sctp_paramhdr) + (sizeof(uint32_t)))); /* seconds to usec */ tim = (now.tv_sec - time_expires.tv_sec) * 1000000; /* add in usec */ if (tim == 0) tim = now.tv_usec - cookie->time_entered.tv_usec; - scm->time_usec = htonl(tim); + cause->stale_time = htonl(tim); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, mflowtype, mflowid, l_inp->fibnum, vrf_id, port); return (NULL); } /* * Now we must see with the lookup address if we have an existing * asoc. This will only happen if we were in the COOKIE-WAIT state * and a INIT collided with us and somewhere the peer sent the * cookie on another address besides the single address our assoc * had for him. In this case we will have one of the tie-tags set at * least AND the address field in the cookie can be used to look it * up. */ to = NULL; switch (cookie->addr_type) { #ifdef INET6 case SCTP_IPV6_ADDRESS: memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_port = sh->src_port; sin6.sin6_scope_id = cookie->scope_id; memcpy(&sin6.sin6_addr.s6_addr, cookie->address, sizeof(sin6.sin6_addr.s6_addr)); to = (struct sockaddr *)&sin6; break; #endif #ifdef INET case SCTP_IPV4_ADDRESS: memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = sh->src_port; sin.sin_addr.s_addr = cookie->address[0]; to = (struct sockaddr *)&sin; break; #endif default: /* This should not happen */ return (NULL); } if (*stcb == NULL) { /* Yep, lets check */ *stcb = sctp_findassociation_ep_addr(inp_p, to, netp, dst, NULL); if (*stcb == NULL) { /* * We should have only got back the same inp. If we * got back a different ep we have a problem. The * original findep got back l_inp and now */ if (l_inp != *inp_p) { SCTP_PRINTF("Bad problem find_ep got a diff inp then special_locate?\n"); } } else { if (*locked_tcb == NULL) { /* * In this case we found the assoc only * after we locked the create lock. This * means we are in a colliding case and we * must make sure that we unlock the tcb if * its one of the cases where we throw away * the incoming packets. */ *locked_tcb = *stcb; /* * We must also increment the inp ref count * since the ref_count flags was set when we * did not find the TCB, now we found it * which reduces the refcount.. we must * raise it back out to balance it all :-) */ SCTP_INP_INCR_REF((*stcb)->sctp_ep); if ((*stcb)->sctp_ep != l_inp) { SCTP_PRINTF("Huh? ep:%p diff then l_inp:%p?\n", (void *)(*stcb)->sctp_ep, (void *)l_inp); } } } } cookie_len -= SCTP_SIGNATURE_SIZE; if (*stcb == NULL) { /* this is the "normal" case... get a new TCB */ *stcb = sctp_process_cookie_new(m, iphlen, offset, src, dst, sh, cookie, cookie_len, *inp_p, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port); } else { /* this is abnormal... cookie-echo on existing TCB */ had_a_existing_tcb = 1; *stcb = sctp_process_cookie_existing(m, iphlen, offset, src, dst, sh, cookie, cookie_len, *inp_p, *stcb, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port); } if (*stcb == NULL) { /* still no TCB... must be bad cookie-echo */ return (NULL); } if (*netp != NULL) { (*netp)->flowtype = mflowtype; (*netp)->flowid = mflowid; } /* * Ok, we built an association so confirm the address we sent the * INIT-ACK to. */ netl = sctp_findnet(*stcb, to); /* * This code should in theory NOT run but */ if (netl == NULL) { /* TSNH! Huh, why do I need to add this address here? */ if (sctp_add_remote_addr(*stcb, to, NULL, SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) { return (NULL); } netl = sctp_findnet(*stcb, to); } if (netl) { if (netl->dest_state & SCTP_ADDR_UNCONFIRMED) { netl->dest_state &= ~SCTP_ADDR_UNCONFIRMED; (void)sctp_set_primary_addr((*stcb), (struct sockaddr *)NULL, netl); send_int_conf = 1; } } sctp_start_net_timers(*stcb); if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { if (!had_a_existing_tcb || (((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { /* * If we have a NEW cookie or the connect never * reached the connected state during collision we * must do the TCP accept thing. */ struct socket *so, *oso; struct sctp_inpcb *inp; if (notification == SCTP_NOTIFY_ASSOC_RESTART) { /* * For a restart we will keep the same * socket, no need to do anything. I THINK!! */ sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } return (m); } oso = (*inp_p)->sctp_socket; atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); CURVNET_SET(oso->so_vnet); so = sonewconn(oso, 0 ); CURVNET_RESTORE(); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); if (so == NULL) { struct mbuf *op_err; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *pcb_so; #endif /* Too many sockets */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another socket!\n"); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(*inp_p, NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) pcb_so = SCTP_INP_SO(*inp_p); atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); SCTP_SOCKET_LOCK(pcb_so, 1); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); #endif (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(pcb_so, 1); #endif return (NULL); } inp = (struct sctp_inpcb *)so->so_pcb; SCTP_INP_INCR_REF(inp); /* * We add the unbound flag here so that if we get an * soabort() before we get the move_pcb done, we * will properly cleanup. */ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | SCTP_PCB_FLAGS_CONNECTED | SCTP_PCB_FLAGS_IN_TCPPOOL | SCTP_PCB_FLAGS_UNBOUND | (SCTP_PCB_COPY_FLAGS & (*inp_p)->sctp_flags) | SCTP_PCB_FLAGS_DONT_WAKE); inp->sctp_features = (*inp_p)->sctp_features; inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features; inp->sctp_socket = so; inp->sctp_frag_point = (*inp_p)->sctp_frag_point; inp->max_cwnd = (*inp_p)->max_cwnd; inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off; inp->ecn_supported = (*inp_p)->ecn_supported; inp->prsctp_supported = (*inp_p)->prsctp_supported; inp->auth_supported = (*inp_p)->auth_supported; inp->asconf_supported = (*inp_p)->asconf_supported; inp->reconfig_supported = (*inp_p)->reconfig_supported; inp->nrsack_supported = (*inp_p)->nrsack_supported; inp->pktdrop_supported = (*inp_p)->pktdrop_supported; inp->partial_delivery_point = (*inp_p)->partial_delivery_point; inp->sctp_context = (*inp_p)->sctp_context; inp->local_strreset_support = (*inp_p)->local_strreset_support; inp->fibnum = (*inp_p)->fibnum; inp->inp_starting_point_for_iterator = NULL; /* * copy in the authentication parameters from the * original endpoint */ if (inp->sctp_ep.local_hmacs) sctp_free_hmaclist(inp->sctp_ep.local_hmacs); inp->sctp_ep.local_hmacs = sctp_copy_hmaclist((*inp_p)->sctp_ep.local_hmacs); if (inp->sctp_ep.local_auth_chunks) sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); inp->sctp_ep.local_auth_chunks = sctp_copy_chunklist((*inp_p)->sctp_ep.local_auth_chunks); /* * Now we must move it from one hash table to * another and get the tcb in the right place. */ /* * This is where the one-2-one socket is put into * the accept state waiting for the accept! */ if (*stcb) { (*stcb)->asoc.state |= SCTP_STATE_IN_ACCEPT_QUEUE; } sctp_move_pcb_and_assoc(*inp_p, inp, *stcb); atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); sctp_pull_off_control_to_new_inp((*inp_p), inp, *stcb, 0); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); /* * now we must check to see if we were aborted while * the move was going on and the lock/unlock * happened. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* * yep it was, we leave the assoc attached * to the socket since the sctp_inpcb_free() * call will send an abort for us. */ SCTP_INP_DECR_REF(inp); return (NULL); } SCTP_INP_DECR_REF(inp); /* Switch over to the new guy */ *inp_p = inp; sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } /* * Pull it from the incomplete queue and wake the * guy */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); SCTP_SOCKET_LOCK(so, 1); #endif soisconnected(so); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); SCTP_SOCKET_UNLOCK(so, 1); #endif return (m); } } if (notification) { sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } return (m); } static void sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { /* cp must not be used, others call this without a c-ack :-) */ struct sctp_association *asoc; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_cookie_ack: handling COOKIE-ACK\n"); if ((stcb == NULL) || (net == NULL)) { return; } asoc = &stcb->asoc; sctp_stop_all_cookie_timers(stcb); /* process according to association state */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) { /* state change only needed when I am in right state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); sctp_start_net_timers(stcb); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); } /* update RTO */ SCTP_STAT_INCR_COUNTER32(sctps_activeestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); if (asoc->overall_error_count == 0) { net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy, SCTP_RTT_FROM_NON_DATA); } (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); sctp_ulp_notify(SCTP_NOTIFY_ASSOC_UP, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif if ((stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) == 0) { soisconnected(stcb->sctp_socket); } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* * We don't need to do the asconf thing, nor hb or * autoclose if the socket is closed. */ goto closed_socket; } sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, stcb->sctp_ep, stcb, NULL); } /* * send ASCONF if parameters are pending and ASCONFs are * allowed (eg. addresses changed when init/cookie echo were * in flight) */ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) && (stcb->asoc.asconf_supported == 1) && (!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, stcb->asoc.primary_destination); #else sctp_send_asconf(stcb, stcb->asoc.primary_destination, SCTP_ADDR_NOT_LOCKED); #endif } } closed_socket: /* Toss the cookie if I can */ sctp_toss_old_cookies(stcb, asoc); if (!TAILQ_EMPTY(&asoc->sent_queue)) { /* Restart the timer if we have pending data */ struct sctp_tmit_chunk *chk; chk = TAILQ_FIRST(&asoc->sent_queue); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } static void sctp_handle_ecn_echo(struct sctp_ecne_chunk *cp, struct sctp_tcb *stcb) { struct sctp_nets *net; struct sctp_tmit_chunk *lchk; struct sctp_ecne_chunk bkup; uint8_t override_bit; uint32_t tsn, window_data_tsn; int len; unsigned int pkt_cnt; len = ntohs(cp->ch.chunk_length); if ((len != sizeof(struct sctp_ecne_chunk)) && (len != sizeof(struct old_sctp_ecne_chunk))) { return; } if (len == sizeof(struct old_sctp_ecne_chunk)) { /* Its the old format */ memcpy(&bkup, cp, sizeof(struct old_sctp_ecne_chunk)); bkup.num_pkts_since_cwr = htonl(1); cp = &bkup; } SCTP_STAT_INCR(sctps_recvecne); tsn = ntohl(cp->tsn); pkt_cnt = ntohl(cp->num_pkts_since_cwr); lchk = TAILQ_LAST(&stcb->asoc.send_queue, sctpchunk_listhead); if (lchk == NULL) { window_data_tsn = stcb->asoc.sending_seq - 1; } else { window_data_tsn = lchk->rec.data.TSN_seq; } /* Find where it was sent to if possible. */ net = NULL; TAILQ_FOREACH(lchk, &stcb->asoc.sent_queue, sctp_next) { if (lchk->rec.data.TSN_seq == tsn) { net = lchk->whoTo; net->ecn_prev_cwnd = lchk->rec.data.cwnd_at_send; break; } if (SCTP_TSN_GT(lchk->rec.data.TSN_seq, tsn)) { break; } } if (net == NULL) { /* * What to do. A previous send of a CWR was possibly lost. * See how old it is, we may have it marked on the actual * net. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (tsn == net->last_cwr_tsn) { /* Found him, send it off */ break; } } if (net == NULL) { /* * If we reach here, we need to send a special CWR * that says hey, we did this a long time ago and * you lost the response. */ net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { /* TSNH */ return; } override_bit = SCTP_CWR_REDUCE_OVERRIDE; } else { override_bit = 0; } } else { override_bit = 0; } if (SCTP_TSN_GT(tsn, net->cwr_window_tsn) && ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) { /* * JRS - Use the congestion control given in the pluggable * CC module */ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 0, pkt_cnt); /* * We reduce once every RTT. So we will only lower cwnd at * the next sending seq i.e. the window_data_tsn */ net->cwr_window_tsn = window_data_tsn; net->ecn_ce_pkt_cnt += pkt_cnt; net->lost_cnt = pkt_cnt; net->last_cwr_tsn = tsn; } else { override_bit |= SCTP_CWR_IN_SAME_WINDOW; if (SCTP_TSN_GT(tsn, net->last_cwr_tsn) && ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) { /* * Another loss in the same window update how many * marks/packets lost we have had. */ int cnt = 1; if (pkt_cnt > net->lost_cnt) { /* Should be the case */ cnt = (pkt_cnt - net->lost_cnt); net->ecn_ce_pkt_cnt += cnt; } net->lost_cnt = pkt_cnt; net->last_cwr_tsn = tsn; /* * Most CC functions will ignore this call, since we * are in-window yet of the initial CE the peer saw. */ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 1, cnt); } } /* * We always send a CWR this way if our previous one was lost our * peer will get an update, or if it is not time again to reduce we * still get the cwr to the peer. Note we set the override when we * could not find the TSN on the chunk or the destination network. */ sctp_send_cwr(stcb, net, net->last_cwr_tsn, override_bit); } static void sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net) { /* * Here we get a CWR from the peer. We must look in the outqueue and * make sure that we have a covered ECNE in the control chunk part. * If so remove it. */ struct sctp_tmit_chunk *chk; struct sctp_ecne_chunk *ecne; int override; uint32_t cwr_tsn; cwr_tsn = ntohl(cp->tsn); override = cp->ch.chunk_flags & SCTP_CWR_REDUCE_OVERRIDE; TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) { continue; } if ((override == 0) && (chk->whoTo != net)) { /* Must be from the right src unless override is set */ continue; } ecne = mtod(chk->data, struct sctp_ecne_chunk *); if (SCTP_TSN_GE(cwr_tsn, ntohl(ecne->tsn))) { /* this covers this ECNE, we can remove it */ stcb->asoc.ecn_echo_cnt_onq--; TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk, sctp_next); sctp_m_freem(chk->data); chk->data = NULL; stcb->asoc.ctrl_queue_cnt--; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); if (override == 0) { break; } } } } static void sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *asoc; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: handling SHUTDOWN-COMPLETE\n"); if (stcb == NULL) return; asoc = &stcb->asoc; /* process according to association state */ if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { /* unexpected SHUTDOWN-COMPLETE... so ignore... */ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: not in SCTP_STATE_SHUTDOWN_ACK_SENT --- ignore\n"); SCTP_TCB_UNLOCK(stcb); return; } /* notify upper layer protocol */ if (stcb->sctp_socket) { sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } #ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { panic("Queues are not empty when handling SHUTDOWN-COMPLETE"); } #endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_24); SCTP_STAT_INCR_COUNTER32(sctps_shutdown); /* free the TCB */ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: calls free-asoc\n"); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(stcb->sctp_ep); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_25); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif return; } static int process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, struct sctp_nets *net, uint8_t flg) { switch (desc->chunk_type) { case SCTP_DATA: /* find the tsn to resend (possibly */ { uint32_t tsn; struct sctp_tmit_chunk *tp1; tsn = ntohl(desc->tsn_ifany); TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->rec.data.TSN_seq == tsn) { /* found it */ break; } if (SCTP_TSN_GT(tp1->rec.data.TSN_seq, tsn)) { /* not found */ tp1 = NULL; break; } } if (tp1 == NULL) { /* * Do it the other way , aka without paying * attention to queue seq order. */ SCTP_STAT_INCR(sctps_pdrpdnfnd); TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->rec.data.TSN_seq == tsn) { /* found it */ break; } } } if (tp1 == NULL) { SCTP_STAT_INCR(sctps_pdrptsnnf); } if ((tp1) && (tp1->sent < SCTP_DATAGRAM_ACKED)) { uint8_t *ddp; if (((flg & SCTP_BADCRC) == 0) && ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { return (0); } if ((stcb->asoc.peers_rwnd == 0) && ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { SCTP_STAT_INCR(sctps_pdrpdiwnp); return (0); } if (stcb->asoc.peers_rwnd == 0 && (flg & SCTP_FROM_MIDDLE_BOX)) { SCTP_STAT_INCR(sctps_pdrpdizrw); return (0); } ddp = (uint8_t *) (mtod(tp1->data, caddr_t)+ sizeof(struct sctp_data_chunk)); { unsigned int iii; for (iii = 0; iii < sizeof(desc->data_bytes); iii++) { if (ddp[iii] != desc->data_bytes[iii]) { SCTP_STAT_INCR(sctps_pdrpbadd); return (-1); } } } if (tp1->do_rtt) { /* * this guy had a RTO calculation * pending on it, cancel it */ if (tp1->whoTo->rto_needed == 0) { tp1->whoTo->rto_needed = 1; } tp1->do_rtt = 0; } SCTP_STAT_INCR(sctps_pdrpmark); if (tp1->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); /* * mark it as if we were doing a FR, since * we will be getting gap ack reports behind * the info from the router. */ tp1->rec.data.doing_fast_retransmit = 1; /* * mark the tsn with what sequences can * cause a new FR. */ if (TAILQ_EMPTY(&stcb->asoc.send_queue)) { tp1->rec.data.fast_retran_tsn = stcb->asoc.sending_seq; } else { tp1->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.TSN_seq; } /* restart the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, tp1->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_26); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, tp1->whoTo); /* fix counts and things */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP, tp1->whoTo->flight_size, tp1->book_size, (uintptr_t) stcb, tp1->rec.data.TSN_seq); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_decrease(tp1); sctp_total_flight_decrease(stcb, tp1); } tp1->sent = SCTP_DATAGRAM_RESEND; } { /* audit code */ unsigned int audit; audit = 0; TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->sent == SCTP_DATAGRAM_RESEND) audit++; } TAILQ_FOREACH(tp1, &stcb->asoc.control_send_queue, sctp_next) { if (tp1->sent == SCTP_DATAGRAM_RESEND) audit++; } if (audit != stcb->asoc.sent_queue_retran_cnt) { SCTP_PRINTF("**Local Audit finds cnt:%d asoc cnt:%d\n", audit, stcb->asoc.sent_queue_retran_cnt); #ifndef SCTP_AUDITING_ENABLED stcb->asoc.sent_queue_retran_cnt = audit; #endif } } } break; case SCTP_ASCONF: { struct sctp_tmit_chunk *asconf; TAILQ_FOREACH(asconf, &stcb->asoc.control_send_queue, sctp_next) { if (asconf->rec.chunk_id.id == SCTP_ASCONF) { break; } } if (asconf) { if (asconf->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); asconf->sent = SCTP_DATAGRAM_RESEND; asconf->snd_count--; } } break; case SCTP_INITIATION: /* resend the INIT */ stcb->asoc.dropped_special_cnt++; if (stcb->asoc.dropped_special_cnt < SCTP_RETRY_DROPPED_THRESH) { /* * If we can get it in, in a few attempts we do * this, otherwise we let the timer fire. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } break; case SCTP_SELECTIVE_ACK: case SCTP_NR_SELECTIVE_ACK: /* resend the sack */ sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); break; case SCTP_HEARTBEAT_REQUEST: /* resend a demand HB */ if ((stcb->asoc.overall_error_count + 3) < stcb->asoc.max_send_times) { /* * Only retransmit if we KNOW we wont destroy the * tcb */ sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); } break; case SCTP_SHUTDOWN: sctp_send_shutdown(stcb, net); break; case SCTP_SHUTDOWN_ACK: sctp_send_shutdown_ack(stcb, net); break; case SCTP_COOKIE_ECHO: { struct sctp_tmit_chunk *cookie; cookie = NULL; TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue, sctp_next) { if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) { break; } } if (cookie) { if (cookie->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); cookie->sent = SCTP_DATAGRAM_RESEND; sctp_stop_all_cookie_timers(stcb); } } break; case SCTP_COOKIE_ACK: sctp_send_cookie_ack(stcb); break; case SCTP_ASCONF_ACK: /* resend last asconf ack */ sctp_send_asconf_ack(stcb); break; case SCTP_FORWARD_CUM_TSN: send_forward_tsn(stcb, &stcb->asoc); break; /* can't do anything with these */ case SCTP_PACKET_DROPPED: case SCTP_INITIATION_ACK: /* this should not happen */ case SCTP_HEARTBEAT_ACK: case SCTP_ABORT_ASSOCIATION: case SCTP_OPERATION_ERROR: case SCTP_SHUTDOWN_COMPLETE: case SCTP_ECN_ECHO: case SCTP_ECN_CWR: default: break; } return (0); } void sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * list) { uint32_t i; uint16_t temp; /* * We set things to 0xffff since this is the last delivered sequence * and we will be sending in 0 after the reset. */ if (number_entries) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamincnt) { continue; } stcb->asoc.strmin[temp].last_sequence_delivered = 0xffff; } } else { list = NULL; for (i = 0; i < stcb->asoc.streamincnt; i++) { stcb->asoc.strmin[i].last_sequence_delivered = 0xffff; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } static void sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * list) { uint32_t i; uint16_t temp; if (number_entries > 0) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamoutcnt) { /* no such stream */ continue; } stcb->asoc.strmout[temp].next_sequence_send = 0; } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].next_sequence_send = 0; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } static void sctp_reset_clear_pending(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * list) { uint32_t i; uint16_t temp; if (number_entries > 0) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamoutcnt) { /* no such stream */ continue; } stcb->asoc.strmout[temp].state = SCTP_STREAM_OPEN; } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].state = SCTP_STREAM_OPEN; } } } struct sctp_stream_reset_request * sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk) { struct sctp_association *asoc; struct sctp_chunkhdr *ch; struct sctp_stream_reset_request *r; struct sctp_tmit_chunk *chk; int len, clen; asoc = &stcb->asoc; if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { asoc->stream_reset_outstanding = 0; return (NULL); } if (stcb->asoc.str_reset == NULL) { asoc->stream_reset_outstanding = 0; return (NULL); } chk = stcb->asoc.str_reset; if (chk->data == NULL) { return (NULL); } if (bchk) { /* he wants a copy of the chk pointer */ *bchk = chk; } clen = chk->send_size; ch = mtod(chk->data, struct sctp_chunkhdr *); r = (struct sctp_stream_reset_request *)(ch + 1); if (ntohl(r->request_seq) == seq) { /* found it */ return (r); } len = SCTP_SIZE32(ntohs(r->ph.param_length)); if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) { /* move to the next one, there can only be a max of two */ r = (struct sctp_stream_reset_request *)((caddr_t)r + len); if (ntohl(r->request_seq) == seq) { return (r); } } /* that seq is not here */ return (NULL); } static void sctp_clean_up_stream_reset(struct sctp_tcb *stcb) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk = stcb->asoc.str_reset; if (stcb->asoc.str_reset == NULL) { return; } asoc = &stcb->asoc; sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_28); TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } asoc->ctrl_queue_cnt--; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); /* sa_ignore NO_NULL_CHK */ stcb->asoc.str_reset = NULL; } static int sctp_handle_stream_reset_response(struct sctp_tcb *stcb, uint32_t seq, uint32_t action, struct sctp_stream_reset_response *respin) { uint16_t type; int lparm_len; struct sctp_association *asoc = &stcb->asoc; struct sctp_tmit_chunk *chk; struct sctp_stream_reset_request *req_param; struct sctp_stream_reset_out_request *req_out_param; struct sctp_stream_reset_in_request *req_in_param; uint32_t number_entries; if (asoc->stream_reset_outstanding == 0) { /* duplicate */ return (0); } if (seq == stcb->asoc.str_reset_seq_out) { req_param = sctp_find_stream_reset(stcb, seq, &chk); if (req_param != NULL) { stcb->asoc.str_reset_seq_out++; type = ntohs(req_param->ph.param_type); lparm_len = ntohs(req_param->ph.param_length); if (type == SCTP_STR_RESET_OUT_REQUEST) { int no_clear = 0; req_out_param = (struct sctp_stream_reset_out_request *)req_param; number_entries = (lparm_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t); asoc->stream_reset_out_is_outstanding = 0; if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* do it */ sctp_reset_out_streams(stcb, number_entries, req_out_param->list_of_streams); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); } else if (action == SCTP_STREAM_RESET_RESULT_IN_PROGRESS) { /* * Set it up so we don't stop * retransmitting */ stcb->asoc.str_reset_seq_out--; asoc->stream_reset_out_is_outstanding = 1; no_clear = 1; } else { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); } if (no_clear == 0) { sctp_reset_clear_pending(stcb, number_entries, req_out_param->list_of_streams); } } else if (type == SCTP_STR_RESET_IN_REQUEST) { req_in_param = (struct sctp_stream_reset_in_request *)req_param; number_entries = (lparm_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t); if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_IN, stcb, number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb, number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } } else if (type == SCTP_STR_RESET_ADD_OUT_STREAMS) { /* Ok we now may have more streams */ int num_stream; num_stream = stcb->asoc.strm_pending_add_size; if (num_stream > (stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt)) { /* TSNH */ num_stream = stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt; } stcb->asoc.strm_pending_add_size = 0; if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* Put the new streams into effect */ int i; for (i = asoc->streamoutcnt; i < (asoc->streamoutcnt + num_stream); i++) { asoc->strmout[i].state = SCTP_STREAM_OPEN; } asoc->streamoutcnt += num_stream; sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_DENIED); } else { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_FAILED); } } else if (type == SCTP_STR_RESET_ADD_IN_STREAMS) { if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_DENIED); } else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_FAILED); } } else if (type == SCTP_STR_RESET_TSN_REQUEST) { /** * a) Adopt the new in tsn. * b) reset the map * c) Adopt the new out-tsn */ struct sctp_stream_reset_response_tsn *resp; struct sctp_forward_tsn_chunk fwdtsn; int abort_flag = 0; if (respin == NULL) { /* huh ? */ return (0); } if (ntohs(respin->ph.param_length) < sizeof(struct sctp_stream_reset_response_tsn)) { return (0); } if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { resp = (struct sctp_stream_reset_response_tsn *)respin; asoc->stream_reset_outstanding--; fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; fwdtsn.new_cumulative_tsn = htonl(ntohl(resp->senders_next_tsn) - 1); sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); if (abort_flag) { return (1); } stcb->asoc.highest_tsn_inside_map = (ntohl(resp->senders_next_tsn) - 1); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 7, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map; stcb->asoc.mapping_array_base_tsn = ntohl(resp->senders_next_tsn); memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map; memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size); stcb->asoc.sending_seq = ntohl(resp->receivers_next_tsn); stcb->asoc.last_acked_seq = stcb->asoc.cumulative_tsn; sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL); sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL); sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), 0); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), SCTP_ASSOC_RESET_DENIED); } else { sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), SCTP_ASSOC_RESET_FAILED); } } /* get rid of the request and get the request flags */ if (asoc->stream_reset_outstanding == 0) { sctp_clean_up_stream_reset(stcb); } } } if (asoc->stream_reset_outstanding == 0) { sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); } return (0); } static void sctp_handle_str_reset_request_in(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_in_request *req, int trunc) { uint32_t seq; int len, i; int number_entries; uint16_t temp; /* * peer wants me to send a str-reset to him for my outgoing seq's if * seq_in is right. */ struct sctp_association *asoc = &stcb->asoc; seq = ntohl(req->request_seq); if (asoc->str_reset_seq_in == seq) { asoc->last_reset_action[1] = asoc->last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (trunc) { /* Can't do it, since they exceeded our buffer size */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (stcb->asoc.stream_reset_out_is_outstanding == 0) { len = ntohs(req->ph.param_length); number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t)); if (number_entries) { for (i = 0; i < number_entries; i++) { temp = ntohs(req->list_of_streams[i]); if (temp >= stcb->asoc.streamoutcnt) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; goto bad_boy; } req->list_of_streams[i] = temp; } for (i = 0; i < number_entries; i++) { if (stcb->asoc.strmout[req->list_of_streams[i]].state == SCTP_STREAM_OPEN) { stcb->asoc.strmout[req->list_of_streams[i]].state = SCTP_STREAM_RESET_PENDING; } } } else { /* Its all */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING; } } asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; } else { /* Can't do it, since we have sent one out */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS; } bad_boy: sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if (asoc->str_reset_seq_in - 1 == seq) { sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if (asoc->str_reset_seq_in - 2 == seq) { sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); } static int sctp_handle_str_reset_request_tsn(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_tsn_request *req) { /* reset all in and out and update the tsn */ /* * A) reset my str-seq's on in and out. B) Select a receive next, * and set cum-ack to it. Also process this selected number as a * fwd-tsn as well. C) set in the response my next sending seq. */ struct sctp_forward_tsn_chunk fwdtsn; struct sctp_association *asoc = &stcb->asoc; int abort_flag = 0; uint32_t seq; seq = ntohl(req->request_seq); if (asoc->str_reset_seq_in == seq) { asoc->last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else { fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; fwdtsn.ch.chunk_flags = 0; fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1); sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); if (abort_flag) { return (1); } asoc->highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->highest_tsn_inside_map; asoc->mapping_array_base_tsn = asoc->highest_tsn_inside_map + 1; memset(asoc->mapping_array, 0, asoc->mapping_array_size); asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); atomic_add_int(&asoc->sending_seq, 1); /* save off historical data for retrans */ asoc->last_sending_seq[1] = asoc->last_sending_seq[0]; asoc->last_sending_seq[0] = asoc->sending_seq; asoc->last_base_tsnsent[1] = asoc->last_base_tsnsent[0]; asoc->last_base_tsnsent[0] = asoc->mapping_array_base_tsn; sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL); sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; sctp_notify_stream_reset_tsn(stcb, asoc->sending_seq, (asoc->mapping_array_base_tsn + 1), 0); } sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0], asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]); asoc->str_reset_seq_in++; } else if (asoc->str_reset_seq_in - 1 == seq) { sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0], asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]); } else if (asoc->str_reset_seq_in - 2 == seq) { sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[1], asoc->last_sending_seq[1], asoc->last_base_tsnsent[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } return (0); } static void sctp_handle_str_reset_request_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_out_request *req, int trunc) { uint32_t seq, tsn; int number_entries, len; struct sctp_association *asoc = &stcb->asoc; seq = ntohl(req->request_seq); /* now if its not a duplicate we process it */ if (asoc->str_reset_seq_in == seq) { len = ntohs(req->ph.param_length); number_entries = ((len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t)); /* * the sender is resetting, handle the list issue.. we must * a) verify if we can do the reset, if so no problem b) If * we can't do the reset we must copy the request. c) queue * it, and setup the data in processor to trigger it off * when needed and dequeue all the queued data. */ tsn = ntohl(req->send_reset_at_tsn); /* move the reset action back one */ asoc->last_reset_action[1] = asoc->last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (trunc) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (SCTP_TSN_GE(asoc->cumulative_tsn, tsn)) { /* we can do it now */ sctp_reset_in_stream(stcb, number_entries, req->list_of_streams); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; } else { /* * we must queue it up and thus wait for the TSN's * to arrive that are at or before tsn */ struct sctp_stream_reset_list *liste; int siz; siz = sizeof(struct sctp_stream_reset_list) + (number_entries * sizeof(uint16_t)); SCTP_MALLOC(liste, struct sctp_stream_reset_list *, siz, SCTP_M_STRESET); if (liste == NULL) { /* gak out of memory */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); return; } liste->seq = seq; liste->tsn = tsn; liste->number_entries = number_entries; memcpy(&liste->list_of_streams, req->list_of_streams, number_entries * sizeof(uint16_t)); TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_IN_PROGRESS; } sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } static void sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_add_strm *str_add) { /* * Peer is requesting to add more streams. If its within our * max-streams we will allow it. */ uint32_t num_stream, i; uint32_t seq; struct sctp_association *asoc = &stcb->asoc; struct sctp_queued_to_read *ctl, *nctl; /* Get the number. */ seq = ntohl(str_add->request_seq); num_stream = ntohs(str_add->number_of_streams); /* Now what would be the new total? */ if (asoc->str_reset_seq_in == seq) { num_stream += stcb->asoc.streamincnt; stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if ((num_stream > stcb->asoc.max_inbound_streams) || (num_stream > 0xffff)) { /* We must reject it they ask for to many */ denied: stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else { /* Ok, we can do that :-) */ struct sctp_stream_in *oldstrm; /* save off the old */ oldstrm = stcb->asoc.strmin; SCTP_MALLOC(stcb->asoc.strmin, struct sctp_stream_in *, (num_stream * sizeof(struct sctp_stream_in)), SCTP_M_STRMI); if (stcb->asoc.strmin == NULL) { stcb->asoc.strmin = oldstrm; goto denied; } /* copy off the old data */ for (i = 0; i < stcb->asoc.streamincnt; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); stcb->asoc.strmin[i].stream_no = i; stcb->asoc.strmin[i].last_sequence_delivered = oldstrm[i].last_sequence_delivered; stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started; /* now anything on those queues? */ TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next, nctl) { TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next); TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next); } } /* Init the new streams */ for (i = stcb->asoc.streamincnt; i < num_stream; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); stcb->asoc.strmin[i].stream_no = i; stcb->asoc.strmin[i].last_sequence_delivered = 0xffff; stcb->asoc.strmin[i].delivery_started = 0; } SCTP_FREE(oldstrm, SCTP_M_STRMI); /* update the size */ stcb->asoc.streamincnt = num_stream; stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0); } sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } static void sctp_handle_str_reset_add_out_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_add_strm *str_add) { /* * Peer is requesting to add more streams. If its within our * max-streams we will allow it. */ uint16_t num_stream; uint32_t seq; struct sctp_association *asoc = &stcb->asoc; /* Get the number. */ seq = ntohl(str_add->request_seq); num_stream = ntohs(str_add->number_of_streams); /* Now what would be the new total? */ if (asoc->str_reset_seq_in == seq) { stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (stcb->asoc.stream_reset_outstanding) { /* We must reject it we have something pending */ stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS; } else { /* Ok, we can do that :-) */ int mychk; mychk = stcb->asoc.streamoutcnt; mychk += num_stream; if (mychk < 0x10000) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, num_stream, 0, 1)) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } } else { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } } sctp_add_stream_reset_result(chk, seq, stcb->asoc.last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } #ifdef __GNUC__ __attribute__((noinline)) #endif static int sctp_handle_stream_reset(struct sctp_tcb *stcb, struct mbuf *m, int offset, struct sctp_chunkhdr *ch_req) { uint16_t remaining_length, param_len, ptype; struct sctp_paramhdr pstore; uint8_t cstore[SCTP_CHUNK_BUFFER_SIZE]; uint32_t seq = 0; int num_req = 0; int trunc = 0; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; struct sctp_paramhdr *ph; int ret_code = 0; int num_param = 0; /* now it may be a reset or a reset-response */ remaining_length = ntohs(ch_req->chunk_length) - sizeof(struct sctp_chunkhdr); /* setup for adding the response */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return (ret_code); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->no_fr_allowed = 0; chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { strres_nochunk: if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return (ret_code); } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = NULL; ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->send_size); SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); offset += sizeof(struct sctp_chunkhdr); while (remaining_length >= sizeof(struct sctp_paramhdr)) { ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(pstore), (uint8_t *) & pstore); if (ph == NULL) { /* TSNH */ break; } param_len = ntohs(ph->param_length); if ((param_len > remaining_length) || (param_len < (sizeof(struct sctp_paramhdr) + sizeof(uint32_t)))) { /* bad parameter length */ break; } ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, min(param_len, sizeof(cstore)), (uint8_t *) & cstore); if (ph == NULL) { /* TSNH */ break; } ptype = ntohs(ph->param_type); num_param++; if (param_len > sizeof(cstore)) { trunc = 1; } else { trunc = 0; } if (num_param > SCTP_MAX_RESET_PARAMS) { /* hit the max of parameters already sorry.. */ break; } if (ptype == SCTP_STR_RESET_OUT_REQUEST) { struct sctp_stream_reset_out_request *req_out; if (param_len < sizeof(struct sctp_stream_reset_out_request)) { break; } req_out = (struct sctp_stream_reset_out_request *)ph; num_req++; if (stcb->asoc.stream_reset_outstanding) { seq = ntohl(req_out->response_seq); if (seq == stcb->asoc.str_reset_seq_out) { /* implicit ack */ (void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_RESULT_PERFORMED, NULL); } } sctp_handle_str_reset_request_out(stcb, chk, req_out, trunc); } else if (ptype == SCTP_STR_RESET_ADD_OUT_STREAMS) { struct sctp_stream_reset_add_strm *str_add; if (param_len < sizeof(struct sctp_stream_reset_add_strm)) { break; } str_add = (struct sctp_stream_reset_add_strm *)ph; num_req++; sctp_handle_str_reset_add_strm(stcb, chk, str_add); } else if (ptype == SCTP_STR_RESET_ADD_IN_STREAMS) { struct sctp_stream_reset_add_strm *str_add; if (param_len < sizeof(struct sctp_stream_reset_add_strm)) { break; } str_add = (struct sctp_stream_reset_add_strm *)ph; num_req++; sctp_handle_str_reset_add_out_strm(stcb, chk, str_add); } else if (ptype == SCTP_STR_RESET_IN_REQUEST) { struct sctp_stream_reset_in_request *req_in; num_req++; req_in = (struct sctp_stream_reset_in_request *)ph; sctp_handle_str_reset_request_in(stcb, chk, req_in, trunc); } else if (ptype == SCTP_STR_RESET_TSN_REQUEST) { struct sctp_stream_reset_tsn_request *req_tsn; num_req++; req_tsn = (struct sctp_stream_reset_tsn_request *)ph; if (sctp_handle_str_reset_request_tsn(stcb, chk, req_tsn)) { ret_code = 1; goto strres_nochunk; } /* no more */ break; } else if (ptype == SCTP_STR_RESET_RESPONSE) { struct sctp_stream_reset_response *resp; uint32_t result; if (param_len < sizeof(struct sctp_stream_reset_response)) { break; } resp = (struct sctp_stream_reset_response *)ph; seq = ntohl(resp->response_seq); result = ntohl(resp->result); if (sctp_handle_stream_reset_response(stcb, seq, result, resp)) { ret_code = 1; goto strres_nochunk; } } else { break; } offset += SCTP_SIZE32(param_len); if (remaining_length >= SCTP_SIZE32(param_len)) { remaining_length -= SCTP_SIZE32(param_len); } else { remaining_length = 0; } } if (num_req == 0) { /* we have no response free the stuff */ goto strres_nochunk; } /* ok we have a chunk to link in */ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); stcb->asoc.ctrl_queue_cnt++; return (ret_code); } /* * Handle a router or endpoints report of a packet loss, there are two ways * to handle this, either we get the whole packet and must disect it * ourselves (possibly with truncation and or corruption) or it is a summary * from a middle box that did the disectting for us. */ static void sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit) { uint32_t bottle_bw, on_queue; uint16_t trunc_len; unsigned int chlen; unsigned int at; struct sctp_chunk_desc desc; struct sctp_chunkhdr *ch; chlen = ntohs(cp->ch.chunk_length); chlen -= sizeof(struct sctp_pktdrop_chunk); /* XXX possible chlen underflow */ if (chlen == 0) { ch = NULL; if (cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) SCTP_STAT_INCR(sctps_pdrpbwrpt); } else { ch = (struct sctp_chunkhdr *)(cp->data + sizeof(struct sctphdr)); chlen -= sizeof(struct sctphdr); /* XXX possible chlen underflow */ memset(&desc, 0, sizeof(desc)); } trunc_len = (uint16_t) ntohs(cp->trunc_len); if (trunc_len > limit) { trunc_len = limit; } /* now the chunks themselves */ while ((ch != NULL) && (chlen >= sizeof(struct sctp_chunkhdr))) { desc.chunk_type = ch->chunk_type; /* get amount we need to move */ at = ntohs(ch->chunk_length); if (at < sizeof(struct sctp_chunkhdr)) { /* corrupt chunk, maybe at the end? */ SCTP_STAT_INCR(sctps_pdrpcrupt); break; } if (trunc_len == 0) { /* we are supposed to have all of it */ if (at > chlen) { /* corrupt skip it */ SCTP_STAT_INCR(sctps_pdrpcrupt); break; } } else { /* is there enough of it left ? */ if (desc.chunk_type == SCTP_DATA) { if (chlen < (sizeof(struct sctp_data_chunk) + sizeof(desc.data_bytes))) { break; } } else { if (chlen < sizeof(struct sctp_chunkhdr)) { break; } } } if (desc.chunk_type == SCTP_DATA) { /* can we get out the tsn? */ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX)) SCTP_STAT_INCR(sctps_pdrpmbda); if (chlen >= (sizeof(struct sctp_data_chunk) + sizeof(uint32_t))) { /* yep */ struct sctp_data_chunk *dcp; uint8_t *ddp; unsigned int iii; dcp = (struct sctp_data_chunk *)ch; ddp = (uint8_t *) (dcp + 1); for (iii = 0; iii < sizeof(desc.data_bytes); iii++) { desc.data_bytes[iii] = ddp[iii]; } desc.tsn_ifany = dcp->dp.tsn; } else { /* nope we are done. */ SCTP_STAT_INCR(sctps_pdrpnedat); break; } } else { if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX)) SCTP_STAT_INCR(sctps_pdrpmbct); } if (process_chunk_drop(stcb, &desc, net, cp->ch.chunk_flags)) { SCTP_STAT_INCR(sctps_pdrppdbrk); break; } if (SCTP_SIZE32(at) > chlen) { break; } chlen -= SCTP_SIZE32(at); if (chlen < sizeof(struct sctp_chunkhdr)) { /* done, none left */ break; } ch = (struct sctp_chunkhdr *)((caddr_t)ch + SCTP_SIZE32(at)); } /* Now update any rwnd --- possibly */ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) == 0) { /* From a peer, we get a rwnd report */ uint32_t a_rwnd; SCTP_STAT_INCR(sctps_pdrpfehos); bottle_bw = ntohl(cp->bottle_bw); on_queue = ntohl(cp->current_onq); if (bottle_bw && on_queue) { /* a rwnd report is in here */ if (bottle_bw > on_queue) a_rwnd = bottle_bw - on_queue; else a_rwnd = 0; if (a_rwnd == 0) stcb->asoc.peers_rwnd = 0; else { if (a_rwnd > stcb->asoc.total_flight) { stcb->asoc.peers_rwnd = a_rwnd - stcb->asoc.total_flight; } else { stcb->asoc.peers_rwnd = 0; } if (stcb->asoc.peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ stcb->asoc.peers_rwnd = 0; } } } } else { SCTP_STAT_INCR(sctps_pdrpfmbox); } /* now middle boxes in sat networks get a cwnd bump */ if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) && (stcb->asoc.sat_t3_loss_recovery == 0) && (stcb->asoc.sat_network)) { /* * This is debateable but for sat networks it makes sense * Note if a T3 timer has went off, we will prohibit any * changes to cwnd until we exit the t3 loss recovery. */ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped(stcb, net, cp, &bottle_bw, &on_queue); } } /* * handles all control chunks in a packet inputs: - m: mbuf chain, assumed to * still contain IP/SCTP header - stcb: is the tcb found for this packet - * offset: offset into the mbuf chain to first chunkhdr - length: is the * length of the complete packet outputs: - length: modified to remaining * length after control processing - netp: modified to new sctp_nets after * cookie-echo processing - return NULL to discard the packet (ie. no asoc, * bad packet,...) otherwise return the tcb for this packet */ #ifdef __GNUC__ __attribute__((noinline)) #endif static struct sctp_tcb * sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; uint32_t vtag_in; int num_chunks = 0; /* number of control chunks processed */ uint32_t chk_length; int ret; int abort_no_unlock = 0; int ecne_seen = 0; /* * How big should this be, and should it be alloc'd? Lets try the * d-mtu-ceiling for now (2k) and that should hopefully work ... * until we get into jumbo grams and such.. */ uint8_t chunk_buf[SCTP_CHUNK_BUFFER_SIZE]; struct sctp_tcb *locked_tcb = stcb; int got_auth = 0; uint32_t auth_offset = 0, auth_len = 0; int auth_skipped = 0; int asconf_cnt = 0; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_control: iphlen=%u, offset=%u, length=%u stcb:%p\n", iphlen, *offset, length, (void *)stcb); /* validate chunk header length... */ if (ntohs(ch->chunk_length) < sizeof(*ch)) { SCTPDBG(SCTP_DEBUG_INPUT1, "Invalid header length %d\n", ntohs(ch->chunk_length)); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } /* * validate the verification tag */ vtag_in = ntohl(sh->v_tag); if (locked_tcb) { SCTP_TCB_LOCK_ASSERT(locked_tcb); } if (ch->chunk_type == SCTP_INITIATION) { SCTPDBG(SCTP_DEBUG_INPUT1, "Its an INIT of len:%d vtag:%x\n", ntohs(ch->chunk_length), vtag_in); if (vtag_in != 0) { /* protocol error- silently discard... */ SCTP_STAT_INCR(sctps_badvtag); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else if (ch->chunk_type != SCTP_COOKIE_ECHO) { /* * If there is no stcb, skip the AUTH chunk and process * later after a stcb is found (to validate the lookup was * valid. */ if ((ch->chunk_type == SCTP_AUTHENTICATION) && (stcb == NULL) && (inp->auth_supported == 1)) { /* save this chunk for later processing */ auth_skipped = 1; auth_offset = *offset; auth_len = ntohs(ch->chunk_length); /* (temporarily) move past this chunk */ *offset += SCTP_SIZE32(auth_len); if (*offset >= length) { /* no more data left in the mbuf chain */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_chunkhdr), chunk_buf); } if (ch == NULL) { /* Help */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if (ch->chunk_type == SCTP_COOKIE_ECHO) { goto process_control_chunks; } /* * first check if it's an ASCONF with an unknown src addr we * need to look inside to find the association */ if (ch->chunk_type == SCTP_ASCONF && stcb == NULL) { struct sctp_chunkhdr *asconf_ch = ch; uint32_t asconf_offset = 0, asconf_len = 0; /* inp's refcount may be reduced */ SCTP_INP_INCR_REF(inp); asconf_offset = *offset; do { asconf_len = ntohs(asconf_ch->chunk_length); if (asconf_len < sizeof(struct sctp_asconf_paramhdr)) break; stcb = sctp_findassociation_ep_asconf(m, *offset, dst, sh, &inp, netp, vrf_id); if (stcb != NULL) break; asconf_offset += SCTP_SIZE32(asconf_len); asconf_ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, asconf_offset, sizeof(struct sctp_chunkhdr), chunk_buf); } while (asconf_ch != NULL && asconf_ch->chunk_type == SCTP_ASCONF); if (stcb == NULL) { /* * reduce inp's refcount if not reduced in * sctp_findassociation_ep_asconf(). */ SCTP_INP_DECR_REF(inp); } else { locked_tcb = stcb; } /* now go back and verify any auth chunk to be sure */ if (auth_skipped && (stcb != NULL)) { struct sctp_auth_chunk *auth; auth = (struct sctp_auth_chunk *) sctp_m_getptr(m, auth_offset, auth_len, chunk_buf); got_auth = 1; auth_skipped = 0; if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) { /* auth HMAC failed so dump it */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } else { /* remaining chunks are HMAC checked */ stcb->asoc.authenticated = 1; } } } if (stcb == NULL) { snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __FUNCTION__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); /* no association, so it's out of the blue... */ sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } asoc = &stcb->asoc; /* ABORT and SHUTDOWN can use either v_tag... */ if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) || (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) || (ch->chunk_type == SCTP_PACKET_DROPPED)) { /* Take the T-bit always into account. */ if ((((ch->chunk_flags & SCTP_HAD_NO_TCB) == 0) && (vtag_in == asoc->my_vtag)) || (((ch->chunk_flags & SCTP_HAD_NO_TCB) == SCTP_HAD_NO_TCB) && (vtag_in == asoc->peer_vtag))) { /* this is valid */ } else { /* drop this packet... */ SCTP_STAT_INCR(sctps_badvtag); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { if (vtag_in != asoc->my_vtag) { /* * this could be a stale SHUTDOWN-ACK or the * peer never got the SHUTDOWN-COMPLETE and * is still hung; we have started a new asoc * but it won't complete until the shutdown * is completed */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __FUNCTION__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, mflowtype, mflowid, fibnum, vrf_id, port); return (NULL); } } else { /* for all other chunks, vtag must match */ if (vtag_in != asoc->my_vtag) { /* invalid vtag... */ SCTPDBG(SCTP_DEBUG_INPUT3, "invalid vtag: %xh, expect %xh\n", vtag_in, asoc->my_vtag); SCTP_STAT_INCR(sctps_badvtag); if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } } } /* end if !SCTP_COOKIE_ECHO */ /* * process all control chunks... */ if (((ch->chunk_type == SCTP_SELECTIVE_ACK) || (ch->chunk_type == SCTP_NR_SELECTIVE_ACK) || (ch->chunk_type == SCTP_HEARTBEAT_REQUEST)) && (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) { /* implied cookie-ack.. we must have lost the ack */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); } process_control_chunks: while (IS_SCTP_CONTROL(ch)) { /* validate chunk length */ chk_length = ntohs(ch->chunk_length); SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_process_control: processing a chunk type=%u, len=%u\n", ch->chunk_type, chk_length); SCTP_LTRACE_CHK(inp, stcb, ch->chunk_type, chk_length); if (chk_length < sizeof(*ch) || (*offset + (int)chk_length) > length) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks); /* * INIT-ACK only gets the init ack "header" portion only * because we don't have to process the peer's COOKIE. All * others get a complete chunk. */ if ((ch->chunk_type == SCTP_INITIATION_ACK) || (ch->chunk_type == SCTP_INITIATION)) { /* get an init-ack chunk */ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_init_ack_chunk), chunk_buf); if (ch == NULL) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else { /* For cookies and all other chunks. */ if (chk_length > sizeof(chunk_buf)) { /* * use just the size of the chunk buffer so * the front part of our chunks fit in * contiguous space up to the chunk buffer * size (508 bytes). For chunks that need to * get more than that they must use the * sctp_m_getptr() function or other means * (e.g. know how to parse mbuf chains). * Cookies do this already. */ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, (sizeof(chunk_buf) - 4), chunk_buf); if (ch == NULL) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } else { /* We can fit it all */ ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, chk_length, chunk_buf); if (ch == NULL) { SCTP_PRINTF("sctp_process_control: Can't get the all data....\n"); *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } } } num_chunks++; /* Save off the last place we got a control from */ if (stcb != NULL) { if (((netp != NULL) && (*netp != NULL)) || (ch->chunk_type == SCTP_ASCONF)) { /* * allow last_control to be NULL if * ASCONF... ASCONF processing will find the * right net later */ if ((netp != NULL) && (*netp != NULL)) stcb->asoc.last_control_chunk_from = *netp; } } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xB0, ch->chunk_type); #endif /* check to see if this chunk required auth, but isn't */ if ((stcb != NULL) && (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); goto next_chunk; } switch (ch->chunk_type) { case SCTP_INITIATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT\n"); /* The INIT chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { /* RFC 4960 requires that no ABORT is sent */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } /* Honor our resource limit. */ if (chk_length > SCTP_LARGEST_INIT_ACCEPTED) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); *offset = length; return (NULL); } sctp_handle_init(m, iphlen, *offset, src, dst, sh, (struct sctp_init_chunk *)ch, inp, stcb, &abort_no_unlock, mflowtype, mflowid, vrf_id, port); *offset = length; if ((!abort_no_unlock) && (locked_tcb)) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); break; case SCTP_PAD_CHUNK: break; case SCTP_INITIATION_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT-ACK\n"); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else { if (locked_tcb != stcb) { /* Very unlikely */ SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; if (stcb) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif } return (NULL); } } /* The INIT-ACK chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if ((netp) && (*netp)) { ret = sctp_handle_init_ack(m, iphlen, *offset, src, dst, sh, (struct sctp_init_ack_chunk *)ch, stcb, *netp, &abort_no_unlock, mflowtype, mflowid, vrf_id); } else { ret = -1; } *offset = length; if (abort_no_unlock) { return (NULL); } /* * Special case, I must call the output routine to * get the cookie echoed */ if ((stcb != NULL) && (ret == 0)) { sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); } if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); break; case SCTP_SELECTIVE_ACK: { struct sctp_sack_chunk *sack; int abort_now = 0; uint32_t a_rwnd, cum_ack; uint16_t num_seg, num_dup; uint8_t flags; int offset_seg, offset_dup; SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK\n"); SCTP_STAT_INCR(sctps_recvsacks); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing SACK chunk\n"); break; } if (chk_length < sizeof(struct sctp_sack_chunk)) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on SACK chunk, too small\n"); break; } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /*- * If we have sent a shutdown-ack, we will pay no * attention to a sack sent in to us since * we don't care anymore. */ break; } sack = (struct sctp_sack_chunk *)ch; flags = ch->chunk_flags; cum_ack = ntohl(sack->sack.cum_tsn_ack); num_seg = ntohs(sack->sack.num_gap_ack_blks); num_dup = ntohs(sack->sack.num_dup_tsns); a_rwnd = (uint32_t) ntohl(sack->sack.a_rwnd); if (sizeof(struct sctp_sack_chunk) + num_seg * sizeof(struct sctp_gap_ack_block) + num_dup * sizeof(uint32_t) != chk_length) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of SACK chunk\n"); break; } offset_seg = *offset + sizeof(struct sctp_sack_chunk); offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block); SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n", cum_ack, num_seg, a_rwnd); stcb->asoc.seen_a_sack_this_pkt = 1; if ((stcb->asoc.pr_sctp_cnt == 0) && (num_seg == 0) && SCTP_TSN_GE(cum_ack, stcb->asoc.last_acked_seq) && (stcb->asoc.saw_sack_with_frags == 0) && (stcb->asoc.saw_sack_with_nr_frags == 0) && (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) ) { /* * We have a SIMPLE sack having no * prior segments and data on sent * queue to be acked.. Use the * faster path sack processing. We * also allow window update sacks * with no missing segments to go * this way too. */ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, &abort_now, ecne_seen); } else { if (netp && *netp) sctp_handle_sack(m, offset_seg, offset_dup, stcb, num_seg, 0, num_dup, &abort_now, flags, cum_ack, a_rwnd, ecne_seen); } if (abort_now) { /* ABORT signal from sack processing */ *offset = length; return (NULL); } if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } } break; /* * EY - nr_sack: If the received chunk is an * nr_sack chunk */ case SCTP_NR_SELECTIVE_ACK: { struct sctp_nr_sack_chunk *nr_sack; int abort_now = 0; uint32_t a_rwnd, cum_ack; uint16_t num_seg, num_nr_seg, num_dup; uint8_t flags; int offset_seg, offset_dup; SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK\n"); SCTP_STAT_INCR(sctps_recvsacks); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing NR-SACK chunk\n"); break; } if (stcb->asoc.nrsack_supported == 0) { goto unknown_chunk; } if (chk_length < sizeof(struct sctp_nr_sack_chunk)) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on NR-SACK chunk, too small\n"); break; } if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /*- * If we have sent a shutdown-ack, we will pay no * attention to a sack sent in to us since * we don't care anymore. */ break; } nr_sack = (struct sctp_nr_sack_chunk *)ch; flags = ch->chunk_flags; cum_ack = ntohl(nr_sack->nr_sack.cum_tsn_ack); num_seg = ntohs(nr_sack->nr_sack.num_gap_ack_blks); num_nr_seg = ntohs(nr_sack->nr_sack.num_nr_gap_ack_blks); num_dup = ntohs(nr_sack->nr_sack.num_dup_tsns); a_rwnd = (uint32_t) ntohl(nr_sack->nr_sack.a_rwnd); if (sizeof(struct sctp_nr_sack_chunk) + (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block) + num_dup * sizeof(uint32_t) != chk_length) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of NR_SACK chunk\n"); break; } offset_seg = *offset + sizeof(struct sctp_nr_sack_chunk); offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block); SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n", cum_ack, num_seg, a_rwnd); stcb->asoc.seen_a_sack_this_pkt = 1; if ((stcb->asoc.pr_sctp_cnt == 0) && (num_seg == 0) && (num_nr_seg == 0) && SCTP_TSN_GE(cum_ack, stcb->asoc.last_acked_seq) && (stcb->asoc.saw_sack_with_frags == 0) && (stcb->asoc.saw_sack_with_nr_frags == 0) && (!TAILQ_EMPTY(&stcb->asoc.sent_queue))) { /* * We have a SIMPLE sack having no * prior segments and data on sent * queue to be acked. Use the faster * path sack processing. We also * allow window update sacks with no * missing segments to go this way * too. */ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, &abort_now, ecne_seen); } else { if (netp && *netp) sctp_handle_sack(m, offset_seg, offset_dup, stcb, num_seg, num_nr_seg, num_dup, &abort_now, flags, cum_ack, a_rwnd, ecne_seen); } if (abort_now) { /* ABORT signal from sack processing */ *offset = length; return (NULL); } if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } } break; case SCTP_HEARTBEAT_REQUEST: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT\n"); if ((stcb) && netp && *netp) { SCTP_STAT_INCR(sctps_recvheartbeat); sctp_send_heartbeat_ack(stcb, m, *offset, chk_length, *netp); /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; } break; case SCTP_HEARTBEAT_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT-ACK\n"); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) { /* Its not ours */ *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; SCTP_STAT_INCR(sctps_recvheartbeatack); if (netp && *netp) sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch, stcb, *netp); break; case SCTP_ABORT_ASSOCIATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ABORT, stcb %p\n", (void *)stcb); if ((stcb) && netp && *netp) sctp_handle_abort((struct sctp_abort_chunk *)ch, stcb, *netp); *offset = length; return (NULL); break; case SCTP_SHUTDOWN: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n", (void *)stcb); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if (netp && *netp) { int abort_flag = 0; sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch, stcb, *netp, &abort_flag); if (abort_flag) { *offset = length; return (NULL); } } break; case SCTP_SHUTDOWN_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-ACK, stcb %p\n", (void *)stcb); if ((stcb) && (netp) && (*netp)) sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp); *offset = length; return (NULL); break; case SCTP_OPERATION_ERROR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP-ERR\n"); if ((stcb) && netp && *netp && sctp_handle_error(ch, stcb, *netp) < 0) { *offset = length; return (NULL); } break; case SCTP_COOKIE_ECHO: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE-ECHO, stcb %p\n", (void *)stcb); if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else { if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ abend: if (stcb) { SCTP_TCB_UNLOCK(stcb); } *offset = length; return (NULL); } } /* * First are we accepting? We do this again here * since it is possible that a previous endpoint WAS * listening responded to a INIT-ACK and then * closed. We opened and bound.. and are now no * longer listening. */ if ((stcb == NULL) && (inp->sctp_socket->so_qlen >= inp->sctp_socket->so_qlimit)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); } *offset = length; return (NULL); } else { struct mbuf *ret_buf; struct sctp_inpcb *linp; if (stcb) { linp = NULL; } else { linp = inp; } if (linp) { SCTP_ASOC_CREATE_LOCK(linp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { SCTP_ASOC_CREATE_UNLOCK(linp); goto abend; } } if (netp) { ret_buf = sctp_handle_cookie_echo(m, iphlen, *offset, src, dst, sh, (struct sctp_cookie_echo_chunk *)ch, &inp, &stcb, netp, auth_skipped, auth_offset, auth_len, &locked_tcb, mflowtype, mflowid, vrf_id, port); } else { ret_buf = NULL; } if (linp) { SCTP_ASOC_CREATE_UNLOCK(linp); } if (ret_buf == NULL) { if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } SCTPDBG(SCTP_DEBUG_INPUT3, "GAK, null buffer\n"); *offset = length; return (NULL); } /* if AUTH skipped, see if it verified... */ if (auth_skipped) { got_auth = 1; auth_skipped = 0; } if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) { /* * Restart the timer if we have * pending data */ struct sctp_tmit_chunk *chk; chk = TAILQ_FIRST(&stcb->asoc.sent_queue); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } break; case SCTP_COOKIE_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE-ACK, stcb %p\n", (void *)stcb); if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) { if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else if (stcb) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_30); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif *offset = length; return (NULL); } } /* He's alive so give him credit */ if ((stcb) && netp && *netp) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); } break; case SCTP_ECN_ECHO: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-ECHO\n"); /* He's alive so give him credit */ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (stcb) { if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch, stcb); ecne_seen = 1; } break; case SCTP_ECN_CWR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-CWR\n"); /* He's alive so give him credit */ if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (stcb) { if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb, *netp); } break; case SCTP_SHUTDOWN_COMPLETE: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-COMPLETE, stcb %p\n", (void *)stcb); /* must be first and only chunk */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { *offset = length; if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } return (NULL); } if ((stcb) && netp && *netp) { sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch, stcb, *netp); } *offset = length; return (NULL); break; case SCTP_ASCONF: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n"); /* He's alive so give him credit */ if (stcb) { if (stcb->asoc.asconf_supported == 0) { goto unknown_chunk; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_asconf(m, *offset, src, (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0); asconf_cnt++; } break; case SCTP_ASCONF_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF-ACK\n"); if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if ((stcb) && netp && *netp) { if (stcb->asoc.asconf_supported == 0) { goto unknown_chunk; } /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_asconf_ack(m, *offset, (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock); if (abort_no_unlock) return (NULL); } break; case SCTP_FORWARD_CUM_TSN: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_FWD-TSN\n"); if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } /* He's alive so give him credit */ if (stcb) { int abort_flag = 0; if (stcb->asoc.prsctp_supported == 0) { goto unknown_chunk; } stcb->asoc.overall_error_count = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } *fwd_tsn_seen = 1; if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_31); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif *offset = length; return (NULL); } sctp_handle_forward_tsn(stcb, (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset); if (abort_flag) { *offset = length; return (NULL); } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; } } break; case SCTP_STREAM_RESET: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n"); if (((stcb == NULL) || (ch == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req)))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (stcb->asoc.reconfig_supported == 0) { goto unknown_chunk; } if (sctp_handle_stream_reset(stcb, m, *offset, ch)) { /* stop processing */ *offset = length; return (NULL); } break; case SCTP_PACKET_DROPPED: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n"); /* re-get it all please */ if (chk_length < sizeof(struct sctp_pktdrop_chunk)) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (ch && (stcb) && netp && (*netp)) { if (stcb->asoc.pktdrop_supported == 0) { goto unknown_chunk; } sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch, stcb, *netp, min(chk_length, (sizeof(chunk_buf) - 4))); } break; case SCTP_AUTHENTICATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n"); if (stcb == NULL) { /* save the first AUTH for later processing */ if (auth_skipped == 0) { auth_offset = *offset; auth_len = chk_length; auth_skipped = 1; } /* skip this chunk (temporarily) */ goto next_chunk; } if (stcb->asoc.auth_supported == 0) { goto unknown_chunk; } if ((chk_length < (sizeof(struct sctp_auth_chunk))) || (chk_length > (sizeof(struct sctp_auth_chunk) + SCTP_AUTH_DIGEST_LEN_MAX))) { /* Its not ours */ if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } if (got_auth == 1) { /* skip this chunk... it's already auth'd */ goto next_chunk; } got_auth = 1; if ((ch == NULL) || sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch, m, *offset)) { /* auth HMAC failed so dump the packet */ *offset = length; return (stcb); } else { /* remaining chunks are HMAC checked */ stcb->asoc.authenticated = 1; } break; default: unknown_chunk: /* it's an unknown chunk! */ if ((ch->chunk_type & 0x40) && (stcb != NULL)) { - struct mbuf *mm; - struct sctp_paramhdr *phd; + struct sctp_gen_error_cause *cause; int len; - mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause), 0, M_NOWAIT, 1, MT_DATA); - if (mm) { + if (op_err != NULL) { len = min(SCTP_SIZE32(chk_length), (uint32_t) (length - *offset)); - phd = mtod(mm, struct sctp_paramhdr *); - /* - * We cheat and use param type since - * we did not bother to define a - * error cause struct. They are the - * same basic format with different - * names. - */ - phd->param_type = htons(SCTP_CAUSE_UNRECOG_CHUNK); - phd->param_length = htons(len + sizeof(*phd)); - SCTP_BUF_LEN(mm) = sizeof(*phd); - SCTP_BUF_NEXT(mm) = SCTP_M_COPYM(m, *offset, len, M_NOWAIT); - if (SCTP_BUF_NEXT(mm)) { + cause = mtod(op_err, struct sctp_gen_error_cause *); + cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK); + cause->length = htons(len + sizeof(struct sctp_gen_error_cause)); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); + SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, len, M_NOWAIT); + if (SCTP_BUF_NEXT(op_err) != NULL) { #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(SCTP_BUF_NEXT(mm), SCTP_MBUF_ICOPY); } #endif - sctp_queue_op_err(stcb, mm); + sctp_queue_op_err(stcb, op_err); } else { - sctp_m_freem(mm); + sctp_m_freem(op_err); } } } if ((ch->chunk_type & 0x80) == 0) { /* discard this packet */ *offset = length; return (stcb); } /* else skip this bad chunk and continue... */ break; } /* switch (ch->chunk_type) */ next_chunk: /* get the next chunk */ *offset += SCTP_SIZE32(chk_length); if (*offset >= length) { /* no more data left in the mbuf chain */ break; } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_chunkhdr), chunk_buf); if (ch == NULL) { if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } *offset = length; return (NULL); } } /* while */ if (asconf_cnt > 0 && stcb != NULL) { sctp_send_asconf_ack(stcb); } return (stcb); } #ifdef INVARIANTS #ifdef __GNUC__ __attribute__((noinline)) #endif void sctp_validate_no_locks(struct sctp_inpcb *inp) { struct sctp_tcb *lstcb; LIST_FOREACH(lstcb, &inp->sctp_asoc_list, sctp_tcblist) { if (mtx_owned(&lstcb->tcb_mtx)) { panic("Own lock on stcb at return from input"); } } if (mtx_owned(&inp->inp_create_mtx)) { panic("Own create lock on inp"); } if (mtx_owned(&inp->inp_mtx)) { panic("Own inp lock on inp"); } } #endif /* * common input chunk processing (v4 and v6) */ void sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int length, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, #if !defined(SCTP_WITH_NO_CSUM) uint8_t compute_crc, #endif uint8_t ecn_bits, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { uint32_t high_tsn; int fwd_tsn_seen = 0, data_processed = 0; struct mbuf *m = *mm, *op_err; char msg[SCTP_DIAG_INFO_LEN]; int un_sent; int cnt_ctrl_ready = 0; struct sctp_inpcb *inp = NULL, *inp_decr = NULL; struct sctp_tcb *stcb = NULL; struct sctp_nets *net = NULL; SCTP_STAT_INCR(sctps_recvdatagrams); #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 1); sctp_auditing(0, inp, stcb, net); #endif #if !defined(SCTP_WITH_NO_CSUM) if (compute_crc != 0) { uint32_t check, calc_check; check = sh->checksum; sh->checksum = 0; calc_check = sctp_calculate_cksum(m, iphlen); sh->checksum = check; if (calc_check != check) { SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x m:%p mlen:%d iphlen:%d\n", calc_check, check, (void *)m, length, iphlen); stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); #if defined(INET) || defined(INET6) if ((net != NULL) && (port != 0)) { if (net->port == 0) { sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr)); } net->port = port; } #endif if (net != NULL) { net->flowtype = mflowtype; net->flowid = mflowid; } if ((inp != NULL) && (stcb != NULL)) { sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED); } else if ((inp != NULL) && (stcb == NULL)) { inp_decr = inp; } SCTP_STAT_INCR(sctps_badsum); SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors); goto out; } } #endif /* Destination port of 0 is illegal, based on RFC4960. */ if (sh->dest_port == 0) { SCTP_STAT_INCR(sctps_hdrops); goto out; } stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); #if defined(INET) || defined(INET6) if ((net != NULL) && (port != 0)) { if (net->port == 0) { sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr)); } net->port = port; } #endif if (net != NULL) { net->flowtype = mflowtype; net->flowid = mflowid; } if (inp == NULL) { SCTP_STAT_INCR(sctps_noport); if (badport_bandlim(BANDLIM_SCTP_OOTB) < 0) { goto out; } if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { sctp_send_shutdown_complete2(src, dst, sh, mflowtype, mflowid, fibnum, vrf_id, port); goto out; } if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) { goto out; } if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) { if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) || ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) && (ch->chunk_type != SCTP_INIT))) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Out of the blue"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, fibnum, vrf_id, port); } } goto out; } else if (stcb == NULL) { inp_decr = inp; } #ifdef IPSEC /*- * I very much doubt any of the IPSEC stuff will work but I have no * idea, so I will leave it in place. */ if (inp != NULL) { switch (dst->sa_family) { #ifdef INET case AF_INET: if (ipsec4_in_reject(m, &inp->ip_inp.inp)) { SCTP_STAT_INCR(sctps_hdrops); goto out; } break; #endif #ifdef INET6 case AF_INET6: if (ipsec6_in_reject(m, &inp->ip_inp.inp)) { SCTP_STAT_INCR(sctps_hdrops); goto out; } break; #endif default: break; } } #endif SCTPDBG(SCTP_DEBUG_INPUT1, "Ok, Common input processing called, m:%p iphlen:%d offset:%d length:%d stcb:%p\n", (void *)m, iphlen, offset, length, (void *)stcb); if (stcb) { /* always clear this before beginning a packet */ stcb->asoc.authenticated = 0; stcb->asoc.seen_a_sack_this_pkt = 0; SCTPDBG(SCTP_DEBUG_INPUT1, "stcb:%p state:%x\n", (void *)stcb, stcb->asoc.state); if ((stcb->asoc.state & SCTP_STATE_WAS_ABORTED) || (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) { /*- * If we hit here, we had a ref count * up when the assoc was aborted and the * timer is clearing out the assoc, we should * NOT respond to any packet.. its OOTB. */ SCTP_TCB_UNLOCK(stcb); stcb = NULL; snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __FUNCTION__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; } } if (IS_SCTP_CONTROL(ch)) { /* process the control portion of the SCTP packet */ /* sa_ignore NO_NULL_CHK */ stcb = sctp_process_control(m, iphlen, &offset, length, src, dst, sh, ch, inp, stcb, &net, &fwd_tsn_seen, mflowtype, mflowid, fibnum, vrf_id, port); if (stcb) { /* * This covers us if the cookie-echo was there and * it changes our INP. */ inp = stcb->sctp_ep; #if defined(INET) || defined(INET6) if ((net) && (port)) { if (net->port == 0) { sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr)); } net->port = port; } #endif } } else { /* * no control chunks, so pre-process DATA chunks (these * checks are taken care of by control processing) */ /* * if DATA only packet, and auth is required, then punt... * can't have authenticated without any AUTH (control) * chunks */ if ((stcb != NULL) && (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); goto out; } if (stcb == NULL) { /* out of the blue DATA chunk */ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __FUNCTION__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, fibnum, vrf_id, port); goto out; } if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) { /* v_tag mismatch! */ SCTP_STAT_INCR(sctps_badvtag); goto out; } } if (stcb == NULL) { /* * no valid TCB for this packet, or we found it's a bad * packet while processing control, or we're done with this * packet (done or skip rest of data), so we drop it... */ goto out; } /* * DATA chunk processing */ /* plow through the data chunks while length > offset */ /* * Rest should be DATA only. Check authentication state if AUTH for * DATA is required. */ if ((length > offset) && (stcb != NULL) && (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); SCTPDBG(SCTP_DEBUG_AUTH1, "Data chunk requires AUTH, skipped\n"); goto trigger_send; } if (length > offset) { int retval; /* * First check to make sure our state is correct. We would * not get here unless we really did have a tag, so we don't * abort if this happens, just dump the chunk silently. */ switch (SCTP_GET_STATE(&stcb->asoc)) { case SCTP_STATE_COOKIE_ECHOED: /* * we consider data with valid tags in this state * shows us the cookie-ack was lost. Imply it was * there. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, net); break; case SCTP_STATE_COOKIE_WAIT: /* * We consider OOTB any data sent during asoc setup. */ snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __FUNCTION__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; /* sa_ignore NOTREACHED */ break; case SCTP_STATE_EMPTY: /* should not happen */ case SCTP_STATE_INUSE: /* should not happen */ case SCTP_STATE_SHUTDOWN_RECEIVED: /* This is a peer error */ case SCTP_STATE_SHUTDOWN_ACK_SENT: default: goto out; /* sa_ignore NOTREACHED */ break; case SCTP_STATE_OPEN: case SCTP_STATE_SHUTDOWN_SENT: break; } /* plow through the data chunks while length > offset */ retval = sctp_process_data(mm, iphlen, &offset, length, inp, stcb, net, &high_tsn); if (retval == 2) { /* * The association aborted, NO UNLOCK needed since * the association is destroyed. */ stcb = NULL; goto out; } data_processed = 1; /* * Anything important needs to have been m_copy'ed in * process_data */ } /* take care of ecn */ if ((data_processed == 1) && (stcb->asoc.ecn_supported == 1) && ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS)) { /* Yep, we need to add a ECNE */ sctp_send_ecn_echo(stcb, net, high_tsn); } if ((data_processed == 0) && (fwd_tsn_seen)) { int was_a_gap; uint32_t highest_tsn; if (SCTP_TSN_GT(stcb->asoc.highest_tsn_inside_nr_map, stcb->asoc.highest_tsn_inside_map)) { highest_tsn = stcb->asoc.highest_tsn_inside_nr_map; } else { highest_tsn = stcb->asoc.highest_tsn_inside_map; } was_a_gap = SCTP_TSN_GT(highest_tsn, stcb->asoc.cumulative_tsn); stcb->asoc.send_sack = 1; sctp_sack_check(stcb, was_a_gap); } else if (fwd_tsn_seen) { stcb->asoc.send_sack = 1; } /* trigger send of any chunks in queue... */ trigger_send: #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 2); sctp_auditing(1, inp, stcb, net); #endif SCTPDBG(SCTP_DEBUG_INPUT1, "Check for chunk output prw:%d tqe:%d tf=%d\n", stcb->asoc.peers_rwnd, TAILQ_EMPTY(&stcb->asoc.control_send_queue), stcb->asoc.total_flight); un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight); if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { cnt_ctrl_ready = stcb->asoc.ctrl_queue_cnt - stcb->asoc.ecn_echo_cnt_onq; } if (cnt_ctrl_ready || ((un_sent) && (stcb->asoc.peers_rwnd > 0 || (stcb->asoc.peers_rwnd <= 0 && stcb->asoc.total_flight == 0)))) { SCTPDBG(SCTP_DEBUG_INPUT3, "Calling chunk OUTPUT\n"); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); SCTPDBG(SCTP_DEBUG_INPUT3, "chunk OUTPUT returns\n"); } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 3); sctp_auditing(2, inp, stcb, net); #endif out: if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } if (inp_decr != NULL) { /* reduce ref-count */ SCTP_INP_WLOCK(inp_decr); SCTP_INP_DECR_REF(inp_decr); SCTP_INP_WUNLOCK(inp_decr); } #ifdef INVARIANTS if (inp != NULL) { sctp_validate_no_locks(inp); } #endif return; } #ifdef INET void sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) { struct mbuf *m; int iphlen; uint32_t vrf_id = 0; uint8_t ecn_bits; struct sockaddr_in src, dst; struct ip *ip; struct sctphdr *sh; struct sctp_chunkhdr *ch; int length, offset; #if !defined(SCTP_WITH_NO_CSUM) uint8_t compute_crc; #endif uint32_t mflowid; uint8_t mflowtype; uint16_t fibnum; iphlen = off; if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) { SCTP_RELEASE_PKT(i_pak); return; } m = SCTP_HEADER_TO_CHAIN(i_pak); #ifdef SCTP_MBUF_LOGGING /* Log in any input mbufs */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m, SCTP_MBUF_INPUT); } #endif #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(m); } #endif SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, "sctp_input(): Packet of length %d received on %s with csum_flags 0x%b.\n", m->m_pkthdr.len, if_name(m->m_pkthdr.rcvif), (int)m->m_pkthdr.csum_flags, CSUM_BITS); mflowid = m->m_pkthdr.flowid; mflowtype = M_HASHTYPE_GET(m); fibnum = M_GETFIB(m); SCTP_STAT_INCR(sctps_recvpackets); SCTP_STAT_INCR_COUNTER64(sctps_inpackets); /* Get IP, SCTP, and first chunk header together in the first mbuf. */ offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); if (SCTP_BUF_LEN(m) < offset) { if ((m = m_pullup(m, offset)) == NULL) { SCTP_STAT_INCR(sctps_hdrops); return; } } ip = mtod(m, struct ip *); sh = (struct sctphdr *)((caddr_t)ip + iphlen); ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr)); offset -= sizeof(struct sctp_chunkhdr); memset(&src, 0, sizeof(struct sockaddr_in)); src.sin_family = AF_INET; src.sin_len = sizeof(struct sockaddr_in); src.sin_port = sh->src_port; src.sin_addr = ip->ip_src; memset(&dst, 0, sizeof(struct sockaddr_in)); dst.sin_family = AF_INET; dst.sin_len = sizeof(struct sockaddr_in); dst.sin_port = sh->dest_port; dst.sin_addr = ip->ip_dst; length = ntohs(ip->ip_len); /* Validate mbuf chain length with IP payload length. */ if (SCTP_HEADER_LEN(m) != length) { SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_input() length:%d reported length:%d\n", length, SCTP_HEADER_LEN(m)); SCTP_STAT_INCR(sctps_hdrops); goto out; } /* SCTP does not allow broadcasts or multicasts */ if (IN_MULTICAST(ntohl(dst.sin_addr.s_addr))) { goto out; } if (SCTP_IS_IT_BROADCAST(dst.sin_addr, m)) { goto out; } ecn_bits = ip->ip_tos; #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_recvnocrc); #else if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { SCTP_STAT_INCR(sctps_recvswcrc); compute_crc = 1; } #endif sctp_common_input_processing(&m, iphlen, offset, length, (struct sockaddr *)&src, (struct sockaddr *)&dst, sh, ch, #if !defined(SCTP_WITH_NO_CSUM) compute_crc, #endif ecn_bits, mflowtype, mflowid, fibnum, vrf_id, port); out: if (m) { sctp_m_freem(m); } return; } #if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP) extern int *sctp_cpuarry; #endif int sctp_input(struct mbuf **mp, int *offp, int proto SCTP_UNUSED) { struct mbuf *m; int off; m = *mp; off = *offp; #if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP) if (mp_ncpus > 1) { struct ip *ip; struct sctphdr *sh; int offset; int cpu_to_use; uint32_t flowid, tag; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { flowid = m->m_pkthdr.flowid; } else { /* * No flow id built by lower layers fix it so we * create one. */ offset = off + sizeof(struct sctphdr); if (SCTP_BUF_LEN(m) < offset) { if ((m = m_pullup(m, offset)) == NULL) { SCTP_STAT_INCR(sctps_hdrops); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); sh = (struct sctphdr *)((caddr_t)ip + off); tag = htonl(sh->v_tag); flowid = tag ^ ntohs(sh->dest_port) ^ ntohs(sh->src_port); m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); } cpu_to_use = sctp_cpuarry[flowid % mp_ncpus]; sctp_queue_to_mcore(m, off, cpu_to_use); return (IPPROTO_DONE); } #endif sctp_input_with_port(m, off, 0); return (IPPROTO_DONE); } #endif Index: projects/iosched/sys =================================================================== --- projects/iosched/sys (revision 287721) +++ projects/iosched/sys (revision 287722) Property changes on: projects/iosched/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r287701-287721 Index: projects/iosched/usr.sbin/ctladm/ctladm.8 =================================================================== --- projects/iosched/usr.sbin/ctladm/ctladm.8 (revision 287721) +++ projects/iosched/usr.sbin/ctladm/ctladm.8 (revision 287722) @@ -1,1124 +1,1127 @@ .\" .\" Copyright (c) 2003 Silicon Graphics International Corp. +.\" Copyright (c) 2015 Alexander Motin .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions, and the following disclaimer, .\" without modification. .\" 2. Redistributions in binary form must reproduce at minimum a disclaimer .\" substantially similar to the "NO WARRANTY" disclaimer below .\" ("Disclaimer") and any redistribution must be conditioned upon .\" including a substantially similar Disclaimer requirement for further .\" binary redistribution. .\" .\" NO WARRANTY .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS .\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT .\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR .\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT .\" HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, .\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING .\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGES. .\" .\" ctladm utility man page. .\" .\" Author: Ken Merry .\" .\" $Id: //depot/users/kenm/FreeBSD-test2/usr.sbin/ctladm/ctladm.8#3 $ .\" $FreeBSD$ .\" -.Dd September 10, 2015 +.Dd September 12, 2015 .Dt CTLADM 8 .Os .Sh NAME .Nm ctladm .Nd CAM Target Layer control utility .Sh SYNOPSIS .Nm .Aq Ar command .Op lun .Op generic args .Op command args .Nm .Ic tur .Aq lun .Op general options .Nm .Ic inquiry .Aq lun .Op general options .Nm .Ic reqsense .Aq lun .Op general options .Nm .Ic reportluns .Aq lun .Op general options .Nm .Ic read .Aq lun .Op general options .Aq Fl l Ar lba .Aq Fl d Ar datalen .Aq Fl f Ar file|- .Aq Fl b Ar blocksize_bytes .Op Fl c Ar cdbsize .Op Fl N .Nm .Ic write .Aq lun .Op general options .Aq Fl l Ar lba .Aq Fl d Ar datalen .Aq Fl f Ar file|- .Aq Fl b Ar blocksize_bytes .Op Fl c Ar cdbsize .Op Fl N .Nm .Ic readcap .Aq lun .Op general options .Op Fl c Ar cdbsize .Nm .Ic modesense .Aq lun .Aq Fl m Ar page | Fl l .Op Fl P Ar pc .Op Fl d .Op Fl S Ar subpage .Op Fl c Ar size .Nm .Ic start .Aq lun .Op general options .Op Fl i .Op Fl o .Nm .Ic stop .Aq lun .Op general options .Op Fl i .Op Fl o .Nm .Ic synccache .Aq lun .Op general options .Op Fl l Ar lba .Op Fl b Ar blockcount .Op Fl r .Op Fl i .Op Fl c Ar cdbsize .Nm .Ic shutdown .Op general options .Nm .Ic startup .Op general options .Nm .Ic lunlist .Nm .Ic delay .Aq lun .Aq Fl l Ar datamove|done .Aq Fl t Ar secs .Op Fl T Ar oneshot|cont .Nm .Ic realsync Aq on|off|query .Nm .Ic setsync interval .Aq lun .Aq Fl i Ar interval .Nm .Ic getsync .Aq lun .Nm .Ic inject .Aq Fl i Ar action .Aq Fl p Ar pattern .Op Fl r Ar lba,len .Op Fl s Ar len fmt Op Ar args .Op Fl c .Op Fl d Ar delete_id .Nm .Ic create .Aq Fl b Ar backend .Op Fl B Ar blocksize .Op Fl d Ar device_id .Op Fl l Ar lun_id .Op Fl o Ar name=value .Op Fl s Ar size_bytes .Op Fl S Ar serial_num .Op Fl t Ar device_type .Nm .Ic remove .Aq Fl b Ar backend .Aq Fl l Ar lun_id .Op Fl o Ar name=value .Nm .Ic modify .Aq Fl b Ar backend .Aq Fl l Ar lun_id .Op Fl o Ar name=value .Aq Fl s Ar size_bytes .Nm .Ic devlist .Op Fl b Ar backend .Op Fl v .Op Fl x .Nm .Ic port .Op Fl l .Op Fl o Ar on|off .Op Fl w Ar wwpn .Op Fl W Ar wwnn .Op Fl p Ar targ_port .Op Fl t Ar fe_type .Op Fl q .Op Fl x .Nm .Ic portlist .Op Fl f Ar frontend .Op Fl i .Op Fl l .Op Fl p Ar targ_port .Op Fl q .Op Fl v .Op Fl x .Nm .Ic lunmap .Aq Fl p Ar targ_port .Op Fl l Ar pLUN .Op Fl L Ar cLUN .Nm .Ic dumpooa .Nm .Ic dumpstructs .Nm .Ic islist .Op Fl v .Op Fl x .Nm .Ic islogout .Aq Fl a | Fl c Ar connection-id | Fl i Ar name | Fl p Ar portal .Nm .Ic isterminate .Aq Fl a | Fl c Ar connection-id | Fl i Ar name | Fl p Ar portal .Nm .Ic help .Sh DESCRIPTION The .Nm utility is designed to provide a way to access and control the CAM Target Layer (CTL). It provides a way to send .Tn SCSI commands to the CTL layer, and also provides some meta-commands that utilize .Tn SCSI commands. (For instance, the .Ic lunlist command is implemented using the .Tn SCSI REPORT LUNS and INQUIRY commands.) .Pp The .Nm utility has a number of primary functions, many of which require a device identifier. The device identifier takes the following form: .Bl -tag -width 14n .It lun Specify the LUN number to operate on. .El Many of the primary functions of the .Nm utility take the following optional arguments: .Bl -tag -width 10n .It Fl C Ar retries Specify the number of times to retry a command in the event of failure. .It Fl D Ar device Specify the device to open. This allows opening a device other than the default device, .Pa /dev/cam/ctl , to be opened for sending commands. .It Fl I Ar id Specify the initiator number to use. By default, .Nm will use 7 as the initiator number. .El .Pp Primary commands: .Bl -tag -width 11n .It Ic tur Send the .Tn SCSI TEST UNIT READY command to the device and report whether or not it is ready. .It Ic inquiry Send the .Tn SCSI INQUIRY command to the device and display some of the returned inquiry data. .It Ic reqsense Send the .Tn SCSI REQUEST SENSE command to the device and display the returned sense information. .It Ic reportluns Send the .Tn SCSI REPORT LUNS command to the device and display supported LUNs. .It Ic read Send a .Tn SCSI READ command to the device, and write the requested data to a file or stdout. .Bl -tag -width 12n .It Fl l Ar lba Specify the starting Logical Block Address for the READ. This can be specified in decimal, octal (starting with 0), hexadecimal (starting with 0x) or any other base supported by .Xr strtoull 3 . .It Fl d Ar datalen Specify the length, in 512 byte blocks, of the READ request. .It Fl f Ar file Specify the destination for the data read by the READ command. Either a filename or .Sq - for stdout may be specified. .It Fl c Ar cdbsize Specify the minimum .Tn SCSI CDB (Command Data Block) size to be used for the READ request. Allowable values are 6, 10, 12 and 16. Depending upon the LBA and amount of data requested, a larger CDB size may be used to satisfy the request. (e.g., for LBAs above 0xffffffff, READ(16) must be used to satisfy the request.) .It Fl b Ar blocksize Specify the blocksize of the underlying .Tn SCSI device, so the transfer length can be calculated accurately. The blocksize can be obtained via the .Tn SCSI READ CAPACITY command. .It Fl N Do not copy data to .Nm from the kernel when doing a read, just execute the command without copying data. This is to be used for performance testing. .El .It Ic write Read data from a file or stdin, and write the data to the device using the .Tn SCSI WRITE command. .Bl -tag -width 12n .It Fl l Ar lba Specify the starting Logical Block Address for the WRITE. This can be specified in decimal, octal (starting with 0), hexadecimal (starting with 0x) or any other base supported by .Xr strtoull 3 . .It Fl d Ar atalen Specify the length, in 512 byte blocks, of the WRITE request. .It Fl f Ar file Specify the source for the data to be written by the WRITE command. Either a filename or .Sq - for stdin may be specified. .It Fl c Ar cdbsize Specify the minimum .Tn SCSI CDB (Command Data Block) size to be used for the READ request. Allowable values are 6, 10, 12 and 16. Depending upon the LBA and amount of data requested, a larger CDB size may be used to satisfy the request. (e.g., for LBAs above 0xffffffff, READ(16) must be used to satisfy the request.) .It Fl b Ar blocksize Specify the blocksize of the underlying .Tn SCSI device, so the transfer length can be calculated accurately. The blocksize can be obtained via the .Tn SCSI READ CAPACITY command. .It Fl N Do not copy data to .Nm to the kernel when doing a write, just execute the command without copying data. This is to be used for performance testing. .El .It Ic readcap Send the .Tn SCSI READ CAPACITY command to the device and display the device size and device block size. By default, READ CAPACITY(10) is used. If the device returns a maximum LBA of 0xffffffff, however, .Nm will automatically issue a READ CAPACITY(16), which is implemented as a service action of the SERVICE ACTION IN(16) opcode. The user can specify the minimum CDB size with the .Fl c argument. Valid values for the .Fl c option are 10 and 16. If a 10 byte CDB is specified, the request will be automatically reissued with a 16 byte CDB if the maximum LBA returned is 0xffffffff. .It Ic modesense Send a .Tn SCSI MODE SENSE command to the device, and display the requested mode page(s) or page list. .Bl -tag -width 10n .It Fl m Ar page Specify the mode page to display. This option and the .Fl l option are mutually exclusive. One of the two must be specified, though. Mode page numbers may be specified in decimal or hexadecimal. .It Fl l Request that the list of mode pages supported by the device be returned. This option and the .Fl m option are mutually exclusive. One of the two must be specified, though. .It Fl P Ar pc Specify the mode page control value. Possible values are: .Bl -tag -width 2n -compact .It 0 Current values. .It 1 Changeable value bitmask. .It 2 Default values. .It 3 Saved values. .El .It Fl d Disable block descriptors when sending the mode sense request. .It Fl S Ar subpage Specify the subpage used with the mode sense request. .It Fl c Ar cdbsize Specify the CDB size used for the mode sense request. Supported values are 6 and 10. .El .It Ic start Send the .Tn SCSI START STOP UNIT command to the specified LUN with the start bit set. .Bl -tag -width 4n .It Fl i Set the immediate bit in the CDB. Note that CTL does not support the immediate bit, so this is primarily useful for making sure that CTL returns the proper error. .It Fl o Set the Copan proprietary on/offline bit in the CDB. When this flag is used, the LUN will be marked online again (see the description of the .Ic shutdown and .Ic startup commands). When this flag is used with a start command, the LUN will NOT be spun up. You need to use a start command without the .Fl o flag to spin up the disks in the LUN. .El .It Ic stop Send the .Tn SCSI START STOP UNIT command to the specified LUN with the start bit cleared. We use an ordered tag to stop the LUN, so we can guarantee that all pending I/O executes before it is stopped. (CTL guarantees this anyway, but .Nm sends an ordered tag for completeness.) .Bl -tag -width 4n .It Fl i Set the immediate bit in the CDB. Note that CTL does not support the immediate bit, so this is primarily useful for making sure that CTL returns the proper error. .It Fl o Set the Copan proprietary on/offline bit in the CDB. When this flag is used, the LUN will be spun down and taken offline ("Logical unit not ready, manual intervention required"). See the description of the .Ic shutdown and .Ic startup options. .El .It Ic synccache Send the .Tn SCSI SYNCHRONIZE CACHE command to the device. By default, SYNCHRONIZE CACHE(10) is used. If the specified starting LBA is greater than 0xffffffff or the length is greater than 0xffff, though, SYNCHRONIZE CACHE(16) will be used. The 16 byte command will also be used if the user specifies a 16 byte CDB with the .Fl c argument. .Bl -tag -width 14n .It Fl l Ar lba Specify the starting LBA of the cache region to synchronize. This option is a no-op for CTL. If you send a SYNCHRONIZE CACHE command, it will sync the cache for the entire LUN. .It Fl b Ar blockcount Specify the length of the cache region to synchronize. This option is a no-op for CTL. If you send a SYNCHRONIZE CACHE command, it will sync the cache for the entire LUN. .It Fl r Specify relative addressing for the starting LBA. CTL does not support relative addressing, since it only works for linked commands, and CTL does not support linked commands. .It Fl i Tell the target to return status immediately after issuing the SYNCHRONIZE CACHE command rather than waiting for the cache to finish syncing. CTL does not support this bit. .It Fl c Ar cdbsize Specify the minimum CDB size. Valid values are 10 and 16 bytes. .El .It Ic shutdown Issue a .Tn SCSI START STOP UNIT command with the start bit cleared and the on/offline bit set to all direct access LUNs. This will spin down all direct access LUNs, and mark them offline ("Logical unit not ready, manual intervention required"). Once marked offline, the state can only be cleared by sending a START STOP UNIT command with the start bit set and the on/offline bit set. The .Nm commands .Ic startup and .Ic start will accomplish this. Note that the on/offline bit is a non-standard Copan extension to the .Tn SCSI START STOP UNIT command, so merely sending a normal start command from an initiator will not clear the condition. (This is by design.) .It Ic startup Issue a .Tn SCSI START STOP UNIT command with the start bit set and the on/offline bit set to all direct access LUNs. This will mark all direct access LUNs "online" again. It will not cause any LUNs to start up. A separate start command without the on/offline bit set is necessary for that. .It Ic lunlist List all LUNs registered with CTL. Because this command uses the ioctl port, it will only work when the FETDs (Front End Target Drivers) are enabled. This command is the equivalent of doing a REPORT LUNS on one LUN and then an INQUIRY on each LUN in the system. .It Ic delay Delay commands at the given location. There are two places where commands may be delayed currently: before data is transferred .Pq Dq datamove and just prior to sending status to the host .Pq Dq done . One of the two must be supplied as an argument to the .Fl l option. The .Fl t option must also be specified. .Bl -tag -width 12n .It Fl l Ar delayloc Delay command(s) at the specified location. This can either be at the data movement stage (datamove) or prior to command completion (done). .It Fl t Ar delaytime Delay command(s) for the specified number of seconds. This must be specified. If set to 0, it will clear out any previously set delay for this particular location (datamove or done). .It Fl T Ar delaytype Specify the delay type. By default, the .Ic delay option will delay the next command sent to the given LUN. With the .Fl T Ar cont option, every command will be delayed by the specified period of time. With the .Fl T Ar oneshot the next command sent to the given LUN will be delayed and all subsequent commands will be completed normally. This is the default. .El .It Ic realsync Query and control CTL's SYNCHRONIZE CACHE behavior. The .Sq query argument will show whether SYNCHRONIZE CACHE commands are being sent to the backend or not. The default is to send SYNCHRONIZE CACHE commands to the backend. The .Sq on argument will cause all SYNCHRONIZE CACHE commands sent to all LUNs to be sent to the backend. The .Sq off argument will cause all SYNCHRONIZE CACHE commands sent to all LUNs to be immediately returned to the initiator with successful status. .It Ic setsync For a given lun, only actually service every Nth SYNCHRONIZE CACHE command that is sent. This can be used for debugging the optimal time period for sending SYNCHRONIZE cache commands. An interval of 0 means that the cache will be flushed for this LUN every time a SYNCHRONIZE CACHE command is received. .Pp You must specify the LUN you want to modify. .It Ic getsync Get the interval at which we actually service the SYNCHRONIZE CACHE command, as set by the .Ic setsync command above. The reported number means that we will actually flush the cache on every Nth SYNCHRONIZE CACHE command. A value of 0 means that we will flush the cache every time. .Pp You must specify the LUN you want to query. .It Ic inject Inject the specified type of error for the LUN specified, when a command that matches the given pattern is seen. The sense data returned is in either fixed or descriptor format, depending upon the status of the D_SENSE bit in the control mode page (page 0xa) for the LUN. .Pp Errors are only injected for commands that have not already failed for other reasons. By default, only the first command matching the pattern specified is returned with the supplied error. .Pp If the .Fl c flag is specified, all commands matching the pattern will be returned with the specified error until the error injection command is deleted with .Fl d flag. .Bl -tag -width 17n .It Fl i Ar action Specify the error to return: .Bl -tag -width 10n .It aborted Return the next matching command on the specified LUN with the sense key ABORTED COMMAND (0x0b), and the ASC/ASCQ 0x45,0x00 ("Select or reselect failure"). .It mediumerr Return the next matching command on the specified LUN with the sense key MEDIUM ERROR (0x03) and the ASC/ASCQ 0x11,0x00 ("Unrecovered read error") for reads, or ASC/ASCQ 0x0c,0x02 ("Write error - auto reallocation failed") for write errors. .It ua Return the next matching command on the specified LUN with the sense key UNIT ATTENTION (0x06) and the ASC/ASCQ 0x29,0x00 ("POWER ON, RESET, OR BUS DEVICE RESET OCCURRED"). .It custom Return the next matching command on the specified LUN with the supplied sense data. The .Fl s argument must be specified. .El .It Fl p Ar pattern Specify which commands should be returned with the given error. .Bl -tag -width 10n .It read The error should apply to READ(6), READ(10), READ(12), READ(16), etc. .It write The error should apply to WRITE(6), WRITE(10), WRITE(12), WRITE(16), WRITE AND VERIFY(10), etc. .It rw The error should apply to both read and write type commands. .It readcap The error should apply to READ CAPACITY(10) and READ CAPACITY(16) commands. .It tur The error should apply to TEST UNIT READY commands. .It any The error should apply to any command. .El .It Fl r Ar lba,len Specify the starting lba and length of the range of LBAs which should trigger an error. This option is only applies when read and/or write patterns are specified. If used with other command types, the error will never be triggered. .It Fl s Ar len fmt Op Ar args Specify the sense data that is to be returned for custom actions. If the format is .Sq - , len bytes of sense data will be read from standard input and written to the sense buffer. If len is longer than 252 bytes (the maximum allowable .Tn SCSI sense data length), it will be truncated to that length. The sense data format is described in .Xr cam_cdparse 3 . .It Fl c The error injection should be persistent, instead of happening once. Persistent errors must be deleted with the .Fl d argument. .It Fl d Ar delete_id Delete the specified error injection serial number. The serial number is returned when the error is injected. .El .It Ic port Perform one of several CTL frontend port operations. Either get a list of frontend ports .Pq Fl l , turn one or more frontends on or off .Pq Fl o Ar on|off , or set the World Wide Node Name .Pq Fl w Ar wwnn or World Wide Port Name .Pq Fl W Ar wwpn for a given port. One of .Fl l , .Fl o , or .Fl w or .Fl W must be specified. The WWNN and WWPN may both be specified at the same time, but cannot be combined with enabling/disabling or listing ports. .Bl -tag -width 12n .It Fl l List all CTL frontend ports or a specific port type or number. .It Fl o Ar on|off Turn the specified CTL frontend ports off or on. If no port number or port type is specified, all ports are turned on or off. .It Fl p Ar targ_port Specify the frontend port number. The port numbers can be found in the frontend port list. .It Fl q Omit the header in the port list output. .It Fl t Ar fe_type Specify the frontend type. Currently defined port types are .Dq fc (Fibre Channel), .Dq scsi (Parallel SCSI), .Dq ioctl (CTL ioctl interface), and .Dq internal (CTL CAM SIM). .It Fl w Ar wwnn Set the World Wide Node Name for the given port. The .Fl n argument must be specified, since this is only possible to implement on a single port. As a general rule, the WWNN should be the same across all ports on the system. .It Fl W Ar wwpn Set the World Wide Port Name for the given port. The .Fl n argument must be specified, since this is only possible to implement on a single port. As a general rule, the WWPN must be different for every port in the system. .It Fl x Output the port list in XML format. .El .It Ic portlist List CTL frontend ports. .Bl -tag -width 12n .It Fl f Ar frontend Specify the frontend type. .It Fl i Report target and connected initiators addresses. .It Fl l Report LUN mapping. .It Fl p Ar targ_port Specify the frontend port number. .It Fl q Omit the header in the port list output. .It Fl v Enable verbose output (report all port options). .It Fl x Output the port list in XML format. .El .It Ic lunmap Change LUN mapping for specified port. If both .Ar pLUN and .Ar cLUN are specified -- LUN will be mapped. If .Ar pLUN is specified, but .Ar cLUN is not -- LUN will be unmapped. If neither .Ar pLUN nor .Ar cLUN are specified -- LUN mapping will be disabled, exposing all CTL LUNs. .Bl -tag -width 12n .It Fl p Ar targ_port Specify the frontend port number. .It Fl l Ar pLUN LUN number visible by specified port. .It Fl L Ar cLUN CTL LUN number. .El .It Ic dumpooa Dump the OOA (Order Of Arrival) queue for each LUN registered with CTL. .It Ic dumpstructs Dump the CTL structures to the console. .It Ic create Create a new LUN. The backend must be specified, and depending upon the backend requested, some of the other options may be required. If the LUN is created successfully, the LUN configuration will be displayed. If LUN creation fails, a message will be displayed describing the failure. .Bl -tag -width 14n .It Fl b Ar backend The .Fl b flag is required. This specifies the name backend to use when creating the LUN. Examples are .Dq ramdisk and .Dq block . .It Fl B Ar blocksize Specify the blocksize of the backend in bytes. .It Fl d Ar device_id Specify the LUN-associated string to use in the .Tn SCSI INQUIRY VPD page 0x83 data. .It Fl l Ar lun_id Request that a particular LUN number be assigned. If the requested LUN number is not available, the request will fail. .It Fl o Ar name=value Specify a backend-specific name/value pair. Multiple .Fl o arguments may be specified. Refer to the backend documentation for arguments that may be used. .It Fl s Ar size_bytes Specify the size of the LUN in bytes. Some backends may allow setting the size (e.g. the ramdisk backend) and for others the size may be implicit (e.g. the block backend). .It Fl S Ar serial_num Specify the serial number to be used in the .Tn SCSI INQUIRY VPD page 0x80 data. .It Fl t Ar device_type Specify the numeric SCSI device type to use when creating the LUN. For example, the Direct Access type is 0. If this flag is not used, the type of LUN created is backend-specific. Not all LUN types are supported. Currently CTL only supports Direct Access (type 0) and Processor (type 3) LUNs. The backend requested may or may not support all of the LUN types that CTL supports. .El .It Ic remove Remove a LUN. The backend must be specified, and the LUN number must also be specified. Backend-specific options may also be specified with the .Fl o flag. .Bl -tag -width 14n .It Fl b Ar backend Specify the backend that owns the LUN to be removed. Examples are .Dq ramdisk and .Dq block . .It Fl l Ar lun_id Specify the LUN number to remove. .It Fl o Ar name=value Specify a backend-specific name/value pair. Multiple .Fl o arguments may be specified. Refer to the backend documentation for arguments that may be used. .El .It Ic modify Modify a LUN size. The backend, the LUN number, and the size must be specified. .Bl -tag -width 14n .It Fl b Ar backend Specify the backend that owns the LUN to be removed. Examples are .Dq ramdisk and .Dq block . .It Fl l Ar lun_id Specify the LUN number to remove. .It Fl o Ar name=value Specify a backend-specific name/value pair. Multiple .Fl o arguments may be specified. Refer to the backend documentation for arguments that may be used. .It Fl s Ar size_bytes Specify the size of the LUN in bytes. For the .Dq block backend, an .Dq auto keyword may be passed instead; this will make CTL use the size of backing file or device. .El .It Ic devlist Get a list of all configured LUNs. This also includes the LUN size and blocksize, serial number and device ID. .Bl -tag -width 11n .It Fl b Ar backend Specify the backend. This restricts the LUN list to the named backend. Examples are .Dq ramdisk and .Dq block . .It Fl v Be verbose. This will also display any backend-specific LUN attributes in addition to the standard per-LUN information. .It Fl x Dump the raw XML. The LUN list information from the kernel comes in XML format, and this option allows the display of the raw XML data. This option and the .Fl v and .Fl b options are mutually exclusive. If you specify .Fl x , the entire LUN database is displayed in XML format. .El .It Ic islist Get a list of currently running iSCSI sessions. This includes initiator and target names and the unique connection IDs. .Bl -tag -width 11n .It Fl v Verbose mode. .It Fl x Dump the raw XML. The sessions list information from the kernel comes in XML format, and this option allows the display of the raw XML data. .El .It Ic islogout Ask the initiator to log out iSCSI sessions matching criteria. .Bl -tag -width 11n .It Fl a Log out all sessions. .It Fl c Specify connection ID. .It Fl i Specify initiator name. .It Fl p Specify initiator portal (hostname or IP address). .El .It Ic isterminate Forcibly terminate iSCSI sessions matching criteria. .Bl -tag -width 11n .It Fl a Terminate all sessions. .It Fl c Specify connection ID. .It Fl i Specify initiator name. .It Fl p Specify initiator portal (hostname or IP address). .El .It Ic help Display .Nm usage information. .El .Sh OPTIONS Number of additional configuration options may be specified for LUNs. Some options are global, others are backend-specific. .Pp Global options: .Bl -tag -width 12n .It Va vendor Specifies LUN vendor string up to 8 chars. .It Va product Specifies LUN product string up to 16 chars. .It Va revision Specifies LUN revision string up to 4 chars. .It Va scsiname Specifies LUN SCSI name string. .It Va eui Specifies LUN EUI-64 identifier. .It Va naa Specifies LUN NAA identifier. Either EUI or NAA identifier should be set to UNIQUE value to allow EXTENDED COPY command access the LUN. Non-unique LUN identifiers may lead to data corruption. +.It Va ha_role +Setting to "primary" or "secondary" overrides default role of the node +in HA cluster, set by kern.cam.ctl.ha_role sysctl. .It Va insecure_tpc Setting to "on" allows EXTENDED COPY command sent to this LUN access other LUNs on this host, not accessible otherwise. This allows to offload copying between different iSCSI targets residing on the same host in trusted environments. .It Va readcache Set to "off", disables read caching for the LUN, if supported by the backend. .It Va readonly Set to "on", blocks all media write operations to the LUN, reporting it as write protected. .It Va reordering Set to "unrestricted", allows target to process commands with SIMPLE task attribute in arbitrary order. Any data integrity exposures related to command sequence order shall be explicitly handled by the application client through the selection of appropriate commands and task attributes. The default value is "restricted". It improves data integrity, but may introduce some additional delays. .It Va serseq Set to "on" to serialize conseсutive reads/writes. Set to "read" to serialize conseсutive reads. Set to "off" to allow them be issued in parallel. Parallel issue of consecutive operations may confuse logic of the backing file system, hurting performance; but it may improve performance of backing stores without prefetch/write-back. .It Va pblocksize .It Va pblockoffset Specify physical block size and offset of the device. .It Va ublocksize .It Va ublockoffset Specify UNMAP block size and offset of the device. -.It Va rpm .It Va rpm Specifies medium rotation rate of the device: 0 -- not reported, 1 -- non-rotating (SSD), >1024 -- value in revolutions per minute. .It Va formfactor Specifies nominal form factor of the device: 0 -- not reported, 1 -- 5.25", 2 -- 3.5", 3 -- 2.5", 4 -- 1.8", 5 -- less then 1.8". .It Va unmap Set to "on", enables UNMAP support for the LUN, if supported by the backend. .It Va avail-threshold .It Va used-threshold .It Va pool-avail-threshold .It Va pool-used-threshold Set per-LUN/-pool thin provisioning soft thresholds. LUN will establish UNIT ATTENTION condition if its or pool available space get below configured avail values, or its or pool used space get above configured used values. Pool thresholds are working only for ZVOL-backed LUNs. .It Va writecache Set to "off", disables write caching for the LUN, if supported by the backend. .El .Pp Options specific for block backend: .Bl -tag -width 12n .It Va file Specifies file or device name to use for backing store. .It Va num_threads Specifies number of backend threads to use for this LUN. .El .Sh EXAMPLES .Dl ctladm tur 1 .Pp Send a .Tn SCSI TEST UNIT READY command to LUN 1. .Pp .Dl ctladm modesense 1 -l .Pp Display the list of mode pages supported by LUN 1. .Pp .Dl ctladm modesense 0 -m 10 -P 3 -d -c 10 .Pp Display the saved version of the Control mode page (page 10) on LUN 0. Disable fetching block descriptors, and use a 10 byte MODE SENSE command instead of the default 6 byte command. .Bd -literal ctladm read 2 -l 0 -d 1 -b 512 -f - > foo .Ed .Pp Read the first 512 byte block from LUN 2 and dump it to the file .Pa foo . .Bd -literal ctladm write 3 -l 0xff432140 -d 20 -b 512 -f /tmp/bar .Ed .Pp Read 10240 bytes from the file .Pa /tmp/bar and write it to LUN 3. starting at LBA 0xff432140. .Pp .Dl ctladm create -b ramdisk -s 10485760000000000 .Pp Create a LUN with the .Dq fake ramdisk as a backing store. The LUN will claim to have a size of approximately 10 terabytes. .Pp .Dl ctladm create -b block -o file=src/usr.sbin/ctladm/ctladm.8 .Pp Create a LUN using the block backend, and specify the file .Pa src/usr.sbin/ctladm/ctladm.8 as the backing store. The size of the LUN will be derived from the size of the file. .Pp .Dl ctladm create -b block -o file=src/usr.sbin/ctladm/ctladm.8 -S MYSERIAL321 -d MYDEVID123 .Pp Create a LUN using the block backend, specify the file .Pa src/usr.sbin/ctladm/ctladm.8 as the backing store, and specify the .Tn SCSI VPD page 0x80 and 0x83 serial number .Fl ( S ) and device ID .Fl ( d ) . .Pp .Dl ctladm remove -b block -l 12 .Pp Remove LUN 12, which is handled by the block backend, from the system. .Pp .Dl ctladm devlist .Pp List configured LUNs in the system, along with their backend and serial number. This works when the Front End Target Drivers are enabled or disabled. .Pp .Dl ctladm lunlist .Pp List all LUNs in the system, along with their inquiry data and device type. This only works when the FETDs are enabled, since the commands go through the ioctl port. .Pp .Dl ctladm inject 6 -i mediumerr -p read -r 0,512 -c .Pp Inject a medium error on LUN 6 for every read that covers the first 512 blocks of the LUN. .Bd -literal -offset indent ctladm inject 6 -i custom -p tur -s 18 "f0 0 02 s12 04 02" .Ed .Pp Inject a custom error on LUN 6 for the next TEST UNIT READY command only. This will result in a sense key of NOT READY (0x02), and an ASC/ASCQ of 0x04,0x02 ("Logical unit not ready, initializing command required"). .Sh SEE ALSO .Xr cam 3 , .Xr cam_cdbparse 3 , .Xr cam 4 , .Xr ctl 4 , .Xr xpt 4 , .Xr camcontrol 8 , .Xr ctld 8 , .Xr ctlstat 8 .Sh HISTORY The .Nm utility was originally written during the Winter/Spring of 2003 as an interface to CTL. .Sh AUTHORS .An Ken Merry Aq Mt ken@FreeBSD.org Index: projects/iosched/usr.sbin/pw/pw_user.c =================================================================== --- projects/iosched/usr.sbin/pw/pw_user.c (revision 287721) +++ projects/iosched/usr.sbin/pw/pw_user.c (revision 287722) @@ -1,1807 +1,1808 @@ /*- * Copyright (C) 1996 * David L. Nugent. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY DAVID L. NUGENT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL DAVID L. NUGENT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef lint static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pw.h" #include "bitmap.h" #include "psdate.h" #define LOGNAMESIZE (MAXLOGNAME-1) static char locked_str[] = "*LOCKED*"; static struct passwd fakeuser = { "nouser", "*", -1, -1, 0, "", "User &", "/nonexistent", "/bin/sh", 0, 0 }; static int print_user(struct passwd *pwd, bool pretty, bool v7); static uid_t pw_uidpolicy(struct userconf *cnf, intmax_t id); static uid_t pw_gidpolicy(struct userconf *cnf, char *grname, char *nam, gid_t prefer, bool dryrun); static char *pw_homepolicy(struct userconf * cnf, char *homedir, const char *user); static char *pw_shellpolicy(struct userconf * cnf); static char *pw_password(struct userconf * cnf, char const * user, bool dryrun); static char *shell_path(char const * path, char *shells[], char *sh); static void rmat(uid_t uid); static void rmopie(char const * name); static void mkdir_home_parents(int dfd, const char *dir) { struct stat st; char *dirs, *tmp; if (*dir != '/') errx(EX_DATAERR, "invalid base directory for home '%s'", dir); dir++; if (fstatat(dfd, dir, &st, 0) != -1) { if (S_ISDIR(st.st_mode)) return; errx(EX_OSFILE, "root home `/%s' is not a directory", dir); } dirs = strdup(dir); if (dirs == NULL) errx(EX_UNAVAILABLE, "out of memory"); tmp = strrchr(dirs, '/'); if (tmp == NULL) return; tmp[0] = '\0'; /* * This is a kludge especially for Joerg :) * If the home directory would be created in the root partition, then * we really create it under /usr which is likely to have more space. * But we create a symlink from cnf->home -> "/usr" -> cnf->home */ if (strchr(dirs, '/') == NULL) { asprintf(&tmp, "usr/%s", dirs); if (tmp == NULL) errx(EX_UNAVAILABLE, "out of memory"); if (mkdirat(dfd, tmp, _DEF_DIRMODE) != -1 || errno == EEXIST) { fchownat(dfd, tmp, 0, 0, 0); symlinkat(tmp, dfd, dirs); } free(tmp); } tmp = dirs; if (fstatat(dfd, dirs, &st, 0) == -1) { while ((tmp = strchr(tmp + 1, '/')) != NULL) { *tmp = '\0'; if (fstatat(dfd, dirs, &st, 0) == -1) { if (mkdirat(dfd, dirs, _DEF_DIRMODE) == -1) err(EX_OSFILE, "'%s' (root home parent) is not a directory", dirs); } *tmp = '/'; } } if (fstatat(dfd, dirs, &st, 0) == -1) { if (mkdirat(dfd, dirs, _DEF_DIRMODE) == -1) err(EX_OSFILE, "'%s' (root home parent) is not a directory", dirs); fchownat(dfd, dirs, 0, 0, 0); } free(dirs); } static void create_and_populate_homedir(struct userconf *cnf, struct passwd *pwd, const char *skeldir, mode_t homemode, bool update) { int skelfd = -1; /* Create home parents directories */ mkdir_home_parents(conf.rootfd, pwd->pw_dir); if (skeldir != NULL && *skeldir != '\0') { if (*skeldir == '/') skeldir++; skelfd = openat(conf.rootfd, skeldir, O_DIRECTORY|O_CLOEXEC); } copymkdir(conf.rootfd, pwd->pw_dir, skelfd, homemode, pwd->pw_uid, pwd->pw_gid, 0); pw_log(cnf, update ? M_UPDATE : M_ADD, W_USER, "%s(%ju) home %s made", pwd->pw_name, (uintmax_t)pwd->pw_uid, pwd->pw_dir); } static int pw_set_passwd(struct passwd *pwd, int fd, bool precrypted, bool update) { int b, istty; struct termios t, n; login_cap_t *lc; char line[_PASSWORD_LEN+1]; char *p; if (fd == '-') { if (!pwd->pw_passwd || *pwd->pw_passwd != '*') { pwd->pw_passwd = "*"; /* No access */ return (1); } return (0); } if ((istty = isatty(fd))) { if (tcgetattr(fd, &t) == -1) istty = 0; else { n = t; n.c_lflag &= ~(ECHO); tcsetattr(fd, TCSANOW, &n); printf("%s%spassword for user %s:", update ? "new " : "", precrypted ? "encrypted " : "", pwd->pw_name); fflush(stdout); } } b = read(fd, line, sizeof(line) - 1); if (istty) { /* Restore state */ tcsetattr(fd, TCSANOW, &t); fputc('\n', stdout); fflush(stdout); } if (b < 0) err(EX_IOERR, "-%c file descriptor", precrypted ? 'H' : 'h'); line[b] = '\0'; if ((p = strpbrk(line, "\r\n")) != NULL) *p = '\0'; if (!*line) errx(EX_DATAERR, "empty password read on file descriptor %d", fd); if (precrypted) { if (strchr(line, ':') != NULL) errx(EX_DATAERR, "bad encrypted password"); pwd->pw_passwd = strdup(line); } else { lc = login_getpwclass(pwd); if (lc == NULL || login_setcryptfmt(lc, "sha512", NULL) == NULL) warn("setting crypt(3) format"); login_close(lc); pwd->pw_passwd = pw_pwcrypt(line); } return (1); } static void perform_chgpwent(const char *name, struct passwd *pwd, char *nispasswd) { int rc; struct passwd *nispwd; /* duplicate for nis so that chgpwent is not modifying before NIS */ if (nispasswd && *nispasswd == '/') nispwd = pw_dup(pwd); rc = chgpwent(name, pwd); if (rc == -1) errx(EX_IOERR, "user '%s' does not exist (NIS?)", pwd->pw_name); else if (rc != 0) err(EX_IOERR, "passwd file update"); if (nispasswd && *nispasswd == '/') { rc = chgnispwent(nispasswd, name, nispwd); if (rc == -1) warn("User '%s' not found in NIS passwd", pwd->pw_name); else if (rc != 0) warn("NIS passwd update"); /* NOTE: NIS-only update errors are not fatal */ } } /* * The M_LOCK and M_UNLOCK functions simply add or remove * a "*LOCKED*" prefix from in front of the password to * prevent it decoding correctly, and therefore prevents * access. Of course, this only prevents access via * password authentication (not ssh, kerberos or any * other method that does not use the UNIX password) but * that is a known limitation. */ static int pw_userlock(char *arg1, int mode) { struct passwd *pwd = NULL; char *passtmp = NULL; char *name; bool locked = false; uid_t id; if (geteuid() != 0) errx(EX_NOPERM, "you must be root"); if (arg1 == NULL) errx(EX_DATAERR, "username or id required"); if (arg1[strspn(arg1, "0123456789")] == '\0') id = pw_checkid(arg1, UID_MAX); else name = arg1; pwd = (name != NULL) ? GETPWNAM(pw_checkname(name, 0)) : GETPWUID(id); if (pwd == NULL) { if (name == NULL) errx(EX_NOUSER, "no such uid `%ju'", (uintmax_t) id); errx(EX_NOUSER, "no such user `%s'", name); } if (name == NULL) name = pwd->pw_name; if (strncmp(pwd->pw_passwd, locked_str, sizeof(locked_str) -1) == 0) locked = true; if (mode == M_LOCK && locked) errx(EX_DATAERR, "user '%s' is already locked", pwd->pw_name); if (mode == M_UNLOCK && !locked) errx(EX_DATAERR, "user '%s' is not locked", pwd->pw_name); if (mode == M_LOCK) { asprintf(&passtmp, "%s%s", locked_str, pwd->pw_passwd); if (passtmp == NULL) /* disaster */ errx(EX_UNAVAILABLE, "out of memory"); pwd->pw_passwd = passtmp; } else { pwd->pw_passwd += sizeof(locked_str)-1; } perform_chgpwent(name, pwd, NULL); free(passtmp); return (EXIT_SUCCESS); } static uid_t pw_uidpolicy(struct userconf * cnf, intmax_t id) { struct passwd *pwd; struct bitmap bm; uid_t uid = (uid_t) - 1; /* * Check the given uid, if any */ if (id >= 0) { uid = (uid_t) id; if ((pwd = GETPWUID(uid)) != NULL && conf.checkduplicate) errx(EX_DATAERR, "uid `%ju' has already been allocated", (uintmax_t)pwd->pw_uid); return (uid); } /* * We need to allocate the next available uid under one of * two policies a) Grab the first unused uid b) Grab the * highest possible unused uid */ if (cnf->min_uid >= cnf->max_uid) { /* Sanity * claus^H^H^H^Hheck */ cnf->min_uid = 1000; cnf->max_uid = 32000; } bm = bm_alloc(cnf->max_uid - cnf->min_uid + 1); /* * Now, let's fill the bitmap from the password file */ SETPWENT(); while ((pwd = GETPWENT()) != NULL) if (pwd->pw_uid >= (uid_t) cnf->min_uid && pwd->pw_uid <= (uid_t) cnf->max_uid) bm_setbit(&bm, pwd->pw_uid - cnf->min_uid); ENDPWENT(); /* * Then apply the policy, with fallback to reuse if necessary */ if (cnf->reuse_uids || (uid = (uid_t) (bm_lastset(&bm) + cnf->min_uid + 1)) > cnf->max_uid) uid = (uid_t) (bm_firstunset(&bm) + cnf->min_uid); /* * Another sanity check */ if (uid < cnf->min_uid || uid > cnf->max_uid) errx(EX_SOFTWARE, "unable to allocate a new uid - range fully used"); bm_dealloc(&bm); return (uid); } static uid_t pw_gidpolicy(struct userconf *cnf, char *grname, char *nam, gid_t prefer, bool dryrun) { struct group *grp; gid_t gid = (uid_t) - 1; /* * Check the given gid, if any */ SETGRENT(); if (grname) { if ((grp = GETGRNAM(grname)) == NULL) { gid = pw_checkid(grname, GID_MAX); grp = GETGRGID(gid); } gid = grp->gr_gid; } else if ((grp = GETGRNAM(nam)) != NULL && (grp->gr_mem == NULL || grp->gr_mem[0] == NULL)) { gid = grp->gr_gid; /* Already created? Use it anyway... */ } else { intmax_t grid = -1; /* * We need to auto-create a group with the user's name. We * can send all the appropriate output to our sister routine * bit first see if we can create a group with gid==uid so we * can keep the user and group ids in sync. We purposely do * NOT check the gid range if we can force the sync. If the * user's name dups an existing group, then the group add * function will happily handle that case for us and exit. */ if (GETGRGID(prefer) == NULL) grid = prefer; if (dryrun) { gid = pw_groupnext(cnf, true); } else { if (grid == -1) grid = pw_groupnext(cnf, true); groupadd(cnf, nam, grid, NULL, -1, false, false, false); if ((grp = GETGRNAM(nam)) != NULL) gid = grp->gr_gid; } } ENDGRENT(); return (gid); } static char * pw_homepolicy(struct userconf * cnf, char *homedir, const char *user) { static char home[128]; if (homedir) return (homedir); if (cnf->home == NULL || *cnf->home == '\0') errx(EX_CONFIG, "no base home directory set"); snprintf(home, sizeof(home), "%s/%s", cnf->home, user); return (home); } static char * shell_path(char const * path, char *shells[], char *sh) { if (sh != NULL && (*sh == '/' || *sh == '\0')) return sh; /* specified full path or forced none */ else { char *p; char paths[_UC_MAXLINE]; /* * We need to search paths */ strlcpy(paths, path, sizeof(paths)); for (p = strtok(paths, ": \t\r\n"); p != NULL; p = strtok(NULL, ": \t\r\n")) { int i; static char shellpath[256]; if (sh != NULL) { snprintf(shellpath, sizeof(shellpath), "%s/%s", p, sh); if (access(shellpath, X_OK) == 0) return shellpath; } else for (i = 0; i < _UC_MAXSHELLS && shells[i] != NULL; i++) { snprintf(shellpath, sizeof(shellpath), "%s/%s", p, shells[i]); if (access(shellpath, X_OK) == 0) return shellpath; } } if (sh == NULL) errx(EX_OSFILE, "can't find shell `%s' in shell paths", sh); errx(EX_CONFIG, "no default shell available or defined"); return NULL; } } static char * pw_shellpolicy(struct userconf * cnf) { return shell_path(cnf->shelldir, cnf->shells, cnf->shell_default); } #define SALTSIZE 32 static char const chars[] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ./"; char * pw_pwcrypt(char *password) { int i; char salt[SALTSIZE + 1]; char *cryptpw; static char buf[256]; /* * Calculate a salt value */ for (i = 0; i < SALTSIZE; i++) salt[i] = chars[arc4random_uniform(sizeof(chars) - 1)]; salt[SALTSIZE] = '\0'; cryptpw = crypt(password, salt); if (cryptpw == NULL) errx(EX_CONFIG, "crypt(3) failure"); return strcpy(buf, cryptpw); } static char * pw_password(struct userconf * cnf, char const * user, bool dryrun) { int i, l; char pwbuf[32]; switch (cnf->default_password) { case -1: /* Random password */ l = (arc4random() % 8 + 8); /* 8 - 16 chars */ for (i = 0; i < l; i++) pwbuf[i] = chars[arc4random_uniform(sizeof(chars)-1)]; pwbuf[i] = '\0'; /* * We give this information back to the user */ if (conf.fd == -1 && !dryrun) { if (isatty(STDOUT_FILENO)) printf("Password for '%s' is: ", user); printf("%s\n", pwbuf); fflush(stdout); } break; case -2: /* No password at all! */ return ""; case 0: /* No login - default */ default: return "*"; case 1: /* user's name */ strlcpy(pwbuf, user, sizeof(pwbuf)); break; } return pw_pwcrypt(pwbuf); } static int print_user(struct passwd * pwd, bool pretty, bool v7) { int j; char *p; struct group *grp = GETGRGID(pwd->pw_gid); char uname[60] = "User &", office[60] = "[None]", wphone[60] = "[None]", hphone[60] = "[None]"; char acexpire[32] = "[None]", pwexpire[32] = "[None]"; struct tm * tptr; if (!pretty) { p = v7 ? pw_make_v7(pwd) : pw_make(pwd); printf("%s\n", p); free(p); return (EXIT_SUCCESS); } if ((p = strtok(pwd->pw_gecos, ",")) != NULL) { strlcpy(uname, p, sizeof(uname)); if ((p = strtok(NULL, ",")) != NULL) { strlcpy(office, p, sizeof(office)); if ((p = strtok(NULL, ",")) != NULL) { strlcpy(wphone, p, sizeof(wphone)); if ((p = strtok(NULL, "")) != NULL) { strlcpy(hphone, p, sizeof(hphone)); } } } } /* * Handle '&' in gecos field */ if ((p = strchr(uname, '&')) != NULL) { int l = strlen(pwd->pw_name); int m = strlen(p); memmove(p + l, p + 1, m); memmove(p, pwd->pw_name, l); *p = (char) toupper((unsigned char)*p); } if (pwd->pw_expire > (time_t)0 && (tptr = localtime(&pwd->pw_expire)) != NULL) strftime(acexpire, sizeof acexpire, "%c", tptr); if (pwd->pw_change > (time_t)0 && (tptr = localtime(&pwd->pw_change)) != NULL) strftime(pwexpire, sizeof pwexpire, "%c", tptr); printf("Login Name: %-15s #%-12ju Group: %-15s #%ju\n" " Full Name: %s\n" " Home: %-26.26s Class: %s\n" " Shell: %-26.26s Office: %s\n" "Work Phone: %-26.26s Home Phone: %s\n" "Acc Expire: %-26.26s Pwd Expire: %s\n", pwd->pw_name, (uintmax_t)pwd->pw_uid, grp ? grp->gr_name : "(invalid)", (uintmax_t)pwd->pw_gid, uname, pwd->pw_dir, pwd->pw_class, pwd->pw_shell, office, wphone, hphone, acexpire, pwexpire); SETGRENT(); j = 0; while ((grp=GETGRENT()) != NULL) { int i = 0; if (grp->gr_mem != NULL) { while (grp->gr_mem[i] != NULL) { if (strcmp(grp->gr_mem[i], pwd->pw_name)==0) { printf(j++ == 0 ? " Groups: %s" : ",%s", grp->gr_name); break; } ++i; } } } ENDGRENT(); printf("%s", j ? "\n" : ""); return (EXIT_SUCCESS); } char * pw_checkname(char *name, int gecos) { char showch[8]; const char *badchars, *ch, *showtype; int reject; ch = name; reject = 0; if (gecos) { /* See if the name is valid as a gecos (comment) field. */ badchars = ":!@"; showtype = "gecos field"; } else { /* See if the name is valid as a userid or group. */ badchars = " ,\t:+&#%$^()!@~*?<>=|\\/\""; showtype = "userid/group name"; /* Userids and groups can not have a leading '-'. */ if (*ch == '-') reject = 1; } if (!reject) { while (*ch) { if (strchr(badchars, *ch) != NULL || *ch < ' ' || *ch == 127) { reject = 1; break; } /* 8-bit characters are only allowed in GECOS fields */ if (!gecos && (*ch & 0x80)) { reject = 1; break; } ch++; } } /* * A `$' is allowed as the final character for userids and groups, * mainly for the benefit of samba. */ if (reject && !gecos) { if (*ch == '$' && *(ch + 1) == '\0') { reject = 0; ch++; } } if (reject) { snprintf(showch, sizeof(showch), (*ch >= ' ' && *ch < 127) ? "`%c'" : "0x%02x", *ch); errx(EX_DATAERR, "invalid character %s at position %td in %s", showch, (ch - name), showtype); } if (!gecos && (ch - name) > LOGNAMESIZE) errx(EX_USAGE, "name too long `%s' (max is %d)", name, LOGNAMESIZE); return (name); } static void rmat(uid_t uid) { DIR *d = opendir("/var/at/jobs"); if (d != NULL) { struct dirent *e; while ((e = readdir(d)) != NULL) { struct stat st; if (strncmp(e->d_name, ".lock", 5) != 0 && stat(e->d_name, &st) == 0 && !S_ISDIR(st.st_mode) && st.st_uid == uid) { char tmp[MAXPATHLEN]; snprintf(tmp, sizeof(tmp), "/usr/bin/atrm %s", e->d_name); system(tmp); } } closedir(d); } } static void rmopie(char const * name) { char tmp[1014]; FILE *fp; int fd; size_t len; off_t atofs = 0; if ((fd = openat(conf.rootfd, "etc/opiekeys", O_RDWR)) == -1) return; fp = fdopen(fd, "r+"); len = strlen(name); while (fgets(tmp, sizeof(tmp), fp) != NULL) { if (strncmp(name, tmp, len) == 0 && tmp[len]==' ') { /* Comment username out */ if (fseek(fp, atofs, SEEK_SET) == 0) fwrite("#", 1, 1, fp); break; } atofs = ftell(fp); } /* * If we got an error of any sort, don't update! */ fclose(fp); } int pw_user_next(int argc, char **argv, char *name __unused) { struct userconf *cnf = NULL; const char *cfg = NULL; int ch; bool quiet = false; uid_t next; while ((ch = getopt(argc, argv, "Cq")) != -1) { switch (ch) { case 'C': cfg = optarg; break; case 'q': quiet = true; break; } } if (quiet) freopen(_PATH_DEVNULL, "w", stderr); cnf = get_userconfig(cfg); next = pw_uidpolicy(cnf, -1); printf("%ju:", (uintmax_t)next); pw_groupnext(cnf, quiet); return (EXIT_SUCCESS); } int pw_user_show(int argc, char **argv, char *arg1) { struct passwd *pwd = NULL; char *name = NULL; intmax_t id = -1; int ch; bool all = false; bool pretty = false; bool force = false; bool v7 = false; bool quiet = false; if (arg1 != NULL) { if (arg1[strspn(arg1, "0123456789")] == '\0') id = pw_checkid(arg1, UID_MAX); else name = arg1; } while ((ch = getopt(argc, argv, "C:qn:u:FPa7")) != -1) { switch (ch) { case 'C': /* ignore compatibility */ break; case 'q': quiet = true; break; case 'n': name = optarg; break; case 'u': id = pw_checkid(optarg, UID_MAX); break; case 'F': force = true; break; case 'P': pretty = true; break; case 'a': all = true; break; case 7: v7 = true; break; } } if (quiet) freopen(_PATH_DEVNULL, "w", stderr); if (all) { SETPWENT(); while ((pwd = GETPWENT()) != NULL) print_user(pwd, pretty, v7); ENDPWENT(); return (EXIT_SUCCESS); } if (id < 0 && name == NULL) errx(EX_DATAERR, "username or id required"); pwd = (name != NULL) ? GETPWNAM(pw_checkname(name, 0)) : GETPWUID(id); if (pwd == NULL) { if (force) { pwd = &fakeuser; } else { if (name == NULL) errx(EX_NOUSER, "no such uid `%ju'", (uintmax_t) id); errx(EX_NOUSER, "no such user `%s'", name); } } return (print_user(pwd, pretty, v7)); } int pw_user_del(int argc, char **argv, char *arg1) { struct userconf *cnf = NULL; struct passwd *pwd = NULL; struct group *gr, *grp; char *name = NULL; char grname[MAXLOGNAME]; char *nispasswd = NULL; char file[MAXPATHLEN]; char home[MAXPATHLEN]; const char *cfg = NULL; struct stat st; intmax_t id = -1; int ch, rc; bool nis = false; bool deletehome = false; bool quiet = false; if (arg1 != NULL) { if (arg1[strspn(arg1, "0123456789")] == '\0') id = pw_checkid(arg1, UID_MAX); else name = arg1; } while ((ch = getopt(argc, argv, "C:qn:u:rYy:")) != -1) { switch (ch) { case 'C': cfg = optarg; break; case 'q': quiet = true; break; case 'n': name = optarg; break; case 'u': id = pw_checkid(optarg, UID_MAX); break; case 'r': deletehome = true; break; case 'y': nispasswd = optarg; break; case 'Y': nis = true; break; } } if (quiet) freopen(_PATH_DEVNULL, "w", stderr); if (id < 0 && name == NULL) errx(EX_DATAERR, "username or id required"); cnf = get_userconfig(cfg); if (nispasswd == NULL) nispasswd = cnf->nispasswd; pwd = (name != NULL) ? GETPWNAM(pw_checkname(name, 0)) : GETPWUID(id); if (pwd == NULL) { if (name == NULL) errx(EX_NOUSER, "no such uid `%ju'", (uintmax_t) id); errx(EX_NOUSER, "no such user `%s'", name); } if (PWF._altdir == PWF_REGULAR && ((pwd->pw_fields & _PWF_SOURCE) != _PWF_FILES)) { if ((pwd->pw_fields & _PWF_SOURCE) == _PWF_NIS) { if (!nis && nispasswd && *nispasswd != '/') errx(EX_NOUSER, "Cannot remove NIS user `%s'", name); } else { errx(EX_NOUSER, "Cannot remove non local user `%s'", name); } } id = pwd->pw_uid; if (name == NULL) name = pwd->pw_name; if (strcmp(pwd->pw_name, "root") == 0) errx(EX_DATAERR, "cannot remove user 'root'"); /* Remove opie record from /etc/opiekeys */ if (PWALTDIR() != PWF_ALT) rmopie(pwd->pw_name); if (!PWALTDIR()) { /* Remove crontabs */ snprintf(file, sizeof(file), "/var/cron/tabs/%s", pwd->pw_name); if (access(file, F_OK) == 0) { snprintf(file, sizeof(file), "crontab -u %s -r", pwd->pw_name); system(file); } } /* * Save these for later, since contents of pwd may be * invalidated by deletion */ snprintf(file, sizeof(file), "%s/%s", _PATH_MAILDIR, pwd->pw_name); strlcpy(home, pwd->pw_dir, sizeof(home)); gr = GETGRGID(pwd->pw_gid); if (gr != NULL) strlcpy(grname, gr->gr_name, LOGNAMESIZE); else grname[0] = '\0'; rc = delpwent(pwd); if (rc == -1) err(EX_IOERR, "user '%s' does not exist", pwd->pw_name); else if (rc != 0) err(EX_IOERR, "passwd update"); if (nis && nispasswd && *nispasswd=='/') { rc = delnispwent(nispasswd, name); if (rc == -1) warnx("WARNING: user '%s' does not exist in NIS passwd", pwd->pw_name); else if (rc != 0) warn("WARNING: NIS passwd update"); } grp = GETGRNAM(name); if (grp != NULL && (grp->gr_mem == NULL || *grp->gr_mem == NULL) && strcmp(name, grname) == 0) delgrent(GETGRNAM(name)); SETGRENT(); while ((grp = GETGRENT()) != NULL) { int i, j; char group[MAXLOGNAME]; if (grp->gr_mem == NULL) continue; for (i = 0; grp->gr_mem[i] != NULL; i++) { if (strcmp(grp->gr_mem[i], name) != 0) continue; for (j = i; grp->gr_mem[j] != NULL; j++) grp->gr_mem[j] = grp->gr_mem[j+1]; strlcpy(group, grp->gr_name, MAXLOGNAME); chggrent(group, grp); } } ENDGRENT(); pw_log(cnf, M_DELETE, W_USER, "%s(%ju) account removed", name, (uintmax_t)id); /* Remove mail file */ if (PWALTDIR() != PWF_ALT) unlinkat(conf.rootfd, file + 1, 0); /* Remove at jobs */ if (!PWALTDIR() && getpwuid(id) == NULL) rmat(id); /* Remove home directory and contents */ if (PWALTDIR() != PWF_ALT && deletehome && *home == '/' && GETPWUID(id) == NULL && fstatat(conf.rootfd, home + 1, &st, 0) != -1) { rm_r(conf.rootfd, home, id); pw_log(cnf, M_DELETE, W_USER, "%s(%ju) home '%s' %s" "removed", name, (uintmax_t)id, home, fstatat(conf.rootfd, home + 1, &st, 0) == -1 ? "" : "not " "completely "); } return (EXIT_SUCCESS); } int pw_user_lock(int argc, char **argv, char *arg1) { int ch; while ((ch = getopt(argc, argv, "Cq")) != -1) { switch (ch) { case 'C': case 'q': /* compatibility */ break; } } return (pw_userlock(arg1, M_LOCK)); } int pw_user_unlock(int argc, char **argv, char *arg1) { int ch; while ((ch = getopt(argc, argv, "Cq")) != -1) { switch (ch) { case 'C': case 'q': /* compatibility */ break; } } return (pw_userlock(arg1, M_UNLOCK)); } static struct group * group_from_name_or_id(char *name) { const char *errstr = NULL; struct group *grp; uintmax_t id; if ((grp = GETGRNAM(name)) == NULL) { id = strtounum(name, 0, GID_MAX, &errstr); if (errstr) errx(EX_NOUSER, "group `%s' does not exist", name); grp = GETGRGID(id); if (grp == NULL) errx(EX_NOUSER, "group `%s' does not exist", name); } return (grp); } static void split_groups(StringList **groups, char *groupsstr) { struct group *grp; char *p; char tok[] = ", \t"; for (p = strtok(groupsstr, tok); p != NULL; p = strtok(NULL, tok)) { grp = group_from_name_or_id(p); if (*groups == NULL) *groups = sl_init(); sl_add(*groups, newstr(grp->gr_name)); } } static void validate_grname(struct userconf *cnf, char *group) { struct group *grp; if (group == NULL || *group == '\0') { cnf->default_group = ""; return; } grp = group_from_name_or_id(group); cnf->default_group = newstr(grp->gr_name); } static mode_t validate_mode(char *mode) { mode_t m; void *set; if ((set = setmode(mode)) == NULL) errx(EX_DATAERR, "invalid directory creation mode '%s'", mode); m = getmode(set, _DEF_DIRMODE); free(set); return (m); } static void mix_config(struct userconf *cmdcnf, struct userconf *cfg) { if (cmdcnf->default_password == 0) cmdcnf->default_password = cfg->default_password; if (cmdcnf->reuse_uids == 0) cmdcnf->reuse_uids = cfg->reuse_uids; if (cmdcnf->reuse_gids == 0) cmdcnf->reuse_gids = cfg->reuse_gids; if (cmdcnf->nispasswd == NULL) cmdcnf->nispasswd = cfg->nispasswd; if (cmdcnf->dotdir == NULL) cmdcnf->dotdir = cfg->dotdir; if (cmdcnf->newmail == NULL) cmdcnf->newmail = cfg->newmail; if (cmdcnf->logfile == NULL) cmdcnf->logfile = cfg->logfile; if (cmdcnf->home == NULL) cmdcnf->home = cfg->home; if (cmdcnf->homemode == 0) cmdcnf->homemode = cfg->homemode; if (cmdcnf->shelldir == NULL) cmdcnf->shelldir = cfg->shelldir; if (cmdcnf->shells == NULL) cmdcnf->shells = cfg->shells; if (cmdcnf->shell_default == NULL) cmdcnf->shell_default = cfg->shell_default; if (cmdcnf->default_group == NULL) cmdcnf->default_group = cfg->default_group; if (cmdcnf->groups == NULL) cmdcnf->groups = cfg->groups; if (cmdcnf->default_class == NULL) cmdcnf->default_class = cfg->default_class; if (cmdcnf->min_uid == 0) cmdcnf->min_uid = cfg->min_uid; if (cmdcnf->max_uid == 0) cmdcnf->max_uid = cfg->max_uid; if (cmdcnf->min_gid == 0) cmdcnf->min_gid = cfg->min_gid; if (cmdcnf->max_gid == 0) cmdcnf->max_gid = cfg->max_gid; if (cmdcnf->expire_days == 0) cmdcnf->expire_days = cfg->expire_days; if (cmdcnf->password_days == 0) cmdcnf->password_days = cfg->password_days; } int pw_user_add(int argc, char **argv, char *arg1) { struct userconf *cnf, *cmdcnf; struct passwd *pwd; struct group *grp; struct stat st; char args[] = "C:qn:u:c:d:e:p:g:G:mM:k:s:oL:i:w:h:H:Db:NPy:Y"; char line[_PASSWORD_LEN+1], path[MAXPATHLEN]; char *gecos, *homedir, *skel, *walk, *userid, *groupid, *grname; char *default_passwd, *name, *p; const char *cfg; login_cap_t *lc; FILE *pfp, *fp; intmax_t id = -1; time_t now; int rc, ch, fd = -1; size_t i; bool dryrun, nis, pretty, quiet, createhome, precrypted, genconf; dryrun = nis = pretty = quiet = createhome = precrypted = false; genconf = false; gecos = homedir = skel = userid = groupid = default_passwd = NULL; grname = name = NULL; if ((cmdcnf = calloc(1, sizeof(struct userconf))) == NULL) err(EXIT_FAILURE, "calloc()"); if (arg1 != NULL) { if (arg1[strspn(arg1, "0123456789")] == '\0') id = pw_checkid(arg1, UID_MAX); else name = arg1; } while ((ch = getopt(argc, argv, args)) != -1) { switch (ch) { case 'C': cfg = optarg; break; case 'q': quiet = true; break; case 'n': name = optarg; break; case 'u': userid = optarg; break; case 'c': gecos = pw_checkname(optarg, 1); break; case 'd': homedir = optarg; break; case 'e': now = time(NULL); cmdcnf->expire_days = parse_date(now, optarg); break; case 'p': now = time(NULL); cmdcnf->password_days = parse_date(now, optarg); break; case 'g': validate_grname(cmdcnf, optarg); grname = optarg; break; case 'G': split_groups(&cmdcnf->groups, optarg); break; case 'm': createhome = true; break; case 'M': cmdcnf->homemode = validate_mode(optarg); break; case 'k': walk = skel = optarg; if (*walk == '/') walk++; if (fstatat(conf.rootfd, walk, &st, 0) == -1) errx(EX_OSFILE, "skeleton `%s' does not " "exists", skel); if (!S_ISDIR(st.st_mode)) errx(EX_OSFILE, "skeleton `%s' is not a " "directory", skel); cmdcnf->dotdir = skel; break; case 's': cmdcnf->shell_default = optarg; break; case 'o': conf.checkduplicate = false; break; case 'L': cmdcnf->default_class = pw_checkname(optarg, 0); break; case 'i': groupid = optarg; break; case 'w': default_passwd = optarg; break; case 'H': if (fd != -1) errx(EX_USAGE, "'-h' and '-H' are mutually " "exclusive options"); fd = pw_checkfd(optarg); precrypted = true; if (fd == '-') errx(EX_USAGE, "-H expects a file descriptor"); break; case 'h': if (fd != -1) errx(EX_USAGE, "'-h' and '-H' are mutually " "exclusive options"); fd = pw_checkfd(optarg); break; case 'D': genconf = true; break; case 'b': cmdcnf->home = optarg; break; case 'N': dryrun = true; break; case 'P': pretty = true; break; case 'y': cmdcnf->nispasswd = optarg; break; case 'Y': nis = true; break; } } if (geteuid() != 0 && ! dryrun) errx(EX_NOPERM, "you must be root"); if (quiet) freopen(_PATH_DEVNULL, "w", stderr); cnf = get_userconfig(cfg); mix_config(cmdcnf, cnf); if (default_passwd) cmdcnf->default_password = boolean_val(default_passwd, cnf->default_password); if (genconf) { if (name != NULL) errx(EX_DATAERR, "can't combine `-D' with `-n name'"); if (userid != NULL) { if ((p = strtok(userid, ", \t")) != NULL) cmdcnf->min_uid = pw_checkid(p, UID_MAX); if (cmdcnf->min_uid == 0) cmdcnf->min_uid = 1000; if ((p = strtok(NULL, " ,\t")) != NULL) cmdcnf->max_uid = pw_checkid(p, UID_MAX); if (cmdcnf->max_uid == 0) cmdcnf->max_uid = 32000; } if (groupid != NULL) { if ((p = strtok(groupid, ", \t")) != NULL) cmdcnf->min_gid = pw_checkid(p, GID_MAX); if (cmdcnf->min_gid == 0) cmdcnf->min_gid = 1000; if ((p = strtok(NULL, " ,\t")) != NULL) cmdcnf->max_gid = pw_checkid(p, GID_MAX); if (cmdcnf->max_gid == 0) cmdcnf->max_gid = 32000; } if (write_userconfig(cmdcnf, cfg)) return (EXIT_SUCCESS); err(EX_IOERR, "config update"); } if (userid) id = pw_checkid(userid, UID_MAX); if (id < 0 && name == NULL) errx(EX_DATAERR, "user name or id required"); if (name == NULL) errx(EX_DATAERR, "login name required"); if (GETPWNAM(name) != NULL) errx(EX_DATAERR, "login name `%s' already exists", name); pwd = &fakeuser; pwd->pw_name = name; pwd->pw_class = cmdcnf->default_class ? cmdcnf->default_class : ""; pwd->pw_uid = pw_uidpolicy(cmdcnf, id); pwd->pw_gid = pw_gidpolicy(cnf, grname, pwd->pw_name, (gid_t) pwd->pw_uid, dryrun); pwd->pw_change = cmdcnf->password_days; pwd->pw_expire = cmdcnf->expire_days; pwd->pw_dir = pw_homepolicy(cmdcnf, homedir, pwd->pw_name); pwd->pw_shell = pw_shellpolicy(cmdcnf); lc = login_getpwclass(pwd); if (lc == NULL || login_setcryptfmt(lc, "sha512", NULL) == NULL) warn("setting crypt(3) format"); login_close(lc); pwd->pw_passwd = pw_password(cmdcnf, pwd->pw_name, dryrun); if (pwd->pw_uid == 0 && strcmp(pwd->pw_name, "root") != 0) warnx("WARNING: new account `%s' has a uid of 0 " "(superuser access!)", pwd->pw_name); if (gecos) pwd->pw_gecos = gecos; if (fd != -1) pw_set_passwd(pwd, fd, precrypted, false); if (dryrun) return (print_user(pwd, pretty, false)); if ((rc = addpwent(pwd)) != 0) { if (rc == -1) errx(EX_IOERR, "user '%s' already exists", pwd->pw_name); else if (rc != 0) err(EX_IOERR, "passwd file update"); } if (nis && cmdcnf->nispasswd && *cmdcnf->nispasswd == '/') { printf("%s\n", cmdcnf->nispasswd); rc = addnispwent(cmdcnf->nispasswd, pwd); if (rc == -1) warnx("User '%s' already exists in NIS passwd", pwd->pw_name); else if (rc != 0) warn("NIS passwd update"); /* NOTE: we treat NIS-only update errors as non-fatal */ } if (cmdcnf->groups != NULL) { for (i = 0; i < cmdcnf->groups->sl_cur; i++) { grp = GETGRNAM(cmdcnf->groups->sl_str[i]); grp = gr_add(grp, pwd->pw_name); /* * grp can only be NULL in 2 cases: * - the new member is already a member * - a problem with memory occurs * in both cases we want to skip now. */ if (grp == NULL) continue; chggrent(grp->gr_name, grp); free(grp); } } pwd = GETPWNAM(name); if (pwd == NULL) errx(EX_NOUSER, "user '%s' disappeared during update", name); grp = GETGRGID(pwd->pw_gid); pw_log(cnf, M_ADD, W_USER, "%s(%ju):%s(%ju):%s:%s:%s", pwd->pw_name, (uintmax_t)pwd->pw_uid, grp ? grp->gr_name : "unknown", (uintmax_t)(grp ? grp->gr_gid : (uid_t)-1), pwd->pw_gecos, pwd->pw_dir, pwd->pw_shell); /* * let's touch and chown the user's mail file. This is not * strictly necessary under BSD with a 0755 maildir but it also * doesn't hurt anything to create the empty mailfile */ if (PWALTDIR() != PWF_ALT) { snprintf(path, sizeof(path), "%s/%s", _PATH_MAILDIR, pwd->pw_name); /* Preserve contents & mtime */ close(openat(conf.rootfd, path +1, O_RDWR | O_CREAT, 0600)); fchownat(conf.rootfd, path + 1, pwd->pw_uid, pwd->pw_gid, AT_SYMLINK_NOFOLLOW); } /* * Let's create and populate the user's home directory. Note * that this also `works' for editing users if -m is used, but * existing files will *not* be overwritten. */ if (PWALTDIR() != PWF_ALT && createhome && pwd->pw_dir && *pwd->pw_dir == '/' && pwd->pw_dir[1]) create_and_populate_homedir(cmdcnf, pwd, cmdcnf->dotdir, cmdcnf->homemode, false); if (!PWALTDIR() && cmdcnf->newmail && *cmdcnf->newmail && (fp = fopen(cnf->newmail, "r")) != NULL) { if ((pfp = popen(_PATH_SENDMAIL " -t", "w")) == NULL) warn("sendmail"); else { fprintf(pfp, "From: root\n" "To: %s\n" "Subject: Welcome!\n\n", pwd->pw_name); while (fgets(line, sizeof(line), fp) != NULL) { /* Do substitutions? */ fputs(line, pfp); } pclose(pfp); pw_log(cnf, M_ADD, W_USER, "%s(%ju) new user mail sent", pwd->pw_name, (uintmax_t)pwd->pw_uid); } fclose(fp); } if (nis && nis_update() == 0) pw_log(cnf, M_ADD, W_USER, "NIS maps updated"); return (EXIT_SUCCESS); } int pw_user_mod(int argc, char **argv, char *arg1) { struct userconf *cnf; struct passwd *pwd; struct group *grp; StringList *groups = NULL; char args[] = "C:qn:u:c:d:e:p:g:G:mM:l:k:s:w:L:h:H:NPYy:"; const char *cfg; char *gecos, *homedir, *grname, *name, *newname, *walk, *skel, *shell; char *passwd, *class, *nispasswd; login_cap_t *lc; struct stat st; intmax_t id = -1; int ch, fd = -1; size_t i, j; bool quiet, createhome, pretty, dryrun, nis, edited, docreatehome; bool precrypted; mode_t homemode = 0; time_t expire_days, password_days, now; expire_days = password_days = -1; gecos = homedir = grname = name = newname = skel = shell =NULL; passwd = NULL; class = nispasswd = NULL; quiet = createhome = pretty = dryrun = nis = precrypted = false; edited = docreatehome = false; if (arg1 != NULL) { if (arg1[strspn(arg1, "0123456789")] == '\0') id = pw_checkid(arg1, UID_MAX); else name = arg1; } while ((ch = getopt(argc, argv, args)) != -1) { switch (ch) { case 'C': cfg = optarg; break; case 'q': quiet = true; break; case 'n': name = optarg; break; case 'u': id = pw_checkid(optarg, UID_MAX); break; case 'c': gecos = pw_checkname(optarg, 1); break; case 'd': homedir = optarg; break; case 'e': now = time(NULL); expire_days = parse_date(now, optarg); break; case 'p': now = time(NULL); password_days = parse_date(now, optarg); break; case 'g': group_from_name_or_id(optarg); grname = optarg; break; case 'G': split_groups(&groups, optarg); break; case 'm': createhome = true; break; case 'M': homemode = validate_mode(optarg); break; case 'l': newname = optarg; break; case 'k': walk = skel = optarg; if (*walk == '/') walk++; if (fstatat(conf.rootfd, walk, &st, 0) == -1) errx(EX_OSFILE, "skeleton `%s' does not " "exists", skel); if (!S_ISDIR(st.st_mode)) errx(EX_OSFILE, "skeleton `%s' is not a " "directory", skel); break; case 's': shell = optarg; break; case 'w': passwd = optarg; break; case 'L': class = pw_checkname(optarg, 0); break; case 'H': if (fd != -1) errx(EX_USAGE, "'-h' and '-H' are mutually " "exclusive options"); fd = pw_checkfd(optarg); precrypted = true; if (fd == '-') errx(EX_USAGE, "-H expects a file descriptor"); break; case 'h': if (fd != -1) errx(EX_USAGE, "'-h' and '-H' are mutually " "exclusive options"); fd = pw_checkfd(optarg); break; case 'N': dryrun = true; break; case 'P': pretty = true; break; case 'y': nispasswd = optarg; break; case 'Y': nis = true; break; } } if (geteuid() != 0 && ! dryrun) errx(EX_NOPERM, "you must be root"); if (quiet) freopen(_PATH_DEVNULL, "w", stderr); cnf = get_userconfig(cfg); if (id < 0 && name == NULL) errx(EX_DATAERR, "username or id required"); pwd = (name != NULL) ? GETPWNAM(pw_checkname(name, 0)) : GETPWUID(id); if (pwd == NULL) { if (name == NULL) errx(EX_NOUSER, "no such uid `%ju'", (uintmax_t) id); errx(EX_NOUSER, "no such user `%s'", name); } if (name == NULL) name = pwd->pw_name; if (nis && nispasswd == NULL) nispasswd = cnf->nispasswd; if (PWF._altdir == PWF_REGULAR && ((pwd->pw_fields & _PWF_SOURCE) != _PWF_FILES)) { if ((pwd->pw_fields & _PWF_SOURCE) == _PWF_NIS) { if (!nis && nispasswd && *nispasswd != '/') errx(EX_NOUSER, "Cannot modify NIS user `%s'", name); } else { errx(EX_NOUSER, "Cannot modify non local user `%s'", name); } } if (newname) { if (strcmp(pwd->pw_name, "root") == 0) errx(EX_DATAERR, "can't rename `root' account"); if (strcmp(pwd->pw_name, newname) != 0) { pwd->pw_name = pw_checkname(newname, 0); edited = true; } } if (id > 0 && pwd->pw_uid != id) { pwd->pw_uid = id; edited = true; if (pwd->pw_uid != 0 && strcmp(pwd->pw_name, "root") == 0) errx(EX_DATAERR, "can't change uid of `root' account"); if (pwd->pw_uid == 0 && strcmp(pwd->pw_name, "root") != 0) warnx("WARNING: account `%s' will have a uid of 0 " "(superuser access!)", pwd->pw_name); } if (grname && pwd->pw_uid != 0) { grp = GETGRNAM(grname); if (grp == NULL) grp = GETGRGID(pw_checkid(grname, GID_MAX)); if (grp->gr_gid != pwd->pw_gid) { pwd->pw_gid = grp->gr_gid; edited = true; } } if (password_days >= 0 && pwd->pw_change != password_days) { pwd->pw_change = password_days; edited = true; } if (expire_days >= 0 && pwd->pw_expire != expire_days) { pwd->pw_expire = expire_days; edited = true; } if (shell) { shell = shell_path(cnf->shelldir, cnf->shells, shell); if (shell == NULL) shell = ""; if (strcmp(shell, pwd->pw_shell) != 0) { pwd->pw_shell = shell; edited = true; } } if (class && strcmp(pwd->pw_class, class) != 0) { pwd->pw_class = class; edited = true; } if (homedir && strcmp(pwd->pw_dir, homedir) != 0) { pwd->pw_dir = homedir; + edited = true; if (fstatat(conf.rootfd, pwd->pw_dir, &st, 0) == -1) { if (!createhome) warnx("WARNING: home `%s' does not exist", pwd->pw_dir); else docreatehome = true; } else if (!S_ISDIR(st.st_mode)) { warnx("WARNING: home `%s' is not a directory", pwd->pw_dir); } } if (passwd && conf.fd == -1) { lc = login_getpwclass(pwd); if (lc == NULL || login_setcryptfmt(lc, "sha512", NULL) == NULL) warn("setting crypt(3) format"); login_close(lc); cnf->default_password = boolean_val(passwd, cnf->default_password); pwd->pw_passwd = pw_password(cnf, pwd->pw_name, dryrun); edited = true; } if (gecos && strcmp(pwd->pw_gecos, gecos) != 0) { pwd->pw_gecos = gecos; edited = true; } if (fd != -1) edited = pw_set_passwd(pwd, fd, precrypted, true); if (dryrun) return (print_user(pwd, pretty, false)); if (edited) /* Only updated this if required */ perform_chgpwent(name, pwd, nis ? nispasswd : NULL); /* Now perform the needed changes concern groups */ if (groups != NULL) { /* Delete User from groups using old name */ SETGRENT(); while ((grp = GETGRENT()) != NULL) { if (grp->gr_mem == NULL) continue; for (i = 0; grp->gr_mem[i] != NULL; i++) { if (strcmp(grp->gr_mem[i] , name) != 0) continue; for (j = i; grp->gr_mem[j] != NULL ; j++) grp->gr_mem[j] = grp->gr_mem[j+1]; chggrent(grp->gr_name, grp); break; } } ENDGRENT(); /* Add the user to the needed groups */ for (i = 0; i < groups->sl_cur; i++) { grp = GETGRNAM(groups->sl_str[i]); grp = gr_add(grp, pwd->pw_name); if (grp == NULL) continue; chggrent(grp->gr_name, grp); free(grp); } } /* In case of rename we need to walk over the different groups */ if (newname) { SETGRENT(); while ((grp = GETGRENT()) != NULL) { if (grp->gr_mem == NULL) continue; for (i = 0; grp->gr_mem[i] != NULL; i++) { if (strcmp(grp->gr_mem[i], name) != 0) continue; grp->gr_mem[i] = newname; chggrent(grp->gr_name, grp); break; } } } /* go get a current version of pwd */ if (newname) name = newname; pwd = GETPWNAM(name); if (pwd == NULL) errx(EX_NOUSER, "user '%s' disappeared during update", name); grp = GETGRGID(pwd->pw_gid); pw_log(cnf, M_UPDATE, W_USER, "%s(%ju):%s(%ju):%s:%s:%s", pwd->pw_name, (uintmax_t)pwd->pw_uid, grp ? grp->gr_name : "unknown", (uintmax_t)(grp ? grp->gr_gid : (uid_t)-1), pwd->pw_gecos, pwd->pw_dir, pwd->pw_shell); /* * Let's create and populate the user's home directory. Note * that this also `works' for editing users if -m is used, but * existing files will *not* be overwritten. */ if (PWALTDIR() != PWF_ALT && docreatehome && pwd->pw_dir && *pwd->pw_dir == '/' && pwd->pw_dir[1]) { if (!skel) skel = cnf->dotdir; if (homemode == 0) homemode = cnf->homemode; create_and_populate_homedir(cnf, pwd, skel, homemode, true); } if (nis && nis_update() == 0) pw_log(cnf, M_UPDATE, W_USER, "NIS maps updated"); return (EXIT_SUCCESS); } Index: projects/iosched =================================================================== --- projects/iosched (revision 287721) +++ projects/iosched (revision 287722) Property changes on: projects/iosched ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r287701-287721