Index: user/alc/PQ_LAUNDRY/contrib/bsnmp/lib/snmp.c =================================================================== --- user/alc/PQ_LAUNDRY/contrib/bsnmp/lib/snmp.c (revision 307895) +++ user/alc/PQ_LAUNDRY/contrib/bsnmp/lib/snmp.c (revision 307896) @@ -1,1455 +1,1455 @@ /* * Copyright (c) 2001-2003 * Fraunhofer Institute for Open Communication Systems (FhG Fokus). * All rights reserved. * * Author: Harti Brandt * * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Shteryana Sotirova Shopova * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Begemot: bsnmp/lib/snmp.c,v 1.40 2005/10/04 14:32:42 brandt_h Exp $ * * SNMP */ #include #include #include #include #include #include #ifdef HAVE_STDINT_H #include #elif defined(HAVE_INTTYPES_H) #include #endif #include #include #include #include #include "asn1.h" #include "snmp.h" #include "snmppriv.h" static void snmp_error_func(const char *, ...); static void snmp_printf_func(const char *, ...); void (*snmp_error)(const char *, ...) = snmp_error_func; void (*snmp_printf)(const char *, ...) = snmp_printf_func; /* * Get the next variable binding from the list. * ASN errors on the sequence or the OID are always fatal. */ static enum asn_err get_var_binding(struct asn_buf *b, struct snmp_value *binding) { u_char type; asn_len_t len, trailer; enum asn_err err; if (asn_get_sequence(b, &len) != ASN_ERR_OK) { snmp_error("cannot parse varbind header"); return (ASN_ERR_FAILED); } /* temporary truncate the length so that the parser does not * eat up bytes behind the sequence in the case the encoding is * wrong of inner elements. */ trailer = b->asn_len - len; b->asn_len = len; if (asn_get_objid(b, &binding->var) != ASN_ERR_OK) { snmp_error("cannot parse binding objid"); return (ASN_ERR_FAILED); } if (asn_get_header(b, &type, &len) != ASN_ERR_OK) { snmp_error("cannot parse binding value header"); return (ASN_ERR_FAILED); } switch (type) { case ASN_TYPE_NULL: binding->syntax = SNMP_SYNTAX_NULL; err = asn_get_null_raw(b, len); break; case ASN_TYPE_INTEGER: binding->syntax = SNMP_SYNTAX_INTEGER; err = asn_get_integer_raw(b, len, &binding->v.integer); break; case ASN_TYPE_OCTETSTRING: binding->syntax = SNMP_SYNTAX_OCTETSTRING; binding->v.octetstring.octets = malloc(len); if (binding->v.octetstring.octets == NULL) { snmp_error("%s", strerror(errno)); return (ASN_ERR_FAILED); } binding->v.octetstring.len = len; err = asn_get_octetstring_raw(b, len, binding->v.octetstring.octets, &binding->v.octetstring.len); if (ASN_ERR_STOPPED(err)) { free(binding->v.octetstring.octets); binding->v.octetstring.octets = NULL; } break; case ASN_TYPE_OBJID: binding->syntax = SNMP_SYNTAX_OID; err = asn_get_objid_raw(b, len, &binding->v.oid); break; case ASN_CLASS_APPLICATION|ASN_APP_IPADDRESS: binding->syntax = SNMP_SYNTAX_IPADDRESS; err = asn_get_ipaddress_raw(b, len, binding->v.ipaddress); break; case ASN_CLASS_APPLICATION|ASN_APP_TIMETICKS: binding->syntax = SNMP_SYNTAX_TIMETICKS; err = asn_get_uint32_raw(b, len, &binding->v.uint32); break; case ASN_CLASS_APPLICATION|ASN_APP_COUNTER: binding->syntax = SNMP_SYNTAX_COUNTER; err = asn_get_uint32_raw(b, len, &binding->v.uint32); break; case ASN_CLASS_APPLICATION|ASN_APP_GAUGE: binding->syntax = SNMP_SYNTAX_GAUGE; err = asn_get_uint32_raw(b, len, &binding->v.uint32); break; case ASN_CLASS_APPLICATION|ASN_APP_COUNTER64: binding->syntax = SNMP_SYNTAX_COUNTER64; err = asn_get_counter64_raw(b, len, &binding->v.counter64); break; case ASN_CLASS_CONTEXT | ASN_EXCEPT_NOSUCHOBJECT: binding->syntax = SNMP_SYNTAX_NOSUCHOBJECT; err = asn_get_null_raw(b, len); break; case ASN_CLASS_CONTEXT | ASN_EXCEPT_NOSUCHINSTANCE: binding->syntax = SNMP_SYNTAX_NOSUCHINSTANCE; err = asn_get_null_raw(b, len); break; case ASN_CLASS_CONTEXT | ASN_EXCEPT_ENDOFMIBVIEW: binding->syntax = SNMP_SYNTAX_ENDOFMIBVIEW; err = asn_get_null_raw(b, len); break; default: if ((err = asn_skip(b, len)) == ASN_ERR_OK) err = ASN_ERR_TAG; snmp_error("bad binding value type 0x%x", type); break; } if (ASN_ERR_STOPPED(err)) { snmp_error("cannot parse binding value"); return (err); } if (b->asn_len != 0) snmp_error("ignoring junk at end of binding"); b->asn_len = trailer; return (err); } /* * Parse the different PDUs contents. Any ASN error in the outer components * are fatal. Only errors in variable values may be tolerated. If all * components can be parsed it returns either ASN_ERR_OK or the first * error that was found. */ enum asn_err snmp_parse_pdus_hdr(struct asn_buf *b, struct snmp_pdu *pdu, asn_len_t *lenp) { if (pdu->type == SNMP_PDU_TRAP) { if (asn_get_objid(b, &pdu->enterprise) != ASN_ERR_OK) { snmp_error("cannot parse trap enterprise"); return (ASN_ERR_FAILED); } if (asn_get_ipaddress(b, pdu->agent_addr) != ASN_ERR_OK) { snmp_error("cannot parse trap agent address"); return (ASN_ERR_FAILED); } if (asn_get_integer(b, &pdu->generic_trap) != ASN_ERR_OK) { snmp_error("cannot parse 'generic-trap'"); return (ASN_ERR_FAILED); } if (asn_get_integer(b, &pdu->specific_trap) != ASN_ERR_OK) { snmp_error("cannot parse 'specific-trap'"); return (ASN_ERR_FAILED); } if (asn_get_timeticks(b, &pdu->time_stamp) != ASN_ERR_OK) { snmp_error("cannot parse trap 'time-stamp'"); return (ASN_ERR_FAILED); } } else { if (asn_get_integer(b, &pdu->request_id) != ASN_ERR_OK) { snmp_error("cannot parse 'request-id'"); return (ASN_ERR_FAILED); } if (asn_get_integer(b, &pdu->error_status) != ASN_ERR_OK) { snmp_error("cannot parse 'error_status'"); return (ASN_ERR_FAILED); } if (asn_get_integer(b, &pdu->error_index) != ASN_ERR_OK) { snmp_error("cannot parse 'error_index'"); return (ASN_ERR_FAILED); } } if (asn_get_sequence(b, lenp) != ASN_ERR_OK) { snmp_error("cannot get varlist header"); return (ASN_ERR_FAILED); } return (ASN_ERR_OK); } static enum asn_err parse_pdus(struct asn_buf *b, struct snmp_pdu *pdu, int32_t *ip) { asn_len_t len, trailer; struct snmp_value *v; enum asn_err err, err1; err = snmp_parse_pdus_hdr(b, pdu, &len); if (ASN_ERR_STOPPED(err)) return (err); trailer = b->asn_len - len; v = pdu->bindings; err = ASN_ERR_OK; while (b->asn_len != 0) { if (pdu->nbindings == SNMP_MAX_BINDINGS) { snmp_error("too many bindings (> %u) in PDU", SNMP_MAX_BINDINGS); return (ASN_ERR_FAILED); } err1 = get_var_binding(b, v); if (ASN_ERR_STOPPED(err1)) return (ASN_ERR_FAILED); if (err1 != ASN_ERR_OK && err == ASN_ERR_OK) { err = err1; *ip = pdu->nbindings + 1; } pdu->nbindings++; v++; } b->asn_len = trailer; return (err); } static enum asn_err parse_secparams(struct asn_buf *b, struct snmp_pdu *pdu) { asn_len_t octs_len; u_char buf[256]; /* XXX: calc max possible size here */ struct asn_buf tb; memset(buf, 0, 256); tb.asn_ptr = buf; tb.asn_len = 256; - u_int len; + u_int len = 256; if (asn_get_octetstring(b, buf, &len) != ASN_ERR_OK) { snmp_error("cannot parse usm header"); return (ASN_ERR_FAILED); } tb.asn_len = len; if (asn_get_sequence(&tb, &octs_len) != ASN_ERR_OK) { snmp_error("cannot decode usm header"); return (ASN_ERR_FAILED); } octs_len = SNMP_ENGINE_ID_SIZ; if (asn_get_octetstring(&tb, (u_char *)&pdu->engine.engine_id, &octs_len) != ASN_ERR_OK) { snmp_error("cannot decode msg engine id"); return (ASN_ERR_FAILED); } pdu->engine.engine_len = octs_len; if (asn_get_integer(&tb, &pdu->engine.engine_boots) != ASN_ERR_OK) { snmp_error("cannot decode msg engine boots"); return (ASN_ERR_FAILED); } if (asn_get_integer(&tb, &pdu->engine.engine_time) != ASN_ERR_OK) { snmp_error("cannot decode msg engine time"); return (ASN_ERR_FAILED); } octs_len = SNMP_ADM_STR32_SIZ - 1; if (asn_get_octetstring(&tb, (u_char *)&pdu->user.sec_name, &octs_len) != ASN_ERR_OK) { snmp_error("cannot decode msg user name"); return (ASN_ERR_FAILED); } pdu->user.sec_name[octs_len] = '\0'; octs_len = sizeof(pdu->msg_digest); if (asn_get_octetstring(&tb, (u_char *)&pdu->msg_digest, &octs_len) != ASN_ERR_OK || ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0 && octs_len != sizeof(pdu->msg_digest))) { snmp_error("cannot decode msg authentication param"); return (ASN_ERR_FAILED); } octs_len = sizeof(pdu->msg_salt); if (asn_get_octetstring(&tb, (u_char *)&pdu->msg_salt, &octs_len) != ASN_ERR_OK ||((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0 && octs_len != sizeof(pdu->msg_salt))) { snmp_error("cannot decode msg authentication param"); return (ASN_ERR_FAILED); } if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) { pdu->digest_ptr = b->asn_ptr - SNMP_USM_AUTH_SIZE; pdu->digest_ptr -= octs_len + ASN_MAXLENLEN; } return (ASN_ERR_OK); } static enum snmp_code pdu_encode_secparams(struct asn_buf *b, struct snmp_pdu *pdu) { u_char buf[256], *sptr; struct asn_buf tb; size_t auth_off, moved = 0; auth_off = 0; memset(buf, 0, 256); tb.asn_ptr = buf; tb.asn_len = 256; if (asn_put_temp_header(&tb, (ASN_TYPE_SEQUENCE|ASN_TYPE_CONSTRUCTED), &sptr) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_octetstring(&tb, (u_char *)pdu->engine.engine_id, pdu->engine.engine_len) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_integer(&tb, pdu->engine.engine_boots) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_integer(&tb, pdu->engine.engine_time) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_octetstring(&tb, (u_char *)pdu->user.sec_name, strlen(pdu->user.sec_name)) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) { auth_off = sizeof(buf) - tb.asn_len + ASN_MAXLENLEN; if (asn_put_octetstring(&tb, (u_char *)pdu->msg_digest, sizeof(pdu->msg_digest)) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } else { if (asn_put_octetstring(&tb, (u_char *)pdu->msg_digest, 0) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } if ((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0) { if (asn_put_octetstring(&tb, (u_char *)pdu->msg_salt, sizeof(pdu->msg_salt)) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } else { if (asn_put_octetstring(&tb, (u_char *)pdu->msg_salt, 0) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } if (asn_commit_header(&tb, sptr, &moved) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) pdu->digest_ptr = b->asn_ptr + auth_off - moved; if (asn_put_octetstring(b, buf, sizeof(buf) - tb.asn_len) != ASN_ERR_OK) return (SNMP_CODE_FAILED); pdu->digest_ptr += ASN_MAXLENLEN; if ((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0 && asn_put_temp_header(b, ASN_TYPE_OCTETSTRING, &pdu->encrypted_ptr) != ASN_ERR_OK) return (SNMP_CODE_FAILED); return (SNMP_CODE_OK); } /* * Decode the PDU except for the variable bindings itself. * If decoding fails because of a bad binding, but the rest can be * decoded, ip points to the index of the failed variable (errors * OORANGE, BADLEN or BADVERS). */ enum snmp_code snmp_pdu_decode(struct asn_buf *b, struct snmp_pdu *pdu, int32_t *ip) { enum snmp_code code; if ((code = snmp_pdu_decode_header(b, pdu)) != SNMP_CODE_OK) return (code); if (pdu->version == SNMP_V3) { if (pdu->security_model != SNMP_SECMODEL_USM) return (SNMP_CODE_FAILED); if ((code = snmp_pdu_decode_secmode(b, pdu)) != SNMP_CODE_OK) return (code); } code = snmp_pdu_decode_scoped(b, pdu, ip); switch (code) { case SNMP_CODE_FAILED: snmp_pdu_free(pdu); break; case SNMP_CODE_BADENC: if (pdu->version == SNMP_Verr) return (SNMP_CODE_BADVERS); default: break; } return (code); } enum snmp_code snmp_pdu_decode_header(struct asn_buf *b, struct snmp_pdu *pdu) { int32_t version; u_int octs_len; asn_len_t len; pdu->outer_ptr = b->asn_ptr; pdu->outer_len = b->asn_len; if (asn_get_sequence(b, &len) != ASN_ERR_OK) { snmp_error("cannot decode pdu header"); return (SNMP_CODE_FAILED); } if (b->asn_len < len) { snmp_error("outer sequence value too short"); return (SNMP_CODE_FAILED); } if (b->asn_len != len) { snmp_error("ignoring trailing junk in message"); b->asn_len = len; } if (asn_get_integer(b, &version) != ASN_ERR_OK) { snmp_error("cannot decode version"); return (SNMP_CODE_FAILED); } if (version == 0) pdu->version = SNMP_V1; else if (version == 1) pdu->version = SNMP_V2c; else if (version == 3) pdu->version = SNMP_V3; else { pdu->version = SNMP_Verr; snmp_error("unsupported SNMP version"); return (SNMP_CODE_BADENC); } if (pdu->version == SNMP_V3) { if (asn_get_sequence(b, &len) != ASN_ERR_OK) { snmp_error("cannot decode pdu global data header"); return (SNMP_CODE_FAILED); } if (asn_get_integer(b, &pdu->identifier) != ASN_ERR_OK) { snmp_error("cannot decode msg indetifier"); return (SNMP_CODE_FAILED); } if (asn_get_integer(b, &pdu->engine.max_msg_size) != ASN_ERR_OK) { snmp_error("cannot decode msg size"); return (SNMP_CODE_FAILED); } octs_len = 1; if (asn_get_octetstring(b, (u_char *)&pdu->flags, &octs_len) != ASN_ERR_OK) { snmp_error("cannot decode msg flags"); return (SNMP_CODE_FAILED); } if (asn_get_integer(b, &pdu->security_model) != ASN_ERR_OK) { snmp_error("cannot decode msg size"); return (SNMP_CODE_FAILED); } if (pdu->security_model != SNMP_SECMODEL_USM) return (SNMP_CODE_FAILED); if (parse_secparams(b, pdu) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } else { octs_len = SNMP_COMMUNITY_MAXLEN; if (asn_get_octetstring(b, (u_char *)pdu->community, &octs_len) != ASN_ERR_OK) { snmp_error("cannot decode community"); return (SNMP_CODE_FAILED); } pdu->community[octs_len] = '\0'; } return (SNMP_CODE_OK); } enum snmp_code snmp_pdu_decode_scoped(struct asn_buf *b, struct snmp_pdu *pdu, int32_t *ip) { u_char type; asn_len_t len, trailer; enum asn_err err; if (pdu->version == SNMP_V3) { if (asn_get_sequence(b, &len) != ASN_ERR_OK) { snmp_error("cannot decode scoped pdu header"); return (SNMP_CODE_FAILED); } len = SNMP_ENGINE_ID_SIZ; if (asn_get_octetstring(b, (u_char *)&pdu->context_engine, &len) != ASN_ERR_OK) { snmp_error("cannot decode msg context engine"); return (SNMP_CODE_FAILED); } pdu->context_engine_len = len; len = SNMP_CONTEXT_NAME_SIZ; if (asn_get_octetstring(b, (u_char *)&pdu->context_name, &len) != ASN_ERR_OK) { snmp_error("cannot decode msg context name"); return (SNMP_CODE_FAILED); } pdu->context_name[len] = '\0'; } if (asn_get_header(b, &type, &len) != ASN_ERR_OK) { snmp_error("cannot get pdu header"); return (SNMP_CODE_FAILED); } if ((type & ~ASN_TYPE_MASK) != (ASN_TYPE_CONSTRUCTED | ASN_CLASS_CONTEXT)) { snmp_error("bad pdu header tag"); return (SNMP_CODE_FAILED); } pdu->type = type & ASN_TYPE_MASK; switch (pdu->type) { case SNMP_PDU_GET: case SNMP_PDU_GETNEXT: case SNMP_PDU_RESPONSE: case SNMP_PDU_SET: break; case SNMP_PDU_TRAP: if (pdu->version != SNMP_V1) { snmp_error("bad pdu type %u", pdu->type); return (SNMP_CODE_FAILED); } break; case SNMP_PDU_GETBULK: case SNMP_PDU_INFORM: case SNMP_PDU_TRAP2: case SNMP_PDU_REPORT: if (pdu->version == SNMP_V1) { snmp_error("bad pdu type %u", pdu->type); return (SNMP_CODE_FAILED); } break; default: snmp_error("bad pdu type %u", pdu->type); return (SNMP_CODE_FAILED); } trailer = b->asn_len - len; b->asn_len = len; err = parse_pdus(b, pdu, ip); if (ASN_ERR_STOPPED(err)) return (SNMP_CODE_FAILED); if (b->asn_len != 0) snmp_error("ignoring trailing junk after pdu"); b->asn_len = trailer; return (SNMP_CODE_OK); } enum snmp_code snmp_pdu_decode_secmode(struct asn_buf *b, struct snmp_pdu *pdu) { u_char type; enum snmp_code code; uint8_t digest[SNMP_USM_AUTH_SIZE]; if (pdu->user.auth_proto != SNMP_AUTH_NOAUTH && (pdu->flags & SNMP_MSG_AUTH_FLAG) == 0) return (SNMP_CODE_BADSECLEVEL); if ((code = snmp_pdu_calc_digest(pdu, digest)) != SNMP_CODE_OK) return (SNMP_CODE_FAILED); if (pdu->user.auth_proto != SNMP_AUTH_NOAUTH && memcmp(digest, pdu->msg_digest, sizeof(pdu->msg_digest)) != 0) return (SNMP_CODE_BADDIGEST); if (pdu->user.priv_proto != SNMP_PRIV_NOPRIV && (asn_get_header(b, &type, &pdu->scoped_len) != ASN_ERR_OK || type != ASN_TYPE_OCTETSTRING)) { snmp_error("cannot decode encrypted pdu"); return (SNMP_CODE_FAILED); } pdu->scoped_ptr = b->asn_ptr; if (pdu->user.priv_proto != SNMP_PRIV_NOPRIV && (pdu->flags & SNMP_MSG_PRIV_FLAG) == 0) return (SNMP_CODE_BADSECLEVEL); if ((code = snmp_pdu_decrypt(pdu)) != SNMP_CODE_OK) return (SNMP_CODE_FAILED); return (code); } /* * Check whether what we have is the complete PDU by snooping at the * enclosing structure header. This returns: * -1 if there are ASN.1 errors * 0 if we need more data * > 0 the length of this PDU */ int snmp_pdu_snoop(const struct asn_buf *b0) { u_int length; asn_len_t len; struct asn_buf b = *b0; /* <0x10|0x20> */ if (b.asn_len == 0) return (0); if (b.asn_cptr[0] != (ASN_TYPE_SEQUENCE | ASN_TYPE_CONSTRUCTED)) { asn_error(&b, "bad sequence type %u", b.asn_cptr[0]); return (-1); } b.asn_len--; b.asn_cptr++; if (b.asn_len == 0) return (0); if (*b.asn_cptr & 0x80) { /* long length */ length = *b.asn_cptr++ & 0x7f; b.asn_len--; if (length == 0) { asn_error(&b, "indefinite length not supported"); return (-1); } if (length > ASN_MAXLENLEN) { asn_error(&b, "long length too long (%u)", length); return (-1); } if (length > b.asn_len) return (0); len = 0; while (length--) { len = (len << 8) | *b.asn_cptr++; b.asn_len--; } } else { len = *b.asn_cptr++; b.asn_len--; } if (len > b.asn_len) return (0); return (len + b.asn_cptr - b0->asn_cptr); } /* * Encode the SNMP PDU without the variable bindings field. * We do this the rather uneffective way by * moving things around and assuming that the length field will never * use more than 2 bytes. * We need a number of pointers to apply the fixes afterwards. */ enum snmp_code snmp_pdu_encode_header(struct asn_buf *b, struct snmp_pdu *pdu) { enum asn_err err; u_char *v3_hdr_ptr; if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE|ASN_TYPE_CONSTRUCTED), &pdu->outer_ptr) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (pdu->version == SNMP_V1) err = asn_put_integer(b, 0); else if (pdu->version == SNMP_V2c) err = asn_put_integer(b, 1); else if (pdu->version == SNMP_V3) err = asn_put_integer(b, 3); else return (SNMP_CODE_BADVERS); if (err != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (pdu->version == SNMP_V3) { if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE | ASN_TYPE_CONSTRUCTED), &v3_hdr_ptr) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_integer(b, pdu->identifier) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_integer(b, pdu->engine.max_msg_size) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (pdu->type != SNMP_PDU_RESPONSE && pdu->type != SNMP_PDU_TRAP && pdu->type != SNMP_PDU_TRAP2 && pdu->type != SNMP_PDU_REPORT) pdu->flags |= SNMP_MSG_REPORT_FLAG; if (asn_put_octetstring(b, (u_char *)&pdu->flags, 1) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_integer(b, pdu->security_model) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_commit_header(b, v3_hdr_ptr, NULL) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (pdu->security_model != SNMP_SECMODEL_USM) return (SNMP_CODE_FAILED); if (pdu_encode_secparams(b, pdu) != SNMP_CODE_OK) return (SNMP_CODE_FAILED); /* View-based Access Conntrol information */ if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE | ASN_TYPE_CONSTRUCTED), &pdu->scoped_ptr) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_octetstring(b, (u_char *)pdu->context_engine, pdu->context_engine_len) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (asn_put_octetstring(b, (u_char *)pdu->context_name, strlen(pdu->context_name)) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } else { if (asn_put_octetstring(b, (u_char *)pdu->community, strlen(pdu->community)) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } if (asn_put_temp_header(b, (ASN_TYPE_CONSTRUCTED | ASN_CLASS_CONTEXT | pdu->type), &pdu->pdu_ptr) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (pdu->type == SNMP_PDU_TRAP) { if (pdu->version != SNMP_V1 || asn_put_objid(b, &pdu->enterprise) != ASN_ERR_OK || asn_put_ipaddress(b, pdu->agent_addr) != ASN_ERR_OK || asn_put_integer(b, pdu->generic_trap) != ASN_ERR_OK || asn_put_integer(b, pdu->specific_trap) != ASN_ERR_OK || asn_put_timeticks(b, pdu->time_stamp) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } else { if (pdu->version == SNMP_V1 && (pdu->type == SNMP_PDU_GETBULK || pdu->type == SNMP_PDU_INFORM || pdu->type == SNMP_PDU_TRAP2 || pdu->type == SNMP_PDU_REPORT)) return (SNMP_CODE_FAILED); if (asn_put_integer(b, pdu->request_id) != ASN_ERR_OK || asn_put_integer(b, pdu->error_status) != ASN_ERR_OK || asn_put_integer(b, pdu->error_index) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } if (asn_put_temp_header(b, (ASN_TYPE_SEQUENCE|ASN_TYPE_CONSTRUCTED), &pdu->vars_ptr) != ASN_ERR_OK) return (SNMP_CODE_FAILED); return (SNMP_CODE_OK); } static enum asn_err snmp_pdu_fix_padd(struct asn_buf *b, struct snmp_pdu *pdu) { asn_len_t padlen; if (pdu->user.priv_proto == SNMP_PRIV_DES && pdu->scoped_len % 8 != 0) { padlen = 8 - (pdu->scoped_len % 8); if (asn_pad(b, padlen) != ASN_ERR_OK) return (ASN_ERR_FAILED); pdu->scoped_len += padlen; } return (ASN_ERR_OK); } enum snmp_code snmp_fix_encoding(struct asn_buf *b, struct snmp_pdu *pdu) { size_t moved = 0; enum snmp_code code; if (asn_commit_header(b, pdu->vars_ptr, NULL) != ASN_ERR_OK || asn_commit_header(b, pdu->pdu_ptr, NULL) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (pdu->version == SNMP_V3) { if (asn_commit_header(b, pdu->scoped_ptr, NULL) != ASN_ERR_OK) return (SNMP_CODE_FAILED); pdu->scoped_len = b->asn_ptr - pdu->scoped_ptr; if (snmp_pdu_fix_padd(b, pdu) != ASN_ERR_OK) return (SNMP_CODE_FAILED); if (pdu->security_model != SNMP_SECMODEL_USM) return (SNMP_CODE_FAILED); if (snmp_pdu_encrypt(pdu) != SNMP_CODE_OK) return (SNMP_CODE_FAILED); if (pdu->user.priv_proto != SNMP_PRIV_NOPRIV && asn_commit_header(b, pdu->encrypted_ptr, NULL) != ASN_ERR_OK) return (SNMP_CODE_FAILED); } if (asn_commit_header(b, pdu->outer_ptr, &moved) != ASN_ERR_OK) return (SNMP_CODE_FAILED); pdu->outer_len = b->asn_ptr - pdu->outer_ptr; pdu->digest_ptr -= moved; if (pdu->version == SNMP_V3) { if ((code = snmp_pdu_calc_digest(pdu, pdu->msg_digest)) != SNMP_CODE_OK) return (SNMP_CODE_FAILED); if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) memcpy(pdu->digest_ptr, pdu->msg_digest, sizeof(pdu->msg_digest)); } return (SNMP_CODE_OK); } /* * Encode a binding. Caller must ensure, that the syntax is ok for that version. * Be sure not to cobber b, when something fails. */ enum asn_err snmp_binding_encode(struct asn_buf *b, const struct snmp_value *binding) { u_char *ptr; enum asn_err err; struct asn_buf save = *b; if ((err = asn_put_temp_header(b, (ASN_TYPE_SEQUENCE | ASN_TYPE_CONSTRUCTED), &ptr)) != ASN_ERR_OK) { *b = save; return (err); } if ((err = asn_put_objid(b, &binding->var)) != ASN_ERR_OK) { *b = save; return (err); } switch (binding->syntax) { case SNMP_SYNTAX_NULL: err = asn_put_null(b); break; case SNMP_SYNTAX_INTEGER: err = asn_put_integer(b, binding->v.integer); break; case SNMP_SYNTAX_OCTETSTRING: err = asn_put_octetstring(b, binding->v.octetstring.octets, binding->v.octetstring.len); break; case SNMP_SYNTAX_OID: err = asn_put_objid(b, &binding->v.oid); break; case SNMP_SYNTAX_IPADDRESS: err = asn_put_ipaddress(b, binding->v.ipaddress); break; case SNMP_SYNTAX_TIMETICKS: err = asn_put_uint32(b, ASN_APP_TIMETICKS, binding->v.uint32); break; case SNMP_SYNTAX_COUNTER: err = asn_put_uint32(b, ASN_APP_COUNTER, binding->v.uint32); break; case SNMP_SYNTAX_GAUGE: err = asn_put_uint32(b, ASN_APP_GAUGE, binding->v.uint32); break; case SNMP_SYNTAX_COUNTER64: err = asn_put_counter64(b, binding->v.counter64); break; case SNMP_SYNTAX_NOSUCHOBJECT: err = asn_put_exception(b, ASN_EXCEPT_NOSUCHOBJECT); break; case SNMP_SYNTAX_NOSUCHINSTANCE: err = asn_put_exception(b, ASN_EXCEPT_NOSUCHINSTANCE); break; case SNMP_SYNTAX_ENDOFMIBVIEW: err = asn_put_exception(b, ASN_EXCEPT_ENDOFMIBVIEW); break; } if (err != ASN_ERR_OK) { *b = save; return (err); } err = asn_commit_header(b, ptr, NULL); if (err != ASN_ERR_OK) { *b = save; return (err); } return (ASN_ERR_OK); } /* * Encode an PDU. */ enum snmp_code snmp_pdu_encode(struct snmp_pdu *pdu, struct asn_buf *resp_b) { u_int idx; enum snmp_code err; if ((err = snmp_pdu_encode_header(resp_b, pdu)) != SNMP_CODE_OK) return (err); for (idx = 0; idx < pdu->nbindings; idx++) if (snmp_binding_encode(resp_b, &pdu->bindings[idx]) != ASN_ERR_OK) return (SNMP_CODE_FAILED); return (snmp_fix_encoding(resp_b, pdu)); } static void dump_binding(const struct snmp_value *b) { u_int i; char buf[ASN_OIDSTRLEN]; snmp_printf("%s=", asn_oid2str_r(&b->var, buf)); switch (b->syntax) { case SNMP_SYNTAX_NULL: snmp_printf("NULL"); break; case SNMP_SYNTAX_INTEGER: snmp_printf("INTEGER %d", b->v.integer); break; case SNMP_SYNTAX_OCTETSTRING: snmp_printf("OCTET STRING %lu:", b->v.octetstring.len); for (i = 0; i < b->v.octetstring.len; i++) snmp_printf(" %02x", b->v.octetstring.octets[i]); break; case SNMP_SYNTAX_OID: snmp_printf("OID %s", asn_oid2str_r(&b->v.oid, buf)); break; case SNMP_SYNTAX_IPADDRESS: snmp_printf("IPADDRESS %u.%u.%u.%u", b->v.ipaddress[0], b->v.ipaddress[1], b->v.ipaddress[2], b->v.ipaddress[3]); break; case SNMP_SYNTAX_COUNTER: snmp_printf("COUNTER %u", b->v.uint32); break; case SNMP_SYNTAX_GAUGE: snmp_printf("GAUGE %u", b->v.uint32); break; case SNMP_SYNTAX_TIMETICKS: snmp_printf("TIMETICKS %u", b->v.uint32); break; case SNMP_SYNTAX_COUNTER64: snmp_printf("COUNTER64 %lld", b->v.counter64); break; case SNMP_SYNTAX_NOSUCHOBJECT: snmp_printf("NoSuchObject"); break; case SNMP_SYNTAX_NOSUCHINSTANCE: snmp_printf("NoSuchInstance"); break; case SNMP_SYNTAX_ENDOFMIBVIEW: snmp_printf("EndOfMibView"); break; default: snmp_printf("UNKNOWN SYNTAX %u", b->syntax); break; } } static __inline void dump_bindings(const struct snmp_pdu *pdu) { u_int i; for (i = 0; i < pdu->nbindings; i++) { snmp_printf(" [%u]: ", i); dump_binding(&pdu->bindings[i]); snmp_printf("\n"); } } static __inline void dump_notrap(const struct snmp_pdu *pdu) { snmp_printf(" request_id=%d", pdu->request_id); snmp_printf(" error_status=%d", pdu->error_status); snmp_printf(" error_index=%d\n", pdu->error_index); dump_bindings(pdu); } void snmp_pdu_dump(const struct snmp_pdu *pdu) { char buf[ASN_OIDSTRLEN]; const char *vers; static const char *types[] = { [SNMP_PDU_GET] = "GET", [SNMP_PDU_GETNEXT] = "GETNEXT", [SNMP_PDU_RESPONSE] = "RESPONSE", [SNMP_PDU_SET] = "SET", [SNMP_PDU_TRAP] = "TRAPv1", [SNMP_PDU_GETBULK] = "GETBULK", [SNMP_PDU_INFORM] = "INFORM", [SNMP_PDU_TRAP2] = "TRAPv2", [SNMP_PDU_REPORT] = "REPORT", }; if (pdu->version == SNMP_V1) vers = "SNMPv1"; else if (pdu->version == SNMP_V2c) vers = "SNMPv2c"; else if (pdu->version == SNMP_V3) vers = "SNMPv3"; else vers = "v?"; switch (pdu->type) { case SNMP_PDU_TRAP: snmp_printf("%s %s '%s'", types[pdu->type], vers, pdu->community); snmp_printf(" enterprise=%s", asn_oid2str_r(&pdu->enterprise, buf)); snmp_printf(" agent_addr=%u.%u.%u.%u", pdu->agent_addr[0], pdu->agent_addr[1], pdu->agent_addr[2], pdu->agent_addr[3]); snmp_printf(" generic_trap=%d", pdu->generic_trap); snmp_printf(" specific_trap=%d", pdu->specific_trap); snmp_printf(" time-stamp=%u\n", pdu->time_stamp); dump_bindings(pdu); break; case SNMP_PDU_GET: case SNMP_PDU_GETNEXT: case SNMP_PDU_RESPONSE: case SNMP_PDU_SET: case SNMP_PDU_GETBULK: case SNMP_PDU_INFORM: case SNMP_PDU_TRAP2: case SNMP_PDU_REPORT: snmp_printf("%s %s '%s'", types[pdu->type], vers, pdu->community); dump_notrap(pdu); break; default: snmp_printf("bad pdu type %u\n", pdu->type); break; } } void snmp_value_free(struct snmp_value *value) { if (value->syntax == SNMP_SYNTAX_OCTETSTRING) free(value->v.octetstring.octets); value->syntax = SNMP_SYNTAX_NULL; } int snmp_value_copy(struct snmp_value *to, const struct snmp_value *from) { to->var = from->var; to->syntax = from->syntax; if (from->syntax == SNMP_SYNTAX_OCTETSTRING) { if ((to->v.octetstring.len = from->v.octetstring.len) == 0) to->v.octetstring.octets = NULL; else { to->v.octetstring.octets = malloc(to->v.octetstring.len); if (to->v.octetstring.octets == NULL) return (-1); (void)memcpy(to->v.octetstring.octets, from->v.octetstring.octets, to->v.octetstring.len); } } else to->v = from->v; return (0); } void snmp_pdu_init_secparams(struct snmp_pdu *pdu) { int32_t rval; if (pdu->user.auth_proto != SNMP_AUTH_NOAUTH) pdu->flags |= SNMP_MSG_AUTH_FLAG; switch (pdu->user.priv_proto) { case SNMP_PRIV_DES: memcpy(pdu->msg_salt, &pdu->engine.engine_boots, sizeof(pdu->engine.engine_boots)); rval = random(); memcpy(pdu->msg_salt + sizeof(pdu->engine.engine_boots), &rval, sizeof(int32_t)); pdu->flags |= SNMP_MSG_PRIV_FLAG; break; case SNMP_PRIV_AES: rval = random(); memcpy(pdu->msg_salt, &rval, sizeof(int32_t)); rval = random(); memcpy(pdu->msg_salt + sizeof(int32_t), &rval, sizeof(int32_t)); pdu->flags |= SNMP_MSG_PRIV_FLAG; break; default: break; } } void snmp_pdu_free(struct snmp_pdu *pdu) { u_int i; for (i = 0; i < pdu->nbindings; i++) snmp_value_free(&pdu->bindings[i]); } /* * Parse an ASCII SNMP value into the binary form */ int snmp_value_parse(const char *str, enum snmp_syntax syntax, union snmp_values *v) { char *end; switch (syntax) { case SNMP_SYNTAX_NULL: case SNMP_SYNTAX_NOSUCHOBJECT: case SNMP_SYNTAX_NOSUCHINSTANCE: case SNMP_SYNTAX_ENDOFMIBVIEW: if (*str != '\0') return (-1); return (0); case SNMP_SYNTAX_INTEGER: v->integer = strtoll(str, &end, 0); if (*end != '\0') return (-1); return (0); case SNMP_SYNTAX_OCTETSTRING: { u_long len; /* actual length of string */ u_long alloc; /* allocate length of string */ u_char *octs; /* actual octets */ u_long oct; /* actual octet */ u_char *nocts; /* to avoid memory leak */ u_char c; /* actual character */ # define STUFFC(C) \ if (alloc == len) { \ alloc += 100; \ if ((nocts = realloc(octs, alloc)) == NULL) { \ free(octs); \ return (-1); \ } \ octs = nocts; \ } \ octs[len++] = (C); len = alloc = 0; octs = NULL; if (*str == '"') { str++; while((c = *str++) != '\0') { if (c == '"') { if (*str != '\0') { free(octs); return (-1); } break; } if (c == '\\') { switch (c = *str++) { case '\\': break; case 'a': c = '\a'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case 'x': c = 0; if (!isxdigit(*str)) break; if (isdigit(*str)) c = *str++ - '0'; else if (isupper(*str)) c = *str++ - 'A' + 10; else c = *str++ - 'a' + 10; if (!isxdigit(*str)) break; if (isdigit(*str)) c += *str++ - '0'; else if (isupper(*str)) c += *str++ - 'A' + 10; else c += *str++ - 'a' + 10; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': c = *str++ - '0'; if (*str < '0' || *str > '7') break; c = *str++ - '0'; if (*str < '0' || *str > '7') break; c = *str++ - '0'; break; default: break; } } STUFFC(c); } } else { while (*str != '\0') { oct = strtoul(str, &end, 16); str = end; if (oct > 0xff) { free(octs); return (-1); } STUFFC(oct); if (*str == ':') str++; else if(*str != '\0') { free(octs); return (-1); } } } v->octetstring.octets = octs; v->octetstring.len = len; return (0); # undef STUFFC } case SNMP_SYNTAX_OID: { u_long subid; v->oid.len = 0; for (;;) { if (v->oid.len == ASN_MAXOIDLEN) return (-1); subid = strtoul(str, &end, 10); str = end; if (subid > ASN_MAXID) return (-1); v->oid.subs[v->oid.len++] = (asn_subid_t)subid; if (*str == '\0') break; if (*str != '.') return (-1); str++; } return (0); } case SNMP_SYNTAX_IPADDRESS: { struct hostent *he; u_long ip[4]; int n; if (sscanf(str, "%lu.%lu.%lu.%lu%n", &ip[0], &ip[1], &ip[2], &ip[3], &n) == 4 && (size_t)n == strlen(str) && ip[0] <= 0xff && ip[1] <= 0xff && ip[2] <= 0xff && ip[3] <= 0xff) { v->ipaddress[0] = (u_char)ip[0]; v->ipaddress[1] = (u_char)ip[1]; v->ipaddress[2] = (u_char)ip[2]; v->ipaddress[3] = (u_char)ip[3]; return (0); } if ((he = gethostbyname(str)) == NULL) return (-1); if (he->h_addrtype != AF_INET) return (-1); v->ipaddress[0] = he->h_addr[0]; v->ipaddress[1] = he->h_addr[1]; v->ipaddress[2] = he->h_addr[2]; v->ipaddress[3] = he->h_addr[3]; return (0); } case SNMP_SYNTAX_COUNTER: case SNMP_SYNTAX_GAUGE: case SNMP_SYNTAX_TIMETICKS: { uint64_t sub; sub = strtoull(str, &end, 0); if (*end != '\0' || sub > 0xffffffff) return (-1); v->uint32 = (uint32_t)sub; return (0); } case SNMP_SYNTAX_COUNTER64: v->counter64 = strtoull(str, &end, 0); if (*end != '\0') return (-1); return (0); } abort(); } static void snmp_error_func(const char *fmt, ...) { va_list ap; va_start(ap, fmt); fprintf(stderr, "SNMP: "); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); va_end(ap); } static void snmp_printf_func(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); } Index: user/alc/PQ_LAUNDRY/lib/libc/powerpc/gen/Makefile.inc =================================================================== --- user/alc/PQ_LAUNDRY/lib/libc/powerpc/gen/Makefile.inc (revision 307895) +++ user/alc/PQ_LAUNDRY/lib/libc/powerpc/gen/Makefile.inc (revision 307896) @@ -1,7 +1,7 @@ # $FreeBSD$ -.include "${LIBC_SRC}/powerpc/gen/Makefile.common" +.include "${LIBC_SRCTOP}/powerpc/gen/Makefile.common" SRCS += fabs.S flt_rounds.c fpgetmask.c fpgetround.c \ fpgetsticky.c fpsetmask.c fpsetround.c \ _setjmp.S setjmp.S sigsetjmp.S Index: user/alc/PQ_LAUNDRY/lib/libnetbsd/sys/cdefs.h =================================================================== --- user/alc/PQ_LAUNDRY/lib/libnetbsd/sys/cdefs.h (revision 307895) +++ user/alc/PQ_LAUNDRY/lib/libnetbsd/sys/cdefs.h (revision 307896) @@ -1,72 +1,74 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2012 SRI International * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _LIBNETBSD_SYS_CDEFS_H_ #define _LIBNETBSD_SYS_CDEFS_H_ #include_next +#ifndef __dead #ifdef __dead2 #define __dead __dead2 #else #define __dead #endif +#endif /* !__dead */ /* * The __CONCAT macro is used to concatenate parts of symbol names, e.g. * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo. * The __CONCAT macro is a bit tricky -- make sure you don't put spaces * in between its arguments. __CONCAT can also concatenate double-quoted * strings produced by the __STRING macro, but this only works with ANSI C. */ #define ___STRING(x) __STRING(x) #define ___CONCAT(x,y) __CONCAT(x,y) /* * The following macro is used to remove const cast-away warnings * from gcc -Wcast-qual; it should be used with caution because it * can hide valid errors; in particular most valid uses are in * situations where the API requires it, not to cast away string * constants. We don't use *intptr_t on purpose here and we are * explicit about unsigned long so that we don't have additional * dependencies. */ #define __UNCONST(a) ((void *)(unsigned long)(const void *)(a)) /* * Return the number of elements in a statically-allocated array, * __x. */ #define __arraycount(__x) (sizeof(__x) / sizeof(__x[0])) #endif /* _LIBNETBSD_SYS_CDEFS_H_ */ Index: user/alc/PQ_LAUNDRY/lib/libnetbsd/util.c =================================================================== --- user/alc/PQ_LAUNDRY/lib/libnetbsd/util.c (revision 307895) +++ user/alc/PQ_LAUNDRY/lib/libnetbsd/util.c (revision 307896) @@ -1,59 +1,60 @@ /*- * Copyright (c) 2012 SRI International * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include -#include + +#include "util.h" char * flags_to_string(u_long flags, const char *def) { char *str; str = fflagstostr(flags); if (*str == '\0') { free(str); str = strdup(def); } return (str); } int string_to_flags(char **stringp, u_long *setp, u_long *clrp) { return strtofflags(stringp, setp, clrp); } Index: user/alc/PQ_LAUNDRY/release/tools/arm.subr =================================================================== --- user/alc/PQ_LAUNDRY/release/tools/arm.subr (revision 307895) +++ user/alc/PQ_LAUNDRY/release/tools/arm.subr (revision 307896) @@ -1,137 +1,136 @@ #!/bin/sh #- # Copyright (c) 2015 The FreeBSD Foundation # All rights reserved. # # Portions of this software were developed by Glen Barber # under sponsorship from the FreeBSD Foundation. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # Common subroutines used to build arm/armv6 images. # # $FreeBSD$ # cleanup() { if [ -c "${DESTDIR}/dev/null" ]; then umount_loop ${DESTDIR}/dev 2>/dev/null fi umount_loop ${DESTDIR} if [ ! -z "${mddev}" ]; then mdconfig -d -u ${mddev} fi return 0 } umount_loop() { DIR=$1 i=0 sync while ! umount ${DIR}; do i=$(( $i + 1 )) if [ $i -ge 10 ]; then # This should never happen. But, it has happened. echo "Cannot umount(8) ${DIR}" echo "Something has gone horribly wrong." return 1 fi sleep 1 done return 0 } arm_create_disk() { # Create the target raw file and temporary work directory. chroot ${CHROOTDIR} gpart create -s ${PART_SCHEME} ${mddev} chroot ${CHROOTDIR} gpart add -t '!12' -a 63 -s ${FAT_SIZE} ${mddev} chroot ${CHROOTDIR} gpart set -a active -i 1 ${mddev} chroot ${CHROOTDIR} newfs_msdos -L msdosboot -F ${FAT_TYPE} /dev/${mddev}s1 chroot ${CHROOTDIR} gpart add -t freebsd ${mddev} chroot ${CHROOTDIR} gpart create -s bsd ${mddev}s2 chroot ${CHROOTDIR} gpart add -t freebsd-ufs -a 64k /dev/${mddev}s2 chroot ${CHROOTDIR} newfs -U -L rootfs /dev/${mddev}s2a return 0 } arm_create_user() { # Create a default user account 'freebsd' with the password 'freebsd', # and set the default password for the 'root' user to 'root'. chroot ${CHROOTDIR} /usr/sbin/pw -R ${DESTDIR} \ groupadd freebsd -g 1001 chroot ${CHROOTDIR} mkdir -p ${DESTDIR}/home/freebsd chroot ${CHROOTDIR} /usr/sbin/pw -R ${DESTDIR} \ useradd freebsd \ -m -M 0755 -w yes -n freebsd -u 1001 -g 1001 -G 0 \ -c 'FreeBSD User' -d '/home/freebsd' -s '/bin/csh' chroot ${CHROOTDIR} /usr/sbin/pw -R ${DESTDIR} \ usermod root -w yes - chroot ${CHROOTDIR} ln -s /home ${DESTDIR}/usr/home return 0 } arm_install_base() { chroot ${CHROOTDIR} mount /dev/${mddev}s2a ${DESTDIR} eval chroot ${CHROOTDIR} make -C ${WORLDDIR} \ TARGET=${EMBEDDED_TARGET} \ TARGET_ARCH=${EMBEDDED_TARGET_ARCH} \ DESTDIR=${DESTDIR} KERNCONF=${KERNEL} \ installworld installkernel distribution chroot ${CHROOTDIR} mkdir -p ${DESTDIR}/boot/msdos arm_create_user echo '# Custom /etc/fstab for FreeBSD embedded images' \ > ${CHROOTDIR}/${DESTDIR}/etc/fstab echo "/dev/ufs/rootfs / ufs rw 1 1" \ >> ${CHROOTDIR}/${DESTDIR}/etc/fstab echo "/dev/msdosfs/MSDOSBOOT /boot/msdos msdosfs rw,noatime 0 0" \ >> ${CHROOTDIR}/${DESTDIR}/etc/fstab echo "tmpfs /tmp tmpfs rw,mode=1777,size=50m 0 0" \ >> ${CHROOTDIR}/${DESTDIR}/etc/fstab local hostname hostname="$(echo ${KERNEL} | tr '[:upper:]' '[:lower:]')" echo "hostname=\"${hostname}\"" > ${CHROOTDIR}/${DESTDIR}/etc/rc.conf echo 'ifconfig_DEFAULT="DHCP"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf echo 'sshd_enable="YES"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf echo 'sendmail_enable="NONE"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf echo 'sendmail_submit_enable="NO"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf echo 'sendmail_outbound_enable="NO"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf echo 'sendmail_msp_queue_enable="NO"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf echo 'growfs_enable="YES"' >> ${CHROOTDIR}/${DESTDIR}/etc/rc.conf sync umount_loop ${CHROOTDIR}/${DESTDIR} return 0 } arm_install_uboot() { # Override in the arm/KERNEL.conf file. return 0 } Index: user/alc/PQ_LAUNDRY/share/mk/bsd.compiler.mk =================================================================== --- user/alc/PQ_LAUNDRY/share/mk/bsd.compiler.mk (revision 307895) +++ user/alc/PQ_LAUNDRY/share/mk/bsd.compiler.mk (revision 307896) @@ -1,197 +1,197 @@ # $FreeBSD$ # Setup variables for the compiler # # COMPILER_TYPE is the major type of compiler. Currently gcc and clang support # automatic detection. Other compiler types can be shoe-horned in, but require # explicit setting of the compiler type. The compiler type can also be set # explicitly if, say, you install gcc as clang... # # COMPILER_VERSION is a numeric constant equal to: # major * 10000 + minor * 100 + tiny # It too can be overriden on the command line. When testing it, be sure to # make sure that you are limiting the test to a specific compiler. Testing # against 30300 for gcc likely isn't what you wanted (since versions of gcc # prior to 4.2 likely have no prayer of working). # # COMPILER_FREEBSD_VERSION is the compiler's __FreeBSD_cc_version value. # # COMPILER_FEATURES will contain one or more of the following, based on # compiler support for that feature: # # - c++11 : supports full (or nearly full) C++11 programming environment. # # These variables with an X_ prefix will also be provided if XCC is set. # # This file may be included multiple times, but only has effect the first time. # .if !target(____) ____: .include # Handle ccache after CC is determined, but not if CC/CXX are already # overridden with a manual setup. .if ${MK_CCACHE_BUILD:Uno} == "yes" && \ !make(showconfig) && \ (${CC:M*ccache/world/*} == "" || ${CXX:M*ccache/world/*} == "") # CC is always prepended with the ccache wrapper rather than modifying # PATH since it is more clear that ccache is used and avoids wasting time # for mkdep/linking/asm builds. LOCALBASE?= /usr/local CCACHE_WRAPPER_PATH?= ${LOCALBASE}/libexec/ccache CCACHE_BIN?= ${LOCALBASE}/bin/ccache .if exists(${CCACHE_BIN}) # Export to ensure sub-makes can filter it out for mkdep/linking and # to chain down into kernel build which won't include this file. .export CCACHE_BIN # Expand and export some variables so they may be based on make vars. # This allows doing something like the following in the environment: # CCACHE_BASEDIR='${SRCTOP:H}' MAKEOBJDIRPREFIX='${SRCTOP:H}/obj' .for var in CCACHE_LOGFILE CCACHE_BASEDIR .if defined(${var}) ${var}:= ${${var}} .export ${var} .endif .endfor # Handle bootstrapped compiler changes properly by hashing their content # rather than checking mtime. For external compilers it should be safe # to use the more optimal mtime check. # XXX: CCACHE_COMPILERCHECK= string: .if ${CC:N${CCACHE_BIN}:[1]:M/*} == "" CCACHE_COMPILERCHECK?= content .else CCACHE_COMPILERCHECK?= mtime .endif .export CCACHE_COMPILERCHECK # Remove ccache from the PATH to prevent double calls and wasted CPP/LD time. PATH:= ${PATH:C,:?${CCACHE_WRAPPER_PATH}(/world)?(:$)?,,g} # Ensure no bogus CCACHE_PATH leaks in which might avoid the in-tree compiler. .if !empty(CCACHE_PATH) CCACHE_PATH= .export CCACHE_PATH .endif # Override various toolchain vars. .for var in CC CXX HOST_CC HOST_CXX .if defined(${var}) && ${${var}:M${CCACHE_BIN}} == "" ${var}:= ${CCACHE_BIN} ${${var}} .endif .endfor # GCC does not need the CCACHE_CPP2 hack enabled by default in devel/ccache. # The port enables it due to ccache passing preprocessed C to clang # which fails with -Wparentheses-equality, -Wtautological-compare, and # -Wself-assign on macro-expanded lines. .if defined(COMPILER_TYPE) && ${COMPILER_TYPE} == "gcc" CCACHE_NOCPP2= 1 .export CCACHE_NOCPP2 .endif # Canonicalize CCACHE_DIR for meta mode usage. .if !defined(CCACHE_DIR) CCACHE_DIR!= ${CCACHE_BIN} -p | awk '$$2 == "cache_dir" {print $$4}' .export CCACHE_DIR .endif .if !empty(CCACHE_DIR) && empty(.MAKE.META.IGNORE_PATHS:M${CCACHE_DIR}) CCACHE_DIR:= ${CCACHE_DIR:tA} .MAKE.META.IGNORE_PATHS+= ${CCACHE_DIR} .export CCACHE_DIR .endif # ccache doesn't affect build output so let it slide for meta mode # comparisons. .MAKE.META.IGNORE_PATHS+= ${CCACHE_BIN} ccache-print-options: .PHONY @${CCACHE_BIN} -p .endif # exists(${CCACHE_BIN}) .endif # ${MK_CCACHE_BUILD} == "yes" .for cc X_ in CC $${_empty_var_} XCC X_ .if ${cc} == "CC" || !empty(XCC) # Try to import COMPILER_TYPE and COMPILER_VERSION from parent make. # The value is only used/exported for the same environment that impacts # CC and COMPILER_* settings here. _exported_vars= ${X_}COMPILER_TYPE ${X_}COMPILER_VERSION \ ${X_}COMPILER_FREEBSD_VERSION ${X_}_cc_hash= ${${cc}}${MACHINE}${PATH} ${X_}_cc_hash:= ${${X_}_cc_hash:hash} # Only import if none of the vars are set somehow else. _can_export= yes .for var in ${_exported_vars} .if defined(${var}) _can_export= no .endif .endfor .if ${_can_export} == yes .for var in ${_exported_vars} .if defined(${var}.${${X_}_cc_hash}) ${var}= ${${var}.${${X_}_cc_hash}} .endif .endfor .endif .if ${cc} == "CC" || (${cc} == "XCC" && ${XCC} != ${CC}) .if ${MACHINE} == "common" # common is a pseudo machine for architecture independent # generated files - thus there is no compiler. ${X_}COMPILER_TYPE= none ${X_}COMPILER_VERSION= 0 ${X_}COMPILER_FREEBSD_VERSION= 0 .elif !defined(${X_}COMPILER_TYPE) || !defined(${X_}COMPILER_VERSION) _v!= ${${cc}} --version || echo 0.0.0 .if !defined(${X_}COMPILER_TYPE) . if ${${cc}:T:M*gcc*} ${X_}COMPILER_TYPE:= gcc . elif ${${cc}:T:M*clang*} ${X_}COMPILER_TYPE:= clang . elif ${_v:Mgcc} ${X_}COMPILER_TYPE:= gcc . elif ${_v:M\(GCC\)} ${X_}COMPILER_TYPE:= gcc -. elif ${_v:Mclang} +. elif ${_v:Mclang} || ${_v:M(clang-*.*.*)} ${X_}COMPILER_TYPE:= clang . else .error Unable to determine compiler type for ${cc}=${${cc}}. Consider setting ${X_}COMPILER_TYPE. . endif .endif .if !defined(${X_}COMPILER_VERSION) ${X_}COMPILER_VERSION!=echo "${_v:M[1-9].[0-9]*}" | awk -F. '{print $$1 * 10000 + $$2 * 100 + $$3;}' .endif .undef _v .endif .if !defined(${X_}COMPILER_FREEBSD_VERSION) ${X_}COMPILER_FREEBSD_VERSION!= { echo "__FreeBSD_cc_version" | ${${cc}} -E - 2>/dev/null || echo __FreeBSD_cc_version; } | sed -n '$$p' # If we get a literal "__FreeBSD_cc_version" back then the compiler # is a non-FreeBSD build that doesn't support it or some other error # occurred. .if ${${X_}COMPILER_FREEBSD_VERSION} == "__FreeBSD_cc_version" ${X_}COMPILER_FREEBSD_VERSION= unknown .endif .endif .if ${${X_}COMPILER_TYPE} == "clang" || \ (${${X_}COMPILER_TYPE} == "gcc" && ${${X_}COMPILER_VERSION} >= 40800) ${X_}COMPILER_FEATURES= c++11 .else ${X_}COMPILER_FEATURES= .endif .else # Use CC's values X_COMPILER_TYPE= ${COMPILER_TYPE} X_COMPILER_VERSION= ${COMPILER_VERSION} X_COMPILER_FREEBSD_VERSION= ${COMPILER_FREEBSD_VERSION} X_COMPILER_FEATURES= ${COMPILER_FEATURES} .endif # ${cc} == "CC" || (${cc} == "XCC" && ${XCC} != ${CC}) # Export the values so sub-makes don't have to look them up again, using the # hash key computed above. .for var in ${_exported_vars} ${var}.${${X_}_cc_hash}:= ${${var}} .export-env ${var}.${${X_}_cc_hash} .undef ${var}.${${X_}_cc_hash} .endfor .endif # ${cc} == "CC" || !empty(XCC) .endfor # .for cc in CC XCC .endif # !target(____) Index: user/alc/PQ_LAUNDRY/sys/amd64/amd64/trap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/amd64/amd64/trap.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/amd64/amd64/trap.c (revision 307896) @@ -1,944 +1,939 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #ifdef KDTRACE_HOOKS #include #endif extern void __noinline trap(struct trapframe *frame); extern void trap_check(struct trapframe *frame); extern void syscall(struct trapframe *frame); void dblfault_handler(struct trapframe *frame); static int trap_pfault(struct trapframe *, int); static void trap_fatal(struct trapframe *, vm_offset_t); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; -static int panic_on_nmi = 1; -SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, - &panic_on_nmi, 0, "Panic on NMI"); static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; #ifdef KDB register_t dr6; #endif int i = 0, ucode = 0; u_int type; register_t addr = 0; ksiginfo_t ksi; PCPU_INC(cnt.v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is * active, pass the 'rip' value to the PMC module's interrupt * handler. A non-zero return value from the handler means that * the NMI was consumed by it and we can return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(PCPU_GET(cpuid), frame) != 0) goto out; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) goto out; #endif } if (type == T_MCHK) { mca_intr(); goto out; } if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame)) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while holding a * spin lock. */ if (td->td_md.md_spinlock_count == 0) enable_intr(); } } if (TRAPF_USERMODE(frame)) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_rip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif frame->tf_rflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap_x87(); if (ucode == -1) goto userout; i = SIGFPE; break; case T_PROTFLT: /* general protection fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ /* * Emulator can take care about this trap? */ if (*p->p_sysent->sv_trap != NULL && (*p->p_sysent->sv_trap)(td) == 0) goto userout; addr = frame->tf_addr; i = trap_pfault(frame, TRUE); if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: - nmi_handle_intr(type, frame, true); + nmi_handle_intr(type, frame); break; #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); fpudna(); goto userout; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = fputrap_sse(); if (ucode == -1) goto userout; i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE); goto out; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); fpudna(); goto out; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * For now, supporting kernel handler * registration for FPU traps is overkill. */ trap_fatal(frame, 0); goto out; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; goto out; } if (frame->tf_rip == (long)ld_ds) { frame->tf_rip = (long)ds_load_fault; goto out; } if (frame->tf_rip == (long)ld_es) { frame->tf_rip = (long)es_load_fault; goto out; } if (frame->tf_rip == (long)ld_fs) { frame->tf_rip = (long)fs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gs) { frame->tf_rip = (long)gs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gsbase) { frame->tf_rip = (long)gsbase_load_fault; goto out; } if (frame->tf_rip == (long)ld_fsbase) { frame->tf_rip = (long)fsbase_load_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX %dr6 is not quite reentrant. */ dr6 = rdr6(); load_dr6(dr6 & ~0x4000); if (kdb_trap(type, dr6, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: - if (nmi_handle_intr(type, frame, false) || - !panic_on_nmi) - goto out; - /* FALLTHROUGH */ + nmi_handle_intr(type, frame); + goto out; #endif /* DEV_ISA */ } trap_fatal(frame, 0); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_trapno = type; ksi.ksi_addr = (void *)addr; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %lx code %d type %d " "addr 0x%lx rsp 0x%lx rip 0x%lx " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_rsp, frame->tf_rip, fubyte((void *)(frame->tf_rip + 0)), fubyte((void *)(frame->tf_rip + 1)), fubyte((void *)(frame->tf_rip + 2)), fubyte((void *)(frame->tf_rip + 3)), fubyte((void *)(frame->tf_rip + 4)), fubyte((void *)(frame->tf_rip + 5)), fubyte((void *)(frame->tf_rip + 6)), fubyte((void *)(frame->tf_rip + 7))); } KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } /* * Ensure that we ignore any DTrace-induced faults. This function cannot * be instrumented, so it cannot generate such faults itself. */ void trap_check(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, frame->tf_trapno) != 0) return; #endif trap(frame); } static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a usermode address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss; u_int type; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)], &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, TRAPF_USERMODE(frame) ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", code & PGEX_RSV ? " rsv" : "", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); ss = frame->tf_ss & 0xffff; printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_panic || kdb_active) if (kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault\n"); printf("rip = 0x%lx\n", frame->tf_rip); printf("rsp = 0x%lx\n", frame->tf_rsp); printf("rbp = 0x%lx\n", frame->tf_rbp); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; register_t *argp; caddr_t params; int reg, regcnt, error; p = td->td_proc; frame = td->td_frame; reg = 0; regcnt = 6; params = (caddr_t)frame->tf_rsp + sizeof(register_t); sa->code = frame->tf_rax; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = frame->tf_rdi; reg++; regcnt--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]), ("Too many syscall arguments!")); error = 0; argp = &frame->tf_rdi; argp += reg; bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt); if (sa->narg > regcnt) { KASSERT(params != NULL, ("copyin args with no params!")); error = copyin(params, &sa->args[regcnt], (sa->narg - regcnt) * sizeof(sa->args[0])); } if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; } return (error); } #include "../../kern/subr_syscall.c" /* * System call handler for native binaries. The trap frame is already * set up by the assembler trampoline and a pointer to it is saved in * td_frame. */ void amd64_syscall(struct thread *td, int traced) { struct syscall_args sa; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!TRAPF_USERMODE(td->td_frame)) { panic("syscall"); /* NOT REACHED */ } #endif error = syscallenter(td, &sa); /* * Traced syscall. */ if (__predict_false(traced)) { td->td_frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)td->td_frame->tf_rip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returing with kernel FPU ctx leaked", syscallname(td->td_proc, sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, sa.code))); KASSERT(td->td_md.md_invl_gen.gen == 0, ("System call %s returning with leaked invl_gen %lu", syscallname(td->td_proc, sa.code), td->td_md.md_invl_gen.gen)); syscallret(td, error, &sa); /* * If the user-supplied value of %rip is not a canonical * address, then some CPUs will trigger a ring 0 #GP during * the sysret instruction. However, the fault handler would * execute in ring 0 with the user's %gs and %rsp which would * not be safe. Instead, use the full return path which * catches the problem safely. */ if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS) set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_rsb.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_rsb.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_rsb.c (revision 307896) @@ -1,479 +1,498 @@ /*- * Copyright (c) 2016 Jared McNeill * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* - * Allwinner RSB (Reduced Serial Bus) + * Allwinner RSB (Reduced Serial Bus) and P2WI (Push-Pull Two Wire Interface) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iicbus_if.h" #define RSB_CTRL 0x00 #define START_TRANS (1 << 7) #define GLOBAL_INT_ENB (1 << 1) #define SOFT_RESET (1 << 0) #define RSB_CCR 0x04 #define RSB_INTE 0x08 #define RSB_INTS 0x0c #define INT_TRANS_ERR_ID(x) (((x) >> 8) & 0xf) #define INT_LOAD_BSY (1 << 2) #define INT_TRANS_ERR (1 << 1) #define INT_TRANS_OVER (1 << 0) #define INT_MASK (INT_LOAD_BSY|INT_TRANS_ERR|INT_TRANS_OVER) #define RSB_DADDR0 0x10 #define RSB_DADDR1 0x14 #define RSB_DLEN 0x18 #define DLEN_READ (1 << 4) #define RSB_DATA0 0x1c #define RSB_DATA1 0x20 #define RSB_CMD 0x2c #define CMD_SRTA 0xe8 #define CMD_RD8 0x8b #define CMD_RD16 0x9c #define CMD_RD32 0xa6 #define CMD_WR8 0x4e #define CMD_WR16 0x59 #define CMD_WR32 0x63 #define RSB_DAR 0x30 #define DAR_RTA (0xff << 16) #define DAR_RTA_SHIFT 16 #define DAR_DA (0xffff << 0) #define DAR_DA_SHIFT 0 #define RSB_MAXLEN 8 #define RSB_RESET_RETRY 100 #define RSB_I2C_TIMEOUT hz #define RSB_ADDR_PMIC_PRIMARY 0x3a3 #define RSB_ADDR_PMIC_SECONDARY 0x745 #define RSB_ADDR_PERIPH_IC 0xe89 +#define A31_P2WI 1 +#define A23_RSB 2 + static struct ofw_compat_data compat_data[] = { - { "allwinner,sun8i-a23-rsb", 1 }, + { "allwinner,sun6i-a31-p2wi", A31_P2WI }, + { "allwinner,sun8i-a23-rsb", A23_RSB }, { NULL, 0 } }; static struct resource_spec rsb_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, { -1, 0 } }; /* * Device address to Run-time address mappings. * * Run-time address (RTA) is an 8-bit value used to address the device during * a read or write transaction. The following are valid RTAs: * 0x17 0x2d 0x3a 0x4e 0x59 0x63 0x74 0x8b 0x9c 0xa6 0xb1 0xc5 0xd2 0xe8 0xff * * Allwinner uses RTA 0x2d for the primary PMIC, 0x3a for the secondary PMIC, * and 0x4e for the peripheral IC (where applicable). */ static const struct { uint16_t addr; uint8_t rta; } rsb_rtamap[] = { { .addr = RSB_ADDR_PMIC_PRIMARY, .rta = 0x2d }, { .addr = RSB_ADDR_PMIC_SECONDARY, .rta = 0x3a }, { .addr = RSB_ADDR_PERIPH_IC, .rta = 0x4e }, { .addr = 0, .rta = 0 } }; struct rsb_softc { struct resource *res; struct mtx mtx; clk_t clk; hwreset_t rst; device_t iicbus; int busy; uint32_t status; uint16_t cur_addr; + int type; struct iic_msg *msg; }; #define RSB_LOCK(sc) mtx_lock(&(sc)->mtx) #define RSB_UNLOCK(sc) mtx_unlock(&(sc)->mtx) #define RSB_ASSERT_LOCKED(sc) mtx_assert(&(sc)->mtx, MA_OWNED) #define RSB_READ(sc, reg) bus_read_4((sc)->res, (reg)) #define RSB_WRITE(sc, reg, val) bus_write_4((sc)->res, (reg), (val)) static phandle_t rsb_get_node(device_t bus, device_t dev) { return (ofw_bus_get_node(bus)); } static int rsb_reset(device_t dev, u_char speed, u_char addr, u_char *oldaddr) { struct rsb_softc *sc; int retry; sc = device_get_softc(dev); RSB_LOCK(sc); /* Write soft-reset bit and wait for it to self-clear. */ RSB_WRITE(sc, RSB_CTRL, SOFT_RESET); for (retry = RSB_RESET_RETRY; retry > 0; retry--) if ((RSB_READ(sc, RSB_CTRL) & SOFT_RESET) == 0) break; RSB_UNLOCK(sc); if (retry == 0) { device_printf(dev, "soft reset timeout\n"); return (ETIMEDOUT); } return (IIC_ENOADDR); } static uint32_t rsb_encode(const uint8_t *buf, u_int len, u_int off) { uint32_t val; u_int n; val = 0; for (n = off; n < MIN(len, 4 + off); n++) val |= ((uint32_t)buf[n] << ((n - off) * NBBY)); return val; } static void rsb_decode(const uint32_t val, uint8_t *buf, u_int len, u_int off) { u_int n; for (n = off; n < MIN(len, 4 + off); n++) buf[n] = (val >> ((n - off) * NBBY)) & 0xff; } static int rsb_start(device_t dev) { struct rsb_softc *sc; int error, retry; sc = device_get_softc(dev); RSB_ASSERT_LOCKED(sc); /* Start the transfer */ RSB_WRITE(sc, RSB_CTRL, GLOBAL_INT_ENB | START_TRANS); /* Wait for transfer to complete */ error = ETIMEDOUT; for (retry = RSB_I2C_TIMEOUT; retry > 0; retry--) { sc->status |= RSB_READ(sc, RSB_INTS); if ((sc->status & INT_TRANS_OVER) != 0) { error = 0; break; } DELAY((1000 * hz) / RSB_I2C_TIMEOUT); } if (error == 0 && (sc->status & INT_TRANS_OVER) == 0) { device_printf(dev, "transfer error, status 0x%08x\n", sc->status); error = EIO; } return (error); } static int rsb_set_rta(device_t dev, uint16_t addr) { struct rsb_softc *sc; uint8_t rta; int i; sc = device_get_softc(dev); RSB_ASSERT_LOCKED(sc); /* Lookup run-time address for given device address */ for (rta = 0, i = 0; rsb_rtamap[i].rta != 0; i++) if (rsb_rtamap[i].addr == addr) { rta = rsb_rtamap[i].rta; break; } if (rta == 0) { device_printf(dev, "RTA not known for address %#x\n", addr); return (ENXIO); } /* Set run-time address */ RSB_WRITE(sc, RSB_INTS, RSB_READ(sc, RSB_INTS)); RSB_WRITE(sc, RSB_DAR, (addr << DAR_DA_SHIFT) | (rta << DAR_RTA_SHIFT)); RSB_WRITE(sc, RSB_CMD, CMD_SRTA); return (rsb_start(dev)); } static int rsb_transfer(device_t dev, struct iic_msg *msgs, uint32_t nmsgs) { struct rsb_softc *sc; uint32_t daddr[2], data[2], dlen; uint16_t device_addr; uint8_t cmd; int error; sc = device_get_softc(dev); /* - * RSB is not really an I2C or SMBus controller, so there are some - * restrictions imposed by the driver. + * P2WI and RSB are not really I2C or SMBus controllers, so there are + * some restrictions imposed by the driver. * * Transfers must contain exactly two messages. The first is always * a write, containing a single data byte offset. Data will either * be read from or written to the corresponding data byte in the * second message. The slave address in both messages must be the * same. */ if (nmsgs != 2 || (msgs[0].flags & IIC_M_RD) == IIC_M_RD || (msgs[0].slave >> 1) != (msgs[1].slave >> 1) || msgs[0].len != 1 || msgs[1].len > RSB_MAXLEN) return (EINVAL); - /* The controller can read or write 1, 2, or 4 bytes at a time. */ - if ((msgs[1].flags & IIC_M_RD) != 0) { - switch (msgs[1].len) { - case 1: - cmd = CMD_RD8; - break; - case 2: - cmd = CMD_RD16; - break; - case 4: - cmd = CMD_RD32; - break; - default: - return (EINVAL); + /* The RSB controller can read or write 1, 2, or 4 bytes at a time. */ + if (sc->type == A23_RSB) { + if ((msgs[1].flags & IIC_M_RD) != 0) { + switch (msgs[1].len) { + case 1: + cmd = CMD_RD8; + break; + case 2: + cmd = CMD_RD16; + break; + case 4: + cmd = CMD_RD32; + break; + default: + return (EINVAL); + } + } else { + switch (msgs[1].len) { + case 1: + cmd = CMD_WR8; + break; + case 2: + cmd = CMD_WR16; + break; + case 4: + cmd = CMD_WR32; + break; + default: + return (EINVAL); + } } - } else { - switch (msgs[1].len) { - case 1: - cmd = CMD_WR8; - break; - case 2: - cmd = CMD_WR16; - break; - case 4: - cmd = CMD_WR32; - break; - default: - return (EINVAL); - } } RSB_LOCK(sc); while (sc->busy) mtx_sleep(sc, &sc->mtx, 0, "i2cbuswait", 0); sc->busy = 1; sc->status = 0; /* Select current run-time address if necessary */ - device_addr = msgs[0].slave >> 1; - if (sc->cur_addr != device_addr) { - error = rsb_set_rta(dev, device_addr); - if (error != 0) - goto done; - sc->cur_addr = device_addr; - sc->status = 0; + if (sc->type == A23_RSB) { + device_addr = msgs[0].slave >> 1; + if (sc->cur_addr != device_addr) { + error = rsb_set_rta(dev, device_addr); + if (error != 0) + goto done; + sc->cur_addr = device_addr; + sc->status = 0; + } } /* Clear interrupt status */ RSB_WRITE(sc, RSB_INTS, RSB_READ(sc, RSB_INTS)); /* Program data access address registers */ daddr[0] = rsb_encode(msgs[0].buf, msgs[0].len, 0); RSB_WRITE(sc, RSB_DADDR0, daddr[0]); /* Write data */ if ((msgs[1].flags & IIC_M_RD) == 0) { data[0] = rsb_encode(msgs[1].buf, msgs[1].len, 0); RSB_WRITE(sc, RSB_DATA0, data[0]); } - /* Set command type */ - RSB_WRITE(sc, RSB_CMD, cmd); + /* Set command type for RSB */ + if (sc->type == A23_RSB) + RSB_WRITE(sc, RSB_CMD, cmd); /* Program data length register and transfer direction */ dlen = msgs[0].len - 1; if ((msgs[1].flags & IIC_M_RD) == IIC_M_RD) dlen |= DLEN_READ; RSB_WRITE(sc, RSB_DLEN, dlen); /* Start transfer */ error = rsb_start(dev); if (error != 0) goto done; /* Read data */ if ((msgs[1].flags & IIC_M_RD) == IIC_M_RD) { data[0] = RSB_READ(sc, RSB_DATA0); rsb_decode(data[0], msgs[1].buf, msgs[1].len, 0); } done: sc->msg = NULL; sc->busy = 0; wakeup(sc); RSB_UNLOCK(sc); return (error); } static int rsb_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); - if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) + switch (ofw_bus_search_compatible(dev, compat_data)->ocd_data) { + case A23_RSB: + device_set_desc(dev, "Allwinner RSB"); + break; + case A31_P2WI: + device_set_desc(dev, "Allwinner P2WI"); + break; + default: return (ENXIO); + } - device_set_desc(dev, "Allwinner RSB"); return (BUS_PROBE_DEFAULT); } static int rsb_attach(device_t dev) { struct rsb_softc *sc; int error; sc = device_get_softc(dev); mtx_init(&sc->mtx, device_get_nameunit(dev), "rsb", MTX_DEF); + + sc->type = ofw_bus_search_compatible(dev, compat_data)->ocd_data; if (clk_get_by_ofw_index(dev, 0, 0, &sc->clk) == 0) { error = clk_enable(sc->clk); if (error != 0) { device_printf(dev, "cannot enable clock\n"); goto fail; } } if (hwreset_get_by_ofw_idx(dev, 0, 0, &sc->rst) == 0) { error = hwreset_deassert(sc->rst); if (error != 0) { device_printf(dev, "cannot de-assert reset\n"); goto fail; } } if (bus_alloc_resources(dev, rsb_spec, &sc->res) != 0) { device_printf(dev, "cannot allocate resources for device\n"); error = ENXIO; goto fail; } sc->iicbus = device_add_child(dev, "iicbus", -1); if (sc->iicbus == NULL) { device_printf(dev, "cannot add iicbus child device\n"); error = ENXIO; goto fail; } bus_generic_attach(dev); return (0); fail: bus_release_resources(dev, rsb_spec, &sc->res); if (sc->rst != NULL) hwreset_release(sc->rst); if (sc->clk != NULL) clk_release(sc->clk); mtx_destroy(&sc->mtx); return (error); } static device_method_t rsb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, rsb_probe), DEVMETHOD(device_attach, rsb_attach), /* Bus interface */ DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource), DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource), DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), /* OFW methods */ DEVMETHOD(ofw_bus_get_node, rsb_get_node), /* iicbus interface */ DEVMETHOD(iicbus_callback, iicbus_null_callback), DEVMETHOD(iicbus_reset, rsb_reset), DEVMETHOD(iicbus_transfer, rsb_transfer), DEVMETHOD_END }; static driver_t rsb_driver = { "iichb", rsb_methods, sizeof(struct rsb_softc), }; static devclass_t rsb_devclass; EARLY_DRIVER_MODULE(iicbus, rsb, iicbus_driver, iicbus_devclass, 0, 0, BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE); EARLY_DRIVER_MODULE(rsb, simplebus, rsb_driver, rsb_devclass, 0, 0, BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE); MODULE_VERSION(rsb, 1); Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_thermal.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_thermal.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_thermal.c (revision 307896) @@ -1,561 +1,574 @@ /*- * Copyright (c) 2016 Jared McNeill * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Allwinner thermal sensor controller */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include "cpufreq_if.h" #define THS_CTRL0 0x00 #define THS_CTRL1 0x04 #define ADC_CALI_EN (1 << 17) #define THS_CTRL2 0x40 #define SENSOR_ACQ1_SHIFT 16 #define SENSOR2_EN (1 << 2) #define SENSOR1_EN (1 << 1) #define SENSOR0_EN (1 << 0) #define THS_INTC 0x44 #define THS_INTS 0x48 #define THS2_DATA_IRQ_STS (1 << 10) #define THS1_DATA_IRQ_STS (1 << 9) #define THS0_DATA_IRQ_STS (1 << 8) #define SHUT_INT2_STS (1 << 6) #define SHUT_INT1_STS (1 << 5) #define SHUT_INT0_STS (1 << 4) #define ALARM_INT2_STS (1 << 2) #define ALARM_INT1_STS (1 << 1) #define ALARM_INT0_STS (1 << 0) #define THS_ALARM0_CTRL 0x50 #define ALARM_T_HOT_MASK 0xfff #define ALARM_T_HOT_SHIFT 16 #define ALARM_T_HYST_MASK 0xfff #define ALARM_T_HYST_SHIFT 0 #define THS_SHUTDOWN0_CTRL 0x60 #define SHUT_T_HOT_MASK 0xfff #define SHUT_T_HOT_SHIFT 16 #define THS_FILTER 0x70 #define THS_CALIB0 0x74 #define THS_CALIB1 0x78 #define THS_DATA0 0x80 #define THS_DATA1 0x84 #define THS_DATA2 0x88 #define DATA_MASK 0xfff #define A83T_ADC_ACQUIRE_TIME 0x17 #define A83T_FILTER 0x4 #define A83T_INTC 0x1000 #define A83T_TEMP_BASE 2719000 #define A83T_TEMP_MUL 1000 #define A83T_TEMP_DIV 14186 #define A83T_CLK_RATE 24000000 #define A64_ADC_ACQUIRE_TIME 0x190 #define A64_FILTER 0x6 #define A64_INTC 0x18000 #define A64_TEMP_BASE 2170000 #define A64_TEMP_MUL 1000 #define A64_TEMP_DIV 8560 #define A64_CLK_RATE 4000000 #define H3_ADC_ACQUIRE_TIME 0x3f #define H3_FILTER 0x6 #define H3_INTC 0x191000 #define H3_TEMP_BASE 2794000 #define H3_TEMP_MUL 1000 #define H3_TEMP_DIV -14882 #define H3_CLK_RATE 4000000 #define TEMP_C_TO_K 273 #define SENSOR_ENABLE_ALL (SENSOR0_EN|SENSOR1_EN|SENSOR2_EN) #define SHUT_INT_ALL (SHUT_INT0_STS|SHUT_INT1_STS|SHUT_INT2_STS) #define ALARM_INT_ALL (ALARM_INT0_STS) #define MAX_SENSORS 3 #define MAX_CF_LEVELS 64 #define THROTTLE_ENABLE_DEFAULT 1 /* Enable thermal throttling */ static int aw_thermal_throttle_enable = THROTTLE_ENABLE_DEFAULT; TUNABLE_INT("hw.aw_thermal.throttle_enable", &aw_thermal_throttle_enable); struct aw_thermal_sensor { const char *name; const char *desc; }; struct aw_thermal_config { struct aw_thermal_sensor sensors[MAX_SENSORS]; int nsensors; uint64_t clk_rate; uint32_t adc_acquire_time; int adc_cali_en; uint32_t filter; uint32_t intc; int (*to_temp)(uint32_t); int temp_base; int temp_mul; int temp_div; int calib0, calib1; uint32_t calib0_mask, calib1_mask; }; static int a83t_to_temp(uint32_t val) { return ((A83T_TEMP_BASE - (val * A83T_TEMP_MUL)) / A83T_TEMP_DIV); } static const struct aw_thermal_config a83t_config = { .nsensors = 3, .sensors = { [0] = { .name = "cluster0", .desc = "CPU cluster 0 temperature", }, [1] = { .name = "cluster1", .desc = "CPU cluster 1 temperature", }, [2] = { .name = "gpu", .desc = "GPU temperature", }, }, .clk_rate = A83T_CLK_RATE, .adc_acquire_time = A83T_ADC_ACQUIRE_TIME, .adc_cali_en = 1, .filter = A83T_FILTER, .intc = A83T_INTC, .to_temp = a83t_to_temp, .calib0 = 1, .calib0_mask = 0xffffffff, .calib1 = 1, .calib1_mask = 0xffffffff, }; static int a64_to_temp(uint32_t val) { return ((A64_TEMP_BASE - (val * A64_TEMP_MUL)) / A64_TEMP_DIV); } static const struct aw_thermal_config a64_config = { .nsensors = 3, .sensors = { [0] = { .name = "cpu", .desc = "CPU temperature", }, [1] = { .name = "gpu1", .desc = "GPU temperature 1", }, [2] = { .name = "gpu2", .desc = "GPU temperature 2", }, }, .clk_rate = A64_CLK_RATE, .adc_acquire_time = A64_ADC_ACQUIRE_TIME, .filter = A64_FILTER, .intc = A64_INTC, .to_temp = a64_to_temp, }; static int h3_to_temp(uint32_t val) { return (((int)(val * H3_TEMP_MUL) - H3_TEMP_BASE) / H3_TEMP_DIV); } static const struct aw_thermal_config h3_config = { .nsensors = 1, .sensors = { [0] = { .name = "cpu", .desc = "CPU temperature", }, }, .clk_rate = H3_CLK_RATE, .adc_acquire_time = H3_ADC_ACQUIRE_TIME, .filter = H3_FILTER, .intc = H3_INTC, .to_temp = h3_to_temp, .calib0 = 1, .calib0_mask = 0xfff, }; static struct ofw_compat_data compat_data[] = { { "allwinner,sun8i-a83t-ts", (uintptr_t)&a83t_config }, { "allwinner,sun8i-h3-ts", (uintptr_t)&h3_config }, { "allwinner,sun50i-a64-ts", (uintptr_t)&a64_config }, { NULL, (uintptr_t)NULL } }; #define THS_CONF(d) \ (void *)ofw_bus_search_compatible((d), compat_data)->ocd_data struct aw_thermal_softc { device_t dev; struct resource *res[2]; struct aw_thermal_config *conf; + struct task cf_task; int throttle; int min_freq; struct cf_level levels[MAX_CF_LEVELS]; eventhandler_tag cf_pre_tag; }; static struct resource_spec aw_thermal_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, { SYS_RES_IRQ, 0, RF_ACTIVE }, { -1, 0 } }; #define RD4(sc, reg) bus_read_4((sc)->res[0], (reg)) #define WR4(sc, reg, val) bus_write_4((sc)->res[0], (reg), (val)) static int aw_thermal_init(struct aw_thermal_softc *sc) { uint32_t calib0, calib1; int error; if (sc->conf->calib0 != 0 || sc->conf->calib1 != 0) { /* Read calibration settings from SRAM */ error = aw_sid_read_tscalib(&calib0, &calib1); if (error != 0) return (error); calib0 &= sc->conf->calib0_mask; calib1 &= sc->conf->calib1_mask; /* Write calibration settings to thermal controller */ if (sc->conf->calib0 != 0 && calib0 != 0) WR4(sc, THS_CALIB0, calib0); if (sc->conf->calib1 != 0 && calib1 != 0) WR4(sc, THS_CALIB1, calib1); } /* Configure ADC acquire time (CLK_IN/(N+1)) and enable sensors */ WR4(sc, THS_CTRL1, ADC_CALI_EN); WR4(sc, THS_CTRL0, sc->conf->adc_acquire_time); WR4(sc, THS_CTRL2, sc->conf->adc_acquire_time << SENSOR_ACQ1_SHIFT); /* Enable average filter */ WR4(sc, THS_FILTER, sc->conf->filter); /* Enable interrupts */ WR4(sc, THS_INTS, RD4(sc, THS_INTS)); WR4(sc, THS_INTC, sc->conf->intc | SHUT_INT_ALL | ALARM_INT_ALL); /* Enable sensors */ WR4(sc, THS_CTRL2, RD4(sc, THS_CTRL2) | SENSOR_ENABLE_ALL); return (0); } static int aw_thermal_gettemp(struct aw_thermal_softc *sc, int sensor) { uint32_t val; val = RD4(sc, THS_DATA0 + (sensor * 4)); return (sc->conf->to_temp(val) + TEMP_C_TO_K); } static int aw_thermal_getshut(struct aw_thermal_softc *sc, int sensor) { uint32_t val; val = RD4(sc, THS_SHUTDOWN0_CTRL + (sensor * 4)); val = (val >> SHUT_T_HOT_SHIFT) & SHUT_T_HOT_MASK; return (sc->conf->to_temp(val) + TEMP_C_TO_K); } static int aw_thermal_gethyst(struct aw_thermal_softc *sc, int sensor) { uint32_t val; val = RD4(sc, THS_ALARM0_CTRL + (sensor * 4)); val = (val >> ALARM_T_HYST_SHIFT) & ALARM_T_HYST_MASK; return (sc->conf->to_temp(val) + TEMP_C_TO_K); } static int aw_thermal_getalarm(struct aw_thermal_softc *sc, int sensor) { uint32_t val; val = RD4(sc, THS_ALARM0_CTRL + (sensor * 4)); val = (val >> ALARM_T_HOT_SHIFT) & ALARM_T_HOT_MASK; return (sc->conf->to_temp(val) + TEMP_C_TO_K); } static int aw_thermal_sysctl(SYSCTL_HANDLER_ARGS) { struct aw_thermal_softc *sc; int sensor, val; sc = arg1; sensor = arg2; val = aw_thermal_gettemp(sc, sensor); return sysctl_handle_opaque(oidp, &val, sizeof(val), req); } static void aw_thermal_throttle(struct aw_thermal_softc *sc, int enable) { device_t cf_dev; int count, error; if (enable == sc->throttle) return; if (enable != 0) { /* Set the lowest available frequency */ cf_dev = devclass_get_device(devclass_find("cpufreq"), 0); if (cf_dev == NULL) return; count = MAX_CF_LEVELS; error = CPUFREQ_LEVELS(cf_dev, sc->levels, &count); if (error != 0 || count == 0) return; sc->min_freq = sc->levels[count - 1].total_set.freq; error = CPUFREQ_SET(cf_dev, &sc->levels[count - 1], CPUFREQ_PRIO_USER); if (error != 0) return; } sc->throttle = enable; } static void +aw_thermal_cf_task(void *arg, int pending) +{ + struct aw_thermal_softc *sc; + + sc = arg; + + aw_thermal_throttle(sc, 1); +} + +static void aw_thermal_cf_pre_change(void *arg, const struct cf_level *level, int *status) { struct aw_thermal_softc *sc; int temp_cur, temp_alarm; sc = arg; if (aw_thermal_throttle_enable == 0 || sc->throttle == 0 || level->total_set.freq == sc->min_freq) return; temp_cur = aw_thermal_gettemp(sc, 0); temp_alarm = aw_thermal_getalarm(sc, 0); if (temp_cur < temp_alarm) aw_thermal_throttle(sc, 0); else *status = ENXIO; } static void aw_thermal_intr(void *arg) { struct aw_thermal_softc *sc; device_t dev; uint32_t ints; dev = arg; sc = device_get_softc(dev); ints = RD4(sc, THS_INTS); WR4(sc, THS_INTS, ints); if ((ints & SHUT_INT_ALL) != 0) { device_printf(dev, "WARNING - current temperature exceeds safe limits\n"); shutdown_nice(RB_POWEROFF); } if ((ints & ALARM_INT_ALL) != 0) - aw_thermal_throttle(sc, 1); + taskqueue_enqueue(taskqueue_thread, &sc->cf_task); } static int aw_thermal_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (THS_CONF(dev) == NULL) return (ENXIO); device_set_desc(dev, "Allwinner Thermal Sensor Controller"); return (BUS_PROBE_DEFAULT); } static int aw_thermal_attach(device_t dev) { struct aw_thermal_softc *sc; clk_t clk_ahb, clk_ths; hwreset_t rst; int i, error; void *ih; sc = device_get_softc(dev); clk_ahb = clk_ths = NULL; rst = NULL; ih = NULL; sc->conf = THS_CONF(dev); + TASK_INIT(&sc->cf_task, 0, aw_thermal_cf_task, sc); if (bus_alloc_resources(dev, aw_thermal_spec, sc->res) != 0) { device_printf(dev, "cannot allocate resources for device\n"); return (ENXIO); } if (clk_get_by_ofw_name(dev, 0, "ahb", &clk_ahb) == 0) { error = clk_enable(clk_ahb); if (error != 0) { device_printf(dev, "cannot enable ahb clock\n"); goto fail; } } if (clk_get_by_ofw_name(dev, 0, "ths", &clk_ths) == 0) { error = clk_set_freq(clk_ths, sc->conf->clk_rate, 0); if (error != 0) { device_printf(dev, "cannot set ths clock rate\n"); goto fail; } error = clk_enable(clk_ths); if (error != 0) { device_printf(dev, "cannot enable ths clock\n"); goto fail; } } if (hwreset_get_by_ofw_idx(dev, 0, 0, &rst) == 0) { error = hwreset_deassert(rst); if (error != 0) { device_printf(dev, "cannot de-assert reset\n"); goto fail; } } error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_MISC | INTR_MPSAFE, NULL, aw_thermal_intr, dev, &ih); if (error != 0) { device_printf(dev, "cannot setup interrupt handler\n"); goto fail; } if (aw_thermal_init(sc) != 0) goto fail; for (i = 0; i < sc->conf->nsensors; i++) SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, sc->conf->sensors[i].name, CTLTYPE_INT | CTLFLAG_RD, sc, i, aw_thermal_sysctl, "IK0", sc->conf->sensors[i].desc); if (bootverbose) for (i = 0; i < sc->conf->nsensors; i++) { device_printf(dev, "#%d: alarm %dC hyst %dC shut %dC\n", i, aw_thermal_getalarm(sc, i) - TEMP_C_TO_K, aw_thermal_gethyst(sc, i) - TEMP_C_TO_K, aw_thermal_getshut(sc, i) - TEMP_C_TO_K); } sc->cf_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change, aw_thermal_cf_pre_change, sc, EVENTHANDLER_PRI_FIRST); return (0); fail: if (ih != NULL) bus_teardown_intr(dev, sc->res[1], ih); if (rst != NULL) hwreset_release(rst); if (clk_ahb != NULL) clk_release(clk_ahb); if (clk_ths != NULL) clk_release(clk_ths); bus_release_resources(dev, aw_thermal_spec, sc->res); return (ENXIO); } static device_method_t aw_thermal_methods[] = { /* Device interface */ DEVMETHOD(device_probe, aw_thermal_probe), DEVMETHOD(device_attach, aw_thermal_attach), DEVMETHOD_END }; static driver_t aw_thermal_driver = { "aw_thermal", aw_thermal_methods, sizeof(struct aw_thermal_softc), }; static devclass_t aw_thermal_devclass; DRIVER_MODULE(aw_thermal, simplebus, aw_thermal_driver, aw_thermal_devclass, 0, 0); MODULE_VERSION(aw_thermal, 1); Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/files.allwinner (revision 307896) @@ -1,57 +1,57 @@ # $FreeBSD$ kern/kern_clocksource.c standard arm/allwinner/a10_ahci.c optional ahci arm/allwinner/a10_codec.c optional sound arm/allwinner/a10_common.c standard arm/allwinner/a10_dmac.c standard arm/allwinner/a10_ehci.c optional ehci arm/allwinner/aw_usbphy.c optional ehci arm/allwinner/a10_gpio.c optional gpio arm/allwinner/a10_mmc.c optional mmc arm/allwinner/a10_sramc.c standard arm/allwinner/aw_nmi.c optional intrng arm/allwinner/aw_if_dwc.c optional dwc -arm/allwinner/aw_rsb.c optional rsb +arm/allwinner/aw_rsb.c optional rsb | p2wi arm/allwinner/aw_rtc.c standard arm/allwinner/aw_ts.c standard arm/allwinner/aw_wdog.c standard arm/allwinner/aw_machdep.c standard arm/allwinner/aw_mp.c optional smp arm/allwinner/axp209.c optional axp209 arm/allwinner/axp81x.c optional axp81x arm/allwinner/if_awg.c optional awg arm/allwinner/if_emac.c optional emac arm/allwinner/sunxi_dma_if.m standard dev/iicbus/twsi/a10_twsi.c optional twsi dev/usb/controller/generic_ohci.c optional ohci dev/usb/controller/generic_usb_if.m optional ohci arm/allwinner/aw_sid.c standard arm/allwinner/aw_thermal.c standard dev/iicbus/sy8106a.c optional sy8106a #arm/allwinner/console.c standard arm/allwinner/a10_fb.c optional vt arm/allwinner/a10_hdmi.c optional hdmi arm/allwinner/a10_hdmiaudio.c optional hdmi sound arm/arm/hdmi_if.m optional hdmi arm/allwinner/aw_reset.c standard arm/allwinner/aw_ccu.c standard arm/allwinner/clk/aw_ahbclk.c standard arm/allwinner/clk/aw_apbclk.c standard arm/allwinner/clk/aw_axiclk.c standard arm/allwinner/clk/aw_codecclk.c standard arm/allwinner/clk/aw_cpuclk.c standard arm/allwinner/clk/aw_cpusclk.c standard arm/allwinner/clk/aw_debeclk.c standard arm/allwinner/clk/aw_gate.c standard arm/allwinner/clk/aw_gmacclk.c standard arm/allwinner/clk/aw_hdmiclk.c standard arm/allwinner/clk/aw_lcdclk.c standard arm/allwinner/clk/aw_modclk.c standard arm/allwinner/clk/aw_mmcclk.c standard arm/allwinner/clk/aw_oscclk.c standard arm/allwinner/clk/aw_pll.c standard arm/allwinner/clk/aw_thsclk.c standard arm/allwinner/clk/aw_usbclk.c standard Index: user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/arm/conf/GENERIC (revision 307896) @@ -1,176 +1,178 @@ # # GENERICV6 -- Generic(ish) kernel config. # # For more information on this file, please read the config(5) manual page, # and/or the handbook section on Kernel Configuration Files: # # http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the ../../conf/NOTES and NOTES files. # If you are in doubt as to the purpose or necessity of a line, check first # in NOTES. # # $FreeBSD$ ident GENERIC cpu CPU_CORTEXA_MP machine arm armv6 makeoptions CONF_CFLAGS="-march=armv7a" makeoptions KERNVIRTADDR=0xc0000000 options KERNVIRTADDR=0xc0000000 include "std.armv6" files "../allwinner/files.allwinner" files "../allwinner/a20/files.a20" files "../allwinner/a31/files.a31" files "../allwinner/a83t/files.a83t" files "../allwinner/h3/files.h3" files "../broadcom/bcm2835/files.bcm2836" files "../broadcom/bcm2835/files.bcm283x" files "../nvidia/tegra124/files.tegra124" files "../qemu/files.qemu" options SOC_ALLWINNER_A20 options SOC_ALLWINNER_A31 options SOC_ALLWINNER_A31S options SOC_ALLWINNER_A83T options SOC_ALLWINNER_H3 options SOC_BCM2836 options SCHED_ULE # ULE scheduler options SMP # Enable multiple cores options PLATFORM options PLATFORM_SMP options MULTIDELAY options LINUX_BOOT_ABI # EXT_RESOURCES pseudo devices options EXT_RESOURCES device clk device phy device hwreset device regulator # CPU frequency control device cpufreq # Interrupt controller options INTRNG device gic # ARM Generic Timer device generic_timer # MMC/SD/SDIO Card slot support device sdhci # SD controller device mmc # mmc/sd bus device mmcsd # mmc/sd flash cards # ATA controllers device ahci # AHCI-compatible SATA controllers #device ata # Legacy ATA/SATA controllers # PCI options NEW_PCIB device pci # PCI NICs device re # RealTek 8139C+/8169/8169S/8110S # VirtIO device virtio device virtio_mmio device virtio_blk device vtnet # Console and misc device uart device uart_ns8250 device uart_snps device pl011 device pty device snp device md # Memory "disks" device random # Entropy device device psci # I2C support device iicbus device iic device twsi -device rsb +device rsb # Allwinner Reduced Serial Bus +device p2wi # Allwinner Push-Pull Two Wire device axp209 # AXP209 Power Management Unit device axp81x # AXP813/818 Power Management Unit device bcm2835_bsc device icee +device sy8106a # SY8106A Buck Regulator # GPIO device gpio device gpioled device gpioregulator # SPI device spibus device bcm2835_spi device scbus # SCSI bus (required for ATA/SCSI) device da # Direct Access (disks) device cd # CD device pass # Passthrough device (direct ATA/SCSI access) # USB support options USB_HOST_ALIGN=64 # Align usb buffers to cache line size. device usb #device uhci device ohci device ehci device dwcotg # DWC OTG controller device umass # Disks/Mass storage - Requires scbus and da device uhid # "Human Interface Devices" device ukbd # Allow keyboard like HIDs to control console # Ethernet device loop device ether device vlan # 802.1Q VLAN support device mii device bpf #device emac # 10/100 integrated EMAC controller device dwc # 10/100/1000 integrated GMAC controller device awg # 10/100/1000 integrated EMAC controller # USB ethernet support, requires miibus device smcphy device smsc device miibus # Sound support device sound # Framebuffer support device vt device kbdmux device ums device videomode device hdmi device vchiq # Pinmux device fdt_pinctrl # Extensible Firmware Interface options EFI # Flattened Device Tree options FDT # Configure using FDT/DTB data makeoptions MODULES_EXTRA="dtb/allwinner dtb/nvidia dtb/rpi" Index: user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/Makefile =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/Makefile (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/Makefile (revision 307896) @@ -1,44 +1,45 @@ # $FreeBSD$ LIB= efi INTERNALLIB= WARNS?= 2 SRCS= delay.c devpath.c efi_console.c efinet.c efipart.c env.c errno.c \ handles.c libefi.c .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" SRCS+= time.c .elif ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm" SRCS+= time_event.c .endif # We implement a slightly non-standard %S in that it always takes a # CHAR16 that's common in UEFI-land instead of a wchar_t. This only # seems to matter on arm64 where wchar_t defaults to an int instead # of a short. There's no good cast to use here so just ignore the # warnings for now. CWARNFLAGS.efinet.c+= -Wno-format .if ${MACHINE_CPUARCH} == "aarch64" CFLAGS+= -msoft-float -mgeneral-regs-only .endif .if ${MACHINE_ARCH} == "amd64" CFLAGS+= -fPIC -mno-red-zone .endif +CFLAGS+= -I${.CURDIR}/../../ficl -I${.CURDIR}/../../ficl/${MACHINE} CFLAGS+= -I${.CURDIR}/../include CFLAGS+= -I${.CURDIR}/../include/${MACHINE} CFLAGS+= -I${.CURDIR}/../../../../lib/libstand # Pick up the bootstrap header for some interface items CFLAGS+= -I${.CURDIR}/../../common # Handle FreeBSD specific %b and %D printf format specifiers CFLAGS+= ${FORMAT_EXTENSIONS} # Do not use TERM_EMU on arm and arm64 as it doesn't behave well with serial console .if ${MACHINE_CPUARCH} != "arm" && ${MACHINE_CPUARCH} != "aarch64" CFLAGS+= -DTERM_EMU .endif .include Index: user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/env.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/env.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/efi/libefi/env.c (revision 307896) @@ -1,55 +1,234 @@ /* * Copyright (c) 2015 Netflix, Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include +#include #include #include +#include +#include "bootstrap.h" +#include "ficl.h" +int efi_variable_support = 1; + /* * Simple wrappers to the underlying UEFI functions. * See http://wiki.phoenix.com/wiki/index.php/EFI_RUNTIME_SERVICES * for details. */ EFI_STATUS efi_get_next_variable_name(UINTN *variable_name_size, CHAR16 *variable_name, EFI_GUID *vendor_guid) { return RS->GetNextVariableName(variable_name_size, variable_name, vendor_guid); } EFI_STATUS efi_get_variable(CHAR16 *variable_name, EFI_GUID *vendor_guid, UINT32 *attributes, UINTN *data_size, void *data) { return RS->GetVariable(variable_name, vendor_guid, attributes, data_size, data); } EFI_STATUS efi_set_variable(CHAR16 *variable_name, EFI_GUID *vendor_guid, UINT32 attributes, UINTN data_size, void *data) { return RS->SetVariable(variable_name, vendor_guid, attributes, data_size, data); } + +/* + * FreeBSD's loader interaction words and extras + * + * efi-setenv ( value n name n guid n attr -- 0 | -1) + * efi-getenv ( guid n addr n -- addr' n' | -1 ) + * efi-unsetenv ( name n guid n'' -- ) + */ + +/* + * efi-setenv + * efi-setenv ( value n name n guid n attr -- 0 | -1) + * + * Set environment variables using the SetVariable EFI runtime service. + * + * Value and guid are passed through in binary form (so guid needs to be + * converted to binary form from its string form). Name is converted from + * ASCII to CHAR16. Since ficl doesn't have support for internationalization, + * there's no native CHAR16 interface provided. + * + * attr is an int in the bitmask of the following attributes for this variable. + * + * 1 Non volatile + * 2 Boot service access + * 4 Run time access + * (corresponding to the same bits in the UEFI spec). + */ +void +ficlEfiSetenv(FICL_VM *pVM) +{ +#ifndef TESTMAIN + char *value = NULL, *guid = NULL; + CHAR16 *name = NULL; + int i; +#endif + char *namep, *valuep, *guidp; + int names, values, guids, attr; + int status; + uuid_t u; + uint32_t ustatus; + +#if FICL_ROBUST > 1 + vmCheckStack(pVM, 6, 0); +#endif + attr = stackPopINT(pVM->pStack); + guids = stackPopINT(pVM->pStack); + guidp = (char*)stackPopPtr(pVM->pStack); + names = stackPopINT(pVM->pStack); + namep = (char*)stackPopPtr(pVM->pStack); + values = stackPopINT(pVM->pStack); + valuep = (char*)stackPopPtr(pVM->pStack); + +#ifndef TESTMAIN + guid = (char*)ficlMalloc(guids); + if (guid != NULL) + vmThrowErr(pVM, "Error: out of memory"); + memcpy(guid, guidp, guids); + uuid_from_string(guid, &u, &ustatus); + if (ustatus != uuid_s_ok) { + stackPushINT(pVM->pStack, -1); + goto out; + } + + name = (CHAR16 *)ficlMalloc((names + 1) * sizeof(CHAR16)); + if (name == NULL) + vmThrowErr(pVM, "Error: out of memory"); + for (i = 0; i < names; i++) + name[i] = namep[i]; + name[names] = (CHAR16)0; + + value = (char*)ficlMalloc(values + 1); + if (value != NULL) + vmThrowErr(pVM, "Error: out of memory"); + memcpy(value, valuep, values); + + status = efi_set_variable(name, (EFI_GUID *)&u, attr, values, value); + if (status == EFI_SUCCESS) + stackPushINT(pVM->pStack, 0); + else + stackPushINT(pVM->pStack, -1); +out: + ficlFree(name); + ficlFree(value); + ficlFree(guid); +#endif + + return; +} + +void +ficlEfiGetenv(FICL_VM *pVM) +{ +#ifndef TESTMAIN + char *name, *value; +#endif + char *namep; + int names; + +#if FICL_ROBUST > 1 + vmCheckStack(pVM, 2, 2); +#endif + names = stackPopINT(pVM->pStack); + namep = (char*) stackPopPtr(pVM->pStack); + +#ifndef TESTMAIN + name = (char*) ficlMalloc(names+1); + if (!name) + vmThrowErr(pVM, "Error: out of memory"); + strncpy(name, namep, names); + name[names] = '\0'; + + value = getenv(name); + ficlFree(name); + + if(value != NULL) { + stackPushPtr(pVM->pStack, value); + stackPushINT(pVM->pStack, strlen(value)); + } else +#endif + stackPushINT(pVM->pStack, -1); + + return; +} + +void +ficlEfiUnsetenv(FICL_VM *pVM) +{ +#ifndef TESTMAIN + char *name; +#endif + char *namep; + int names; + +#if FICL_ROBUST > 1 + vmCheckStack(pVM, 2, 0); +#endif + names = stackPopINT(pVM->pStack); + namep = (char*) stackPopPtr(pVM->pStack); + +#ifndef TESTMAIN + name = (char*) ficlMalloc(names+1); + if (!name) + vmThrowErr(pVM, "Error: out of memory"); + strncpy(name, namep, names); + name[names] = '\0'; + + unsetenv(name); + ficlFree(name); +#endif + + return; +} + +/************************************************************************** +** Add FreeBSD UEFI platform extensions into the system dictionary +**************************************************************************/ +void ficlEfiCompilePlatform(FICL_SYSTEM *pSys) +{ + FICL_DICT *dp = pSys->dp; + assert (dp); + + dictAppendWord(dp, "efi-setenv", ficlEfiSetenv, FW_DEFAULT); + dictAppendWord(dp, "efi-getenv", ficlEfiGetenv, FW_DEFAULT); + dictAppendWord(dp, "efi-unsetenv", ficlEfiUnsetenv, FW_DEFAULT); + + /* Would like to export the EFI version, but this will do for now */ + ficlSetEnv(pSys, "efi-boot", 1); + + return; +} + +FICL_COMPILE_SET(ficlEfiCompilePlatform); Index: user/alc/PQ_LAUNDRY/sys/boot/efi/loader/main.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/efi/loader/main.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/efi/loader/main.c (revision 307896) @@ -1,1075 +1,1079 @@ /*- * Copyright (c) 2008-2010 Rui Paulo * Copyright (c) 2006 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #ifdef EFI_ZFS_BOOT #include #endif #include "loader_efi.h" extern char bootprog_name[]; extern char bootprog_rev[]; extern char bootprog_date[]; extern char bootprog_maker[]; +/* Force a reference to bring in EFI support from the library */ +extern int efi_variable_support; +int *dummy1 = &efi_variable_support; + struct arch_switch archsw; /* MI/MD interface boundary */ EFI_GUID acpi = ACPI_TABLE_GUID; EFI_GUID acpi20 = ACPI_20_TABLE_GUID; EFI_GUID devid = DEVICE_PATH_PROTOCOL; EFI_GUID imgid = LOADED_IMAGE_PROTOCOL; EFI_GUID mps = MPS_TABLE_GUID; EFI_GUID netid = EFI_SIMPLE_NETWORK_PROTOCOL; EFI_GUID smbios = SMBIOS_TABLE_GUID; EFI_GUID dxe = DXE_SERVICES_TABLE_GUID; EFI_GUID hoblist = HOB_LIST_TABLE_GUID; EFI_GUID memtype = MEMORY_TYPE_INFORMATION_TABLE_GUID; EFI_GUID debugimg = DEBUG_IMAGE_INFO_TABLE_GUID; EFI_GUID fdtdtb = FDT_TABLE_GUID; EFI_GUID inputid = SIMPLE_TEXT_INPUT_PROTOCOL; #ifdef EFI_ZFS_BOOT static void efi_zfs_probe(void); #endif /* * cpy8to16 copies a traditional C string into a CHAR16 string and * 0 terminates it. len is the size of *dst in bytes. */ static void cpy8to16(const char *src, CHAR16 *dst, size_t len) { len <<= 1; /* Assume CHAR16 is 2 bytes */ while (len > 0 && *src) { *dst++ = *src++; len--; } *dst++ = (CHAR16)0; } static void cpy16to8(const CHAR16 *src, char *dst, size_t len) { size_t i; for (i = 0; i < len && src[i]; i++) dst[i] = (char)src[i]; if (i < len) dst[i] = '\0'; } static int has_keyboard(void) { EFI_STATUS status; EFI_DEVICE_PATH *path; EFI_HANDLE *hin, *hin_end, *walker; UINTN sz; int retval = 0; /* * Find all the handles that support the SIMPLE_TEXT_INPUT_PROTOCOL and * do the typical dance to get the right sized buffer. */ sz = 0; hin = NULL; status = BS->LocateHandle(ByProtocol, &inputid, 0, &sz, 0); if (status == EFI_BUFFER_TOO_SMALL) { hin = (EFI_HANDLE *)malloc(sz); status = BS->LocateHandle(ByProtocol, &inputid, 0, &sz, hin); if (EFI_ERROR(status)) free(hin); } if (EFI_ERROR(status)) return retval; /* * Look at each of the handles. If it supports the device path protocol, * use it to get the device path for this handle. Then see if that * device path matches either the USB device path for keyboards or the * legacy device path for keyboards. */ hin_end = &hin[sz / sizeof(*hin)]; for (walker = hin; walker < hin_end; walker++) { status = BS->HandleProtocol(*walker, &devid, (VOID **)&path); if (EFI_ERROR(status)) continue; while (!IsDevicePathEnd(path)) { /* * Check for the ACPI keyboard node. All PNP3xx nodes * are keyboards of different flavors. Note: It is * unclear of there's always a keyboard node when * there's a keyboard controller, or if there's only one * when a keyboard is detected at boot. */ if (DevicePathType(path) == ACPI_DEVICE_PATH && (DevicePathSubType(path) == ACPI_DP || DevicePathSubType(path) == ACPI_EXTENDED_DP)) { ACPI_HID_DEVICE_PATH *acpi; acpi = (ACPI_HID_DEVICE_PATH *)(void *)path; if ((EISA_ID_TO_NUM(acpi->HID) & 0xff00) == 0x300 && (acpi->HID & 0xffff) == PNP_EISA_ID_CONST) { retval = 1; goto out; } /* * Check for USB keyboard node, if present. Unlike a * PS/2 keyboard, these definitely only appear when * connected to the system. */ } else if (DevicePathType(path) == MESSAGING_DEVICE_PATH && DevicePathSubType(path) == MSG_USB_CLASS_DP) { USB_CLASS_DEVICE_PATH *usb; usb = (USB_CLASS_DEVICE_PATH *)(void *)path; if (usb->DeviceClass == 3 && /* HID */ usb->DeviceSubClass == 1 && /* Boot devices */ usb->DeviceProtocol == 1) { /* Boot keyboards */ retval = 1; goto out; } } path = NextDevicePathNode(path); } } out: free(hin); return retval; } static int find_currdev(EFI_LOADED_IMAGE *img, struct devsw **dev, int *unit, uint64_t *extra) { EFI_DEVICE_PATH *devpath, *copy; EFI_HANDLE h; /* * Try the device handle from our loaded image first. If that * fails, use the device path from the loaded image and see if * any of the nodes in that path match one of the enumerated * handles. */ if (efi_handle_lookup(img->DeviceHandle, dev, unit, extra) == 0) return (0); copy = NULL; devpath = efi_lookup_image_devpath(IH); while (devpath != NULL) { h = efi_devpath_handle(devpath); if (h == NULL) break; if (efi_handle_lookup(h, dev, unit, extra) == 0) { if (copy != NULL) free(copy); return (0); } if (copy != NULL) free(copy); devpath = efi_lookup_devpath(h); if (devpath != NULL) { copy = efi_devpath_trim(devpath); devpath = copy; } } return (ENOENT); } EFI_STATUS main(int argc, CHAR16 *argv[]) { char var[128]; EFI_LOADED_IMAGE *img; EFI_GUID *guid; int i, j, vargood, unit, howto; struct devsw *dev; uint64_t pool_guid; UINTN k; int has_kbd; char buf[40]; archsw.arch_autoload = efi_autoload; archsw.arch_getdev = efi_getdev; archsw.arch_copyin = efi_copyin; archsw.arch_copyout = efi_copyout; archsw.arch_readin = efi_readin; #ifdef EFI_ZFS_BOOT /* Note this needs to be set before ZFS init. */ archsw.arch_zfs_probe = efi_zfs_probe; #endif /* Init the time source */ efi_time_init(); has_kbd = has_keyboard(); /* * XXX Chicken-and-egg problem; we want to have console output * early, but some console attributes may depend on reading from * eg. the boot device, which we can't do yet. We can use * printf() etc. once this is done. */ cons_probe(); /* * Initialise the block cache. Set the upper limit. */ bcache_init(32768, 512); /* * Parse the args to set the console settings, etc * boot1.efi passes these in, if it can read /boot.config or /boot/config * or iPXE may be setup to pass these in. * * Loop through the args, and for each one that contains an '=' that is * not the first character, add it to the environment. This allows * loader and kernel env vars to be passed on the command line. Convert * args from UCS-2 to ASCII (16 to 8 bit) as they are copied. */ howto = 0; for (i = 1; i < argc; i++) { if (argv[i][0] == '-') { for (j = 1; argv[i][j] != 0; j++) { int ch; ch = argv[i][j]; switch (ch) { case 'a': howto |= RB_ASKNAME; break; case 'd': howto |= RB_KDB; break; case 'D': howto |= RB_MULTIPLE; break; case 'h': howto |= RB_SERIAL; break; case 'm': howto |= RB_MUTE; break; case 'p': howto |= RB_PAUSE; break; case 'P': if (!has_kbd) howto |= RB_SERIAL | RB_MULTIPLE; break; case 'r': howto |= RB_DFLTROOT; break; case 's': howto |= RB_SINGLE; break; case 'S': if (argv[i][j + 1] == 0) { if (i + 1 == argc) { setenv("comconsole_speed", "115200", 1); } else { cpy16to8(&argv[i + 1][0], var, sizeof(var)); setenv("comconsole_speedspeed", var, 1); } i++; break; } else { cpy16to8(&argv[i][j + 1], var, sizeof(var)); setenv("comconsole_speed", var, 1); break; } case 'v': howto |= RB_VERBOSE; break; } } } else { vargood = 0; for (j = 0; argv[i][j] != 0; j++) { if (j == sizeof(var)) { vargood = 0; break; } if (j > 0 && argv[i][j] == '=') vargood = 1; var[j] = (char)argv[i][j]; } if (vargood) { var[j] = 0; putenv(var); } } } for (i = 0; howto_names[i].ev != NULL; i++) if (howto & howto_names[i].mask) setenv(howto_names[i].ev, "YES", 1); if (howto & RB_MULTIPLE) { if (howto & RB_SERIAL) setenv("console", "comconsole efi" , 1); else setenv("console", "efi comconsole" , 1); } else if (howto & RB_SERIAL) { setenv("console", "comconsole" , 1); } if (efi_copy_init()) { printf("failed to allocate staging area\n"); return (EFI_BUFFER_TOO_SMALL); } /* * March through the device switch probing for things. */ for (i = 0; devsw[i] != NULL; i++) if (devsw[i]->dv_init != NULL) (devsw[i]->dv_init)(); /* Get our loaded image protocol interface structure. */ BS->HandleProtocol(IH, &imgid, (VOID**)&img); printf("Command line arguments:"); for (i = 0; i < argc; i++) printf(" %S", argv[i]); printf("\n"); printf("Image base: 0x%lx\n", (u_long)img->ImageBase); printf("EFI version: %d.%02d\n", ST->Hdr.Revision >> 16, ST->Hdr.Revision & 0xffff); printf("EFI Firmware: %S (rev %d.%02d)\n", ST->FirmwareVendor, ST->FirmwareRevision >> 16, ST->FirmwareRevision & 0xffff); printf("\n"); printf("%s, Revision %s\n", bootprog_name, bootprog_rev); printf("(%s, %s)\n", bootprog_maker, bootprog_date); /* * Disable the watchdog timer. By default the boot manager sets * the timer to 5 minutes before invoking a boot option. If we * want to return to the boot manager, we have to disable the * watchdog timer and since we're an interactive program, we don't * want to wait until the user types "quit". The timer may have * fired by then. We don't care if this fails. It does not prevent * normal functioning in any way... */ BS->SetWatchdogTimer(0, 0, 0, NULL); if (find_currdev(img, &dev, &unit, &pool_guid) != 0) return (EFI_NOT_FOUND); switch (dev->dv_type) { #ifdef EFI_ZFS_BOOT case DEVT_ZFS: { struct zfs_devdesc currdev; currdev.d_dev = dev; currdev.d_unit = unit; currdev.d_type = currdev.d_dev->dv_type; currdev.d_opendata = NULL; currdev.pool_guid = pool_guid; currdev.root_guid = 0; env_setenv("currdev", EV_VOLATILE, efi_fmtdev(&currdev), efi_setcurrdev, env_nounset); env_setenv("loaddev", EV_VOLATILE, efi_fmtdev(&currdev), env_noset, env_nounset); init_zfs_bootenv(zfs_fmtdev(&currdev)); break; } #endif default: { struct devdesc currdev; currdev.d_dev = dev; currdev.d_unit = unit; currdev.d_opendata = NULL; currdev.d_type = currdev.d_dev->dv_type; env_setenv("currdev", EV_VOLATILE, efi_fmtdev(&currdev), efi_setcurrdev, env_nounset); env_setenv("loaddev", EV_VOLATILE, efi_fmtdev(&currdev), env_noset, env_nounset); break; } } snprintf(var, sizeof(var), "%d.%02d", ST->Hdr.Revision >> 16, ST->Hdr.Revision & 0xffff); env_setenv("efi-version", EV_VOLATILE, var, env_noset, env_nounset); setenv("LINES", "24", 1); /* optional */ for (k = 0; k < ST->NumberOfTableEntries; k++) { guid = &ST->ConfigurationTable[k].VendorGuid; if (!memcmp(guid, &smbios, sizeof(EFI_GUID))) { snprintf(buf, sizeof(buf), "%p", ST->ConfigurationTable[k].VendorTable); setenv("hint.smbios.0.mem", buf, 1); smbios_detect(ST->ConfigurationTable[k].VendorTable); break; } } interact(NULL); /* doesn't return */ return (EFI_SUCCESS); /* keep compiler happy */ } /* XXX move to lib stand ? */ static int wcscmp(CHAR16 *a, CHAR16 *b) { while (*a && *b && *a == *b) { a++; b++; } return *a - *b; } COMMAND_SET(reboot, "reboot", "reboot the system", command_reboot); static int command_reboot(int argc, char *argv[]) { int i; for (i = 0; devsw[i] != NULL; ++i) if (devsw[i]->dv_cleanup != NULL) (devsw[i]->dv_cleanup)(); RS->ResetSystem(EfiResetCold, EFI_SUCCESS, 23, (CHAR16 *)"Reboot from the loader"); /* NOTREACHED */ return (CMD_ERROR); } COMMAND_SET(quit, "quit", "exit the loader", command_quit); static int command_quit(int argc, char *argv[]) { exit(0); return (CMD_OK); } COMMAND_SET(memmap, "memmap", "print memory map", command_memmap); static int command_memmap(int argc, char *argv[]) { UINTN sz; EFI_MEMORY_DESCRIPTOR *map, *p; UINTN key, dsz; UINT32 dver; EFI_STATUS status; int i, ndesc; static char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode" }; sz = 0; status = BS->GetMemoryMap(&sz, 0, &key, &dsz, &dver); if (status != EFI_BUFFER_TOO_SMALL) { printf("Can't determine memory map size\n"); return (CMD_ERROR); } map = malloc(sz); status = BS->GetMemoryMap(&sz, map, &key, &dsz, &dver); if (EFI_ERROR(status)) { printf("Can't read memory map\n"); return (CMD_ERROR); } ndesc = sz / dsz; printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = NextMemoryDescriptor(p, dsz)) { printf("%23s %012jx %012jx %08jx ", types[p->Type], (uintmax_t)p->PhysicalStart, (uintmax_t)p->VirtualStart, (uintmax_t)p->NumberOfPages); if (p->Attribute & EFI_MEMORY_UC) printf("UC "); if (p->Attribute & EFI_MEMORY_WC) printf("WC "); if (p->Attribute & EFI_MEMORY_WT) printf("WT "); if (p->Attribute & EFI_MEMORY_WB) printf("WB "); if (p->Attribute & EFI_MEMORY_UCE) printf("UCE "); if (p->Attribute & EFI_MEMORY_WP) printf("WP "); if (p->Attribute & EFI_MEMORY_RP) printf("RP "); if (p->Attribute & EFI_MEMORY_XP) printf("XP "); printf("\n"); } return (CMD_OK); } COMMAND_SET(configuration, "configuration", "print configuration tables", command_configuration); static const char * guid_to_string(EFI_GUID *guid) { static char buf[40]; sprintf(buf, "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", guid->Data1, guid->Data2, guid->Data3, guid->Data4[0], guid->Data4[1], guid->Data4[2], guid->Data4[3], guid->Data4[4], guid->Data4[5], guid->Data4[6], guid->Data4[7]); return (buf); } static int command_configuration(int argc, char *argv[]) { UINTN i; printf("NumberOfTableEntries=%lu\n", (unsigned long)ST->NumberOfTableEntries); for (i = 0; i < ST->NumberOfTableEntries; i++) { EFI_GUID *guid; printf(" "); guid = &ST->ConfigurationTable[i].VendorGuid; if (!memcmp(guid, &mps, sizeof(EFI_GUID))) printf("MPS Table"); else if (!memcmp(guid, &acpi, sizeof(EFI_GUID))) printf("ACPI Table"); else if (!memcmp(guid, &acpi20, sizeof(EFI_GUID))) printf("ACPI 2.0 Table"); else if (!memcmp(guid, &smbios, sizeof(EFI_GUID))) printf("SMBIOS Table %p", ST->ConfigurationTable[i].VendorTable); else if (!memcmp(guid, &dxe, sizeof(EFI_GUID))) printf("DXE Table"); else if (!memcmp(guid, &hoblist, sizeof(EFI_GUID))) printf("HOB List Table"); else if (!memcmp(guid, &memtype, sizeof(EFI_GUID))) printf("Memory Type Information Table"); else if (!memcmp(guid, &debugimg, sizeof(EFI_GUID))) printf("Debug Image Info Table"); else if (!memcmp(guid, &fdtdtb, sizeof(EFI_GUID))) printf("FDT Table"); else printf("Unknown Table (%s)", guid_to_string(guid)); printf(" at %p\n", ST->ConfigurationTable[i].VendorTable); } return (CMD_OK); } COMMAND_SET(mode, "mode", "change or display EFI text modes", command_mode); static int command_mode(int argc, char *argv[]) { UINTN cols, rows; unsigned int mode; int i; char *cp; char rowenv[8]; EFI_STATUS status; SIMPLE_TEXT_OUTPUT_INTERFACE *conout; extern void HO(void); conout = ST->ConOut; if (argc > 1) { mode = strtol(argv[1], &cp, 0); if (cp[0] != '\0') { printf("Invalid mode\n"); return (CMD_ERROR); } status = conout->QueryMode(conout, mode, &cols, &rows); if (EFI_ERROR(status)) { printf("invalid mode %d\n", mode); return (CMD_ERROR); } status = conout->SetMode(conout, mode); if (EFI_ERROR(status)) { printf("couldn't set mode %d\n", mode); return (CMD_ERROR); } sprintf(rowenv, "%u", (unsigned)rows); setenv("LINES", rowenv, 1); HO(); /* set cursor */ return (CMD_OK); } printf("Current mode: %d\n", conout->Mode->Mode); for (i = 0; i <= conout->Mode->MaxMode; i++) { status = conout->QueryMode(conout, i, &cols, &rows); if (EFI_ERROR(status)) continue; printf("Mode %d: %u columns, %u rows\n", i, (unsigned)cols, (unsigned)rows); } if (i != 0) printf("Select a mode with the command \"mode \"\n"); return (CMD_OK); } #ifdef EFI_ZFS_BOOT COMMAND_SET(lszfs, "lszfs", "list child datasets of a zfs dataset", command_lszfs); static int command_lszfs(int argc, char *argv[]) { int err; if (argc != 2) { command_errmsg = "wrong number of arguments"; return (CMD_ERROR); } err = zfs_list(argv[1]); if (err != 0) { command_errmsg = strerror(err); return (CMD_ERROR); } return (CMD_OK); } COMMAND_SET(reloadbe, "reloadbe", "refresh the list of ZFS Boot Environments", command_reloadbe); static int command_reloadbe(int argc, char *argv[]) { int err; char *root; if (argc > 2) { command_errmsg = "wrong number of arguments"; return (CMD_ERROR); } if (argc == 2) { err = zfs_bootenv(argv[1]); } else { root = getenv("zfs_be_root"); if (root == NULL) { return (CMD_OK); } err = zfs_bootenv(root); } if (err != 0) { command_errmsg = strerror(err); return (CMD_ERROR); } return (CMD_OK); } #endif COMMAND_SET(efishow, "efi-show", "print some or all EFI variables", command_efi_show); static int efi_print_var(CHAR16 *varnamearg, EFI_GUID *matchguid, int lflag) { UINTN datasz, i; EFI_STATUS status; UINT32 attr; CHAR16 *data; char *str; uint32_t uuid_status; int is_ascii; datasz = 0; status = RS->GetVariable(varnamearg, matchguid, &attr, &datasz, NULL); if (status != EFI_BUFFER_TOO_SMALL) { printf("Can't get the variable: error %#lx\n", status); return (CMD_ERROR); } data = malloc(datasz); status = RS->GetVariable(varnamearg, matchguid, &attr, &datasz, data); if (status != EFI_SUCCESS) { printf("Can't get the variable: error %#lx\n", status); return (CMD_ERROR); } uuid_to_string((uuid_t *)matchguid, &str, &uuid_status); if (lflag) { printf("%s 0x%x %S", str, attr, varnamearg); } else { printf("%s 0x%x %S=", str, attr, varnamearg); is_ascii = 1; free(str); str = (char *)data; for (i = 0; i < datasz - 1; i++) { /* Quick hack to see if this ascii-ish string printable range plus tab, cr and lf */ if ((str[i] < 32 || str[i] > 126) && str[i] != 9 && str[i] != 10 && str[i] != 13) { is_ascii = 0; break; } } if (str[datasz - 1] != '\0') is_ascii = 0; if (is_ascii) printf("%s", str); else { for (i = 0; i < datasz / 2; i++) { if (isalnum(data[i]) || isspace(data[i])) printf("%c", data[i]); else printf("\\x%02x", data[i]); } } } free(data); if (pager_output("\n")) return (CMD_WARN); return (CMD_OK); } static int command_efi_show(int argc, char *argv[]) { /* * efi-show [-a] * print all the env * efi-show -u UUID * print all the env vars tagged with UUID * efi-show -v var * search all the env vars and print the ones matching var * eif-show -u UUID -v var * eif-show UUID var * print all the env vars that match UUID and var */ /* NB: We assume EFI_GUID is the same as uuid_t */ int aflag = 0, gflag = 0, lflag = 0, vflag = 0; int ch, rv; unsigned i; EFI_STATUS status; EFI_GUID varguid = { 0,0,0,{0,0,0,0,0,0,0,0} }; EFI_GUID matchguid = { 0,0,0,{0,0,0,0,0,0,0,0} }; uint32_t uuid_status; CHAR16 *varname; CHAR16 *newnm; CHAR16 varnamearg[128]; UINTN varalloc; UINTN varsz; while ((ch = getopt(argc, argv, "ag:lv:")) != -1) { switch (ch) { case 'a': aflag = 1; break; case 'g': gflag = 1; uuid_from_string(optarg, (uuid_t *)&matchguid, &uuid_status); if (uuid_status != uuid_s_ok) { printf("uid %s could not be parsed\n", optarg); return (CMD_ERROR); } break; case 'l': lflag = 1; break; case 'v': vflag = 1; if (strlen(optarg) >= nitems(varnamearg)) { printf("Variable %s is longer than %zd characters\n", optarg, nitems(varnamearg)); return (CMD_ERROR); } for (i = 0; i < strlen(optarg); i++) varnamearg[i] = optarg[i]; varnamearg[i] = 0; break; default: printf("Invalid argument %c\n", ch); return (CMD_ERROR); } } if (aflag && (gflag || vflag)) { printf("-a isn't compatible with -v or -u\n"); return (CMD_ERROR); } if (aflag && optind < argc) { printf("-a doesn't take any args"); return (CMD_ERROR); } if (optind == argc) aflag = 1; argc -= optind; argv += optind; pager_open(); if (vflag && gflag) { rv = efi_print_var(varnamearg, &matchguid, lflag); pager_close(); return (rv); } if (argc == 2) { optarg = argv[0]; if (strlen(optarg) >= nitems(varnamearg)) { printf("Variable %s is longer than %zd characters\n", optarg, nitems(varnamearg)); pager_close(); return (CMD_ERROR); } for (i = 0; i < strlen(optarg); i++) varnamearg[i] = optarg[i]; varnamearg[i] = 0; optarg = argv[1]; uuid_from_string(optarg, (uuid_t *)&matchguid, &uuid_status); if (uuid_status != uuid_s_ok) { printf("uid %s could not be parsed\n", optarg); pager_close(); return (CMD_ERROR); } rv = efi_print_var(varnamearg, &matchguid, lflag); pager_close(); return (rv); } - if (argc != 0) { - printf("Too many args\n"); + if (argc > 0) { + printf("Too many args %d\n", argc); pager_close(); return (CMD_ERROR); } /* * Initiate the search -- note the standard takes pain * to specify the initial call must be a poiner to a NULL * character. */ varalloc = 1024; varname = malloc(varalloc); if (varname == NULL) { printf("Can't allocate memory to get variables\n"); pager_close(); return (CMD_ERROR); } varname[0] = 0; while (1) { varsz = varalloc; status = RS->GetNextVariableName(&varsz, varname, &varguid); if (status == EFI_BUFFER_TOO_SMALL) { varalloc = varsz; newnm = malloc(varalloc); if (newnm == NULL) { printf("Can't allocate memory to get variables\n"); free(varname); pager_close(); return (CMD_ERROR); } memcpy(newnm, varname, varsz); free(varname); varname = newnm; continue; /* Try again with bigger buffer */ } if (status != EFI_SUCCESS) break; if (aflag) { if (efi_print_var(varname, &varguid, lflag) != CMD_OK) break; continue; } if (vflag) { if (wcscmp(varnamearg, varname) == 0) { if (efi_print_var(varname, &varguid, lflag) != CMD_OK) break; continue; } } if (gflag) { if (memcmp(&varguid, &matchguid, sizeof(varguid)) == 0) { if (efi_print_var(varname, &varguid, lflag) != CMD_OK) break; continue; } } } free(varname); pager_close(); return (CMD_OK); } COMMAND_SET(efiset, "efi-set", "set EFI variables", command_efi_set); static int command_efi_set(int argc, char *argv[]) { char *uuid, *var, *val; CHAR16 wvar[128]; EFI_GUID guid; uint32_t status; EFI_STATUS err; if (argc != 4) { printf("efi-set uuid var new-value\n"); return (CMD_ERROR); } uuid = argv[1]; var = argv[2]; val = argv[3]; uuid_from_string(uuid, (uuid_t *)&guid, &status); if (status != uuid_s_ok) { printf("Invalid uuid %s %d\n", uuid, status); return (CMD_ERROR); } cpy8to16(var, wvar, sizeof(wvar)); err = RS->SetVariable(wvar, &guid, EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_RUNTIME_ACCESS | EFI_VARIABLE_BOOTSERVICE_ACCESS, strlen(val) + 1, val); if (EFI_ERROR(err)) { printf("Failed to set variable: error %lu\n", EFI_ERROR_CODE(err)); return (CMD_ERROR); } return (CMD_OK); } COMMAND_SET(efiunset, "efi-unset", "delete / unset EFI variables", command_efi_unset); static int command_efi_unset(int argc, char *argv[]) { char *uuid, *var; CHAR16 wvar[128]; EFI_GUID guid; uint32_t status; EFI_STATUS err; if (argc != 3) { printf("efi-unset uuid var\n"); return (CMD_ERROR); } uuid = argv[1]; var = argv[2]; uuid_from_string(uuid, (uuid_t *)&guid, &status); if (status != uuid_s_ok) { printf("Invalid uuid %s\n", uuid); return (CMD_ERROR); } cpy8to16(var, wvar, sizeof(wvar)); err = RS->SetVariable(wvar, &guid, 0, 0, NULL); if (EFI_ERROR(err)) { printf("Failed to unset variable: error %lu\n", EFI_ERROR_CODE(err)); return (CMD_ERROR); } return (CMD_OK); } #ifdef LOADER_FDT_SUPPORT extern int command_fdt_internal(int argc, char *argv[]); /* * Since proper fdt command handling function is defined in fdt_loader_cmd.c, * and declaring it as extern is in contradiction with COMMAND_SET() macro * (which uses static pointer), we're defining wrapper function, which * calls the proper fdt handling routine. */ static int command_fdt(int argc, char *argv[]) { return (command_fdt_internal(argc, argv)); } COMMAND_SET(fdt, "fdt", "flattened device tree handling", command_fdt); #endif #ifdef EFI_ZFS_BOOT static void efi_zfs_probe(void) { EFI_HANDLE h; u_int unit; int i; char dname[SPECNAMELEN + 1]; uint64_t guid; unit = 0; h = efi_find_handle(&efipart_dev, 0); for (i = 0; h != NULL; h = efi_find_handle(&efipart_dev, ++i)) { snprintf(dname, sizeof(dname), "%s%d:", efipart_dev.dv_name, i); if (zfs_probe_dev(dname, &guid) == 0) (void)efi_handle_update_dev(h, &zfs_dev, unit++, guid); } } #endif Index: user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/arm/olimex-a20-som-evb.dts =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/arm/olimex-a20-som-evb.dts (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/arm/olimex-a20-som-evb.dts (revision 307896) @@ -1,43 +1,47 @@ /*- * Copyright (c) 2015 Emmanuel Vadot * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "sun7i-a20-olimex-som-evb.dts" #include "sun7i-a20-hdmi.dtsi" #include "xpowers-axp209.dtsi" / { soc@01c00000 { hdmi@01c16000 { status = "okay"; }; hdmiaudio { status = "okay"; }; }; }; + +&cpu0 { + cpu-supply = <®_dcdc2>; +}; Index: user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c (nonexistent) @@ -1,207 +0,0 @@ -/*- - * Copyright (c) 2014 Netflix, Inc - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/******************************************************************* -** e f i . c -** Additional words for EFI -** -*******************************************************************/ - -#ifdef TESTMAIN -#include -#include -#include -#include -#include -#include -#include -#else -#include -#endif -#include "bootstrap.h" -#include -#include "ficl.h" - -/* - * FreeBSD's loader interaction words and extras - * - * efi-setenv ( value n name n guid n attr -- 0 | -1) - * efi-getenv ( guid n addr n -- addr' n' | -1 ) - * efi-unsetenv ( name n guid n'' -- ) - */ - -/* - * efi-setenv - * efi-setenv ( value n name n guid n attr -- 0 | -1) - * - * Set environment variables using the SetVariable EFI runtime service. - * - * Value and guid are passed through in binary form (so guid needs to be - * converted to binary form from its string form). Name is converted from - * ASCII to CHAR16. Since ficl doesn't have support for internationalization, - * there's no native CHAR16 interface provided. - * - * attr is an int in the bitmask of the following attributes for this variable. - * - * 1 Non volatile - * 2 Boot service access - * 4 Run time access - * (corresponding to the same bits in the UEFI spec). - */ -void -ficlEfiSetenv(FICL_VM *pVM) -{ -#ifndef TESTMAIN - char *value, *guid; - CHAR16 *name - int i; -#endif - char *namep, *valuep, *guidp; - int names, values, guids, attr; - -#if FICL_ROBUST > 1 - vmCheckStack(pVM, 6, 0); -#endif - attr = stackPopINT(pVM->pStack); - guids = stackPopINT(pVM->pStack); - guidp = (char*)stackPopPtr(pVM->pStack); - names = stackPopINT(pVM->pStack); - namep = (char*)stackPopPtr(pVM->pStack); - values = stackPopINT(pVM->pStack); - valuep = (char*)stackPopPtr(pVM->pStack); - -#ifndef TESTMAIN - guid = (char*)ficlMalloc(guids); - if (guid != NULL) - vmThrowErr(pVM, "Error: out of memory"); - memcpy(guid, guidp, guids); - - name = (char*)ficlMalloc((names + 1) * sizeof(CHAR16)); - if (name == NULL) - vmThrowErr(pVM, "Error: out of memory"); - for (i = 0; i < names; i++) - name[i] = namep[i]; - name[names] = (CHAR16)0; - - value = (char*)ficlMalloc(values + 1); - if (value != NULL) - vmThrowErr(pVM, "Error: out of memory"); - memcpy(value, valuep, values); - - status = efi_set_variable(name, guid, attr, value); - if (status == EFI_SUCCESS) - stackPushINT(pVM->pStack, 0); - else - stackPushINT(pVM->pStack, -1); - - ficlFree(name); - ficlFree(value); - ficlFree(guid); -#endif - - return; -} - -void -ficlEfiGetenv(FICL_VM *pVM) -{ -#ifndef TESTMAIN - char *name, *value; -#endif - char *namep; - int names; - -#if FICL_ROBUST > 1 - vmCheckStack(pVM, 2, 2); -#endif - names = stackPopINT(pVM->pStack); - namep = (char*) stackPopPtr(pVM->pStack); - -#ifndef TESTMAIN - name = (char*) ficlMalloc(names+1); - if (!name) - vmThrowErr(pVM, "Error: out of memory"); - strncpy(name, namep, names); - name[names] = '\0'; - - value = getenv(name); - ficlFree(name); - - if(value != NULL) { - stackPushPtr(pVM->pStack, value); - stackPushINT(pVM->pStack, strlen(value)); - } else -#endif - stackPushINT(pVM->pStack, -1); - - return; -} - -void -ficlEfiUnsetenv(FICL_VM *pVM) -{ -#ifndef TESTMAIN - char *name; -#endif - char *namep; - int names; - -#if FICL_ROBUST > 1 - vmCheckStack(pVM, 2, 0); -#endif - names = stackPopINT(pVM->pStack); - namep = (char*) stackPopPtr(pVM->pStack); - -#ifndef TESTMAIN - name = (char*) ficlMalloc(names+1); - if (!name) - vmThrowErr(pVM, "Error: out of memory"); - strncpy(name, namep, names); - name[names] = '\0'; - - unsetenv(name); - ficlFree(name); -#endif - - return; -} -/************************************************************************** - -** Build FreeBSD platform extensions into the system dictionary -**************************************************************************/ -void ficlEfiCompilePlatform(FICL_SYSTEM *pSys) -{ - FICL_DICT *dp = pSys->dp; - assert (dp); - - dictAppendWord(dp, "efi-setenv", ficlEfiSetenv, FW_DEFAULT); - dictAppendWord(dp, "efi-getenv", ficlEfiGetenv, FW_DEFAULT); - dictAppendWord(dp, "efi-unsetenv", ficlEfiUnsetenv, FW_DEFAULT); - - return; -} Property changes on: user/alc/PQ_LAUNDRY/sys/boot/ficl/efi.c ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/boot/ficl/loader.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/ficl/loader.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/ficl/loader.c (revision 307896) @@ -1,842 +1,841 @@ /*- * Copyright (c) 2000 Daniel Capo Sobral * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /******************************************************************* ** l o a d e r . c ** Additional FICL words designed for FreeBSD's loader ** *******************************************************************/ #ifdef TESTMAIN #include #include #include #include #include #include #include #else #include #endif #include "bootstrap.h" #include #include #include "ficl.h" /* FreeBSD's loader interaction words and extras * * setenv ( value n name n' -- ) * setenv? ( value n name n' flag -- ) * getenv ( addr n -- addr' n' | -1 ) * unsetenv ( addr n -- ) * copyin ( addr addr' len -- ) * copyout ( addr addr' len -- ) * findfile ( name len type len' -- addr ) * pnpdevices ( -- addr ) * pnphandlers ( -- addr ) * ccall ( [[...[p10] p9] ... p1] n addr -- result ) * uuid-from-string ( addr n -- addr' ) * uuid-to-string ( addr' -- addr n ) * .# ( value -- ) */ void ficlSetenv(FICL_VM *pVM) { #ifndef TESTMAIN char *name, *value; #endif char *namep, *valuep; int names, values; #if FICL_ROBUST > 1 vmCheckStack(pVM, 4, 0); #endif names = stackPopINT(pVM->pStack); namep = (char*) stackPopPtr(pVM->pStack); values = stackPopINT(pVM->pStack); valuep = (char*) stackPopPtr(pVM->pStack); #ifndef TESTMAIN name = (char*) ficlMalloc(names+1); if (!name) vmThrowErr(pVM, "Error: out of memory"); strncpy(name, namep, names); name[names] = '\0'; value = (char*) ficlMalloc(values+1); if (!value) vmThrowErr(pVM, "Error: out of memory"); strncpy(value, valuep, values); value[values] = '\0'; setenv(name, value, 1); ficlFree(name); ficlFree(value); #endif return; } void ficlSetenvq(FICL_VM *pVM) { #ifndef TESTMAIN char *name, *value; #endif char *namep, *valuep; int names, values, overwrite; #if FICL_ROBUST > 1 vmCheckStack(pVM, 5, 0); #endif overwrite = stackPopINT(pVM->pStack); names = stackPopINT(pVM->pStack); namep = (char*) stackPopPtr(pVM->pStack); values = stackPopINT(pVM->pStack); valuep = (char*) stackPopPtr(pVM->pStack); #ifndef TESTMAIN name = (char*) ficlMalloc(names+1); if (!name) vmThrowErr(pVM, "Error: out of memory"); strncpy(name, namep, names); name[names] = '\0'; value = (char*) ficlMalloc(values+1); if (!value) vmThrowErr(pVM, "Error: out of memory"); strncpy(value, valuep, values); value[values] = '\0'; setenv(name, value, overwrite); ficlFree(name); ficlFree(value); #endif return; } void ficlGetenv(FICL_VM *pVM) { #ifndef TESTMAIN char *name, *value; #endif char *namep; int names; #if FICL_ROBUST > 1 vmCheckStack(pVM, 2, 2); #endif names = stackPopINT(pVM->pStack); namep = (char*) stackPopPtr(pVM->pStack); #ifndef TESTMAIN name = (char*) ficlMalloc(names+1); if (!name) vmThrowErr(pVM, "Error: out of memory"); strncpy(name, namep, names); name[names] = '\0'; value = getenv(name); ficlFree(name); if(value != NULL) { stackPushPtr(pVM->pStack, value); stackPushINT(pVM->pStack, strlen(value)); } else #endif stackPushINT(pVM->pStack, -1); return; } void ficlUnsetenv(FICL_VM *pVM) { #ifndef TESTMAIN char *name; #endif char *namep; int names; #if FICL_ROBUST > 1 vmCheckStack(pVM, 2, 0); #endif names = stackPopINT(pVM->pStack); namep = (char*) stackPopPtr(pVM->pStack); #ifndef TESTMAIN name = (char*) ficlMalloc(names+1); if (!name) vmThrowErr(pVM, "Error: out of memory"); strncpy(name, namep, names); name[names] = '\0'; unsetenv(name); ficlFree(name); #endif return; } void ficlCopyin(FICL_VM *pVM) { void* src; vm_offset_t dest; size_t len; #if FICL_ROBUST > 1 vmCheckStack(pVM, 3, 0); #endif len = stackPopINT(pVM->pStack); dest = stackPopINT(pVM->pStack); src = stackPopPtr(pVM->pStack); #ifndef TESTMAIN archsw.arch_copyin(src, dest, len); #endif return; } void ficlCopyout(FICL_VM *pVM) { void* dest; vm_offset_t src; size_t len; #if FICL_ROBUST > 1 vmCheckStack(pVM, 3, 0); #endif len = stackPopINT(pVM->pStack); dest = stackPopPtr(pVM->pStack); src = stackPopINT(pVM->pStack); #ifndef TESTMAIN archsw.arch_copyout(src, dest, len); #endif return; } void ficlFindfile(FICL_VM *pVM) { #ifndef TESTMAIN char *name, *type; #endif char *namep, *typep; struct preloaded_file* fp; int names, types; #if FICL_ROBUST > 1 vmCheckStack(pVM, 4, 1); #endif types = stackPopINT(pVM->pStack); typep = (char*) stackPopPtr(pVM->pStack); names = stackPopINT(pVM->pStack); namep = (char*) stackPopPtr(pVM->pStack); #ifndef TESTMAIN name = (char*) ficlMalloc(names+1); if (!name) vmThrowErr(pVM, "Error: out of memory"); strncpy(name, namep, names); name[names] = '\0'; type = (char*) ficlMalloc(types+1); if (!type) vmThrowErr(pVM, "Error: out of memory"); strncpy(type, typep, types); type[types] = '\0'; fp = file_findfile(name, type); #else fp = NULL; #endif stackPushPtr(pVM->pStack, fp); return; } void ficlCcall(FICL_VM *pVM) { int (*func)(int, ...); int result, p[10]; int nparam, i; #if FICL_ROBUST > 1 vmCheckStack(pVM, 2, 0); #endif func = stackPopPtr(pVM->pStack); nparam = stackPopINT(pVM->pStack); #if FICL_ROBUST > 1 vmCheckStack(pVM, nparam, 1); #endif for (i = 0; i < nparam; i++) p[i] = stackPopINT(pVM->pStack); result = func(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9]); stackPushINT(pVM->pStack, result); return; } void ficlUuidFromString(FICL_VM *pVM) { #ifndef TESTMAIN char *uuid; uint32_t status; #endif char *uuidp; int uuids; uuid_t *u; #if FICL_ROBUST > 1 vmCheckStack(pVM, 2, 0); #endif uuids = stackPopINT(pVM->pStack); uuidp = (char *) stackPopPtr(pVM->pStack); #ifndef TESTMAIN uuid = (char *)ficlMalloc(uuids + 1); if (!uuid) vmThrowErr(pVM, "Error: out of memory"); strncpy(uuid, uuidp, uuids); uuid[uuids] = '\0'; u = (uuid_t *)ficlMalloc(sizeof (*u)); uuid_from_string(uuid, u, &status); ficlFree(uuid); if (status != uuid_s_ok) { ficlFree(u); u = NULL; } #else u = NULL; #endif stackPushPtr(pVM->pStack, u); return; } void ficlUuidToString(FICL_VM *pVM) { #ifndef TESTMAIN char *uuid; uint32_t status; #endif uuid_t *u; #if FICL_ROBUST > 1 vmCheckStack(pVM, 1, 0); #endif u = (uuid_t *)stackPopPtr(pVM->pStack); #ifndef TESTMAIN uuid_to_string(u, &uuid, &status); if (status != uuid_s_ok) { stackPushPtr(pVM->pStack, uuid); stackPushINT(pVM->pStack, strlen(uuid)); } else #endif stackPushINT(pVM->pStack, -1); return; } /************************************************************************** f i c l E x e c F D ** reads in text from file fd and passes it to ficlExec() * returns VM_OUTOFTEXT on success or the ficlExec() error code on * failure. */ #define nLINEBUF 256 int ficlExecFD(FICL_VM *pVM, int fd) { char cp[nLINEBUF]; int nLine = 0, rval = VM_OUTOFTEXT; char ch; CELL id; id = pVM->sourceID; pVM->sourceID.i = fd; /* feed each line to ficlExec */ while (1) { int status, i; i = 0; while ((status = read(fd, &ch, 1)) > 0 && ch != '\n') cp[i++] = ch; nLine++; if (!i) { if (status < 1) break; continue; } rval = ficlExecC(pVM, cp, i); if(rval != VM_QUIT && rval != VM_USEREXIT && rval != VM_OUTOFTEXT) { pVM->sourceID = id; return rval; } } /* ** Pass an empty line with SOURCE-ID == -1 to flush ** any pending REFILLs (as required by FILE wordset) */ pVM->sourceID.i = -1; ficlExec(pVM, ""); pVM->sourceID = id; return rval; } static void displayCellNoPad(FICL_VM *pVM) { CELL c; #if FICL_ROBUST > 1 vmCheckStack(pVM, 1, 0); #endif c = stackPop(pVM->pStack); ltoa((c).i, pVM->pad, pVM->base); vmTextOut(pVM, pVM->pad, 0); return; } /* isdir? - Return whether an fd corresponds to a directory. * * isdir? ( fd -- bool ) */ static void isdirQuestion(FICL_VM *pVM) { struct stat sb; FICL_INT flag; int fd; #if FICL_ROBUST > 1 vmCheckStack(pVM, 1, 1); #endif fd = stackPopINT(pVM->pStack); flag = FICL_FALSE; do { if (fd < 0) break; if (fstat(fd, &sb) < 0) break; if (!S_ISDIR(sb.st_mode)) break; flag = FICL_TRUE; } while (0); stackPushINT(pVM->pStack, flag); } /* fopen - open a file and return new fd on stack. * * fopen ( ptr count mode -- fd ) */ static void pfopen(FICL_VM *pVM) { int mode, fd, count; char *ptr, *name; #if FICL_ROBUST > 1 vmCheckStack(pVM, 3, 1); #endif mode = stackPopINT(pVM->pStack); /* get mode */ count = stackPopINT(pVM->pStack); /* get count */ ptr = stackPopPtr(pVM->pStack); /* get ptr */ if ((count < 0) || (ptr == NULL)) { stackPushINT(pVM->pStack, -1); return; } /* ensure that the string is null terminated */ name = (char *)malloc(count+1); bcopy(ptr,name,count); name[count] = 0; /* open the file */ fd = open(name, mode); free(name); stackPushINT(pVM->pStack, fd); return; } /* fclose - close a file who's fd is on stack. * * fclose ( fd -- ) */ static void pfclose(FICL_VM *pVM) { int fd; #if FICL_ROBUST > 1 vmCheckStack(pVM, 1, 0); #endif fd = stackPopINT(pVM->pStack); /* get fd */ if (fd != -1) close(fd); return; } /* fread - read file contents * * fread ( fd buf nbytes -- nread ) */ static void pfread(FICL_VM *pVM) { int fd, len; char *buf; #if FICL_ROBUST > 1 vmCheckStack(pVM, 3, 1); #endif len = stackPopINT(pVM->pStack); /* get number of bytes to read */ buf = stackPopPtr(pVM->pStack); /* get buffer */ fd = stackPopINT(pVM->pStack); /* get fd */ if (len > 0 && buf && fd != -1) stackPushINT(pVM->pStack, read(fd, buf, len)); else stackPushINT(pVM->pStack, -1); return; } /* freaddir - read directory contents * * freaddir ( fd -- ptr len TRUE | FALSE ) */ static void pfreaddir(FICL_VM *pVM) { #ifdef TESTMAIN static struct dirent dirent; struct stat sb; char *buf; off_t off, ptr; u_int blksz; int bufsz; #endif struct dirent *d; int fd; #if FICL_ROBUST > 1 vmCheckStack(pVM, 1, 3); #endif fd = stackPopINT(pVM->pStack); #if TESTMAIN /* * The readdirfd() function is specific to the loader environment. * We do the best we can to make freaddir work, but it's not at * all guaranteed. */ d = NULL; buf = NULL; do { if (fd == -1) break; if (fstat(fd, &sb) == -1) break; blksz = (sb.st_blksize) ? sb.st_blksize : getpagesize(); if ((blksz & (blksz - 1)) != 0) break; buf = malloc(blksz); if (buf == NULL) break; off = lseek(fd, 0LL, SEEK_CUR); if (off == -1) break; ptr = off; if (lseek(fd, 0, SEEK_SET) == -1) break; bufsz = getdents(fd, buf, blksz); while (bufsz > 0 && bufsz <= ptr) { ptr -= bufsz; bufsz = getdents(fd, buf, blksz); } if (bufsz <= 0) break; d = (void *)(buf + ptr); dirent = *d; off += d->d_reclen; d = (lseek(fd, off, SEEK_SET) != off) ? NULL : &dirent; } while (0); if (buf != NULL) free(buf); #else d = readdirfd(fd); #endif if (d != NULL) { stackPushPtr(pVM->pStack, d->d_name); stackPushINT(pVM->pStack, strlen(d->d_name)); stackPushINT(pVM->pStack, FICL_TRUE); } else { stackPushINT(pVM->pStack, FICL_FALSE); } } /* fload - interpret file contents * * fload ( fd -- ) */ static void pfload(FICL_VM *pVM) { int fd; #if FICL_ROBUST > 1 vmCheckStack(pVM, 1, 0); #endif fd = stackPopINT(pVM->pStack); /* get fd */ if (fd != -1) ficlExecFD(pVM, fd); return; } /* fwrite - write file contents * * fwrite ( fd buf nbytes -- nwritten ) */ static void pfwrite(FICL_VM *pVM) { int fd, len; char *buf; #if FICL_ROBUST > 1 vmCheckStack(pVM, 3, 1); #endif len = stackPopINT(pVM->pStack); /* get number of bytes to read */ buf = stackPopPtr(pVM->pStack); /* get buffer */ fd = stackPopINT(pVM->pStack); /* get fd */ if (len > 0 && buf && fd != -1) stackPushINT(pVM->pStack, write(fd, buf, len)); else stackPushINT(pVM->pStack, -1); return; } /* fseek - seek to a new position in a file * * fseek ( fd ofs whence -- pos ) */ static void pfseek(FICL_VM *pVM) { int fd, pos, whence; #if FICL_ROBUST > 1 vmCheckStack(pVM, 3, 1); #endif whence = stackPopINT(pVM->pStack); pos = stackPopINT(pVM->pStack); fd = stackPopINT(pVM->pStack); stackPushINT(pVM->pStack, lseek(fd, pos, whence)); return; } /* key - get a character from stdin * * key ( -- char ) */ static void key(FICL_VM *pVM) { #if FICL_ROBUST > 1 vmCheckStack(pVM, 0, 1); #endif stackPushINT(pVM->pStack, getchar()); return; } /* key? - check for a character from stdin (FACILITY) * * key? ( -- flag ) */ static void keyQuestion(FICL_VM *pVM) { #if FICL_ROBUST > 1 vmCheckStack(pVM, 0, 1); #endif #ifdef TESTMAIN /* XXX Since we don't fiddle with termios, let it always succeed... */ stackPushINT(pVM->pStack, FICL_TRUE); #else /* But here do the right thing. */ stackPushINT(pVM->pStack, ischar()? FICL_TRUE : FICL_FALSE); #endif return; } /* seconds - gives number of seconds since beginning of time * * beginning of time is defined as: * * BTX - number of seconds since midnight * FreeBSD - number of seconds since Jan 1 1970 * * seconds ( -- u ) */ static void pseconds(FICL_VM *pVM) { #if FICL_ROBUST > 1 vmCheckStack(pVM,0,1); #endif stackPushUNS(pVM->pStack, (FICL_UNS) time(NULL)); return; } /* ms - wait at least that many milliseconds (FACILITY) * * ms ( u -- ) * */ static void ms(FICL_VM *pVM) { #if FICL_ROBUST > 1 vmCheckStack(pVM,1,0); #endif #ifdef TESTMAIN usleep(stackPopUNS(pVM->pStack)*1000); #else delay(stackPopUNS(pVM->pStack)*1000); #endif return; } /* fkey - get a character from a file * * fkey ( file -- char ) */ static void fkey(FICL_VM *pVM) { int i, fd; char ch; #if FICL_ROBUST > 1 vmCheckStack(pVM, 1, 1); #endif fd = stackPopINT(pVM->pStack); i = read(fd, &ch, 1); stackPushINT(pVM->pStack, i > 0 ? ch : -1); return; } /* ** Retrieves free space remaining on the dictionary */ static void freeHeap(FICL_VM *pVM) { stackPushINT(pVM->pStack, dictCellsAvail(ficlGetDict(pVM->pSys))); } /******************* Increase dictionary size on-demand ******************/ static void ficlDictThreshold(FICL_VM *pVM) { stackPushPtr(pVM->pStack, &dictThreshold); } static void ficlDictIncrease(FICL_VM *pVM) { stackPushPtr(pVM->pStack, &dictIncrease); } /************************************************************************** f i c l C o m p i l e P l a t f o r m ** Build FreeBSD platform extensions into the system dictionary **************************************************************************/ void ficlCompilePlatform(FICL_SYSTEM *pSys) { ficlCompileFcn **fnpp; FICL_DICT *dp = pSys->dp; assert (dp); dictAppendWord(dp, ".#", displayCellNoPad, FW_DEFAULT); dictAppendWord(dp, "isdir?", isdirQuestion, FW_DEFAULT); dictAppendWord(dp, "fopen", pfopen, FW_DEFAULT); dictAppendWord(dp, "fclose", pfclose, FW_DEFAULT); dictAppendWord(dp, "fread", pfread, FW_DEFAULT); dictAppendWord(dp, "freaddir", pfreaddir, FW_DEFAULT); dictAppendWord(dp, "fload", pfload, FW_DEFAULT); dictAppendWord(dp, "fkey", fkey, FW_DEFAULT); dictAppendWord(dp, "fseek", pfseek, FW_DEFAULT); dictAppendWord(dp, "fwrite", pfwrite, FW_DEFAULT); dictAppendWord(dp, "key", key, FW_DEFAULT); dictAppendWord(dp, "key?", keyQuestion, FW_DEFAULT); dictAppendWord(dp, "ms", ms, FW_DEFAULT); dictAppendWord(dp, "seconds", pseconds, FW_DEFAULT); dictAppendWord(dp, "heap?", freeHeap, FW_DEFAULT); dictAppendWord(dp, "dictthreshold", ficlDictThreshold, FW_DEFAULT); dictAppendWord(dp, "dictincrease", ficlDictIncrease, FW_DEFAULT); dictAppendWord(dp, "setenv", ficlSetenv, FW_DEFAULT); dictAppendWord(dp, "setenv?", ficlSetenvq, FW_DEFAULT); dictAppendWord(dp, "getenv", ficlGetenv, FW_DEFAULT); dictAppendWord(dp, "unsetenv", ficlUnsetenv, FW_DEFAULT); dictAppendWord(dp, "copyin", ficlCopyin, FW_DEFAULT); dictAppendWord(dp, "copyout", ficlCopyout, FW_DEFAULT); dictAppendWord(dp, "findfile", ficlFindfile, FW_DEFAULT); dictAppendWord(dp, "ccall", ficlCcall, FW_DEFAULT); dictAppendWord(dp, "uuid-from-string", ficlUuidFromString, FW_DEFAULT); dictAppendWord(dp, "uuid-to-string", ficlUuidToString, FW_DEFAULT); - SET_FOREACH(fnpp, Xficl_compile_set) { + SET_FOREACH(fnpp, Xficl_compile_set) (*fnpp)(pSys); - } #if defined(PC98) ficlSetEnv(pSys, "arch-pc98", FICL_TRUE); #elif defined(__i386__) ficlSetEnv(pSys, "arch-i386", FICL_TRUE); ficlSetEnv(pSys, "arch-powerpc", FICL_FALSE); #elif defined(__powerpc__) ficlSetEnv(pSys, "arch-i386", FICL_FALSE); ficlSetEnv(pSys, "arch-powerpc", FICL_TRUE); #endif return; } Index: user/alc/PQ_LAUNDRY/sys/boot/forth/Makefile.inc =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/forth/Makefile.inc (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/forth/Makefile.inc (revision 307896) @@ -1,25 +1,26 @@ # $FreeBSD$ FILES+= beastie.4th FILES+= brand.4th FILES+= brand-fbsd.4th FILES+= check-password.4th FILES+= color.4th FILES+= delay.4th +FILES+= efi.4th FILES+= frames.4th FILES+= loader.4th FILES+= loader.conf FILES+= loader.help FILES+= logo-beastie.4th FILES+= logo-beastiebw.4th FILES+= logo-fbsdbw.4th FILES+= logo-orb.4th FILES+= logo-orbbw.4th FILES+= menu.4th FILES+= menu-commands.4th FILES+= menusets.4th FILES+= screen.4th FILES+= shortcuts.4th FILES+= support.4th FILES+= version.4th FILESDIR_loader.conf= /boot/defaults Index: user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th (nonexistent) +++ user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th (revision 307896) @@ -0,0 +1,30 @@ +\ Copyright (c) 2016 Netflix, Inc +\ All rights reserved. +\ +\ Redistribution and use in source and binary forms, with or without +\ modification, are permitted provided that the following conditions +\ are met: +\ 1. Redistributions of source code must retain the above copyright +\ notice, this list of conditions and the following disclaimer. +\ 2. Redistributions in binary form must reproduce the above copyright +\ notice, this list of conditions and the following disclaimer in the +\ documentation and/or other materials provided with the distribution. +\ +\ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +\ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +\ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +\ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +\ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +\ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +\ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +\ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +\ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +\ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +\ SUCH DAMAGE. +\ +\ $FreeBSD$ + +only forth definitions + +\ Place holder for more functions +.( EFI boot environment) cr Property changes on: user/alc/PQ_LAUNDRY/sys/boot/forth/efi.4th ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: user/alc/PQ_LAUNDRY/sys/boot/forth/loader.4th =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/forth/loader.4th (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/boot/forth/loader.4th (revision 307896) @@ -1,263 +1,266 @@ \ Copyright (c) 1999 Daniel C. Sobral \ Copyright (c) 2011-2015 Devin Teske \ All rights reserved. \ \ Redistribution and use in source and binary forms, with or without \ modification, are permitted provided that the following conditions \ are met: \ 1. Redistributions of source code must retain the above copyright \ notice, this list of conditions and the following disclaimer. \ 2. Redistributions in binary form must reproduce the above copyright \ notice, this list of conditions and the following disclaimer in the \ documentation and/or other materials provided with the distribution. \ \ THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND \ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE \ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE \ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE \ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL \ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS \ OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) \ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT \ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY \ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF \ SUCH DAMAGE. \ \ $FreeBSD$ only forth definitions s" arch-i386" environment? [if] [if] s" loader_version" environment? [if] 11 < [if] .( Loader version 1.1+ required) cr abort [then] [else] .( Could not get loader version!) cr abort [then] [then] [then] 256 dictthreshold ! \ 256 cells minimum free space 2048 dictincrease ! \ 2048 additional cells each time include /boot/support.4th include /boot/color.4th include /boot/delay.4th include /boot/check-password.4th +s" efi-boot" environment? [if] [if] + include /boot/efi.4th +[then] [then] only forth definitions : bootmsg ( -- ) loader_color? dup ( -- bool bool ) if 7 fg 4 bg then ." Booting..." if me then cr ; : try-menu-unset \ menu-unset may not be present s" beastie_disable" getenv dup -1 <> if s" YES" compare-insensitive 0= if exit then else drop then s" menu-unset" sfind if execute else drop then s" menusets-unset" sfind if execute else drop then ; only forth also support-functions also builtins definitions : boot 0= if ( interpreted ) get_arguments then \ Unload only if a path was passed dup if >r over r> swap c@ [char] - <> if 0 1 unload drop else s" kernelname" getenv? if ( a kernel has been loaded ) try-menu-unset bootmsg 1 boot exit then load_kernel_and_modules ?dup if exit then try-menu-unset bootmsg 0 1 boot exit then else s" kernelname" getenv? if ( a kernel has been loaded ) try-menu-unset bootmsg 1 boot exit then load_kernel_and_modules ?dup if exit then try-menu-unset bootmsg 0 1 boot exit then load_kernel_and_modules ?dup 0= if bootmsg 0 1 boot then ; \ ***** boot-conf \ \ Prepares to boot as specified by loaded configuration files. : boot-conf 0= if ( interpreted ) get_arguments then 0 1 unload drop load_kernel_and_modules ?dup 0= if 0 1 autoboot then ; also forth definitions previous builtin: boot builtin: boot-conf only forth definitions also support-functions \ ***** start \ \ Initializes support.4th global variables, sets loader_conf_files, \ processes conf files, and, if any one such file was successfully \ read to the end, loads kernel and modules. : start ( -- ) ( throws: abort & user-defined ) s" /boot/defaults/loader.conf" initialize include_conf_files include_nextboot_file \ If the user defined a post-initialize hook, call it now s" post-initialize" sfind if execute else drop then \ Will *NOT* try to load kernel and modules if no configuration file \ was successfully loaded! any_conf_read? if s" loader_delay" getenv -1 = if load_xen_throw load_kernel load_modules else drop ." Loading Kernel and Modules (Ctrl-C to Abort)" cr s" also support-functions" evaluate s" set delay_command='load_xen_throw load_kernel load_modules'" evaluate s" set delay_showdots" evaluate delay_execute then then ; \ ***** initialize \ \ Overrides support.4th initialization word with one that does \ everything start one does, short of loading the kernel and \ modules. Returns a flag. : initialize ( -- flag ) s" /boot/defaults/loader.conf" initialize include_conf_files include_nextboot_file \ If the user defined a post-initialize hook, call it now s" post-initialize" sfind if execute else drop then any_conf_read? ; \ ***** read-conf \ \ Read a configuration file, whose name was specified on the command \ line, if interpreted, or given on the stack, if compiled in. : (read-conf) ( addr len -- ) conf_files string= include_conf_files \ Will recurse on new loader_conf_files definitions ; : read-conf ( | addr len -- ) ( throws: abort & user-defined ) state @ if \ Compiling postpone (read-conf) else \ Interpreting bl parse (read-conf) then ; immediate \ show, enable, disable, toggle module loading. They all take module from \ the next word : set-module-flag ( module_addr val -- ) \ set and print flag over module.flag ! dup module.name strtype module.flag @ if ." will be loaded" else ." will not be loaded" then cr ; : enable-module find-module ?dup if true set-module-flag then ; : disable-module find-module ?dup if false set-module-flag then ; : toggle-module find-module ?dup if dup module.flag @ 0= set-module-flag then ; \ ***** show-module \ \ Show loading information about a module. : show-module ( -- ) find-module ?dup if show-one-module then ; \ Words to be used inside configuration files : retry false ; \ For use in load error commands : ignore true ; \ For use in load error commands \ Return to strict forth vocabulary : #type over - >r type r> spaces ; : .? 2 spaces 2swap 15 #type 2 spaces type cr ; \ Execute the ? command to print all the commands defined in \ C, then list the ones we support here. Please note that this \ doesn't use pager_* routines that the C implementation of ? \ does, so these will always appear, even if you stop early \ there. And they may cause the commands to scroll off the \ screen if the number of commands modulus LINES is close \ to LINEs.... : ? ['] ? execute s" boot-conf" s" load kernel and modules, then autoboot" .? s" read-conf" s" read a configuration file" .? s" enable-module" s" enable loading of a module" .? s" disable-module" s" disable loading of a module" .? s" toggle-module" s" toggle loading of a module" .? s" show-module" s" show module load data" .? s" try-include" s" try to load/interpret files" .? ; : try-include ( -- ) \ see loader.4th(8) ['] include ( -- xt ) \ get the execution token of `include' catch ( xt -- exception# | 0 ) if \ failed LF parse ( c -- s-addr/u ) 2drop \ advance >in to EOL (drop data) \ ... prevents words unused by `include' from being interpreted then ; immediate \ interpret immediately for access to `source' (aka tib) only forth definitions Index: user/alc/PQ_LAUNDRY/sys/contrib/rdma/krping/krping.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/contrib/rdma/krping/krping.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/contrib/rdma/krping/krping.c (revision 307896) @@ -1,3347 +1,3347 @@ /* * Copyright (c) 2005 Ammasso, Inc. All rights reserved. * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "krping.h" #include "getopt.h" extern int krping_debug; #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x) #define PRINTF(cb, x...) log(LOG_INFO, x) #define BIND_INFO 1 MODULE_AUTHOR("Steve Wise"); MODULE_DESCRIPTION("RDMA ping client/server"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(krping, 1); MODULE_DEPEND(krping, linuxkpi, 1, 1, 1); static __inline uint64_t get_cycles(void) { uint32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((u_int64_t)high << 32)); } typedef uint64_t cycles_t; enum mem_type { DMA = 1, FASTREG = 2, MW = 3, MR = 4 }; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, {"size", OPT_INT, 'S'}, {"addr", OPT_STRING, 'a'}, {"port", OPT_INT, 'p'}, {"verbose", OPT_NOPARAM, 'v'}, {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, {"mem_mode", OPT_STRING, 'm'}, {"server_inv", OPT_NOPARAM, 'I'}, {"wlat", OPT_NOPARAM, 'l'}, {"rlat", OPT_NOPARAM, 'L'}, {"bw", OPT_NOPARAM, 'B'}, {"duplex", OPT_NOPARAM, 'd'}, {"txdepth", OPT_INT, 'T'}, {"poll", OPT_NOPARAM, 'P'}, {"local_dma_lkey", OPT_NOPARAM, 'Z'}, {"read_inv", OPT_NOPARAM, 'R'}, {"fr", OPT_INT, 'f'}, {NULL, 0, 0} }; #define htonll(x) cpu_to_be64((x)) #define ntohll(x) cpu_to_be64((x)) static struct mutex krping_mutex; /* * List of running krping threads. */ static LIST_HEAD(krping_cbs); /* * krping "ping/pong" loop: * client sends source rkey/addr/len * server receives source rkey/add/len * server rdma reads "ping" data from source * server sends "go ahead" on rdma read completion * client sends sink rkey/addr/len * server receives sink rkey/addr/len * server rdma writes "pong" data to sink * server sends "go ahead" on rdma write completion * */ /* * These states are used to signal events between the completion handler * and the main client or server thread. * * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, * and RDMA_WRITE_COMPLETE for each ping. */ enum test_state { IDLE = 1, CONNECT_REQUEST, ADDR_RESOLVED, ROUTE_RESOLVED, CONNECTED, RDMA_READ_ADV, RDMA_READ_COMPLETE, RDMA_WRITE_ADV, RDMA_WRITE_COMPLETE, ERROR }; struct krping_rdma_info { uint64_t buf; uint32_t rkey; uint32_t size; }; /* * Default max buffer size for IO... */ #define RPING_BUFSIZE 128*1024 #define RPING_SQ_DEPTH 64 /* * Control block struct. */ struct krping_cb { void *cookie; int server; /* 0 iff client */ struct ib_cq *cq; struct ib_pd *pd; struct ib_qp *qp; enum mem_type mem; struct ib_mr *dma_mr; struct ib_fast_reg_page_list *page_list; int page_list_len; struct ib_send_wr fastreg_wr; struct ib_send_wr invalidate_wr; struct ib_mr *fastreg_mr; int server_invalidate; int read_inv; u8 key; struct ib_mw *mw; struct ib_mw_bind bind_attr; struct ib_recv_wr rq_wr; /* recv work request record */ struct ib_sge recv_sgl; /* recv single SGE */ struct krping_rdma_info recv_buf;/* malloc'd buffer */ u64 recv_dma_addr; DECLARE_PCI_UNMAP_ADDR(recv_mapping) struct ib_mr *recv_mr; struct ib_send_wr sq_wr; /* send work requrest record */ struct ib_sge send_sgl; struct krping_rdma_info send_buf;/* single send buf */ u64 send_dma_addr; DECLARE_PCI_UNMAP_ADDR(send_mapping) struct ib_mr *send_mr; struct ib_send_wr rdma_sq_wr; /* rdma work request record */ struct ib_sge rdma_sgl; /* rdma single SGE */ char *rdma_buf; /* used as rdma sink */ u64 rdma_dma_addr; DECLARE_PCI_UNMAP_ADDR(rdma_mapping) struct ib_mr *rdma_mr; uint32_t remote_rkey; /* remote guys RKEY */ uint64_t remote_addr; /* remote guys TO */ uint32_t remote_len; /* remote guys LEN */ char *start_buf; /* rdma read src */ u64 start_dma_addr; DECLARE_PCI_UNMAP_ADDR(start_mapping) struct ib_mr *start_mr; enum test_state state; /* used for cond/signalling */ wait_queue_head_t sem; struct krping_stats stats; uint16_t port; /* dst port in NBO */ struct in_addr addr; /* dst addr in NBO */ char *addr_str; /* dst addr string */ int verbose; /* verbose logging */ int count; /* ping count */ int size; /* ping data size */ int validate; /* validate ping data */ int wlat; /* run wlat test */ int rlat; /* run rlat test */ int bw; /* run bw test */ int duplex; /* run bw full duplex test */ int poll; /* poll or block for rlat test */ int txdepth; /* SQ depth */ int local_dma_lkey; /* use 0 for lkey */ int frtest; /* fastreg test */ int testnum; /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ /* listener on server side. */ struct rdma_cm_id *child_cm_id; /* connection on server side */ struct list_head list; }; static int krping_cma_event_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret; struct krping_cb *cb = cma_id->context; DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { PRINTF(cb, "rdma_resolve_route error %d\n", ret); wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; cb->child_cm_id = cma_id; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: if (cb->state == IDLE) { cb->state = CONNECT_REQUEST; cb->child_cm_id = cma_id; } else { PRINTF(cb, "Received connection request in wrong state" " (%d)\n", cb->state); } DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id); wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: DEBUG_LOG(cb, "ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; } wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: PRINTF(cb, "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: PRINTF(cb, "DISCONNECT EVENT...\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: PRINTF(cb, "cma detected device removal!!!!\n"); break; default: PRINTF(cb, "oof bad type!\n"); wake_up_interruptible(&cb->sem); break; } return 0; } static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) cb->state = RDMA_READ_ADV; else cb->state = RDMA_WRITE_ADV; return 0; } static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } if (cb->state == RDMA_READ_ADV) cb->state = RDMA_WRITE_ADV; else cb->state = RDMA_WRITE_COMPLETE; return 0; } static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) { struct krping_cb *cb = ctx; struct ib_wc wc; struct ib_recv_wr *bad_wr; int ret; BUG_ON(cb->cq != cq); if (cb->state == ERROR) { PRINTF(cb, "cq completion in ERROR state\n"); return; } if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { DEBUG_LOG(cb, "cq flushed\n"); continue; } else { PRINTF(cb, "cq completion failed with " "wr_id %jx status %d opcode %d vender_err %x\n", (uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; } } switch (wc.opcode) { case IB_WC_SEND: DEBUG_LOG(cb, "send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: DEBUG_LOG(cb, "rdma write completion\n"); cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: DEBUG_LOG(cb, "rdma read completion\n"); cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: DEBUG_LOG(cb, "recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; if (cb->wlat || cb->rlat || cb->bw || cb->frtest) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { PRINTF(cb, "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "post recv error: %d\n", ret); goto error; } wake_up_interruptible(&cb->sem); break; default: PRINTF(cb, "%s:%d Unexpected opcode %d, Shutting down\n", __func__, __LINE__, wc.opcode); goto error; } } if (ret) { PRINTF(cb, "poll error %d\n", ret); goto error; } return; error: cb->state = ERROR; wake_up_interruptible(&cb->sem); } static int krping_accept(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; DEBUG_LOG(cb, "accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_accept error: %d\n", ret); return ret; } if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } } return 0; } static void krping_setup_wr(struct krping_cb *cb) { cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; if (cb->local_dma_lkey) cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->recv_sgl.lkey = cb->dma_mr->lkey; else cb->recv_sgl.lkey = cb->recv_mr->lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; if (cb->local_dma_lkey) cb->send_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->send_sgl.lkey = cb->dma_mr->lkey; else cb->send_sgl.lkey = cb->send_mr->lkey; cb->sq_wr.opcode = IB_WR_SEND; cb->sq_wr.send_flags = IB_SEND_SIGNALED; cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { cb->rdma_sgl.addr = cb->rdma_dma_addr; if (cb->mem == MR) cb->rdma_sgl.lkey = cb->rdma_mr->lkey; cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; cb->rdma_sq_wr.num_sge = 1; } switch(cb->mem) { case FASTREG: /* * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR. * both unsignaled. The client uses them to reregister * the rdma buffers with a new key each iteration. */ cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR; cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; cb->fastreg_wr.wr.fast_reg.length = cb->size; cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list; cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len; cb->invalidate_wr.next = &cb->fastreg_wr; cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; break; case MW: cb->bind_attr.wr_id = 0xabbaabba; cb->bind_attr.send_flags = 0; /* unsignaled */ #ifdef BIND_INFO cb->bind_attr.bind_info.length = cb->size; #else cb->bind_attr.length = cb->size; #endif break; default: break; } } static int krping_setup_buffers(struct krping_cb *cb) { int ret; struct ib_phys_buf buf; u64 iovbase; DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb); - cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, + cb->recv_dma_addr = ib_dma_map_single(cb->pd->device, &cb->recv_buf, sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); - cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, + cb->send_dma_addr = ib_dma_map_single(cb->pd->device, &cb->send_buf, sizeof(cb->send_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); if (cb->mem == DMA) { cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE); if (IS_ERR(cb->dma_mr)) { DEBUG_LOG(cb, "reg_dmamr failed\n"); ret = PTR_ERR(cb->dma_mr); goto bail; } } else { if (!cb->local_dma_lkey) { buf.addr = cb->recv_dma_addr; buf.size = sizeof cb->recv_buf; DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->recv_dma_addr; cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_LOCAL_WRITE, &iovbase); if (IS_ERR(cb->recv_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->recv_mr); goto bail; } buf.addr = cb->send_dma_addr; buf.size = sizeof cb->send_buf; DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->send_dma_addr; cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 0, &iovbase); if (IS_ERR(cb->send_mr)) { DEBUG_LOG(cb, "send_buf reg_mr failed\n"); ret = PTR_ERR(cb->send_mr); goto bail; } } } cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->rdma_buf) { DEBUG_LOG(cb, "rdma_buf malloc failed\n"); ret = -ENOMEM; goto bail; } - cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, + cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device, cb->rdma_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); if (cb->mem != DMA) { switch (cb->mem) { case FASTREG: cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; cb->page_list = ib_alloc_fast_reg_page_list( cb->pd->device, cb->page_list_len); if (IS_ERR(cb->page_list)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->page_list); goto bail; } cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, cb->page_list->max_page_list_len); if (IS_ERR(cb->fastreg_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->fastreg_mr); goto bail; } DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p" " page_list_len %u\n", cb->fastreg_mr->rkey, cb->page_list, cb->page_list_len); break; case MW: cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1); if (IS_ERR(cb->mw)) { DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); ret = PTR_ERR(cb->mw); goto bail; } DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey); /*FALLTHROUGH*/ case MR: buf.addr = cb->rdma_dma_addr; buf.size = cb->size; iovbase = cb->rdma_dma_addr; cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE, &iovbase); if (IS_ERR(cb->rdma_mr)) { DEBUG_LOG(cb, "rdma_buf reg_mr failed\n"); ret = PTR_ERR(cb->rdma_mr); goto bail; } DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n", (uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey); break; default: ret = -EINVAL; goto bail; break; } } if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { cb->start_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->start_buf) { DEBUG_LOG(cb, "start_buf malloc failed\n"); ret = -ENOMEM; goto bail; } - cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, + cb->start_dma_addr = ib_dma_map_single(cb->pd->device, cb->start_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); if (cb->mem == MR || cb->mem == MW) { unsigned flags = IB_ACCESS_REMOTE_READ; if (cb->wlat || cb->rlat || cb->bw || cb->frtest) { flags |= IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; } buf.addr = cb->start_dma_addr; buf.size = cb->size; DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->start_dma_addr; cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, flags, &iovbase); if (IS_ERR(cb->start_mr)) { DEBUG_LOG(cb, "start_buf reg_mr failed\n"); ret = PTR_ERR(cb->start_mr); goto bail; } } } krping_setup_wr(cb); DEBUG_LOG(cb, "allocated & registered buffers...\n"); return 0; bail: if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr)) ib_dereg_mr(cb->fastreg_mr); if (cb->mw && !IS_ERR(cb->mw)) ib_dealloc_mw(cb->mw); if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); if (cb->page_list && !IS_ERR(cb->page_list)) ib_free_fast_reg_page_list(cb->page_list); if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); if (cb->recv_mr && !IS_ERR(cb->recv_mr)) ib_dereg_mr(cb->recv_mr); if (cb->send_mr && !IS_ERR(cb->send_mr)) ib_dereg_mr(cb->send_mr); if (cb->rdma_buf) kfree(cb->rdma_buf); if (cb->start_buf) kfree(cb->start_buf); return ret; } static void krping_free_buffers(struct krping_cb *cb) { DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb); if (cb->dma_mr) ib_dereg_mr(cb->dma_mr); if (cb->send_mr) ib_dereg_mr(cb->send_mr); if (cb->recv_mr) ib_dereg_mr(cb->recv_mr); if (cb->rdma_mr) ib_dereg_mr(cb->rdma_mr); if (cb->start_mr) ib_dereg_mr(cb->start_mr); if (cb->fastreg_mr) ib_dereg_mr(cb->fastreg_mr); if (cb->mw) ib_dealloc_mw(cb->mw); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, send_mapping), sizeof(cb->send_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, rdma_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->rdma_buf); if (cb->start_buf) { dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, start_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->start_buf); } } static int krping_create_qp(struct krping_cb *cb) { struct ib_qp_init_attr init_attr; int ret; memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = cb->txdepth; init_attr.cap.max_recv_wr = 2; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = cb->cq; init_attr.recv_cq = cb->cq; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; if (cb->server) { ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->child_cm_id->qp; } else { ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->cm_id->qp; } return ret; } static void krping_free_qp(struct krping_cb *cb) { ib_destroy_qp(cb->qp); ib_destroy_cq(cb->cq); ib_dealloc_pd(cb->pd); } static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) { int ret; cb->pd = ib_alloc_pd(cm_id->device); if (IS_ERR(cb->pd)) { PRINTF(cb, "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } DEBUG_LOG(cb, "created pd %p\n", cb->pd); strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, cb, cb->txdepth * 2, 0); if (IS_ERR(cb->cq)) { PRINTF(cb, "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } DEBUG_LOG(cb, "created cq %p\n", cb->cq); if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); if (ret) { PRINTF(cb, "ib_create_cq failed\n"); goto err2; } } ret = krping_create_qp(cb); if (ret) { PRINTF(cb, "krping_create_qp failed: %d\n", ret); goto err2; } DEBUG_LOG(cb, "created qp %p\n", cb->qp); return 0; err2: ib_destroy_cq(cb->cq); err1: ib_dealloc_pd(cb->pd); return ret; } /* * return the (possibly rebound) rkey for the rdma buffer. * FASTREG mode: invalidate and rebind via fastreg wr. * MW mode: rebind the MW. * other modes: just return the mr rkey. */ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) { u32 rkey = 0xffffffff; u64 p; struct ib_send_wr *bad_wr; int i; int ret; switch (cb->mem) { case FASTREG: cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey; /* * Update the fastreg key. */ ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key); cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey; /* * Update the fastreg WR with new buf info. */ if (buf == (u64)cb->start_dma_addr) cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ; else cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; cb->fastreg_wr.wr.fast_reg.iova_start = buf; p = (u64)(buf & PAGE_MASK); for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; i++, p += PAGE_SIZE) { cb->page_list->page_list[i] = p; DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p); } DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u" " iova_start %jx page_list_len %u\n", post_inv, cb->fastreg_wr.wr.fast_reg.rkey, cb->fastreg_wr.wr.fast_reg.page_shift, (unsigned)cb->fastreg_wr.wr.fast_reg.length, (uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start, cb->fastreg_wr.wr.fast_reg.page_list_len); if (post_inv) ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); else ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); cb->state = ERROR; } rkey = cb->fastreg_mr->rkey; break; case MW: /* * Update the MW with new buf info. */ if (buf == (u64)cb->start_dma_addr) { #ifdef BIND_INFO cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ; cb->bind_attr.bind_info.mr = cb->start_mr; #else cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ; cb->bind_attr.mr = cb->start_mr; #endif } else { #ifdef BIND_INFO cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE; cb->bind_attr.bind_info.mr = cb->rdma_mr; #else cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE; cb->bind_attr.mr = cb->rdma_mr; #endif } #ifdef BIND_INFO cb->bind_attr.bind_info.addr = buf; #else cb->bind_attr.addr = buf; #endif DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n", #ifdef BIND_INFO cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey); #else cb->mw->rkey, buf, cb->bind_attr.mr->rkey); #endif ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr); if (ret) { PRINTF(cb, "bind mw error %d\n", ret); cb->state = ERROR; } else rkey = cb->mw->rkey; break; case MR: if (buf == (u64)cb->start_dma_addr) rkey = cb->start_mr->rkey; else rkey = cb->rdma_mr->rkey; break; case DMA: rkey = cb->dma_mr->rkey; break; default: PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__); cb->state = ERROR; break; } return rkey; } static void krping_format_send(struct krping_cb *cb, u64 buf) { struct krping_rdma_info *info = &cb->send_buf; u32 rkey; /* * Client side will do fastreg or mw bind before * advertising the rdma buffer. Server side * sends have no data. */ if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate); info->buf = htonll(buf); info->rkey = htonl(rkey); info->size = htonl(cb->size); DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n", (unsigned long long)buf, rkey, cb->size); } } static void krping_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr, inv; int ret; while (1) { /* Wait for client's Start STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV); if (cb->state != RDMA_READ_ADV) { PRINTF(cb, "wait for RDMA_READ_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->remote_len; cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1); /* Issue RDMA Read. */ if (cb->read_inv) cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV; else { cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; if (cb->mem == FASTREG) { /* * Immediately follow the read with a * fenced LOCAL_INV. */ cb->rdma_sq_wr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = cb->fastreg_mr->rkey; inv.send_flags = IB_SEND_FENCE; } } ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } cb->rdma_sq_wr.next = NULL; DEBUG_LOG(cb, "server posted rdma read req \n"); /* Wait for read completion */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_COMPLETE); if (cb->state != RDMA_READ_COMPLETE) { PRINTF(cb, "wait for RDMA_READ_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received read complete\n"); /* Display data in recv buf */ if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "server ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "server ping data: %s\n", cb->rdma_buf); } /* Tell client to continue */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); /* Wait for client's RDMA STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); /* RDMA Write echo data */ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; if (cb->local_dma_lkey) cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey; else cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0); DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n", cb->rdma_sq_wr.sg_list->lkey, (unsigned long long)cb->rdma_sq_wr.sg_list->addr, cb->rdma_sq_wr.sg_list->length); ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for completion */ ret = wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server rdma write complete \n"); cb->state = CONNECTED; /* Tell client to begin again */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); } } static void rlat_test(struct krping_cb *cb) { int scnt; int iters = cb->count; struct timeval start_tv, stop_tv; int ret; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; scnt = 0; cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; microtime(&start_tv); if (!cb->poll) { cb->state = RDMA_READ_ADV; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } while (scnt < iters) { cb->state = RDMA_READ_ADV; ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "Couldn't post send: ret=%d scnt %d\n", ret, scnt); return; } do { if (!cb->poll) { wait_event_interruptible(cb->sem, cb->state != RDMA_READ_ADV); if (cb->state == RDMA_READ_COMPLETE) { ne = 1; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } else { ne = -1; } } else ne = ib_poll_cq(cb->cq, 1, &wc); if (cb->state == ERROR) { PRINTF(cb, "state == ERROR...bailing scnt %d\n", scnt); return; } } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (cb->poll && wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } ++scnt; } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size); } static void wlat_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; volatile char *poll_buf = (char *) cb->start_buf; char *buf = (char *)cb->rdma_buf; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < iters && !(scnt < 1 && !cb->server)) { ++rcnt; while (*poll_buf != (char)rcnt) { if (cb->state == ERROR) { PRINTF(cb, "state = ERROR, bailing\n"); return; } } } if (scnt < iters) { struct ib_send_wr *bad_wr; *buf = (char)scnt+1; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); scnt++; } if (ccnt < iters) { struct ib_wc wc; int ne; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ++ccnt; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); PRINTF(cb, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void bw_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters) { while (scnt < iters && scnt - ccnt < cb->txdepth) { struct ib_send_wr *bad_wr; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); ++scnt; } if (ccnt < iters) { int ne; struct ib_wc wc; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ccnt += 1; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void krping_rlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_wlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wlat_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_bw_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) bw_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static int fastreg_supported(struct krping_cb *cb, int server) { struct ib_device *dev = server?cb->child_cm_id->device: cb->cm_id->device; struct ib_device_attr attr; int ret; ret = ib_query_device(dev, &attr); if (ret) { PRINTF(cb, "ib_query_device failed ret %d\n", ret); return 0; } if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n", (unsigned long long)attr.device_cap_flags); return 0; } DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n", (uintmax_t)attr.device_cap_flags); return 1; } static int krping_bind_server(struct krping_cb *cb) { struct sockaddr_in sin; int ret; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = cb->addr.s_addr; sin.sin_port = cb->port; ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); if (ret) { PRINTF(cb, "rdma_bind_addr error %d\n", ret); return ret; } DEBUG_LOG(cb, "rdma_bind_addr successful\n"); DEBUG_LOG(cb, "rdma_listen\n"); ret = rdma_listen(cb->cm_id, 3); if (ret) { PRINTF(cb, "rdma_listen failed: %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST); if (cb->state != CONNECT_REQUEST) { PRINTF(cb, "wait for CONNECT_REQUEST state %d\n", cb->state); return -1; } if (cb->mem == FASTREG && !fastreg_supported(cb, 1)) return -EINVAL; return 0; } /* * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads * complete. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. */ static void krping_fr_test5(struct krping_cb *cb) { struct ib_fast_reg_page_list **pl; struct ib_send_wr *fr, *read, *bad; struct ib_wc wc; struct ib_sge *sgl; u8 key = 0; struct ib_mr **mr; u8 **buf; dma_addr_t *dma_addr; int i; int ret; int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; time_t start; int count = 0; int scnt; int depth = cb->txdepth >> 1; if (!depth) { PRINTF(cb, "txdepth must be > 1 for this test!\n"); return; } pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); read = kzalloc(sizeof *read * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth); buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) { PRINTF(cb, "kzalloc failed\n"); goto err1; } for (scnt = 0; scnt < depth; scnt++) { pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl[scnt])) { PRINTF(cb, "alloc_fr_page_list failed %ld\n", PTR_ERR(pl[scnt])); goto err2; } DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr[scnt])) { PRINTF(cb, "alloc_fr failed %ld\n", PTR_ERR(mr[scnt])); goto err2; } DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); ib_update_fast_reg_key(mr[scnt], ++key); buf[scnt] = kmalloc(cb->size, GFP_KERNEL); if (!buf[scnt]) { PRINTF(cb, "kmalloc failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); - dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device, + dma_addr[scnt] = ib_dma_map_single(cb->pd->device, buf[scnt], cb->size, DMA_BIDIRECTIONAL); if (dma_mapping_error(cb->pd->device->dma_device, dma_addr[scnt])) { PRINTF(cb, "dma_map failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); } sgl[scnt].lkey = mr[scnt]->rkey; sgl[scnt].length = cb->size; sgl[scnt].addr = (u64)buf[scnt]; DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n", __func__, scnt, sgl[scnt].lkey, sgl[scnt].length, (uintmax_t)sgl[scnt].addr); fr[scnt].opcode = IB_WR_FAST_REG_MR; fr[scnt].wr_id = scnt; fr[scnt].send_flags = 0; fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; fr[scnt].wr.fast_reg.length = cb->size; fr[scnt].wr.fast_reg.page_list = pl[scnt]; fr[scnt].wr.fast_reg.page_list_len = plen; fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; fr[scnt].next = &read[scnt]; read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV; read[scnt].wr_id = scnt; read[scnt].send_flags = IB_SEND_SIGNALED; read[scnt].wr.rdma.rkey = cb->remote_rkey; read[scnt].wr.rdma.remote_addr = cb->remote_addr; read[scnt].num_sge = 1; read[scnt].sg_list = &sgl[scnt]; ret = ib_post_send(cb->qp, &fr[scnt], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } start = time_uptime; DEBUG_LOG(cb, "%s starting IO.\n", __func__); while (!cb->count || cb->server || count < cb->count) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, count); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, 1); if (cb->state == ERROR) break; start = time_uptime; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u wr_id %ju " "opcode %d\n", wc.status, (uintmax_t)wc.wr_id, wc.opcode); goto err2; } count++; if (count == cb->count) break; ib_update_fast_reg_key(mr[wc.wr_id], ++key); fr[wc.wr_id].wr.fast_reg.rkey = mr[wc.wr_id]->rkey; sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey; ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } DEBUG_LOG(cb, "%s done!\n", __func__); err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u " "opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mrs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (mr[scnt]) { ib_dereg_mr(mr[scnt]); DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); } } DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (buf[scnt]) { dma_unmap_single(cb->pd->device->dma_device, dma_addr[scnt], cb->size, DMA_BIDIRECTIONAL); kfree(buf[scnt]); DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); } } DEBUG_LOG(cb, "destroying fr page lists!\n"); for (scnt = 0; scnt < depth; scnt++) { if (pl[scnt]) { DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); ib_free_fast_reg_page_list(pl[scnt]); } } err1: if (pl) kfree(pl); if (mr) kfree(mr); if (fr) kfree(fr); if (read) kfree(read); if (sgl) kfree(sgl); if (buf) kfree(buf); if (dma_addr) kfree(dma_addr); } static void krping_fr_test_server(struct krping_cb *cb) { DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test5_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) krping_fr_test5(cb); DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test5_client(struct krping_cb *cb) { struct ib_send_wr *bad; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to server */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); return krping_fr_test5(cb); } /* * sq-depth worth of write + fastreg + inv, reposting them as the invs * complete. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. * If a count is given, then the last IO will have a bogus lkey in the * write work request. This reproduces a fw bug where the connection * will get stuck if a fastreg is processed while the ulptx is failing * the bad write. */ static void krping_fr_test6(struct krping_cb *cb) { struct ib_fast_reg_page_list **pl; struct ib_send_wr *fr, *write, *inv, *bad; struct ib_wc wc; struct ib_sge *sgl; u8 key = 0; struct ib_mr **mr; u8 **buf; dma_addr_t *dma_addr; int i; int ret; int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt; int depth = cb->txdepth / 3; if (!depth) { PRINTF(cb, "txdepth must be > 3 for this test!\n"); return; } pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); write = kzalloc(sizeof *write * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth); inv = kzalloc(sizeof *inv * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth); buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) { PRINTF(cb, "kzalloc failed\n"); goto err1; } for (scnt = 0; scnt < depth; scnt++) { pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl[scnt])) { PRINTF(cb, "alloc_fr_page_list failed %ld\n", PTR_ERR(pl[scnt])); goto err2; } DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr[scnt])) { PRINTF(cb, "alloc_fr failed %ld\n", PTR_ERR(mr[scnt])); goto err2; } DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); ib_update_fast_reg_key(mr[scnt], ++key); buf[scnt] = kmalloc(cb->size, GFP_KERNEL); if (!buf[scnt]) { PRINTF(cb, "kmalloc failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); - dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device, + dma_addr[scnt] = ib_dma_map_single(cb->pd->device, buf[scnt], cb->size, DMA_BIDIRECTIONAL); if (dma_mapping_error(cb->pd->device->dma_device, dma_addr[scnt])) { PRINTF(cb, "dma_map failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); } write[scnt].opcode = IB_WR_RDMA_WRITE; write[scnt].wr_id = scnt; write[scnt].wr.rdma.rkey = cb->remote_rkey; write[scnt].wr.rdma.remote_addr = cb->remote_addr; write[scnt].num_sge = 1; write[scnt].sg_list = &cb->rdma_sgl; write[scnt].sg_list->length = cb->size; write[scnt].next = &fr[scnt]; fr[scnt].opcode = IB_WR_FAST_REG_MR; fr[scnt].wr_id = scnt; fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; fr[scnt].wr.fast_reg.length = cb->size; fr[scnt].wr.fast_reg.page_list = pl[scnt]; fr[scnt].wr.fast_reg.page_list_len = plen; fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; fr[scnt].next = &inv[scnt]; inv[scnt].opcode = IB_WR_LOCAL_INV; inv[scnt].send_flags = IB_SEND_SIGNALED; inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey; ret = ib_post_send(cb->qp, &write[scnt], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } start = time_uptime; DEBUG_LOG(cb, "%s starting IO.\n", __func__); while (!cb->count || cb->server || count < cb->count) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, count); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, 1); if (cb->state == ERROR) break; start = time_uptime; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u wr_id %ju " "opcode %d\n", wc.status, (uintmax_t)wc.wr_id, wc.opcode); goto err2; } count++; if (count == (cb->count -1)) cb->rdma_sgl.lkey = 0x00dead; if (count == cb->count) break; ib_update_fast_reg_key(mr[wc.wr_id], ++key); fr[wc.wr_id].wr.fast_reg.rkey = mr[wc.wr_id]->rkey; inv[wc.wr_id].ex.invalidate_rkey = mr[wc.wr_id]->rkey; ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } else if (krping_sigpending()){ PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } DEBUG_LOG(cb, "%s done!\n", __func__); err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u " "opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mrs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (mr[scnt]) { ib_dereg_mr(mr[scnt]); DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); } } DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (buf[scnt]) { dma_unmap_single(cb->pd->device->dma_device, dma_addr[scnt], cb->size, DMA_BIDIRECTIONAL); kfree(buf[scnt]); DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); } } DEBUG_LOG(cb, "destroying fr page lists!\n"); for (scnt = 0; scnt < depth; scnt++) { if (pl[scnt]) { DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); ib_free_fast_reg_page_list(pl[scnt]); } } err1: if (pl) kfree(pl); if (mr) kfree(mr); if (fr) kfree(fr); if (write) kfree(write); if (inv) kfree(inv); if (sgl) kfree(sgl); if (buf) kfree(buf); if (dma_addr) kfree(dma_addr); } static void krping_fr_test6_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) krping_fr_test6(cb); DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test6_client(struct krping_cb *cb) { struct ib_send_wr *bad; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to server */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); return krping_fr_test6(cb); } static void krping_run_server(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_server(cb); if (ret) return; ret = krping_setup_qp(cb, cb->child_cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); goto err0; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_accept(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_server(cb); else if (cb->rlat) krping_rlat_test_server(cb); else if (cb->bw) krping_bw_test_server(cb); else if (cb->frtest) { switch (cb->testnum) { case 1: case 2: case 3: case 4: krping_fr_test_server(cb); break; case 5: krping_fr_test5_server(cb); break; case 6: krping_fr_test6_server(cb); break; default: PRINTF(cb, "unknown fr test %d\n", cb->testnum); goto err2; break; } } else krping_test_server(cb); rdma_disconnect(cb->child_cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); err0: rdma_destroy_id(cb->child_cm_id); } static void krping_test_client(struct krping_cb *cb) { int ping, start, cc, i, ret; struct ib_send_wr *bad_wr; unsigned char c; start = 65; for (ping = 0; !cb->count || ping < cb->count; ping++) { cb->state = RDMA_READ_ADV; /* Put some ascii text in the buffer. */ cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); for (i = cc, c = start; i < cb->size; i++) { cb->start_buf[i] = c; c++; if (c > 122) c = 65; } start++; if (start > 122) start = 65; cb->start_buf[cb->size - 1] = 0; krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); break; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for server to ACK */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } krping_format_send(cb, cb->rdma_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for the server to say the RDMA Write is complete. */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } if (cb->validate) if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { PRINTF(cb, "data mismatch!\n"); break; } if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "ping data: %s\n", cb->rdma_buf); } #ifdef SLOW_KRPING wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); #endif } } static void krping_rlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } #if 0 { int i; struct timeval start, stop; time_t sec; suseconds_t usec; unsigned long long elapsed; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = 0; cb->rdma_sq_wr.num_sge = 0; microtime(&start); for (i=0; i < 100000; i++) { if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send\n"); return; } do { ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } microtime(&stop); if (stop.tv_usec < start.tv_usec) { stop.tv_usec += 1000000; stop.tv_sec -= 1; } sec = stop.tv_sec - start.tv_sec; usec = stop.tv_usec - start.tv_usec; elapsed = sec * 1000000 + usec; PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed); } #endif rlat_test(cb); } static void krping_wlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } wlat_test(cb); } static void krping_bw_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } bw_test(cb); } /* * fastreg 2 valid different mrs and verify the completions. */ static void krping_fr_test1(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, *bad; struct ib_wc wc; struct ib_mr *mr1, *mr2; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } mr2 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr2)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err2; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } fr.wr.fast_reg.rkey = mr2->rkey; DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 2); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr2!\n"); ib_dereg_mr(mr2); err2: DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg the same mr twice, 2nd one should produce error cqe. */ static void krping_fr_test2(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, *bad; struct ib_wc wc; struct ib_mr *mr1; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 2); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg pipelined in a loop as fast as we can until the user interrupts. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. */ static void krping_fr_test3(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, inv, *bad; struct ib_wc wc; u8 key = 0; struct ib_mr *mr; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.send_flags = IB_SEND_SIGNALED; DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); start = time_uptime; while (1) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); if (cb->state == ERROR) break; start = time_uptime; } while (scnt < (cb->txdepth>>1)) { ib_update_fast_reg_key(mr, ++key); fr.wr.fast_reg.rkey = mr->rkey; inv.ex.invalidate_rkey = mr->rkey; size = arc4random() % cb->size; if (size == 0) size = cb->size; plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list_len = plen; ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } scnt+=2; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u\n", wc.status); goto err2; } count++; scnt--; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "fr_test: done!\n"); ib_dereg_mr(mr); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg 1 and invalidate 1 mr and verify completion. */ static void krping_fr_test4(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, inv, *bad; struct ib_wc wc; struct ib_mr *mr1; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; fr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 1); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } static void krping_fr_test(struct krping_cb *cb) { switch (cb->testnum) { case 1: krping_fr_test1(cb); break; case 2: krping_fr_test2(cb); break; case 3: krping_fr_test3(cb); break; case 4: krping_fr_test4(cb); break; case 5: krping_fr_test5_client(cb); break; case 6: krping_fr_test6_client(cb); break; default: PRINTF(cb, "Unkown frtest num %u\n", cb->testnum); break; } } static int krping_connect_client(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 10; ret = rdma_connect(cb->cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_connect error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } DEBUG_LOG(cb, "rdma_connect successful\n"); return 0; } static int krping_bind_client(struct krping_cb *cb) { struct sockaddr_in sin; int ret; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = cb->addr.s_addr; sin.sin_port = cb->port; ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, 2000); if (ret) { PRINTF(cb, "rdma_resolve_addr error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED); if (cb->state != ROUTE_RESOLVED) { PRINTF(cb, "addr/route resolution did not resolve: state %d\n", cb->state); return -EINTR; } if (cb->mem == FASTREG && !fastreg_supported(cb, 0)) return -EINVAL; DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n"); return 0; } static void krping_run_client(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_client(cb); if (ret) return; ret = krping_setup_qp(cb, cb->cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); return; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_connect_client(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_client(cb); else if (cb->rlat) krping_rlat_test_client(cb); else if (cb->bw) krping_bw_test_client(cb); else if (cb->frtest) krping_fr_test(cb); else krping_test_client(cb); rdma_disconnect(cb->cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); } int krping_doit(char *cmd, void *cookie) { struct krping_cb *cb; int op; int ret = 0; char *optarg; unsigned long optint; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return -ENOMEM; mutex_lock(&krping_mutex); list_add_tail(&cb->list, &krping_cbs); mutex_unlock(&krping_mutex); cb->cookie = cookie; cb->server = -1; cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; cb->mem = DMA; init_waitqueue_head(&cb->sem); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, &optint)) != 0) { switch (op) { case 'a': cb->addr_str = optarg; DEBUG_LOG(cb, "ipaddr (%s)\n", optarg); if (!inet_aton(optarg, &cb->addr)) { PRINTF(cb, "bad addr string %s\n", optarg); ret = EINVAL; } break; case 'p': cb->port = htons(optint); DEBUG_LOG(cb, "port %d\n", (int)optint); break; case 'P': cb->poll = 1; DEBUG_LOG(cb, "server\n"); break; case 's': cb->server = 1; DEBUG_LOG(cb, "server\n"); break; case 'c': cb->server = 0; DEBUG_LOG(cb, "client\n"); break; case 'S': cb->size = optint; if ((cb->size < 1) || (cb->size > RPING_BUFSIZE)) { PRINTF(cb, "Invalid size %d " "(valid range is 1 to %d)\n", cb->size, RPING_BUFSIZE); ret = EINVAL; } else DEBUG_LOG(cb, "size %d\n", (int)optint); break; case 'C': cb->count = optint; if (cb->count < 0) { PRINTF(cb, "Invalid count %d\n", cb->count); ret = EINVAL; } else DEBUG_LOG(cb, "count %d\n", (int) cb->count); break; case 'v': cb->verbose++; DEBUG_LOG(cb, "verbose\n"); break; case 'V': cb->validate++; DEBUG_LOG(cb, "validate data\n"); break; case 'l': cb->wlat++; break; case 'L': cb->rlat++; break; case 'B': cb->bw++; break; case 'd': cb->duplex++; break; case 'm': if (!strncmp(optarg, "dma", 3)) cb->mem = DMA; else if (!strncmp(optarg, "fastreg", 7)) cb->mem = FASTREG; else if (!strncmp(optarg, "mw", 2)) cb->mem = MW; else if (!strncmp(optarg, "mr", 2)) cb->mem = MR; else { PRINTF(cb, "unknown mem mode %s. " "Must be dma, fastreg, mw, or mr\n", optarg); ret = -EINVAL; break; } break; case 'I': cb->server_invalidate = 1; break; case 'T': cb->txdepth = optint; DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth); break; case 'Z': cb->local_dma_lkey = 1; DEBUG_LOG(cb, "using local dma lkey\n"); break; case 'R': cb->read_inv = 1; DEBUG_LOG(cb, "using read-with-inv\n"); break; case 'f': cb->frtest = 1; cb->testnum = optint; DEBUG_LOG(cb, "fast-reg test!\n"); break; default: PRINTF(cb, "unknown opt %s\n", optarg); ret = -EINVAL; break; } } if (ret) goto out; if (cb->server == -1) { PRINTF(cb, "must be either client or server\n"); ret = -EINVAL; goto out; } if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n"); ret = -EINVAL; goto out; } if (cb->server_invalidate && cb->mem != FASTREG) { PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->read_inv && cb->mem != FASTREG) { PRINTF(cb, "read_inv only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) { PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n"); ret = -EINVAL; goto out; } cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); PRINTF(cb, "rdma_create_id error %d\n", ret); goto out; } DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id); if (cb->server) krping_run_server(cb); else krping_run_client(cb); DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id); rdma_destroy_id(cb->cm_id); out: mutex_lock(&krping_mutex); list_del(&cb->list); mutex_unlock(&krping_mutex); kfree(cb); return ret; } void krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg) { struct krping_cb *cb; mutex_lock(&krping_mutex); list_for_each_entry(cb, &krping_cbs, list) (*f)(cb->pd ? &cb->stats : NULL, arg); mutex_unlock(&krping_mutex); } void krping_init(void) { mutex_init(&krping_mutex); } Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c (revision 307896) @@ -1,5259 +1,5242 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_NETMAP #include #include #include #include #include #endif #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #ifdef T4_PKT_TIMESTAMP #define RX_COPY_THRESHOLD (MINCLSIZE - 8) #else #define RX_COPY_THRESHOLD MINCLSIZE #endif /* * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ static int fl_pktshift = 2; TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift); /* * Pad ethernet payload up to this boundary. * -1: driver should figure out a good value. * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ int fl_pad = -1; TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad); /* * Status page length. * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ static int spg_len = -1; TUNABLE_INT("hw.cxgbe.spg_len", &spg_len); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. */ static int cong_drop = 0; TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop); /* * Deliver multiple frames in the same free list buffer if they fit. * -1: let the driver decide whether to enable buffer packing or not. * 0: disable buffer packing. * 1: enable buffer packing. */ static int buffer_packing = -1; TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. * T4: driver will ignore this and use the same value as fl_pad above. * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack); /* * Allow the driver to create mbuf(s) in a cluster allocated for rx. * 0: never; always allocate mbufs from the zone_mbuf UMA zone. * 1: ok to create mbuf(s) within a cluster if there is room. */ static int allow_mbufs_in_cluster = 1; TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster); /* * Largest rx cluster size that the driver is allowed to allocate. */ static int largest_rx_cluster = MJUM16BYTES; TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster); /* * Size of cluster allocation that's most likely to succeed. The driver will * fall back to this size if it fails to allocate clusters larger than this. */ static int safest_rx_cluster = PAGE_SIZE; TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster); struct txpkts { u_int wr_type; /* type 0 or type 1 */ u_int npkt; /* # of packets in this work request */ u_int plen; /* total payload (sum of all packets) */ u_int len16; /* # of 16B pieces used by this work request */ }; /* A packet's SGL. This + m_pkthdr has all info needed for tx */ struct sgl { struct sglist sg; struct sglist_seg seg[TX_SGL_SEGS]; }; static int service_iq(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t, uint16_t, char *); static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *, int, int); static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *); static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_fl *); static int alloc_fwq(struct adapter *); static int free_fwq(struct adapter *); static int alloc_mgmtq(struct adapter *); static int free_mgmtq(struct adapter *); static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, struct sysctl_oid *); static int free_rxq(struct vi_info *, struct sge_rxq *); #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *); #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int, struct sysctl_oid *); static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *); static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int, struct sysctl_oid *); static int free_nm_txq(struct vi_info *, struct sge_nm_txq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *); static int free_eq(struct adapter *, struct sge_eq *); static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *, struct sysctl_oid *); static int free_wrq(struct adapter *, struct sge_wrq *); static int alloc_txq(struct vi_info *, struct sge_txq *, int, struct sysctl_oid *); static int free_txq(struct vi_info *, struct sge_txq *); static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int alloc_fl_sdesc(struct sge_fl *); static void free_fl_sdesc(struct adapter *, struct sge_fl *); static void find_best_refill_source(struct adapter *, struct sge_fl *, int); static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); static inline u_int txpkt_len16(u_int, u_int); static inline u_int txpkt_vm_len16(u_int, u_int); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int); static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *, struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int); static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int); static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int); static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int); static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); static inline uint16_t read_hw_cidx(struct sge_eq *); static inline u_int reclaimable_tx_desc(struct sge_eq *); static inline u_int total_available_tx_desc(struct sge_eq *); static u_int reclaim_tx_descs(struct sge_txq *, u_int); static void tx_reclaim(void *, int); static __be64 get_flit(struct sglist_seg *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *); static void wrq_tx_drain(void *, int); static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); static int sysctl_tc(SYSCTL_HANDLER_ARGS); static counter_u64_t extfree_refs; static counter_u64_t extfree_rels; an_handler_t t4_an_handler; fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES]; cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS]; static int an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl) { #ifdef INVARIANTS panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl); #else log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n", __func__, iq, ctrl); #endif return (EDOOFUS); } int t4_register_an_handler(an_handler_t h) { uintptr_t *loc, new; new = h ? (uintptr_t)h : (uintptr_t)an_not_handled; loc = (uintptr_t *) &t4_an_handler; atomic_store_rel_ptr(loc, new); return (0); } static int fw_msg_not_handled(struct adapter *sc, const __be64 *rpl) { const struct cpl_fw6_msg *cpl = __containerof(rpl, struct cpl_fw6_msg, data[0]); #ifdef INVARIANTS panic("%s: fw_msg type %d", __func__, cpl->type); #else log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type); #endif return (EDOOFUS); } int t4_register_fw_msg_handler(int type, fw_msg_handler_t h) { uintptr_t *loc, new; if (type >= nitems(t4_fw_msg_handler)) return (EINVAL); /* * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL * handler dispatch table. Reject any attempt to install a handler for * this subtype. */ if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled; loc = (uintptr_t *) &t4_fw_msg_handler[type]; atomic_store_rel_ptr(loc, new); return (0); } static int cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { #ifdef INVARIANTS panic("%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); #else log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n", __func__, rss->opcode, iq, m); m_freem(m); #endif return (EDOOFUS); } int t4_register_cpl_handler(int opcode, cpl_handler_t h) { uintptr_t *loc, new; if (opcode >= nitems(t4_cpl_handler)) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled; loc = (uintptr_t *) &t4_cpl_handler[opcode]; atomic_store_rel_ptr(loc, new); return (0); } /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. */ void t4_sge_modload(void) { int i; if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," " using 2 instead.\n", fl_pktshift); fl_pktshift = 2; } if (spg_len != 64 && spg_len != 128) { int len; #if defined(__i386__) || defined(__amd64__) len = cpu_clflush_line_size > 64 ? 128 : 64; #else len = 64; #endif if (spg_len != -1) { printf("Invalid hw.cxgbe.spg_len value (%d)," " using %d instead.\n", spg_len, len); } spg_len = len; } if (cong_drop < -1 || cong_drop > 1) { printf("Invalid hw.cxgbe.cong_drop value (%d)," " using 0 instead.\n", cong_drop); cong_drop = 0; } extfree_refs = counter_u64_alloc(M_WAITOK); extfree_rels = counter_u64_alloc(M_WAITOK); counter_u64_zero(extfree_refs); counter_u64_zero(extfree_rels); t4_an_handler = an_not_handled; for (i = 0; i < nitems(t4_fw_msg_handler); i++) t4_fw_msg_handler[i] = fw_msg_not_handled; for (i = 0; i < nitems(t4_cpl_handler); i++) t4_cpl_handler[i] = cpl_not_handled; t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update); t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx); t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl); } void t4_sge_modunload(void) { counter_u64_free(extfree_refs); counter_u64_free(extfree_rels); } uint64_t t4_sge_extfree_refs(void) { uint64_t refs, rels; rels = counter_u64_fetch(extfree_rels); refs = counter_u64_fetch(extfree_refs); return (refs - rels); } static inline void setup_pad_and_pack_boundaries(struct adapter *sc) { uint32_t v, m; int pad, pack, pad_shift; pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT : X_INGPADBOUNDARY_SHIFT; pad = fl_pad; if (fl_pad < (1 << pad_shift) || fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) || !powerof2(fl_pad)) { /* * If there is any chance that we might use buffer packing and * the chip is a T4, then pick 64 as the pad/pack boundary. Set * it to the minimum allowed in all other cases. */ pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift; /* * For fl_pad = 0 we'll still write a reasonable value to the * register but all the freelists will opt out of padding. * We'll complain here only if the user tried to set it to a * value greater than 0 that was invalid. */ if (fl_pad > 0) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" " (%d), using %d instead.\n", fl_pad, pad); } } m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); if (is_t4(sc)) { if (fl_pack != -1 && fl_pack != pad) { /* Complain but carry on. */ device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," " using %d instead.\n", fl_pack, pad); } return; } pack = fl_pack; if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) { pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); MPASS(powerof2(pack)); if (pack < 16) pack = 16; if (pack == 32) pack = 64; if (pack > 4096) pack = 4096; if (fl_pack != -1) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" " (%d), using %d instead.\n", fl_pack, pack); } } m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(pack) - 5); MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); } /* * adap->params.vpd.cclk must be set up before this is called. */ void t4_tweak_chip_settings(struct adapter *sc) { int i; uint32_t v, m; int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sge_flbuf_sizes[] = { MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, MJUMPAGESIZE - CL_METADATA_SIZE, MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE, #endif MJUM9BYTES, MJUM16BYTES, MCLBYTES - MSIZE - CL_METADATA_SIZE, MJUM9BYTES - CL_METADATA_SIZE, MJUM16BYTES - CL_METADATA_SIZE, }; KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); setup_pad_and_pack_boundaries(sc); v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES, ("%s: hw buffer size table too big", __func__)); for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) { t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i), sge_flbuf_sizes[i]); } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); KASSERT(intr_timer[0] <= timer_max, ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], timer_max)); for (i = 1; i < nitems(intr_timer); i++) { KASSERT(intr_timer[i] >= intr_timer[i - 1], ("%s: timers not listed in increasing order (%d)", __func__, i)); while (intr_timer[i] > timer_max) { if (i == nitems(intr_timer) - 1) { intr_timer[i] = timer_max; break; } intr_timer[i] += intr_timer[i - 1]; intr_timer[i] /= 2; } } v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); /* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); /* * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP. These have been * chosen with MAXPHYS = 128K in mind. The largest DDP buffer that we * may have to deal with is MAXPHYS + 1 page. */ v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4); t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v); /* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */ m = v = F_TDDPTAGTCB | F_ISCSITAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } /* * SGE wants the buffer to be at least 64B and then a multiple of 16. If * padding is in use, the buffer's start and end need to be aligned to the pad * boundary as well. We'll just make sure that the size is a multiple of the * boundary here, it is up to the buffer allocation code to make sure the start * of the buffer is aligned as well. */ static inline int hwsz_ok(struct adapter *sc, int hwsz) { int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1; return (hwsz >= 64 && (hwsz & mask) == 0); } /* * XXX: driver really should be able to deal with unexpected settings. */ int t4_read_chip_settings(struct adapter *sc) { struct sge *s = &sc->sge; struct sge_params *sp = &sc->params.sge; int i, j, n, rc = 0; uint32_t m, v, r; uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, #endif MJUM9BYTES, MJUM16BYTES }; struct sw_zone_info *swz, *safe_swz; struct hw_buf_info *hwb; m = F_RXPKTCPLMODE; v = F_RXPKTCPLMODE; r = sc->params.sge.sge_control; if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } /* * If this changes then every single use of PAGE_SHIFT in the driver * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift. */ if (sp->page_shift != PAGE_SHIFT) { device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); rc = EINVAL; } /* Filter out unusable hw buffer sizes entirely (mark with -2). */ hwb = &s->hw_buf_info[0]; for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = sc->params.sge.sge_fl_buffer_size[i]; hwb->size = r; hwb->zidx = hwsz_ok(sc, r) ? -1 : -2; hwb->next = -1; } /* * Create a sorted list in decreasing order of hw buffer sizes (and so * increasing order of spare area) for each software zone. * * If padding is enabled then the start and end of the buffer must align * to the pad boundary; if packing is enabled then they must align with * the pack boundary as well. Allocations from the cluster zones are * aligned to min(size, 4K), so the buffer starts at that alignment and * ends at hwb->size alignment. If mbuf inlining is allowed the * starting alignment will be reduced to MSIZE and the driver will * exercise appropriate caution when deciding on the best buffer layout * to use. */ n = 0; /* no usable buffer size to begin with */ swz = &s->sw_zone_info[0]; safe_swz = NULL; for (i = 0; i < SW_ZONE_SIZES; i++, swz++) { int8_t head = -1, tail = -1; swz->size = sw_buf_sizes[i]; swz->zone = m_getzone(swz->size); swz->type = m_gettype(swz->size); if (swz->size < PAGE_SIZE) { MPASS(powerof2(swz->size)); if (fl_pad && (swz->size % sp->pad_boundary != 0)) continue; } if (swz->size == safest_rx_cluster) safe_swz = swz; hwb = &s->hw_buf_info[0]; for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { if (hwb->zidx != -1 || hwb->size > swz->size) continue; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sp->pad_boundary == 0); #endif hwb->zidx = i; if (head == -1) head = tail = j; else if (hwb->size < s->hw_buf_info[tail].size) { s->hw_buf_info[tail].next = j; tail = j; } else { int8_t *cur; struct hw_buf_info *t; for (cur = &head; *cur != -1; cur = &t->next) { t = &s->hw_buf_info[*cur]; if (hwb->size == t->size) { hwb->zidx = -2; break; } if (hwb->size > t->size) { hwb->next = *cur; *cur = j; break; } } } } swz->head_hwidx = head; swz->tail_hwidx = tail; if (tail != -1) { n++; if (swz->size - s->hw_buf_info[tail].size >= CL_METADATA_SIZE) sc->flags |= BUF_PACKING_OK; } } if (n == 0) { device_printf(sc->dev, "no usable SGE FL buffer size.\n"); rc = EINVAL; } s->safe_hwidx1 = -1; s->safe_hwidx2 = -1; if (safe_swz != NULL) { s->safe_hwidx1 = safe_swz->head_hwidx; for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) { int spare; hwb = &s->hw_buf_info[i]; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sp->pad_boundary == 0); #endif spare = safe_swz->size - hwb->size; if (spare >= CL_METADATA_SIZE) { s->safe_hwidx2 = i; break; } } } if (sc->flags & IS_VF) return (0); v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); if (r != v) { device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); rc = EINVAL; } m = v = F_TDDPTAGTCB; r = t4_read_reg(sc, A_ULP_RX_CTL); if ((r & m) != v) { device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); rc = EINVAL; } m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; r = t4_read_reg(sc, A_TP_PARA_REG5); if ((r & m) != v) { device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); rc = EINVAL; } t4_init_tp_params(sc); t4_read_mtu_tbl(sc, sc->params.mtus, NULL); t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd); return (rc); } int t4_create_dma_tag(struct adapter *sc) { int rc; rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmat); if (rc != 0) { device_printf(sc->dev, "failed to create main DMA tag: %d\n", rc); } return (rc); } void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { struct sge_params *sp = &sc->params.sge; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A", "freelist buffer sizes"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, NULL, sp->pad_boundary, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, sp->spg_len, "status page size (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sp->pack_boundary, "payload pack boundary (bytes)"); } int t4_destroy_dma_tag(struct adapter *sc) { if (sc->dmat) bus_dma_tag_destroy(sc->dmat); return (0); } /* * Allocate and initialize the firmware event queue and the management queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ int t4_setup_adapter_queues(struct adapter *sc) { int rc; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); sysctl_ctx_init(&sc->ctx); sc->flags |= ADAP_SYSCTL_CTX; /* * Firmware event queue */ rc = alloc_fwq(sc); if (rc != 0) return (rc); /* * Management queue. This is just a control queue that uses the fwq as * its associated iq. */ if (!(sc->flags & IS_VF)) rc = alloc_mgmtq(sc); return (rc); } /* * Idempotent */ int t4_teardown_adapter_queues(struct adapter *sc) { ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* Do this before freeing the queue */ if (sc->flags & ADAP_SYSCTL_CTX) { sysctl_ctx_free(&sc->ctx); sc->flags &= ~ADAP_SYSCTL_CTX; } free_mgmtq(sc); free_fwq(sc); return (0); } static inline int first_vector(struct vi_info *vi) { struct adapter *sc = vi->pi->adapter; if (sc->intr_count == 1) return (0); return (vi->first_intr); } /* * Given an arbitrary "index," come up with an iq that can be used by other * queues (of this VI) for interrupt forwarding, SGE egress updates, etc. * The iq returned is guaranteed to be something that takes direct interrupts. */ static struct sge_iq * vi_intr_iq(struct vi_info *vi, int idx) { struct adapter *sc = vi->pi->adapter; struct sge *s = &sc->sge; struct sge_iq *iq = NULL; int nintr, i; if (sc->intr_count == 1) return (&sc->sge.fwq); nintr = vi->nintr; KASSERT(nintr != 0, ("%s: vi %p has no exclusive interrupts, total interrupts = %d", __func__, vi, sc->intr_count)); i = idx % nintr; if (vi->flags & INTR_RXQ) { if (i < vi->nrxq) { iq = &s->rxq[vi->first_rxq + i].iq; goto done; } i -= vi->nrxq; } #ifdef TCP_OFFLOAD if (vi->flags & INTR_OFLD_RXQ) { if (i < vi->nofldrxq) { iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq; goto done; } i -= vi->nofldrxq; } #endif panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__, vi, vi->flags & INTR_ALL, idx, nintr); done: MPASS(iq != NULL); KASSERT(iq->flags & IQ_INTR, ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi, vi->flags & INTR_ALL, idx)); return (iq); } /* Maximum payload that can be delivered with a single iq descriptor */ static inline int mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) { int payload; #ifdef TCP_OFFLOAD if (toe) { payload = sc->tt.rx_coalesce ? G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; } else { #endif /* large enough even when hw VLAN extraction is disabled */ payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu; #ifdef TCP_OFFLOAD } #endif return (payload); } int t4_setup_vi_queues(struct vi_info *vi) { int rc = 0, i, j, intr_idx, iqid; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *ctrlq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP int saved_idx; struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif char name[16]; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); int maxp, mtu = ifp->if_mtu; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = first_vector(vi); #ifdef DEV_NETMAP saved_idx = intr_idx; if (ifp->if_capabilities & IFCAP_NETMAP) { /* netmap is supported with direct interrupts only. */ MPASS(vi->flags & INTR_RXQ); /* * We don't have buffers to back the netmap rx queues * right now so we create the queues in a way that * doesn't set off any congestion signal in the chip. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_nm_rxq(vi, i, nm_rxq) { rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq", CTLFLAG_RD, NULL, "tx queues"); for_each_nm_txq(vi, i, nm_txq) { iqid = vi->first_nm_rxq + (i % vi->nnmrxq); rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid); if (rc != 0) goto done; } } /* Normal rx queues and netmap rx queues share the same interrupts. */ intr_idx = saved_idx; #endif /* * First pass over all NIC and TOE rx queues: * a) initialize iq and fl * b) allocate queue iff it will take direct interrupts. */ maxp = mtu_to_max_payload(sc, mtu, 0); if (vi->flags & INTR_RXQ) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); } for_each_rxq(vi, i, rxq) { init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name); if (vi->flags & INTR_RXQ) { rxq->iq.flags |= IQ_INTR; rc = alloc_rxq(vi, rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq); #endif #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); if (vi->flags & INTR_OFLD_RXQ) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); } for_each_ofld_rxq(vi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name); if (vi->flags & INTR_OFLD_RXQ) { ofld_rxq->iq.flags |= IQ_INTR; rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #endif /* * Second pass over all NIC and TOE rx queues. The queues forwarding * their interrupts are allocated now. */ j = 0; if (!(vi->flags & INTR_RXQ)) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_rxq(vi, i, rxq) { MPASS(!(rxq->iq.flags & IQ_INTR)); intr_idx = vi_intr_iq(vi, j)->abs_id; rc = alloc_rxq(vi, rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); for_each_ofld_rxq(vi, i, ofld_rxq) { MPASS(!(ofld_rxq->iq.flags & IQ_INTR)); intr_idx = vi_intr_iq(vi, j)->abs_id; rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #endif /* * Now the tx queues. Only one pass needed. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD, NULL, "tx queues"); j = 0; for_each_txq(vi, i, txq) { iqid = vi_intr_iq(vi, j)->cntxt_id; snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(vi->dev), i); init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid, name); rc = alloc_txq(vi, txq, i, oid); if (rc != 0) goto done; j++; } #ifdef TCP_OFFLOAD oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections"); for_each_ofld_txq(vi, i, ofld_txq) { struct sysctl_oid *oid2; iqid = vi_intr_iq(vi, j)->cntxt_id; snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(vi->dev), i); init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, iqid, name); snprintf(name, sizeof(name), "%d", i); oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "offload tx queue"); rc = alloc_wrq(sc, vi, ofld_txq, oid2); if (rc != 0) goto done; j++; } #endif /* * Finally, the control queue. */ if (!IS_MAIN_VI(vi) || sc->flags & IS_VF) goto done; oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD, NULL, "ctrl queue"); ctrlq = &sc->sge.ctrlq[pi->port_id]; iqid = vi_intr_iq(vi, 0)->cntxt_id; snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev)); init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid, name); rc = alloc_wrq(sc, vi, ctrlq, oid); done: if (rc) t4_teardown_vi_queues(vi); return (rc); } /* * Idempotent */ int t4_teardown_vi_queues(struct vi_info *vi) { int i; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif /* Do this before freeing the queues */ if (vi->flags & VI_SYSCTL_CTX) { sysctl_ctx_free(&vi->ctx); vi->flags &= ~VI_SYSCTL_CTX; } #ifdef DEV_NETMAP if (vi->ifp->if_capabilities & IFCAP_NETMAP) { for_each_nm_txq(vi, i, nm_txq) { free_nm_txq(vi, nm_txq); } for_each_nm_rxq(vi, i, nm_rxq) { free_nm_rxq(vi, nm_rxq); } } #endif /* * Take down all the tx queues first, as they reference the rx queues * (for egress updates, etc.). */ if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF)) free_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(vi, i, txq) { free_txq(vi, txq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(vi, i, ofld_txq) { free_wrq(sc, ofld_txq); } #endif /* * Then take down the rx queues that forward their interrupts, as they * reference other rx queues. */ for_each_rxq(vi, i, rxq) { if ((rxq->iq.flags & IQ_INTR) == 0) free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { if ((ofld_rxq->iq.flags & IQ_INTR) == 0) free_ofld_rxq(vi, ofld_rxq); } #endif /* * Then take down the rx queues that take direct interrupts. */ for_each_rxq(vi, i, rxq) { if (rxq->iq.flags & IQ_INTR) free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) free_ofld_rxq(vi, ofld_rxq); } #endif return (0); } /* * Deals with errors and the firmware event queue. All data rx queues forward * their interrupt to the firmware event queue. */ void t4_intr_all(void *arg) { struct adapter *sc = arg; struct sge_iq *fwq = &sc->sge.fwq; t4_intr_err(arg); if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) { service_iq(fwq, 0); atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE); } } /* Deals with error interrupts */ void t4_intr_err(void *arg) { struct adapter *sc = arg; t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); t4_slow_intr_handler(sc); } void t4_intr_evt(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } void t4_intr(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } void t4_vi_intr(void *arg) { struct irq *irq = arg; #ifdef DEV_NETMAP if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) { t4_nm_intr(irq->nm_rxq); atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON); } #endif if (irq->rxq != NULL) t4_intr(irq->rxq); } /* * Deals with anything and everything on the given ingress queue. */ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ struct sge_fl *fl; /* Use iff IQ_HAS_FL */ struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type, refill; uint32_t lq; uint16_t fl_hw_cidx; struct mbuf *m0; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; #endif KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); limit = budget ? budget : iq->qsize / 16; if (iq->flags & IQ_HAS_FL) { fl = &rxq->fl; fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ } else { fl = NULL; fl_hw_cidx = 0; /* to silence gcc warning */ } /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); refill = 0; m0 = NULL; rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: KASSERT(iq->flags & IQ_HAS_FL, ("%s: data for an iq (%p) with no freelist", __func__, iq)); m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto process_iql; refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2; #ifdef T4_PKT_TIMESTAMP /* * 60 bit timestamp for the payload is * *(uint64_t *)m0->m_pktdat. Note that it is * in the leading free-space in the mbuf. The * kernel can clobber it during a pullup, * m_copymdata, etc. You need to make sure that * the mbuf reaches you unmolested if you care * about the timestamp. */ *(uint64_t *)m0->m_pktdat = be64toh(ctrl->u.last_flit) & 0xfffffffffffffff; #endif /* fall through */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: /* * Interrupts should be forwarded only to queues * that are not forwarding their interrupts. * This means service_iq can recurse but only 1 * level deep. */ KASSERT(budget == 0, ("%s: budget %u, rsp_type %u", __func__, budget, rsp_type)); /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. */ if (lq >= 1024) { t4_an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start - sc->sge.iq_base]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { if (service_iq(q, q->qsize / 16) == 0) { atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { STAILQ_INSERT_TAIL(&iql, q, link); } } break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && sc->lro_timeout != 0) { tcp_lro_flush_inactive(&rxq->lro, &lro_timeout); } #endif if (budget) { if (iq->flags & IQ_HAS_FL) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); } return (EINPROGRESS); } } if (refill) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); fl_hw_cidx = fl->hw_cidx; } } process_iql: if (STAILQ_EMPTY(&iql)) break; /* * Process the head only, and send it to the back of the list if * it's still not done. */ q = STAILQ_FIRST(&iql); STAILQ_REMOVE_HEAD(&iql, link); if (service_iq(q, q->qsize / 8) == 0) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); else STAILQ_INSERT_TAIL(&iql, q, link); } #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { struct lro_ctrl *lro = &rxq->lro; tcp_lro_flush_all(lro); } #endif t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); if (iq->flags & IQ_HAS_FL) { int starved; FL_LOCK(fl); starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); } return (0); } static inline int cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0; if (rc) MPASS(cll->region3 >= CL_METADATA_SIZE); return (rc); } static inline struct cluster_metadata * cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll, caddr_t cl) { if (cl_has_metadata(fl, cll)) { struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; return ((struct cluster_metadata *)(cl + swz->size) - 1); } return (NULL); } static void rxb_free(struct mbuf *m, void *arg1, void *arg2) { uma_zone_t zone = arg1; caddr_t cl = arg2; uma_zfree(zone, cl); counter_u64_add(extfree_rels, 1); } /* * The mbuf returned by this function could be allocated from zone_mbuf or * constructed in spare room in the cluster. * * The mbuf carries the payload in one of these ways * a) frame inside the mbuf (mbuf from zone_mbuf) * b) m_cljset (for clusters without metadata) zone_mbuf * c) m_extaddref (cluster with metadata) inline mbuf * d) m_extaddref (cluster with metadata) zone_mbuf */ static struct mbuf * get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct cluster_layout *cll = &sd->cll; struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx]; struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl); int len, blen; caddr_t payload; blen = hwb->size - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); payload = sd->cl + cll->region1 + fl->rx_offset; if (fl->flags & FL_BUF_PACKING) { const u_int l = fr_offset + len; const u_int pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < hwb->size) blen = len + pad; MPASS(fl->rx_offset + blen <= hwb->size); } else { MPASS(fl->rx_offset == 0); /* not packing */ } if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { /* * Copy payload into a freshly allocated mbuf. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; #ifdef T4_PKT_TIMESTAMP /* Leave room for a timestamp */ m->m_data += 8; #endif /* copy data to mbuf */ bcopy(payload, mtod(m, caddr_t), len); } else if (sd->nmbuf * MSIZE < cll->region1) { /* * There's spare room in the cluster for an mbuf. Create one * and associate it with the payload that's in the cluster. */ MPASS(clm != NULL); m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE); /* No bzero required */ if (m_init(m, M_NOWAIT, MT_DATA, fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE)) return (NULL); fl->mbuf_inlined++; m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { /* * Grab an mbuf from zone_mbuf and associate it with the * payload in the cluster. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; if (clm != NULL) { m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { m_cljset(m, sd->cl, swz->type); sd->cl = NULL; /* consumed, not a recycle candidate */ } } if (fr_offset == 0) m->m_pkthdr.len = remaining; m->m_len = len; if (fl->flags & FL_BUF_PACKING) { fl->rx_offset += blen; MPASS(fl->rx_offset <= hwb->size); if (fl->rx_offset < hwb->size) return (m); /* without advancing the cidx */ } if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } fl->rx_offset = 0; return (m); } static struct mbuf * get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf) { struct mbuf *m0, *m, **pnext; u_int remaining; const u_int total = G_RSPD_LEN(len_newbuf); if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(fl->m0->m_pkthdr.len == total); MPASS(fl->remaining < total); m0 = fl->m0; pnext = fl->pnext; remaining = fl->remaining; fl->flags &= ~FL_BUF_RESUME; goto get_segment; } if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { fl->rx_offset = 0; if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } } /* * Payload starts at rx_offset in the current hw buffer. Its length is * 'len' and it may span multiple hw buffers. */ m0 = get_scatter_segment(sc, fl, 0, total); if (m0 == NULL) return (NULL); remaining = total - m0->m_len; pnext = &m0->m_next; while (remaining > 0) { get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, total - remaining, remaining); if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = remaining; fl->flags |= FL_BUF_RESUME; return (NULL); } *pnext = m; pnext = &m->m_next; remaining -= m->m_len; } *pnext = NULL; M_ASSERTPKTHDR(m0); return (m0); } static int t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct sge_rxq *rxq = iq_to_rxq(iq); struct ifnet *ifp = rxq->ifp; struct adapter *sc = iq->adapter; const struct cpl_rx_pkt *cpl = (const void *)(rss + 1); #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6}, {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6}, }; KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__, rss->opcode)); m0->m_pkthdr.len -= sc->params.sge.fl_pktshift; m0->m_len -= sc->params.sge.fl_pktshift; m0->m_data += sc->params.sge.fl_pktshift; m0->m_pkthdr.rcvif = ifp; M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]); m0->m_pkthdr.flowid = be32toh(rss->hash_val); if (cpl->csum_calc && !cpl->err_vec) { if (ifp->if_capenable & IFCAP_RXCSUM && cpl->l2info & htobe32(F_RXF_IP)) { m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); rxq->rxcsum++; } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && cpl->l2info & htobe32(F_RXF_IP6)) { m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); rxq->rxcsum++; } if (__predict_false(cpl->ip_frag)) m0->m_pkthdr.csum_data = be16toh(cpl->csum); else m0->m_pkthdr.csum_data = 0xffff; } if (cpl->vlan_ex) { m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); m0->m_flags |= M_VLANTAG; rxq->vlan_extraction++; } #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && tcp_lro_rx(lro, m0, 0) == 0) { /* queued for LRO */ } else #endif ifp->if_input(ifp, m0); return (0); } /* * Must drain the wrq or make sure that someone else will. */ static void wrq_tx_drain(void *arg, int n) { struct sge_wrq *wrq = arg; struct sge_eq *eq = &wrq->eq; EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(wrq->adapter, wrq); EQ_UNLOCK(eq); } static void drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) { struct sge_eq *eq = &wrq->eq; u_int available, dbdiff; /* # of hardware descriptors */ u_int n; struct wrqe *wr; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ EQ_LOCK_ASSERT_OWNED(eq); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); wr = STAILQ_FIRST(&wrq->wr_list); MPASS(wr != NULL); /* Must be called with something useful to do */ MPASS(eq->pidx == eq->dbidx); dbdiff = 0; do { eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; MPASS(wr->wrq == wrq); n = howmany(wr->wr_len, EQ_ESIZE); if (available < n) break; dst = (void *)&eq->desc[eq->pidx]; if (__predict_true(eq->sidx - eq->pidx > n)) { /* Won't wrap, won't end exactly at the status page. */ bcopy(&wr->wr[0], dst, wr->wr_len); eq->pidx += n; } else { int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; bcopy(&wr->wr[0], dst, first_portion); if (wr->wr_len > first_portion) { bcopy(&wr->wr[first_portion], &eq->desc[0], wr->wr_len - first_portion); } eq->pidx = n - (eq->sidx - eq->pidx); } wrq->tx_wrs_copied++; if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } dbdiff += n; if (dbdiff >= 16) { ring_eq_db(sc, eq, dbdiff); dbdiff = 0; } STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); MPASS(wrq->nwr_pending > 0); wrq->nwr_pending--; MPASS(wrq->ndesc_needed >= n); wrq->ndesc_needed -= n; } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); if (dbdiff) ring_eq_db(sc, eq, dbdiff); } /* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { #ifdef INVARIANTS struct sge_eq *eq = &wrq->eq; #endif EQ_LOCK_ASSERT_OWNED(eq); MPASS(wr != NULL); MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); MPASS((wr->wr_len & 0x7) == 0); STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); wrq->nwr_pending++; wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) return; /* commit_wrq_wr will drain wr_list as well. */ drain_wrq_wr_list(sc, wrq); /* Doorbell must have caught up to the pidx. */ MPASS(eq->pidx == eq->dbidx); } void t4_update_fl_bufsize(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; int i, maxp, mtu = ifp->if_mtu; maxp = mtu_to_max_payload(sc, mtu, 0); for_each_rxq(vi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(vi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #endif } static inline int mbuf_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.l5hlen > 0, ("%s: mbuf %p missing information on # of segments.", __func__, m)); return (m->m_pkthdr.l5hlen); } static inline void set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.l5hlen = nsegs; } static inline int mbuf_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[0]; MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } static inline void set_mbuf_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[0] = len16; } static inline int needs_tso(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & CSUM_TSO) { KASSERT(m->m_pkthdr.tso_segsz > 0, ("%s: TSO requested in mbuf %p but MSS not provided", __func__, m)); return (1); } return (0); } static inline int needs_l3_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) return (1); return (0); } static inline int needs_l4_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) return (1); return (0); } static inline int needs_vlan_insertion(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_flags & M_VLANTAG) { KASSERT(m->m_pkthdr.ether_vtag != 0, ("%s: HWVLAN requested in mbuf %p but tag not provided", __func__, m)); return (1); } return (0); } static void * m_advance(struct mbuf **pm, int *poffset, int len) { struct mbuf *m = *pm; int offset = *poffset; uintptr_t p = 0; MPASS(len > 0); for (;;) { if (offset + len < m->m_len) { offset += len; p = mtod(m, uintptr_t) + offset; break; } len -= m->m_len - offset; m = m->m_next; offset = 0; MPASS(m != NULL); } *poffset = offset; *pm = m; return ((void *)p); } -static inline int -same_paddr(char *a, char *b) -{ - - if (a == b) - return (1); - else if (a != NULL && b != NULL) { - vm_offset_t x = (vm_offset_t)a; - vm_offset_t y = (vm_offset_t)b; - - if ((x & PAGE_MASK) == (y & PAGE_MASK) && - pmap_kextract(x) == pmap_kextract(y)) - return (1); - } - - return (0); -} - /* * Can deal with empty mbufs in the chain that have m_len = 0, but the chain * must have at least one mbuf that's not empty. */ static inline int count_mbuf_nsegs(struct mbuf *m) { - char *prev_end, *start; + vm_paddr_t lastb, next; + vm_offset_t va; int len, nsegs; MPASS(m != NULL); nsegs = 0; - prev_end = NULL; + lastb = 0; for (; m; m = m->m_next) { len = m->m_len; if (__predict_false(len == 0)) continue; - start = mtod(m, char *); - - nsegs += sglist_count(start, len); - if (same_paddr(prev_end, start)) + va = mtod(m, vm_offset_t); + next = pmap_kextract(va); + nsegs += sglist_count(m->m_data, len); + if (lastb + 1 == next) nsegs--; - prev_end = start + len; + lastb = pmap_kextract(va + len - 1); } MPASS(nsegs > 0); return (nsegs); } /* * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: * a) caller can assume it's been freed if this function returns with an error. * b) it may get defragged up if the gather list is too long for the hardware. */ int parse_pkt(struct adapter *sc, struct mbuf **mp) { struct mbuf *m0 = *mp, *m; int rc, nsegs, defragged = 0, offset; struct ether_header *eh; void *l3hdr; #if defined(INET) || defined(INET6) struct tcphdr *tcp; #endif uint16_t eh_type; M_ASSERTPKTHDR(m0); if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { rc = EINVAL; fail: m_freem(m0); *mp = NULL; return (rc); } restart: /* * First count the number of gather list segments in the payload. * Defrag the mbuf if nsegs exceeds the hardware limit. */ M_ASSERTPKTHDR(m0); MPASS(m0->m_pkthdr.len > 0); nsegs = count_mbuf_nsegs(m0); if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) { if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = EFBIG; goto fail; } *mp = m0 = m; /* update caller's copy after defrag */ goto restart; } if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) { m0 = m_pullup(m0, m0->m_pkthdr.len); if (m0 == NULL) { /* Should have left well enough alone. */ rc = EFBIG; goto fail; } *mp = m0; /* update caller's copy after pullup */ goto restart; } set_mbuf_nsegs(m0, nsegs); if (sc->flags & IS_VF) set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0))); else set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0))); if (!needs_tso(m0) && !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0)))) return (0); m = m0; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.l2hlen = sizeof(*evh); } else m0->m_pkthdr.l2hlen = sizeof(*eh); offset = 0; l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = l3hdr; MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP); m0->m_pkthdr.l3hlen = sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; m0->m_pkthdr.l3hlen = ip->ip_hl * 4; break; } #endif default: panic("%s: ethertype 0x%04x unknown. if_cxgbe must be compiled" " with the same INET/INET6 options as the kernel.", __func__, eh_type); } #if defined(INET) || defined(INET6) if (needs_tso(m0)) { tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); m0->m_pkthdr.l4hlen = tcp->th_off * 4; } #endif MPASS(m0 == *mp); return (0); } void * start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, available; struct wrqe *wr; void *w; MPASS(len16 > 0); ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); EQ_LOCK(eq); if (!STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); if (!STAILQ_EMPTY(&wrq->wr_list)) { slowpath: EQ_UNLOCK(eq); wr = alloc_wrqe(len16 * 16, wrq); if (__predict_false(wr == NULL)) return (NULL); cookie->pidx = -1; cookie->ndesc = ndesc; return (&wr->wr); } eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < ndesc) goto slowpath; cookie->pidx = eq->pidx; cookie->ndesc = ndesc; TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); w = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, ndesc, eq->sidx); if (__predict_false(eq->pidx < ndesc - 1)) { w = &wrq->ss[0]; wrq->ss_pidx = cookie->pidx; wrq->ss_len = len16 * 16; } EQ_UNLOCK(eq); return (w); } void commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, pidx; struct wrq_cookie *prev, *next; if (cookie->pidx == -1) { struct wrqe *wr = __containerof(w, struct wrqe, wr); t4_wrq_tx(sc, wr); return; } ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ pidx = cookie->pidx; MPASS(pidx >= 0 && pidx < eq->sidx); if (__predict_false(w == &wrq->ss[0])) { int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; MPASS(wrq->ss_len > n); /* WR had better wrap around. */ bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); wrq->tx_wrs_ss++; } else wrq->tx_wrs_direct++; EQ_LOCK(eq); prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); next = TAILQ_NEXT(cookie, link); if (prev == NULL) { MPASS(pidx == eq->dbidx); if (next == NULL || ndesc >= 16) ring_eq_db(wrq->adapter, eq, ndesc); else { MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); next->pidx = pidx; next->ndesc += ndesc; } } else { MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); prev->ndesc += ndesc; } TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); #ifdef INVARIANTS if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { /* Doorbell must have caught up to the pidx. */ MPASS(wrq->eq.pidx == wrq->eq.dbidx); } #endif EQ_UNLOCK(eq); } static u_int can_resume_eth_tx(struct mp_ring *r) { struct sge_eq *eq = r->cookie; return (total_available_tx_desc(eq) > eq->sidx / 8); } static inline int cannot_use_txpkts(struct mbuf *m) { /* maybe put a GL limit too, to avoid silliness? */ return (needs_tso(m)); } /* * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to * be consumed. Return the actual number consumed. 0 indicates a stall. */ static u_int eth_tx(struct mp_ring *r, u_int cidx, u_int pidx) { struct sge_txq *txq = r->cookie; struct sge_eq *eq = &txq->eq; struct ifnet *ifp = txq->ifp; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; u_int total, remaining; /* # of packets */ u_int available, dbdiff; /* # of hardware descriptors */ u_int n, next_cidx; struct mbuf *m0, *tail; struct txpkts txp; struct fw_eth_tx_pkts_wr *wr; /* any fw WR struct will do */ remaining = IDXDIFF(pidx, cidx, r->size); MPASS(remaining > 0); /* Must not be called without work to do. */ total = 0; TXQ_LOCK(txq); if (__predict_false((eq->flags & EQ_ENABLED) == 0)) { while (cidx != pidx) { m0 = r->items[cidx]; m_freem(m0); if (++cidx == r->size) cidx = 0; } reclaim_tx_descs(txq, 2048); total = remaining; goto done; } /* How many hardware descriptors do we have readily available. */ if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); while (remaining > 0) { m0 = r->items[cidx]; M_ASSERTPKTHDR(m0); MPASS(m0->m_nextpkt == NULL); if (available < SGE_MAX_WR_NDESC) { available += reclaim_tx_descs(txq, 64); if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16)) break; /* out of descriptors */ } next_cidx = cidx + 1; if (__predict_false(next_cidx == r->size)) next_cidx = 0; wr = (void *)&eq->desc[eq->pidx]; if (sc->flags & IS_VF) { total++; remaining--; ETHER_BPF_MTAP(ifp, m0); n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0, available); } else if (remaining > 1 && try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) { /* pkts at cidx, next_cidx should both be in txp. */ MPASS(txp.npkt == 2); tail = r->items[next_cidx]; MPASS(tail->m_nextpkt == NULL); ETHER_BPF_MTAP(ifp, m0); ETHER_BPF_MTAP(ifp, tail); m0->m_nextpkt = tail; if (__predict_false(++next_cidx == r->size)) next_cidx = 0; while (next_cidx != pidx) { if (add_to_txpkts(r->items[next_cidx], &txp, available) != 0) break; tail->m_nextpkt = r->items[next_cidx]; tail = tail->m_nextpkt; ETHER_BPF_MTAP(ifp, tail); if (__predict_false(++next_cidx == r->size)) next_cidx = 0; } n = write_txpkts_wr(txq, wr, m0, &txp, available); total += txp.npkt; remaining -= txp.npkt; } else { total++; remaining--; ETHER_BPF_MTAP(ifp, m0); n = write_txpkt_wr(txq, (void *)wr, m0, available); } MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC); available -= n; dbdiff += n; IDXINCR(eq->pidx, n, eq->sidx); if (total_available_tx_desc(eq) < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } if (dbdiff >= 16 && remaining >= 4) { ring_eq_db(sc, eq, dbdiff); available += reclaim_tx_descs(txq, 4 * dbdiff); dbdiff = 0; } cidx = next_cidx; } if (dbdiff != 0) { ring_eq_db(sc, eq, dbdiff); reclaim_tx_descs(txq, 32); } done: TXQ_UNLOCK(txq); return (total); } static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, int qsize) { KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ ("%s: bad pktc_idx %d", __func__, pktc_idx)); iq->flags = 0; iq->adapter = sc; iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); iq->intr_pktc_idx = SGE_NCOUNTERS - 1; if (pktc_idx >= 0) { iq->intr_params |= F_QINTR_CNT_EN; iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE; } static inline void init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) { fl->qsize = qsize; fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (sc->flags & BUF_PACKING_OK && ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ fl->flags |= FL_BUF_PACKING; find_best_refill_source(sc, fl, maxp); find_safe_refill_source(sc, fl); } static inline void init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan, uint16_t iqid, char *name) { KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype)); eq->flags = eqtype & EQ_TYPEMASK; eq->tx_chan = tx_chan; eq->iqid = iqid; eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(eq->lockname, name, sizeof(eq->lockname)); } static int alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_addr_t *pa, void **va) { int rc; rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc); goto done; } rc = bus_dmamem_alloc(*tag, va, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc); goto done; } rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); if (rc != 0) { device_printf(sc->dev, "cannot load DMA map: %d\n", rc); goto done; } done: if (rc) free_ring(sc, *tag, *map, *pa, *va); return (rc); } static int free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, bus_addr_t pa, void *va) { if (pa) bus_dmamap_unload(tag, map); if (va) bus_dmamem_free(tag, va, map); if (tag) bus_dma_tag_destroy(tag); return (0); } /* * Allocates the ring for an ingress queue and an optional freelist. If the * freelist is specified it will be allocated and then associated with the * ingress queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. * * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then * the intr_idx specifies the vector, starting from 0. Otherwise it specifies * the abs_id of the ingress queue to which its interrupts should be forwarded. */ static int alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl, int intr_idx, int cong) { int rc, i, cntxt_id; size_t len; struct fw_iq_cmd c; struct port_info *pi = vi->pi; struct adapter *sc = iq->adapter; struct sge_params *sp = &sc->params.sge; __be32 v = 0; len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) return (rc); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); /* Special handling for firmware event queue */ if (iq == &sc->sge.fwq) v |= F_FW_IQ_CMD_IQASYNCH; if (iq->flags & IQ_INTR) { KASSERT(intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, intr_idx)); } else v |= F_FW_IQ_CMD_IQANDST; v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx); c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); if (cong >= 0) c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) return (rc); /* Allocate space for one software descriptor per buffer. */ rc = alloc_fl_sdesc(fl); if (rc != 0) { device_printf(sc->dev, "failed to setup fl software descriptors: %d\n", rc); return (rc); } if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sp->fl_starve_threshold2, 8); fl->buf_boundary = sp->pack_boundary; } else { fl->lowat = roundup2(sp->fl_starve_threshold, 8); fl->buf_boundary = 16; } if (fl_pad && fl->buf_boundary < sp->pad_boundary) fl->buf_boundary = sp->pad_boundary; c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 0)); if (cong >= 0) { c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) | V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B)); c.fl0size = htobe16(fl->qsize); c.fl0addr = htobe64(fl->ba); } rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create ingress queue: %d\n", rc); return (rc); } iq->cidx = 0; iq->gen = F_RSPD_GEN; iq->intr_next = iq->intr_params; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); iq->flags |= IQ_ALLOCATED; cntxt_id = iq->cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.niq) { panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.niq - 1); } sc->sge.iqmap[cntxt_id] = iq; if (fl) { u_int qid; iq->flags |= IQ_HAS_FL; fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = 0; cntxt_id = fl->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) { panic("%s: fl->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); } sc->sge.eqmap[cntxt_id] = (void *)fl; qid = fl->cntxt_id; if (isset(&sc->doorbells, DOORBELL_UDB)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (qid >> s_qpp) << PAGE_SHIFT; qid &= mask; if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { udb += qid << UDBS_SEG_SHIFT; qid = 0; } fl->udb = (volatile void *)udb; } fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db; FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); } if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) { uint32_t param, val; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(iq->cntxt_id); if (cong == 0) val = 1 << 19; else { val = 2 << 19; for (i = 0; i < 4; i++) { if (cong & (1 << i)) val |= 1 << (i << 2); } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* report error but carry on */ device_printf(sc->dev, "failed to set congestion manager context for " "ingress queue %d: %d\n", iq->cntxt_id, rc); } } /* Enable IQ interrupts */ atomic_store_rel_int(&iq->state, IQS_IDLE); t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) | V_INGRESSQID(iq->cntxt_id)); return (0); } static int free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl) { int rc; struct adapter *sc = iq->adapter; device_t dev; if (sc == NULL) return (0); /* nothing to do */ dev = vi ? vi->dev : sc->dev; if (iq->flags & IQ_ALLOCATED) { rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); if (rc != 0) { device_printf(dev, "failed to free queue %p: %d\n", iq, rc); return (rc); } iq->flags &= ~IQ_ALLOCATED; } free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); bzero(iq, sizeof(*iq)); if (fl) { free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); if (fl->sdesc) free_fl_sdesc(sc, fl); if (mtx_initialized(&fl->fl_lock)) mtx_destroy(&fl->fl_lock); bzero(fl, sizeof(*fl)); } return (0); } static void add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_fl *fl) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &fl->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, fl_pad ? 1 : 0, "padding enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); } SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 0, "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated", CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined", CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); } static int alloc_fwq(struct adapter *sc) { int rc, intr_idx; struct sge_iq *fwq = &sc->sge.fwq; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE); fwq->flags |= IQ_INTR; /* always */ if (sc->flags & IS_VF) intr_idx = 0; else { intr_idx = sc->intr_count > 1 ? 1 : 0; fwq->set_tcb_rpl = t4_filter_rpl; fwq->l2t_write_rpl = do_l2t_write_rpl; } rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1); if (rc != 0) { device_printf(sc->dev, "failed to create firmware event queue: %d\n", rc); return (rc); } oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD, NULL, "firmware event queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(&sc->ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &fwq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(&sc->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, fwq->qsize * IQ_ESIZE, "descriptor ring size in bytes"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I", "consumer index"); return (0); } static int free_fwq(struct adapter *sc) { return free_iq_fl(NULL, &sc->sge.fwq, NULL); } static int alloc_mgmtq(struct adapter *sc) { int rc; struct sge_wrq *mgmtq = &sc->sge.mgmtq; char name[16]; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD, NULL, "management queue"); snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev)); init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan, sc->sge.fwq.cntxt_id, name); rc = alloc_wrq(sc, NULL, mgmtq, oid); if (rc != 0) { device_printf(sc->dev, "failed to create management queue: %d\n", rc); return (rc); } return (0); } static int free_mgmtq(struct adapter *sc) { return free_wrq(sc, &sc->sge.mgmtq); } int tnl_cong(struct port_info *pi, int drop) { if (drop == -1) return (-1); else if (drop == 1) return (0); else return (pi->rx_chan_map); } static int alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct adapter *sc = vi->pi->adapter; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(vi->pi, cong_drop)); if (rc != 0) return (rc); if (idx == 0) sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id; else KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id, ("iq_base mismatch")); KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF, ("PF with non-zero iq_base")); /* * The freelist is just barely above the starvation threshold right now, * fill it up a bit more. */ FL_LOCK(&rxq->fl); refill_fl(sc, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); #if defined(INET) || defined(INET6) rc = tcp_lro_init(&rxq->lro); if (rc != 0) return (rc); rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */ if (vi->ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; #endif rxq->ifp = vi->ifp; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &rxq->iq.ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); #if defined(INET) || defined(INET6) SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &rxq->lro.lro_queued, 0, NULL); SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl); return (rc); } static int free_rxq(struct vi_info *vi, struct sge_rxq *rxq) { int rc; #if defined(INET) || defined(INET6) if (rxq->lro.ifp) { tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; } #endif rc = free_iq_fl(vi, &rxq->iq, &rxq->fl); if (rc == 0) bzero(rxq, sizeof(*rxq)); return (rc); } #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { struct port_info *pi = vi->pi; int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, pi->rx_chan_map); if (rc != 0) return (rc); children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &ofld_rxq->iq.ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, ofld_rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl); return (rc); } static int free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq) { int rc; rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl); if (rc == 0) bzero(ofld_rxq, sizeof(*ofld_rxq)); return (rc); } #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; struct sysctl_ctx_list *ctx; char name[16]; size_t len; struct adapter *sc = vi->pi->adapter; struct netmap_adapter *na = NA(vi->ifp); MPASS(na != NULL); len = vi->qsize_rxq * IQ_ESIZE; rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map, &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc); if (rc != 0) return (rc); len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map, &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc); if (rc != 0) return (rc); nm_rxq->vi = vi; nm_rxq->nid = idx; nm_rxq->iq_cidx = 0; nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE; nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; nm_rxq->fl_sidx = na->num_rx_desc; nm_rxq->intr_idx = intr_idx; ctx = &vi->ctx; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I", "consumer index"); children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_rxq->fl_cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &nm_rxq->fl_pidx, 0, "producer index"); return (rc); } static int free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = vi->pi->adapter; free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba, nm_rxq->iq_desc); free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba, nm_rxq->fl_desc); return (0); } static int alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx, struct sysctl_oid *oid) { int rc; size_t len; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct netmap_adapter *na = NA(vi->ifp); char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map, &nm_txq->ba, (void **)&nm_txq->desc); if (rc) return (rc); nm_txq->pidx = nm_txq->cidx = 0; nm_txq->sidx = na->num_tx_desc; nm_txq->nid = idx; nm_txq->iqidx = iqidx; nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) | V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) | V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid))); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "netmap tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_txq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I", "producer index"); return (rc); } static int free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { struct adapter *sc = vi->pi->adapter; free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba, nm_txq->desc); return (0); } #endif static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create control queue %d: %d\n", eq->tx_chan, rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } static int eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create Ethernet egress queue: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | V_FW_EQ_OFLD_CMD_VFN(0)); c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create egress queue for TCP offload: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #endif static int alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, qsize; size_t len; mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; len = qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); eq->pidx = eq->cidx = 0; eq->equeqidx = eq->dbidx = 0; eq->doorbells = sc->doorbells; switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = ctrl_eq_alloc(sc, eq); break; case EQ_ETH: rc = eth_eq_alloc(sc, vi, eq); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = ofld_eq_alloc(sc, vi, eq); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to allocate egress queue(%d): %d\n", eq->flags & EQ_TYPEMASK, rc); } if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ eq->udb_qid = 0; } eq->udb = (volatile void *)udb; } return (rc); } static int free_eq(struct adapter *sc, struct sge_eq *eq) { int rc; if (eq->flags & EQ_ALLOCATED) { switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; case EQ_ETH: rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to free egress queue (%d): %d\n", eq->flags & EQ_TYPEMASK, rc); return (rc); } eq->flags &= ~EQ_ALLOCATED; } free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); if (mtx_initialized(&eq->eq_lock)) mtx_destroy(&eq->eq_lock); bzero(eq, sizeof(*eq)); return (0); } static int alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq, struct sysctl_oid *oid) { int rc; struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = alloc_eq(sc, vi, &wrq->eq); if (rc) return (rc); wrq->adapter = sc; TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); TAILQ_INIT(&wrq->incomplete_wrs); STAILQ_INIT(&wrq->wr_list); wrq->nwr_pending = 0; wrq->ndesc_needed = 0; SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &wrq->eq.ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL, wrq->eq.sidx, "status page index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, &wrq->tx_wrs_direct, "# of work requests (direct)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, &wrq->tx_wrs_copied, "# of work requests (copied)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD, &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)"); return (rc); } static int free_wrq(struct adapter *sc, struct sge_wrq *wrq) { int rc; rc = free_eq(sc, &wrq->eq); if (rc) return (rc); bzero(wrq, sizeof(*wrq)); return (0); } static int alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx, struct sysctl_oid *oid) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, M_CXGBE, M_WAITOK); if (rc != 0) { device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc); return (rc); } rc = alloc_eq(sc, vi, eq); if (rc != 0) { mp_ring_free(txq->r); txq->r = NULL; return (rc); } /* Can't fail after this point. */ if (idx == 0) sc->sge.eq_base = eq->abs_id - eq->cntxt_id; else KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id, ("eq_base mismatch")); KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF, ("PF with non-zero eq_base")); TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); txq->ifp = vi->ifp; txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); if (sc->flags & IS_VF) txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan)); else txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) | V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) | V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid))); txq->tc_idx = -1; txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &eq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, &eq->abs_id, 0, "absolute id of the queue"); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &eq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL, eq->sidx, "status page index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc", CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I", "traffic class (-1 means none)"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, &txq->txcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, &txq->tso_wrs, "# of TSO work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, &txq->imm_wrs, "# of work requests with immediate data"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD, &txq->txpkts0_wrs, "# of txpkts (type 0) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD, &txq->txpkts1_wrs, "# of txpkts (type 1) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD, &txq->txpkts0_pkts, "# of frames tx'd using type0 txpkts work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD, &txq->txpkts1_pkts, "# of frames tx'd using type1 txpkts work requests"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues", CTLFLAG_RD, &txq->r->enqueues, "# of enqueues to the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops", CTLFLAG_RD, &txq->r->drops, "# of drops in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts", CTLFLAG_RD, &txq->r->starts, "# of normal consumer starts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls", CTLFLAG_RD, &txq->r->stalls, "# of consumer stalls in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts", CTLFLAG_RD, &txq->r->restarts, "# of consumer restarts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications", CTLFLAG_RD, &txq->r->abdications, "# of consumer abdications in the mp_ring for this queue"); return (0); } static int free_txq(struct vi_info *vi, struct sge_txq *txq) { int rc; struct adapter *sc = vi->pi->adapter; struct sge_eq *eq = &txq->eq; rc = free_eq(sc, eq); if (rc) return (rc); sglist_free(txq->gl); free(txq->sdesc, M_CXGBE); mp_ring_free(txq->r); bzero(txq, sizeof(*txq)); return (0); } static void oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *ba = arg; KASSERT(nseg == 1, ("%s meant for single segment mappings only.", __func__)); *ba = error ? 0 : segs->ds_addr; } static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { uint32_t n, v; n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx); MPASS(n > 0); wmb(); v = fl->dbval | V_PIDX(n); if (fl->udb) *fl->udb = htole32(v); else t4_write_reg(sc, sc->sge_kdoorbell_reg, v); IDXINCR(fl->dbidx, n, fl->sidx); } /* * Fills up the freelist by allocating up to 'n' buffers. Buffers that are * recycled do not count towards this allocation budget. * * Returns non-zero to indicate that this freelist should be added to the list * of starving freelists. */ static int refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { __be64 *d; struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; struct cluster_layout *cll; struct sw_zone_info *swz; struct cluster_metadata *clm; uint16_t max_pidx; uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); /* * We always stop at the beginning of the hardware descriptor that's just * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, * which would mean an empty freelist to the chip. */ max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; if (fl->pidx == max_pidx * 8) return (0); d = &fl->desc[fl->pidx]; sd = &fl->sdesc[fl->pidx]; cll = &fl->cll_def; /* default layout */ swz = &sc->sge.sw_zone_info[cll->zidx]; while (n > 0) { if (sd->cl != NULL) { if (sd->nmbuf == 0) { /* * Fast recycle without involving any atomics on * the cluster's metadata (if the cluster has * metadata). This happens when all frames * received in the cluster were small enough to * fit within a single mbuf each. */ fl->cl_fast_recycled++; #ifdef INVARIANTS clm = cl_metadata(sc, fl, &sd->cll, sd->cl); if (clm != NULL) MPASS(clm->refcount == 1); #endif goto recycled_fast; } /* * Cluster is guaranteed to have metadata. Clusters * without metadata always take the fast recycle path * when they're recycled. */ clm = cl_metadata(sc, fl, &sd->cll, sd->cl); MPASS(clm != NULL); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { fl->cl_recycled++; counter_u64_add(extfree_rels, 1); goto recycled; } sd->cl = NULL; /* gave up my reference */ } MPASS(sd->cl == NULL); alloc: cl = uma_zalloc(swz->zone, M_NOWAIT); if (__predict_false(cl == NULL)) { if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 || fl->cll_def.zidx == fl->cll_alt.zidx) break; /* fall back to the safe zone */ cll = &fl->cll_alt; swz = &sc->sge.sw_zone_info[cll->zidx]; goto alloc; } fl->cl_allocated++; n--; pa = pmap_kextract((vm_offset_t)cl); pa += cll->region1; sd->cl = cl; sd->cll = *cll; *d = htobe64(pa | cll->hwidx); clm = cl_metadata(sc, fl, cll, cl); if (clm != NULL) { recycled: #ifdef INVARIANTS clm->sd = sd; #endif clm->refcount = 1; } sd->nmbuf = 0; recycled_fast: d++; sd++; if (__predict_false(++fl->pidx % 8 == 0)) { uint16_t pidx = fl->pidx / 8; if (__predict_false(pidx == fl->sidx)) { fl->pidx = 0; pidx = 0; sd = fl->sdesc; d = fl->desc; } if (pidx == max_pidx) break; if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) ring_fl_db(sc, fl); } } if (fl->pidx / 8 != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); } /* * Attempt to refill all starving freelists. */ static void refill_sfl(void *arg) { struct adapter *sc = arg; struct sge_fl *fl, *fl_temp; mtx_assert(&sc->sfl_lock, MA_OWNED); TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { FL_LOCK(fl); refill_fl(sc, fl, 64); if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { TAILQ_REMOVE(&sc->sfl, fl, link); fl->flags &= ~FL_STARVING; } FL_UNLOCK(fl); } if (!TAILQ_EMPTY(&sc->sfl)) callout_schedule(&sc->sfl_callout, hz / 5); } static int alloc_fl_sdesc(struct sge_fl *fl) { fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); return (0); } static void free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; struct cluster_metadata *clm; struct cluster_layout *cll; int i; sd = fl->sdesc; for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; cll = &sd->cll; clm = cl_metadata(sc, fl, cll, sd->cl); if (sd->nmbuf == 0) uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) { uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); counter_u64_add(extfree_rels, 1); } sd->cl = NULL; } free(fl->sdesc, M_CXGBE); fl->sdesc = NULL; } static inline void get_pkt_gl(struct mbuf *m, struct sglist *gl) { int rc; M_ASSERTPKTHDR(m); sglist_reset(gl); rc = sglist_append_mbuf(gl, m); if (__predict_false(rc != 0)) { panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " "with %d.", __func__, m, mbuf_nsegs(m), rc); } KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)); } /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_len16(u_int nsegs, u_int tso) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } /* * len16 for a txpkt_vm WR with a GL. Includes the firmware work * request header. */ static inline u_int txpkt_vm_len16(u_int nsegs, u_int tso) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct fw_eth_tx_pkt_vm_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } /* * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts0_len16(u_int nsegs) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts1_len16(void) { u_int n; n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); return (howmany(n, 16)); } static inline u_int imm_payload(u_int ndesc) { u_int n; n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - sizeof(struct cpl_tx_pkt_core); return (n); } /* * Write a VM txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int csum_type, len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc <= available); /* Firmware work request header */ MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3[0] = 0; wr->r3[1] = 0; /* * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci. * vlantci is ignored unless the ethtype is 0x8100, so it's * simpler to always copy it rather than making it * conditional. Also, it seems that we do not have to set * vlantci or fake the ethtype when doing VLAN tag insertion. */ m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst); csum_type = -1; if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) csum_type = TX_CSUM_TCPIP6; else csum_type = TX_CSUM_TCPIP; cpl = (void *)(lso + 1); txq->tso_wrs++; } else { if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP) csum_type = TX_CSUM_TCPIP; else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP) csum_type = TX_CSUM_UDPIP; else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP) csum_type = TX_CSUM_TCPIP6; else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP) csum_type = TX_CSUM_UDPIP6; #if defined(INET) else if (m0->m_pkthdr.csum_flags & CSUM_IP) { /* * XXX: The firmware appears to stomp on the * fragment/flags field of the IP header when * using TX_CSUM_IP. Fall back to doing * software checksums. */ u_short *sump; struct mbuf *m; int offset; m = m0; offset = 0; sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen + offsetof(struct ip, ip_sum)); *sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } #endif cpl = (void *)(wr + 1); } /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m0) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (csum_type >= 0) { KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0, ("%s: mbuf %p needs checksum offload but missing header lengths", __func__, m0)); if (chip_id(sc) <= CHELSIO_T5) { ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen - ETHER_HDR_LEN); } else { ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen - ETHER_HDR_LEN); } ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen); ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type); } else ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); /* * A packet using TSO will use up an entire descriptor for the * firmware work request header, LSO CPL, and TX_PKT_XT CPL. * If this descriptor is the last descriptor in the ring, wrap * around to the front of the ring explicitly for the start of * the sgl. */ if (dst == (void *)&eq->desc[eq->sidx]) { dst = (void *)&eq->desc[0]; write_gl_to_txd(txq, m0, &dst, 0); } else write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * Write a txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); else if (pktlen <= imm_payload(2) && available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + pktlen, 16); nsegs = 0; } ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc <= available); /* Firmware work request header */ MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); txq->tso_wrs++; } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m0) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m0) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); if (nsegs > 0) { write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; } else { struct mbuf *m; for (m = m0; m != NULL; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; #endif } #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif txq->imm_wrs++; } txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } static int try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available) { u_int needed, nsegs1, nsegs2, l1, l2; if (cannot_use_txpkts(m) || cannot_use_txpkts(n)) return (1); nsegs1 = mbuf_nsegs(m); nsegs2 = mbuf_nsegs(n); if (nsegs1 + nsegs2 == 2) { txp->wr_type = 1; l1 = l2 = txpkts1_len16(); } else { txp->wr_type = 0; l1 = txpkts0_len16(nsegs1); l2 = txpkts0_len16(nsegs2); } txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2; needed = howmany(txp->len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->plen = m->m_pkthdr.len + n->m_pkthdr.len; if (txp->plen > 65535) return (1); txp->npkt = 2; set_mbuf_len16(m, l1); set_mbuf_len16(n, l2); return (0); } static int add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available) { u_int plen, len16, needed, nsegs; MPASS(txp->wr_type == 0 || txp->wr_type == 1); nsegs = mbuf_nsegs(m); if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1)) return (1); plen = txp->plen + m->m_pkthdr.len; if (plen > 65535) return (1); if (txp->wr_type == 0) len16 = txpkts0_len16(nsegs); else len16 = txpkts1_len16(); needed = howmany(txp->len16 + len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->npkt++; txp->plen = plen; txp->len16 += len16; set_mbuf_len16(m, len16); return (0); } /* * Write a txpkts WR for the packets in txp to the hardware descriptors, update * the software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int ndesc, checkwrap; struct mbuf *m; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->plen < 65536); MPASS(m0 != NULL); MPASS(m0->m_nextpkt != NULL); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); MPASS(available > 0 && available < eq->sidx); ndesc = howmany(txp->len16, EQ_ESIZE / 16); MPASS(ndesc <= available); MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); ctrl = V_FW_WR_LEN16(txp->len16); wr->equiq_to_len16 = htobe32(ctrl); wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r3 = 0; wr->type = txp->wr_type; flitp = wr + 1; /* * At this point we are 16B into a hardware descriptor. If checkwrap is * set then we know the WR is going to wrap around somewhere. We'll * check for that at appropriate points. */ checkwrap = eq->sidx - ndesc < eq->pidx; for (m = m0; m != NULL; m = m->m_nextpkt) { if (txp->wr_type == 0) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; /* ULP master command */ ulpmc = flitp; ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); ulpmc->len = htobe32(mbuf_len16(m)); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); cpl = (void *)(ulpsc + 1); if (checkwrap && (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) cpl = (void *)&eq->desc[0]; txq->txpkts0_pkts += txp->npkt; txq->txpkts0_wrs++; } else { cpl = flitp; txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; } /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; if (checkwrap && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = (void *)&eq->desc[0]; write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); } txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) { struct sge_eq *eq = &txq->eq; struct sglist *gl = txq->gl; struct sglist_seg *seg; __be64 *flitp, *wrap; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); get_pkt_gl(m, gl); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)(*to); wrap = (__be64 *)(&eq->desc[eq->sidx]); seg = &gl->sg_segs[0]; usgl = (void *)flitp; /* * We start at a 16 byte boundary somewhere inside the tx descriptor * ring, so we're at least 16 bytes away from the status page. There is * no chance of a wrap around in the middle of usgl (which is 16 bytes). */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { /* Won't wrap around at all */ for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); for (i = 0; i < nflits - 2; i++) { if (flitp == wrap) flitp = (void *)eq->desc; *flitp++ = get_flit(seg, nsegs - 1, i); } } if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); if (__predict_false(flitp == wrap)) *to = (void *)eq->desc; else *to = (void *)flitp; } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static inline void ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) { u_int db; MPASS(n > 0); db = eq->doorbells; if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; int i; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(eq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", __func__, eq->doorbells, n, eq->dbidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); i = eq->dbidx; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: t4_write_reg(sc, sc->sge_kdoorbell_reg, V_QID(eq->cntxt_id) | V_PIDX(n)); break; } IDXINCR(eq->dbidx, n, eq->sidx); } static inline u_int reclaimable_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx; hw_cidx = read_hw_cidx(eq); return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); } static inline u_int total_available_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx, pidx; hw_cidx = read_hw_cidx(eq); pidx = eq->pidx; if (pidx == hw_cidx) return (eq->sidx - 1); else return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); } static inline uint16_t read_hw_cidx(struct sge_eq *eq) { struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; uint16_t cidx = spg->cidx; /* stable snapshot */ return (be16toh(cidx)); } /* * Reclaim 'n' descriptors approximately. */ static u_int reclaim_tx_descs(struct sge_txq *txq, u_int n) { struct tx_sdesc *txsd; struct sge_eq *eq = &txq->eq; u_int can_reclaim, reclaimed; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(n > 0); reclaimed = 0; can_reclaim = reclaimable_tx_desc(eq); while (can_reclaim && reclaimed < n) { int ndesc; struct mbuf *m, *nextpkt; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; /* Firmware doesn't return "partial" credits. */ KASSERT(can_reclaim >= ndesc, ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } reclaimed += ndesc; can_reclaim -= ndesc; IDXINCR(eq->cidx, ndesc, eq->sidx); } return (reclaimed); } static void tx_reclaim(void *arg, int n) { struct sge_txq *txq = arg; struct sge_eq *eq = &txq->eq; do { if (TXQ_TRYLOCK(txq) == 0) break; n = reclaim_tx_descs(txq, 32); if (eq->cidx == eq->pidx) eq->equeqidx = eq->pidx; TXQ_UNLOCK(txq); } while (n > 0); } static __be64 get_flit(struct sglist_seg *segs, int nsegs, int idx) { int i = (idx / 3) * 2; switch (idx % 3) { case 0: { __be64 rc; rc = htobe32(segs[i].ss_len); if (i + 1 < nsegs) rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32; return (rc); } case 1: return (htobe64(segs[i].ss_paddr)); case 2: return (htobe64(segs[i + 1].ss_paddr)); } return (0); } static void find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) { int8_t zidx, hwidx, idx; uint16_t region1, region3; int spare, spare_needed, n; struct sw_zone_info *swz; struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0]; /* * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize * large enough for the max payload and cluster metadata. Otherwise * settle for the largest bufsize that leaves enough room in the cluster * for metadata. * * Without buffer packing: Look for the smallest zone which has a * bufsize large enough for the max payload. Settle for the largest * bufsize available if there's nothing big enough for max payload. */ spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0; swz = &sc->sge.sw_zone_info[0]; hwidx = -1; for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) { if (swz->size > largest_rx_cluster) { if (__predict_true(hwidx != -1)) break; /* * This is a misconfiguration. largest_rx_cluster is * preventing us from finding a refill source. See * dev.t5nex..buffer_sizes to figure out why. */ device_printf(sc->dev, "largest_rx_cluster=%u leaves no" " refill source for fl %p (dma %u). Ignored.\n", largest_rx_cluster, fl, maxp); } for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (spare < spare_needed) continue; hwidx = idx; /* best option so far */ if (hwb->size >= maxp) { if ((fl->flags & FL_BUF_PACKING) == 0) goto done; /* stop looking (not packing) */ if (swz->size >= safest_rx_cluster) goto done; /* stop looking (packing) */ } break; /* keep looking, next zone */ } } done: /* A usable hwidx has been located. */ MPASS(hwidx != -1); hwb = &hwb_list[hwidx]; zidx = hwb->zidx; swz = &sc->sge.sw_zone_info[zidx]; region1 = 0; region3 = swz->size - hwb->size; /* * Stay within this zone and see if there is a better match when mbuf * inlining is allowed. Remember that the hwidx's are sorted in * decreasing order of size (so in increasing order of spare area). */ for (idx = hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) break; /* * Do not inline mbufs if doing so would violate the pad/pack * boundary alignment requirement. */ if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0) continue; if (fl->flags & FL_BUF_PACKING && (MSIZE % sc->params.sge.pack_boundary) != 0) continue; if (spare < CL_METADATA_SIZE + MSIZE) continue; n = (spare - CL_METADATA_SIZE) / MSIZE; if (n > howmany(hwb->size, maxp)) break; hwidx = idx; if (fl->flags & FL_BUF_PACKING) { region1 = n * MSIZE; region3 = spare - region1; } else { region1 = MSIZE; region3 = spare - region1; break; } } KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES, ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp)); KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES, ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp)); KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 == sc->sge.sw_zone_info[zidx].size, ("%s: bad buffer layout for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); if (fl->flags & FL_BUF_PACKING || region1 > 0) { KASSERT(region3 >= CL_METADATA_SIZE, ("%s: no room for metadata. fl %p, maxp %d; " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); KASSERT(region1 % MSIZE == 0, ("%s: bad mbuf region for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); } fl->cll_def.zidx = zidx; fl->cll_def.hwidx = hwidx; fl->cll_def.region1 = region1; fl->cll_def.region3 = region3; } static void find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) { struct sge *s = &sc->sge; struct hw_buf_info *hwb; struct sw_zone_info *swz; int spare; int8_t hwidx; if (fl->flags & FL_BUF_PACKING) hwidx = s->safe_hwidx2; /* with room for metadata */ else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) { hwidx = s->safe_hwidx2; hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; /* no good if there isn't room for an mbuf as well */ if (spare < CL_METADATA_SIZE + MSIZE) hwidx = s->safe_hwidx1; } else hwidx = s->safe_hwidx1; if (hwidx == -1) { /* No fallback source */ fl->cll_alt.hwidx = -1; fl->cll_alt.zidx = -1; return; } hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; fl->cll_alt.hwidx = hwidx; fl->cll_alt.zidx = hwb->zidx; if (allow_mbufs_in_cluster && (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0)) fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else fl->cll_alt.region1 = 0; fl->cll_alt.region3 = spare - fl->cll_alt.region1; } static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); if ((fl->flags & FL_DOOMED) == 0) { fl->flags |= FL_STARVING; TAILQ_INSERT_TAIL(&sc->sfl, fl, link); callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); } FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); } static void handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_wrq *wrq = (void *)eq; atomic_readandclear_int(&eq->equiq); taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task); } static void handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_txq *txq = (void *)eq; MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH); atomic_readandclear_int(&eq->equiq); mp_ring_check_drainage(txq->r, 0); taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); } static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, &handle_wrq_egr_update, &handle_eth_egr_update, &handle_wrq_egr_update}; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start - s->eq_base]; (*h[eq->flags & EQ_TYPEMASK])(sc, eq); return (0); } /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ offsetof(struct cpl_fw6_msg, data)); static int handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { const struct rss_header *rss2; rss2 = (const struct rss_header *)&cpl->data[0]; return (t4_cpl_handler[rss2->opcode](iq, rss2, m)); } return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0])); } /** * t4_handle_wrerr_rpl - process a FW work request error message * @adap: the adapter * @rpl: start of the FW message */ static int t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl) { u8 opcode = *(const u8 *)rpl; const struct fw_error_cmd *e = (const void *)rpl; unsigned int i; if (opcode != FW_ERROR_CMD) { log(LOG_ERR, "%s: Received WRERR_RPL message with opcode %#x\n", device_get_nameunit(adap->dev), opcode); return (EINVAL); } log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev), G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" : "non-fatal"); switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) { case FW_ERROR_TYPE_EXCEPTION: log(LOG_ERR, "exception info:\n"); for (i = 0; i < nitems(e->u.exception.info); i++) log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ", be32toh(e->u.exception.info[i])); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_HWMODULE: log(LOG_ERR, "HW module regaddr %08x regval %08x\n", be32toh(e->u.hwmodule.regaddr), be32toh(e->u.hwmodule.regval)); break; case FW_ERROR_TYPE_WR: log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n", be16toh(e->u.wr.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)), be32toh(e->u.wr.eqid)); for (i = 0; i < nitems(e->u.wr.wrhdr); i++) log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ", e->u.wr.wrhdr[i]); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_ACL: log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s", be16toh(e->u.acl.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)), be32toh(e->u.acl.eqid), G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" : "MAC"); for (i = 0; i < nitems(e->u.acl.val); i++) log(LOG_ERR, " %02x", e->u.acl.val[i]); log(LOG_ERR, "\n"); break; default: log(LOG_ERR, "type %#x\n", G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))); return (EINVAL); } return (0); } static int sysctl_uint16(SYSCTL_HANDLER_ARGS) { uint16_t *id = arg1; int i = *id; return sysctl_handle_int(oidp, &i, 0, req); } static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS) { struct sge *s = arg1; struct hw_buf_info *hwb = &s->hw_buf_info[0]; struct sw_zone_info *swz = &s->sw_zone_info[0]; int i, rc; struct sbuf sb; char c; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster) c = '*'; else c = '\0'; sbuf_printf(&sb, "%u%c ", hwb->size, c); } sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } static int sysctl_tc(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct port_info *pi; struct adapter *sc; struct sge_txq *txq; struct tx_sched_class *tc; int qidx = arg2, rc, tc_idx; uint32_t fw_queue, fw_class; MPASS(qidx >= 0 && qidx < vi->ntxq); pi = vi->pi; sc = pi->adapter; txq = &sc->sge.txq[vi->first_txq + qidx]; tc_idx = txq->tc_idx; rc = sysctl_handle_int(oidp, &tc_idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); /* Note that -1 is legitimate input (it means unbind). */ if (tc_idx < -1 || tc_idx >= sc->chip_params->nsched_cls) return (EINVAL); rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4stc"); if (rc) return (rc); if (tc_idx == txq->tc_idx) { rc = 0; /* No change, nothing to do. */ goto done; } fw_queue = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id); if (tc_idx == -1) fw_class = 0xffffffff; /* Unbind. */ else { /* * Bind to a different class. Ethernet txq's are only allowed * to bind to cl-rl mode-class for now. XXX: too restrictive. */ tc = &pi->tc[tc_idx]; if (tc->flags & TX_SC_OK && tc->params.level == SCHED_CLASS_LEVEL_CL_RL && tc->params.mode == SCHED_CLASS_MODE_CLASS) { /* Ok to proceed. */ fw_class = tc_idx; } else { rc = tc->flags & TX_SC_OK ? EBUSY : ENXIO; goto done; } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); if (rc == 0) { if (txq->tc_idx != -1) { tc = &pi->tc[txq->tc_idx]; MPASS(tc->refcount > 0); tc->refcount--; } if (tc_idx != -1) { tc = &pi->tc[tc_idx]; tc->refcount++; } txq->tc_idx = tc_idx; } done: end_synchronized_op(sc, 0); return (rc); } Index: user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 307896) @@ -1,4233 +1,4234 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmbus_if.h" /* Short for Hyper-V network interface */ #define NETVSC_DEVNAME "hn" /* * It looks like offset 0 of buf is reserved to hold the softc pointer. * The sc pointer evidently not needed, and is not presently populated. * The packet offset is where the netvsc_packet starts in the buffer. */ #define HV_NV_SC_PTR_OFFSET_IN_BUF 0 #define HV_NV_PACKET_OFFSET_IN_BUF 16 /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_LROENT_CNT_DEF 128 #define HN_RING_CNT_DEF_MAX 8 #define HN_RNDIS_PKT_LEN \ (sizeof(struct rndis_packet_msg) + \ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE /* -1 for RNDIS packet message */ #define HN_TX_DATA_SEGCNT_MAX (NETVSC_PACKET_MAXPAGE - 1) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ struct hn_send_ctx send_ctx; uint32_t chim_index; int chim_size; bus_dmamap_t data_dmap; bus_addr_t rndis_pkt_paddr; struct rndis_packet_msg *rndis_pkt; bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x1 #define HN_TXD_FLAG_DMAMAP 0x2 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) #define HN_LRO_ACKCNT_DEF 1 #define HN_LOCK_INIT(sc) \ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) #define HN_CSUM_IP_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) /* * Globals */ SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); /* Limit TSO burst size */ static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif #endif static int hn_share_tx_taskq = 0; SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN, &hn_share_tx_taskq, 0, "Enable shared TX taskqueue"); static struct taskqueue *hn_tx_taskq; #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); static int hn_bind_tx_taskq = -1; SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN, &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu"); static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); #if __FreeBSD_version >= 1100095 static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); #endif static u_int hn_cpu_index; /* * Forward declarations */ static void hn_stop(struct hn_softc *sc); static void hn_init_locked(struct hn_softc *sc); static void hn_init(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int hn_start_locked(struct hn_tx_ring *txr, int len); static void hn_start(struct ifnet *ifp); static void hn_start_txeof(struct hn_tx_ring *); static int hn_ifmedia_upd(struct ifnet *ifp); static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); static int hn_check_iplen(const struct mbuf *, int); static int hn_create_tx_ring(struct hn_softc *, int); static void hn_destroy_tx_ring(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_start_taskfunc(void *, int); static void hn_start_txeof_taskfunc(void *, int); static void hn_link_taskfunc(void *, int); static void hn_netchg_init_taskfunc(void *, int); static void hn_netchg_status_taskfunc(void *, int); static void hn_suspend_mgmt_taskfunc(void *, int); static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_create_rx_data(struct hn_softc *sc, int); static void hn_destroy_rx_data(struct hn_softc *sc); static void hn_set_chim_size(struct hn_softc *, int); static void hn_set_tso_maxsize(struct hn_softc *, int, int); static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); static int hn_attach_subchans(struct hn_softc *); static void hn_detach_allchans(struct hn_softc *); static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr); static void hn_set_ring_inuse(struct hn_softc *, int); static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_rx_drain(struct vmbus_channel *); static void hn_tx_resume(struct hn_softc *, int); static void hn_tx_ring_qflush(struct hn_tx_ring *); static int netvsc_detach(device_t dev); static void hn_link_status(struct hn_softc *); static int hn_sendpkt_rndis_sglist(struct hn_tx_ring *, struct hn_txdesc *); static int hn_sendpkt_rndis_chim(struct hn_tx_ring *, struct hn_txdesc *); static int hn_set_rxfilter(struct hn_softc *); static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt); static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt); static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr); static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid); static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof_taskfunc(void *, int); static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; #if __FreeBSD_version >= 1100099 static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } #endif static __inline int hn_nvs_send_rndis_sglist1(struct vmbus_channel *chan, uint32_t rndis_mtype, struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) { struct hn_nvs_rndis rndis; rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = rndis_mtype; rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID; rndis.nvs_chim_sz = 0; return (hn_nvs_send_sglist(chan, gpa, gpa_cnt, &rndis, sizeof(rndis), sndc)); } int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, struct hn_send_ctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) { return hn_nvs_send_rndis_sglist1(chan, HN_NVS_RNDIS_MTYPE_CTRL, sndc, gpa, gpa_cnt); } static int hn_sendpkt_rndis_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0, ("invalid rndis sglist txd")); return (hn_nvs_send_rndis_sglist1(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); } static int hn_sendpkt_rndis_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) { struct hn_nvs_rndis rndis; KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && txd->chim_size > 0, ("invalid rndis chim txd")); rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; rndis.nvs_chim_idx = txd->chim_index; rndis.nvs_chim_sz = txd->chim_size; return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, &rndis, sizeof(rndis), &txd->send_ctx)); } static int hn_set_rxfilter(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; uint32_t filter; int error = 0; HN_LOCK_ASSERT(sc); if (ifp->if_flags & IFF_PROMISC) { filter = NDIS_PACKET_TYPE_PROMISCUOUS; } else { filter = NDIS_PACKET_TYPE_DIRECTED; if (ifp->if_flags & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; #ifdef notyet /* * See the comment in SIOCADDMULTI/SIOCDELMULTI. */ /* TODO: support multicast list */ if ((ifp->if_flags & IFF_ALLMULTI) || !TAILQ_EMPTY(&ifp->if_multiaddrs)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; #else /* Always enable ALLMULTI */ filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; #endif } if (sc->hn_rx_filter != filter) { error = hn_rndis_set_rxfilter(sc, filter); if (!error) sc->hn_rx_filter = filter; } return (error); } static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } static int hn_rss_reconfig(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return (ENXIO); /* * Disable RSS first. * * NOTE: * Direct reconfiguration by setting the UNCHG flags does * _not_ work properly. */ if (bootverbose) if_printf(sc->hn_ifp, "disable RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); if (error) { if_printf(sc->hn_ifp, "RSS disable failed\n"); return (error); } /* * Reenable the RSS w/ the updated RSS key or indirect * table. */ if (bootverbose) if_printf(sc->hn_ifp, "reconfig RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { if_printf(sc->hn_ifp, "RSS reconfig failed\n"); return (error); } return (0); } static void hn_rss_ind_fixup(struct hn_softc *sc, int nchan) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int i; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* * Check indirect table to make sure that all channels in it * can be used. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) { if (rss->rss_ind[i] >= nchan) { if_printf(sc->hn_ifp, "RSS indirect table %d fixup: %u -> %d\n", i, rss->rss_ind[i], nchan - 1); rss->rss_ind[i] = nchan - 1; } } } static int hn_ifmedia_upd(struct ifnet *ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ static const struct hyperv_guid g_net_vsc_device_type = { .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} }; /* * Standard probe entry point. * */ static int netvsc_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &g_net_vsc_device_type) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } /* * Standard attach entry point. * * Called when the driver is loaded. It allocates needed resources, * and initializes the "hardware" and software. */ static int netvsc_attach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; uint8_t eaddr[ETHER_ADDR_LEN]; struct ifnet *ifp = NULL; int error, ring_cnt, tx_ring_cnt; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); HN_LOCK_INIT(sc); /* * Setup taskqueue for transmission. */ if (hn_tx_taskq == NULL) { sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskq); if (hn_bind_tx_taskq >= 0) { int cpu = hn_bind_tx_taskq; cpuset_t cpu_set; if (cpu > mp_ncpus - 1) cpu = mp_ncpus - 1; CPU_SETOF(cpu, &cpu_set); taskqueue_start_threads_cpuset(&sc->hn_tx_taskq, 1, PI_NET, &cpu_set, "%s tx", device_get_nameunit(dev)); } else { taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx", device_get_nameunit(dev)); } } else { sc->hn_tx_taskq = hn_tx_taskq; } /* * Setup taskqueue for mangement tasks, e.g. link status. */ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", device_get_nameunit(dev)); TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, hn_netchg_status_taskfunc, sc); /* * Allocate ifnet and setup its name earlier, so that if_printf * can be used by functions, which will be called after * ether_ifattach(). */ ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Initialize ifmedia earlier so that it can be unconditionally * destroyed, if error happened later on. */ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; /* * Create enough TX/RX rings, even if only limited number of * channels can be allocated. */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; error = hn_create_rx_data(sc, ring_cnt); if (error) goto failed; /* * Create transaction context for NVS and RNDIS transactions. */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); if (sc->hn_xact == NULL) goto failed; /* * Attach the synthetic parts, i.e. NVS and RNDIS. */ error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; error = hn_rndis_get_eaddr(sc, eaddr); if (error) goto failed; #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } #endif /* * Fixup TX stuffs after synthetic parts are attached. */ hn_fixup_tx_data(sc); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, &sc->hn_nvs_ver, 0, "NVS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_ndis_version_sysctl, "A", "NDIS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_caps_sysctl, "A", "capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_hwassist_sysctl, "A", "hwassist"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxfilter_sysctl, "A", "rxfilter"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_key_sysctl, "IU", "RSS key"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); /* * Setup the ifmedia, which has been initialized earlier. */ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Setup the ifnet for this interface. */ + ifp->if_baudrate = IF_Gbps(10); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_init = hn_init; if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); ifp->if_start = hn_start; IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); ifp->if_snd.ifq_drv_maxlen = qdepth - 1; IFQ_SET_READY(&ifp->if_snd); } else { ifp->if_transmit = hn_transmit; ifp->if_qflush = hn_xmit_qflush; } ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; #endif if (sc->hn_caps & HN_CAP_VLAN) { /* XXX not sure about VLAN_MTU. */ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; } ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; if (ifp->if_hwassist & HN_CSUM_IP_MASK) ifp->if_capabilities |= IFCAP_TXCSUM; if (ifp->if_hwassist & HN_CSUM_IP6_MASK) ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; if (sc->hn_caps & HN_CAP_TSO4) { ifp->if_capabilities |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_IP_TSO; } if (sc->hn_caps & HN_CAP_TSO6) { ifp->if_capabilities |= IFCAP_TSO6; ifp->if_hwassist |= CSUM_IP6_TSO; } /* Enable all available capabilities by default. */ ifp->if_capenable = ifp->if_capabilities; if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; ifp->if_hw_tsomaxsegsize = PAGE_SIZE; } ether_ifattach(ifp, eaddr); if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { if_printf(ifp, "TSO segcnt %u segsz %u\n", ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } /* Inform the upper layer about the long frame support. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* * Kick off link status check. */ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; hn_link_status_update(sc); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) hn_synth_detach(sc); netvsc_detach(dev); return (error); } static int netvsc_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct ifnet *ifp = sc->hn_ifp; if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc); /* * NOTE: * hn_stop() only suspends data, so managment * stuffs have to be suspended manually here. */ hn_suspend_mgmt(sc); hn_synth_detach(sc); } HN_UNLOCK(sc); ether_ifdetach(ifp); } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskq != hn_tx_taskq) taskqueue_free(sc->hn_tx_taskq); taskqueue_free(sc->hn_mgmt_taskq0); if (sc->hn_xact != NULL) vmbus_xact_ctx_destroy(sc->hn_xact); if_free(ifp); HN_LOCK_DESTROY(sc); return (0); } /* * Standard shutdown entry point */ static int netvsc_shutdown(device_t dev) { return (0); } static void hn_link_status(struct hn_softc *sc) { uint32_t link_status; int error; error = hn_rndis_get_linkstatus(sc, &link_status); if (error) { /* XXX what to do? */ return; } if (link_status == NDIS_MEDIA_STATE_CONNECTED) sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; else sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? LINK_STATE_UP : LINK_STATE_DOWN); } static void hn_link_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) return; hn_link_status(sc); } static void hn_netchg_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Prevent any link status checks from running. */ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; /* * Fake up a [link down --> link up] state change; 5 seconds * delay is used, which closely simulates miibus reaction * upon link down event. */ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 5 * hz); } static void hn_netchg_status_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Re-allow link status checks. */ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; hn_link_status(sc); } void hn_link_status_update(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); } void hn_network_change(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else atomic_add_int(&txr->hn_txdesc_avail, 1); buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif KASSERT(txd->m == NULL && txd->refs == 0 && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && (txd->flags & HN_TXD_FLAG_ONLIST) && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { bool pending = false; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) pending = true; mtx_unlock_spin(&txr->hn_txlist_spin); #else if (!buf_ring_full(txr->hn_txdesc_br)) pending = true; #endif return (pending); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) tcp_lro_flush_all(&rxr->hn_lro); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } static __inline uint32_t hn_rndis_pktmsg_offset(uint32_t ofs) { KASSERT(ofs >= sizeof(struct rndis_packet_msg), ("invalid RNDIS packet msg offset %u", ofs)); return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); } /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; struct rndis_packet_msg *pkt; uint32_t *pi_data; int pktlen; /* * extension points to the area reserved for the * rndis_filter_packet, which is placed just after * the netvsc_packet (and rppi struct, if present; * length is updated later). */ pkt = txd->rndis_pkt; pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; pkt->rm_dataoffset = sizeof(*pkt); pkt->rm_datalen = m_head->m_pkthdr.len; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* * Set the hash value for this packet, so that the host could * dispatch the TX done event for this packet back to this TX * ring's channel. */ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); *pi_data = txr->hn_tx_idx; } if (m_head->m_flags & M_VLANTAG) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); *pi_data = NDIS_VLAN_INFO_MAKE( EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { #if defined(INET6) || defined(INET) struct ether_vlan_header *eh; int ether_len; /* * XXX need m_pullup and use mtodo */ eh = mtod(m_head, struct ether_vlan_header*); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ether_len = ETHER_HDR_LEN; pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip = (struct ip *)(m_head->m_data + ether_len); unsigned long iph_len = ip->ip_hl << 2; struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iph_len); ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6 = (struct ip6_hdr *) (m_head->m_data + ether_len); struct tcphdr *th = (struct tcphdr *)(ip6 + 1); ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, m_head->m_pkthdr.tso_segsz); } #endif #endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); if (m_head->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP)) { *pi_data = NDIS_TXCSUM_INFO_IPV6; } else { *pi_data = NDIS_TXCSUM_INFO_IPV4; if (m_head->m_pkthdr.csum_flags & CSUM_IP) *pi_data |= NDIS_TXCSUM_INFO_IPCS; } if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) *pi_data |= NDIS_TXCSUM_INFO_TCPCS; else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) *pi_data |= NDIS_TXCSUM_INFO_UDPCS; } pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Chimney send, if the packet could fit into one chimney buffer. */ if (pkt->rm_len < txr->hn_chim_size) { txr->hn_tx_chimney_tried++; txd->chim_index = hn_chim_alloc(txr->hn_sc); if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { uint8_t *dest = txr->hn_sc->hn_chim + (txd->chim_index * txr->hn_sc->hn_chim_szmax); memcpy(dest, pkt, pktlen); dest += pktlen; m_copydata(m_head, 0, m_head->m_pkthdr.len, dest); txd->chim_size = pkt->rm_len; txr->hn_gpa_cnt = 0; txr->hn_tx_chimney++; txr->hn_sendpkt = hn_sendpkt_rndis_chim; goto done; } } error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (error) { int freed; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; /* +1 RNDIS packet message */ txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; txr->hn_gpa[0].gpa_len = pktlen; /* * Fill the page buffers with mbuf info after the page * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; txr->hn_sendpkt = hn_sendpkt_rndis_sglist; done: txd->m = m_head; /* Set the completion routine */ hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd); return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0; again: /* * Make sure that txd is not freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); error = txr->hn_sendpkt(txr, txd); if (!error) { ETHER_BPF_MTAP(ifp, txd->m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if (!hn_use_if_start) { if_inc_counter(ifp, IFCOUNTER_OBYTES, txd->m->m_pkthdr.len); if (txd->m->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } txr->hn_pkts++; } hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } return error; } /* * Start a transmit of one or more packets */ static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); if (__predict_false(txr->hn_suspended)) return 0; if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return 0; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); return 1; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } error = hn_encap(txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ continue; } error = hn_send_pkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } return 0; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { #if __FreeBSD_version >= 1100095 if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } #endif return tcp_lro_rx(lc, m, 0); } #endif /* * Called when we receive a data packet from the "wire" on the * specified device * * Note: This is no longer used as a callback */ int hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, const struct hn_recvinfo *info) { struct ifnet *ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1; int hash_type; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return (0); /* * Bail out if packet contains more data than configured MTU. */ if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) { return (0); } else if (dlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), data, dlen); m_new->m_pkthdr.len = m_new->m_len = dlen; rxr->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (dlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } hv_m_append(m_new, dlen, data); } m_new->m_pkthdr.rcvif = ifp; if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { /* IP csum offload */ if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } if ((info->csum_info & (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); if (m_new->m_len < hoff) goto skip; eh = mtod(m_new, struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) goto skip; evl = mtod(m_new, struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } if (etype == ETHERTYPE_IP) { int pr; pr = hn_check_iplen(m_new, hoff); if (pr == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; } else if (pr == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } } else if (pr != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } skip: if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( NDIS_VLAN_INFO_ID(info->vlan_info), NDIS_VLAN_INFO_PRI(info->vlan_info), NDIS_VLAN_INFO_CFI(info->vlan_info)); m_new->m_flags |= M_VLANTAG; } if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = info->hash_value; hash_type = M_HASHTYPE_OPAQUE_HASH; if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); /* * Note: Moved RX completion back to hv_nv_on_receive() so all * messages (not just data messages) will trigger a response. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); rxr->hn_pkts++; if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } /* We're not holding the lock here, so don't release it */ (*ifp->if_input)(ifp, m_new); return (0); } static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct hn_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; int mask, error = 0; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) { error = EINVAL; break; } HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if ((sc->hn_caps & HN_CAP_MTU) == 0) { /* Can't change MTU */ HN_UNLOCK(sc); error = EOPNOTSUPP; break; } if (ifp->if_mtu == ifr->ifr_mtu) { HN_UNLOCK(sc); break; } /* * Suspend this interface before the synthetic parts * are ripped. */ hn_suspend(sc); /* * Detach the synthetics parts, i.e. NVS and RNDIS. */ hn_synth_detach(sc); /* * Reattach the synthetic parts, i.e. NVS and RNDIS, * with the new MTU setting. */ error = hn_synth_attach(sc, ifr->ifr_mtu); if (error) { HN_UNLOCK(sc); break; } /* * Commit the requested MTU, after the synthetic parts * have been successfully attached. */ ifp->if_mtu = ifr->ifr_mtu; /* * Make sure that various parameters based on MTU are * still valid, after the MTU change. */ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) hn_set_chim_size(sc, sc->hn_chim_szmax); hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); #endif /* * All done! Resume the interface now. */ hn_resume(sc); HN_UNLOCK(sc); break; case SIOCSIFFLAGS: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_set_rxfilter(sc); else hn_init_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc); } sc->hn_if_flags = ifp->if_flags; HN_UNLOCK(sc); break; case SIOCSIFCAP: HN_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); } /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; #endif if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; else ifp->if_hwassist &= ~CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; else ifp->if_hwassist &= ~CSUM_IP6_TSO; } HN_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: #ifdef notyet /* * XXX * Multicast uses mutex, while RNDIS RX filter setting * sleeps. We workaround this by always enabling * ALLMULTI. ALLMULTI would actually always be on, even * if we supported the SIOCADDMULTI/SIOCDELMULTI, since * we don't support multicast address list configuration * for this driver. */ HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_set_rxfilter(sc); HN_UNLOCK(sc); #endif break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void hn_stop(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Clear RUNNING bit _before_ hn_suspend_data() */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); hn_suspend_data(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; } /* * FreeBSD transmit entry point */ static void hn_start(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_init_locked(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; /* Configure RX filter */ hn_set_rxfilter(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* Clear TX 'suspended' bit. */ hn_tx_resume(sc, sc->hn_tx_ring_inuse); /* Everything is ready; unleash! */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); } static void hn_init(void *xsc) { struct hn_softc *sc = xsc; HN_LOCK(sc); hn_init_locked(sc); HN_UNLOCK(sc); } #ifdef LATER /* * */ static void hn_watchdog(struct ifnet *ifp) { if_printf(ifp, "watchdog timeout -- resetting\n"); hn_init(ifp->if_softc); /* XXX */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } #endif #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) { HN_UNLOCK(sc); return EINVAL; } hn_set_lro_lenlim(sc, lenlim); HN_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; } #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } HN_UNLOCK(sc); return 0; } static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chim_size, error; chim_size = sc->hn_tx_ring[0].hn_chim_size; error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; HN_LOCK(sc); hn_set_chim_size(sc, chim_size); HN_UNLOCK(sc); return 0; } #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((int *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((int *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } HN_UNLOCK(sc); return 0; } static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char caps_str[128]; uint32_t caps; HN_LOCK(sc); caps = sc->hn_caps; HN_UNLOCK(sc); snprintf(caps_str, sizeof(caps_str), "%b", caps, "\020" "\001VLAN" "\002MTU" "\003IPCS" "\004TCP4CS" "\005TCP6CS" "\006UDP4CS" "\007UDP6CS" "\010TSO4" "\011TSO6" "\012HASHVAL"); return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); } static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char assist_str[128]; uint32_t hwassist; HN_LOCK(sc); hwassist = sc->hn_ifp->if_hwassist; HN_UNLOCK(sc); snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); } static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char filter_str[128]; uint32_t filter; HN_LOCK(sc); filter = sc->hn_rx_filter; HN_UNLOCK(sc); snprintf(filter_str, sizeof(filter_str), "%b", filter, NDIS_PACKET_TYPES); return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); } static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error || req->newptr == NULL) goto back; error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (sc->hn_rx_ring_inuse > 1) { error = hn_rss_reconfig(sc); } else { /* Not RSS capable, at least for now; just save the RSS key. */ error = 0; } back: HN_UNLOCK(sc); return (error); } static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error || req->newptr == NULL) goto back; /* * Don't allow RSS indirect table change, if this interface is not * RSS capable currently. */ if (sc->hn_rx_ring_inuse == 1) { error = EOPNOTSUPP; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); return (error); } static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 int lroent_cnt; #endif #endif int i; /* * Create RXBUF for reception. * * NOTE: * - It is shared by all channels. * - A large enough buffer is allocated, certain version of NVSes * may further limit the usable space. */ sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_rxbuf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return (ENOMEM); } sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_NETVSC, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; if (bootverbose) device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, NETVSC_DEVICE_RING_BUFFER_SIZE + NETVSC_DEVICE_RING_BUFFER_SIZE, &rxr->hn_br_dma, BUS_DMA_WAITOK); if (rxr->hn_br == NULL) { device_printf(dev, "allocate bufring failed\n"); return (ENOMEM); } if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK); rxr->hn_rx_idx = i; rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); #else tcp_lro_init(&rxr->hn_lro); rxr->hn_lro.ifp = sc->hn_ifp; #endif #if __FreeBSD_version >= 1100099 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); #if __FreeBSD_version >= 1100099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); #endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); return (0); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rxbuf != NULL) { hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); sc->hn_rxbuf = NULL; } if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_br == NULL) continue; hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); rxr->hn_br = NULL; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif free(rxr->hn_rdbuf, M_NETVSC); } free(sc->hn_rx_ring, M_NETVSC); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_create_tx_ring(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_NETVSC, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC, M_WAITOK, &txr->hn_tx_lock); #endif txr->hn_tx_taskq = sc->hn_tx_taskq; if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_PKT_ALIGN, /* alignment */ HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; /* * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_pkt, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &txd->rndis_pkt_dmap); if (error) { device_printf(dev, "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap, txd->rndis_pkt, HN_RNDIS_PKT_LEN, hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); if (!hn_use_if_start) { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW, &txr->hn_pkts, "# of packets transmitted"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_destroy_tx_ring(struct hn_tx_ring *txr) { struct hn_txdesc *txd; if (txr->hn_txdesc == NULL) return; #ifndef HN_USE_TXDESC_BUFRING while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) { SLIST_REMOVE_HEAD(&txr->hn_txlist, link); hn_txdesc_dmamap_destroy(txd); } #else mtx_lock(&txr->hn_tx_lock); while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL) hn_txdesc_dmamap_destroy(txd); mtx_unlock(&txr->hn_tx_lock); #endif if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING buf_ring_free(txr->hn_txdesc_br, M_NETVSC); #endif free(txr->hn_txdesc, M_NETVSC); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) buf_ring_free(txr->hn_mbuf_br, M_NETVSC); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; /* * Create TXBUF for chimney sending. * * NOTE: It is shared by all channels. */ sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_chim == NULL) { device_printf(sc->hn_dev, "allocate txbuf failed\n"); return (ENOMEM); } sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_NETVSC, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_create_tx_ring(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); return 0; } static void hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } static void hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) { struct ifnet *ifp = sc->hn_ifp; int tso_minlen; if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) return; KASSERT(sc->hn_ndis_tso_sgmin >= 2, ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); tso_minlen = sc->hn_ndis_tso_sgmin * mtu; KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && sc->hn_ndis_tso_szmax <= IP_MAXPACKET, ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); if (tso_maxlen < tso_minlen) tso_maxlen = tso_minlen; else if (tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; if (tso_maxlen > sc->hn_ndis_tso_szmax) tso_maxlen = sc->hn_ndis_tso_szmax; ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); if (bootverbose) if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); } static void hn_fixup_tx_data(struct hn_softc *sc) { uint64_t csum_assist; int i; hn_set_chim_size(sc, sc->hn_chim_szmax); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_chim_szmax) hn_set_chim_size(sc, hn_tx_chimney_size); csum_assist = 0; if (sc->hn_caps & HN_CAP_IPCS) csum_assist |= CSUM_IP; if (sc->hn_caps & HN_CAP_TCP4CS) csum_assist |= CSUM_IP_TCP; if (sc->hn_caps & HN_CAP_UDP4CS) csum_assist |= CSUM_IP_UDP; #ifdef notyet if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if (sc->hn_caps & HN_CAP_UDP6CS) csum_assist |= CSUM_IP6_UDP; #endif for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; if (sc->hn_caps & HN_CAP_HASHVAL) { /* * Support HASHVAL pktinfo on TX path. */ if (bootverbose) if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; } } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_chim != NULL) { hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); sc->hn_chim = NULL; } if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_destroy_tx_ring(&sc->hn_tx_ring[i]); free(sc->hn_tx_ring, M_NETVSC); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; struct mbuf *m_head; mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); if (__predict_false(txr->hn_suspended)) return 0; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return 0; while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); return 1; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ drbr_advance(ifp, txr->hn_mbuf_br); continue; } error = hn_send_pkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } return 0; } static int hn_transmit(struct ifnet *ifp, struct mbuf *m) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr; int error, idx = 0; /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_tx_ring_qflush(struct hn_tx_ring *txr) { struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_qflush(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; struct hn_tx_ring *txr = NULL; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus); /* * Open this channel */ cbr.cbr = rxr->hn_br; cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; cbr.cbr_txsz = NETVSC_DEVICE_RING_BUFFER_SIZE; cbr.cbr_rxsz = NETVSC_DEVICE_RING_BUFFER_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { if_printf(sc->hn_ifp, "open chan%u failed: %d\n", vmbus_chan_id(chan), error); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (txr != NULL) txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } return (error); } static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), ("RX ring %d is not attached", idx)); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), ("TX ring %d is not attached attached", idx)); txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } /* * Close this channel. * * NOTE: * Channel closing does _not_ destroy the target channel. */ vmbus_chan_close(chan); } static int hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; if (subchan_cnt == 0) return (0); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { error = hn_chan_attach(sc, subchans[i]); if (error) break; } vmbus_subchan_rel(subchans, subchan_cnt); if (error) { if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "%d sub-channels attached\n", subchan_cnt); } } return (error); } static void hn_detach_allchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i; if (subchan_cnt == 0) goto back; /* Detach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); back: /* * Detach the primary channel, _after_ all sub-channels * are detached. */ hn_chan_detach(sc, sc->hn_prichan); /* Wait for sub-channels to be destroyed, if any. */ vmbus_subchan_drain(sc->hn_prichan); #ifdef INVARIANTS for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { KASSERT((sc->hn_rx_ring[i].hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("%dth RX ring is still attached", i)); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { KASSERT((sc->hn_tx_ring[i].hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("%dth TX ring is still attached", i)); } #endif } static int hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) { struct vmbus_channel **subchans; int nchan, rxr_cnt, error; nchan = *nsubch + 1; if (nchan == 1) { /* * Multiple RX/TX rings are not requested. */ *nsubch = 0; return (0); } /* * Query RSS capabilities, e.g. # of RX rings, and # of indirect * table entries. */ error = hn_rndis_query_rsscaps(sc, &rxr_cnt); if (error) { /* No RSS; this is benign. */ *nsubch = 0; return (0); } if (bootverbose) { if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", rxr_cnt, nchan); } if (nchan > rxr_cnt) nchan = rxr_cnt; if (nchan == 1) { if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); *nsubch = 0; return (0); } /* * Allocate sub-channels from NVS. */ *nsubch = nchan - 1; error = hn_nvs_alloc_subchans(sc, nsubch); if (error || *nsubch == 0) { /* Failed to allocate sub-channels. */ *nsubch = 0; return (0); } /* * Wait for all sub-channels to become ready before moving on. */ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); vmbus_subchan_rel(subchans, *nsubch); return (0); } static int hn_synth_attach(struct hn_softc *sc, int mtu) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan, i; uint32_t old_caps; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; /* * Attach the primary channel _before_ attaching NVS and RNDIS. */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) return (error); /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) return (error); /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu); if (error) return (error); /* * Make sure capabilities are not changed. */ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); /* Restore old capabilities and abort. */ sc->hn_caps = old_caps; return ENXIO; } /* * Allocate sub-channels for multi-TX/RX rings. * * NOTE: * The # of RX rings that can be used is equivalent to the # of * channels to be requested. */ nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) return (error); nchan = nsubch + 1; if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* * Configure RSS key and indirect table _after_ all sub-channels * are allocated. */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. */ if (bootverbose) if_printf(sc->hn_ifp, "setup default RSS key\n"); memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); sc->hn_flags |= HN_FLAG_HAS_RSSKEY; } if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { /* * RSS indirect table is not set yet; set it up in round- * robin fashion. */ if (bootverbose) { if_printf(sc->hn_ifp, "setup default RSS indirect " "table\n"); } /* TODO: Take ndis_rss_caps.ndis_nind into account. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) rss->rss_ind[i] = i % nchan; sc->hn_flags |= HN_FLAG_HAS_RSSIND; } else { /* * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. */ hn_rss_ind_fixup(sc, nchan); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { /* * Failed to configure RSS key or indirect table; only * the primary channel can be used. */ nchan = 1; } back: /* * Set the # of TX/RX rings that could be used according to * the # of channels that NVS offered. */ hn_set_ring_inuse(sc, nchan); /* * Attach the sub-channels, if any. */ error = hn_attach_subchans(sc); if (error) return (error); sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; return (0); } /* * NOTE: * The interface must have been suspended though hn_suspend(), before * this function get called. */ static void hn_synth_detach(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Detach the RNDIS first. */ hn_rndis_detach(sc); /* Detach NVS. */ hn_nvs_detach(sc); /* Detach all of the channels. */ hn_detach_allchans(sc); sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; } static void hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) { KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, ("invalid ring count %d", ring_cnt)); if (sc->hn_tx_ring_cnt > ring_cnt) sc->hn_tx_ring_inuse = ring_cnt; else sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_rx_ring_inuse = ring_cnt; if (bootverbose) { if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); } } static void hn_rx_drain(struct vmbus_channel *chan) { while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan)) pause("waitch", 1); vmbus_chan_intr_drain(chan); } static void hn_suspend_data(struct hn_softc *sc) { struct vmbus_channel **subch = NULL; int i, nsubch; HN_LOCK_ASSERT(sc); /* * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ /* Wait for all pending sends to finish. */ while (hn_tx_ring_pending(txr)) pause("hnwtx", 1 /* 1 tick */); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } /* * Disable RX by clearing RX filter. */ hn_rndis_set_rxfilter(sc, 0); sc->hn_rx_filter = 0; /* * Give RNDIS enough time to flush all pending data packets. */ pause("waitrx", (200 * hz) / 1000); /* * Drain RX/TX bufrings and interrupts. */ nsubch = sc->hn_rx_ring_inuse - 1; if (nsubch > 0) subch = vmbus_subchan_get(sc->hn_prichan, nsubch); if (subch != NULL) { for (i = 0; i < nsubch; ++i) hn_rx_drain(subch[i]); } hn_rx_drain(sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); } static void hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) { ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; } static void hn_suspend_mgmt(struct hn_softc *sc) { struct task task; HN_LOCK_ASSERT(sc); /* * Make sure that hn_mgmt_taskq0 can nolonger be accessed * through hn_mgmt_taskq. */ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); vmbus_chan_run_task(sc->hn_prichan, &task); /* * Make sure that all pending management tasks are completed. */ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); taskqueue_drain_all(sc->hn_mgmt_taskq0); } static void hn_suspend(struct hn_softc *sc) { if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_suspend_data(sc); hn_suspend_mgmt(sc); } static void hn_tx_resume(struct hn_softc *sc, int tx_ring_cnt) { int i; KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, ("invalid TX ring count %d", tx_ring_cnt)); for (i = 0; i < tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 0; mtx_unlock(&txr->hn_tx_lock); } } static void hn_resume_data(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* * Re-enable RX. */ hn_set_rxfilter(sc); /* * Make sure to clear suspend status on "all" TX rings, * since hn_tx_ring_inuse can be changed after * hn_suspend_data(). */ hn_tx_resume(sc, sc->hn_tx_ring_cnt); if (!hn_use_if_start) { /* * Flush unused drbrs, since hn_tx_ring_inuse may be * reduced. */ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); } /* * Kick start TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; /* * Use txeof task, so that any pending oactive can be * cleared properly. */ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_resume_mgmt(struct hn_softc *sc) { sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; /* * Kick off network change detection, if it was pending. * If no network change was pending, start link status * checks, which is more lightweight than network change * detection. */ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) hn_network_change(sc); else hn_link_status_update(sc); } static void hn_resume(struct hn_softc *sc) { if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_resume_data(sc); hn_resume_mgmt(sc); } static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { const struct hn_nvs_hdr *hdr; if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { if_printf(sc->hn_ifp, "invalid nvs notify\n"); return; } hdr = VMBUS_CHANPKT_CONST_DATA(pkt); if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { /* Useless; ignore */ return; } if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); } static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { struct hn_send_ctx *sndc; sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid; sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), VMBUS_CHANPKT_DATALEN(pkt)); /* * NOTE: * 'sndc' CAN NOT be accessed anymore, since it can be freed by * its callback. */ } static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) { const struct vmbus_chanpkt_rxbuf *pkt; const struct hn_nvs_hdr *nvs_hdr; int count, i, hlen; if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); return; } nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure that this is a RNDIS message. */ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", nvs_hdr->nvs_type); return; } hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); if (__predict_false(hlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; if (__predict_false(hlen < __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); return; } /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; ++i) { int ofs, len; ofs = pkt->cp_rxbuf[i].rb_ofs; len = pkt->cp_rxbuf[i].rb_len; if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) { if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " "ofs %d, len %d\n", i, ofs, len); continue; } hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len); } /* * Moved completion call back here so that all received * messages (not just data messages) will trigger a response * message back to the host. */ hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid); } /* * Net VSC on receive completion * * Send a receive completion packet to RNDIS device (ie NetVsp) */ static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid) { struct hn_nvs_rndis_ack ack; int retries = 0; int ret = 0; ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; ack.nvs_status = HN_NVS_STATUS_OK; retry_send_cmplt: /* Send the completion */ ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); if (ret == 0) { /* success */ /* no-op */ } else if (ret == EAGAIN) { /* no more room... wait a bit and attempt to retry 3 times */ retries++; if (retries < 4) { DELAY(100); goto retry_send_cmplt; } } } static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = rxr->hn_ifp->if_softc; void *buffer; int bufferlen = NETVSC_PACKET_SIZE; buffer = rxr->hn_rdbuf; do { struct vmbus_chanpkt_hdr *pkt = buffer; uint32_t bytes_rxed; int ret; bytes_rxed = bufferlen; ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); if (ret == 0) { switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hn_nvs_handle_comp(sc, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: hn_nvs_handle_rxbuf(sc, rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hn_nvs_handle_notify(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } else if (ret == ENOBUFS) { /* Handle large packet */ if (bufferlen > NETVSC_PACKET_SIZE) { free(buffer, M_NETVSC); buffer = NULL; } /* alloc new buffer */ buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT); if (buffer == NULL) { if_printf(rxr->hn_ifp, "hv_cb malloc buffer failed, len=%u\n", bytes_rxed); bufferlen = 0; break; } bufferlen = bytes_rxed; } else { /* No more packets */ break; } } while (1); if (bufferlen > NETVSC_PACKET_SIZE) free(buffer, M_NETVSC); hv_rf_channel_rollup(rxr, rxr->hn_txr); } static void hn_tx_taskq_create(void *arg __unused) { if (!hn_share_tx_taskq) return; hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskq); if (hn_bind_tx_taskq >= 0) { int cpu = hn_bind_tx_taskq; cpuset_t cpu_set; if (cpu > mp_ncpus - 1) cpu = mp_ncpus - 1; CPU_SETOF(cpu, &cpu_set); taskqueue_start_threads_cpuset(&hn_tx_taskq, 1, PI_NET, &cpu_set, "hn tx"); } else { taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx"); } } SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST, hn_tx_taskq_create, NULL); static void hn_tx_taskq_destroy(void *arg __unused) { if (hn_tx_taskq != NULL) taskqueue_free(hn_tx_taskq); } SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST, hn_tx_taskq_destroy, NULL); static device_method_t netvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netvsc_probe), DEVMETHOD(device_attach, netvsc_attach), DEVMETHOD(device_detach, netvsc_detach), DEVMETHOD(device_shutdown, netvsc_shutdown), { 0, 0 } }; static driver_t netvsc_driver = { NETVSC_DEVNAME, netvsc_methods, sizeof(struct hn_softc) }; static devclass_t netvsc_devclass; DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); Index: user/alc/PQ_LAUNDRY/sys/i386/i386/trap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/i386/i386/trap.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/i386/i386/trap.c (revision 307896) @@ -1,1133 +1,1123 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_npx.h" #include "opt_stack.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #ifdef KDTRACE_HOOKS #include #endif extern void trap(struct trapframe *frame); extern void syscall(struct trapframe *frame); static int trap_pfault(struct trapframe *, int, vm_offset_t); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); extern inthand_t IDTVEC(lcall_syscall); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) int has_f00f_bug = 0; /* Initialized so that it can be patched. */ #endif -#ifdef KDB -static int kdb_on_nmi = 1; -SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN, - &kdb_on_nmi, 0, "Go to KDB on NMI"); -#endif -static int panic_on_nmi = 1; -SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, - &panic_on_nmi, 0, "Panic on NMI"); static int prot_fault_translation = 0; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; #ifdef KDB register_t dr6; #endif int i = 0, ucode = 0; u_int type; register_t addr = 0; vm_offset_t eva; ksiginfo_t ksi; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif PCPU_INC(cnt.v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI so we check for that first. * If the HWPMC module is active, 'pmc_hook' will point to * the function to be called. A non-zero return value from the * hook means that the NMI was consumed by it and that we can * return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(PCPU_GET(cpuid), frame) != 0) goto out; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) goto out; #endif } if (type == T_MCHK) { mca_intr(); goto out; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) goto out; #endif if ((frame->tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP && frame->tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts disabled until later, * and we shouldn't enable interrupts while holding * a spin lock. */ if (type != T_PAGEFLT && td->td_md.md_spinlock_count == 0) enable_intr(); } } eva = 0; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and conditionally * reenable interrupts. If we hold a spin lock, then * we must not reenable interrupts. This might be a * spurious page fault. */ eva = rcr2(); if (td->td_md.md_spinlock_count == 0) enable_intr(); } if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_eip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif user_trctrap_out: frame->tf_eflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ #ifdef DEV_NPX ucode = npxtrap_x87(); if (ucode == -1) goto userout; #else ucode = 0; #endif i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto user_trctrap_out; } if (i == 0) goto user; break; } i = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame->tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = ILL_PRVOPC; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } addr = eva; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto userout; #else /* !POWERFAIL_NMI */ - nmi_handle_intr(type, frame, true); + nmi_handle_intr(type, frame); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; #endif uprintf("pid %d killed due to lack of floating point\n", p->p_pid); i = SIGKILL; ucode = 0; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ #if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU) ucode = npxtrap_sse(); if (ucode == -1) goto userout; #else ucode = 0; #endif i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE, eva); goto out; case T_DNA: #ifdef DEV_NPX if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); if (npxdna()) goto out; #endif break; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * XXXKIB for now disable any FPU traps in kernel * handler registration seems to be overkill */ trap_fatal(frame, 0); goto out; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto kernel_trctrap; } if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (curpcb->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame->tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; #if 0 PROC_LOCK(p); kern_psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_eip == (int)doreti_iret) { frame->tf_eip = (int)doreti_iret_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_ds) { frame->tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_es) { frame->tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_fs) { frame->tf_eip = (int)doreti_popl_fs_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ kernel_trctrap: if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap() && !(curpcb->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX %dr6 is not quite reentrant. */ dr6 = rdr6(); load_dr6(dr6 & ~0x4000); if (kdb_trap(type, dr6, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto out; #else /* !POWERFAIL_NMI */ - if (nmi_handle_intr(type, frame, false) || - !panic_on_nmi) - goto out; - /* FALLTHROUGH */ + nmi_handle_intr(type, frame); + goto out; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %x code %d type %d " "addr 0x%x esp 0x%08x eip 0x%08x " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_esp, frame->tf_eip, fubyte((void *)(frame->tf_eip + 0)), fubyte((void *)(frame->tf_eip + 1)), fubyte((void *)(frame->tf_eip + 2)), fubyte((void *)(frame->tf_eip + 3)), fubyte((void *)(frame->tf_eip + 4)), fubyte((void *)(frame->tf_eip + 5)), fubyte((void *)(frame->tf_eip + 6)), fubyte((void *)(frame->tf_eip + 7))); } KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return (-2); #endif if (usermode) goto nogo; map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a user-space address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; #if defined(PAE) || defined(PAE_TABLES) else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; #endif else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss, esp; u_int type; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if (TF_HAS_STACKREGS(frame)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_panic || kdb_active) { frame->tf_err = eva; /* smuggle fault address to ddb */ if (kdb_trap(type, 0, frame)) { frame->tf_err = code; /* restore error code */ return; } frame->tf_err = code; /* restore error code */ } #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; caddr_t params; long tmp; int error; p = td->td_proc; frame = td->td_frame; params = (caddr_t)frame->tf_esp + sizeof(int); sa->code = frame->tf_eax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(int); } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(quad_t); } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (params != NULL && sa->narg != 0) error = copyin(params, (caddr_t)sa->args, (u_int)(sa->narg * sizeof(int))); else error = 0; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; } return (error); } #include "../../kern/subr_syscall.c" /* * syscall - system call request C handler. A system call is * essentially treated as a trap by reusing the frame layout. */ void syscall(struct trapframe *frame) { struct thread *td; struct syscall_args sa; register_t orig_tf_eflags; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!(TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0)) { panic("syscall"); /* NOT REACHED */ } #endif orig_tf_eflags = frame->tf_eflags; td = curthread; td->td_frame = frame; error = syscallenter(td, &sa); /* * Traced syscall. */ if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { frame->tf_eflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_eip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, sa.code))); syscallret(td, error, &sa); } Index: user/alc/PQ_LAUNDRY/sys/kern/subr_prf.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/subr_prf.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/kern/subr_prf.c (revision 307896) @@ -1,1219 +1,1223 @@ /*- * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include "opt_ddb.h" #include "opt_printf.h" #endif /* _KERNEL */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif #include #include #ifdef DDB #include #endif /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ +#ifdef _KERNEL #include +#else +#include +#endif #ifdef _KERNEL #define TOCONS 0x01 #define TOTTY 0x02 #define TOLOG 0x04 /* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */ #define MAXNBUF (sizeof(intmax_t) * NBBY + 1) struct putchar_arg { int flags; int pri; struct tty *tty; char *p_bufr; size_t n_bufr; char *p_next; size_t remain; }; struct snprintf_arg { char *str; size_t remain; }; extern int log_open; static void msglogchar(int c, int pri); static void msglogstr(char *str, int pri, int filter_cr); static void putchar(int ch, void *arg); static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper); static void snprintf_func(int ch, void *arg); static int msgbufmapped; /* Set when safe to use msgbuf */ int msgbuftrigger; static int log_console_output = 1; SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RWTUN, &log_console_output, 0, "Duplicate console output to the syslog"); /* * See the comment in log_console() below for more explanation of this. */ static int log_console_add_linefeed; SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RWTUN, &log_console_add_linefeed, 0, "log_console() adds extra newlines"); static int always_console_output; SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RWTUN, &always_console_output, 0, "Always output to console despite TIOCCONS"); /* * Warn that a system table is full. */ void tablefull(const char *tab) { log(LOG_ERR, "%s: table is full\n", tab); } /* * Uprintf prints to the controlling terminal for the current process. */ int uprintf(const char *fmt, ...) { va_list ap; struct putchar_arg pca; struct proc *p; struct thread *td; int retval; td = curthread; if (TD_IS_IDLETHREAD(td)) return (0); sx_slock(&proctree_lock); p = td->td_proc; PROC_LOCK(p); if ((p->p_flag & P_CONTROLT) == 0) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); return (0); } SESS_LOCK(p->p_session); pca.tty = p->p_session->s_ttyp; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (pca.tty == NULL) { sx_sunlock(&proctree_lock); return (0); } pca.flags = TOTTY; pca.p_bufr = NULL; va_start(ap, fmt); tty_lock(pca.tty); sx_sunlock(&proctree_lock); retval = kvprintf(fmt, putchar, &pca, 10, ap); tty_unlock(pca.tty); va_end(ap); return (retval); } /* * tprintf and vtprintf print on the controlling terminal associated with the * given session, possibly to the log as well. */ void tprintf(struct proc *p, int pri, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vtprintf(p, pri, fmt, ap); va_end(ap); } void vtprintf(struct proc *p, int pri, const char *fmt, va_list ap) { struct tty *tp = NULL; int flags = 0; struct putchar_arg pca; struct session *sess = NULL; sx_slock(&proctree_lock); if (pri != -1) flags |= TOLOG; if (p != NULL) { PROC_LOCK(p); if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { sess = p->p_session; sess_hold(sess); PROC_UNLOCK(p); tp = sess->s_ttyp; if (tp != NULL && tty_checkoutq(tp)) flags |= TOTTY; else tp = NULL; } else PROC_UNLOCK(p); } pca.pri = pri; pca.tty = tp; pca.flags = flags; pca.p_bufr = NULL; if (pca.tty != NULL) tty_lock(pca.tty); sx_sunlock(&proctree_lock); kvprintf(fmt, putchar, &pca, 10, ap); if (pca.tty != NULL) tty_unlock(pca.tty); if (sess != NULL) sess_release(sess); msgbuftrigger = 1; } /* * Ttyprintf displays a message on a tty; it should be used only by * the tty driver, or anything that knows the underlying tty will not * be revoke(2)'d away. Other callers should use tprintf. */ int ttyprintf(struct tty *tp, const char *fmt, ...) { va_list ap; struct putchar_arg pca; int retval; va_start(ap, fmt); pca.tty = tp; pca.flags = TOTTY; pca.p_bufr = NULL; retval = kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); return (retval); } static int _vprintf(int level, int flags, const char *fmt, va_list ap) { struct putchar_arg pca; int retval; #ifdef PRINTF_BUFR_SIZE char bufr[PRINTF_BUFR_SIZE]; #endif pca.tty = NULL; pca.pri = level; pca.flags = flags; #ifdef PRINTF_BUFR_SIZE pca.p_bufr = bufr; pca.p_next = pca.p_bufr; pca.n_bufr = sizeof(bufr); pca.remain = sizeof(bufr); *pca.p_next = '\0'; #else /* Don't buffer console output. */ pca.p_bufr = NULL; #endif retval = kvprintf(fmt, putchar, &pca, 10, ap); #ifdef PRINTF_BUFR_SIZE /* Write any buffered console/log output: */ if (*pca.p_bufr != '\0') { if (pca.flags & TOLOG) msglogstr(pca.p_bufr, level, /*filter_cr*/1); if (pca.flags & TOCONS) cnputs(pca.p_bufr); } #endif return (retval); } /* * Log writes to the log buffer, and guarantees not to sleep (so can be * called by interrupt routines). If there is no process reading the * log yet, it writes to the console also. */ void log(int level, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vlog(level, fmt, ap); va_end(ap); } void vlog(int level, const char *fmt, va_list ap) { (void)_vprintf(level, log_open ? TOLOG : TOCONS | TOLOG, fmt, ap); msgbuftrigger = 1; } #define CONSCHUNK 128 void log_console(struct uio *uio) { int c, error, nl; char *consbuffer; int pri; if (!log_console_output) return; pri = LOG_INFO | LOG_CONSOLE; uio = cloneuio(uio); consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK); nl = 0; while (uio->uio_resid > 0) { c = imin(uio->uio_resid, CONSCHUNK - 1); error = uiomove(consbuffer, c, uio); if (error != 0) break; /* Make sure we're NUL-terminated */ consbuffer[c] = '\0'; if (consbuffer[c - 1] == '\n') nl = 1; else nl = 0; msglogstr(consbuffer, pri, /*filter_cr*/ 1); } /* * The previous behavior in log_console() is preserved when * log_console_add_linefeed is non-zero. For that behavior, if an * individual console write came in that was not terminated with a * line feed, it would add a line feed. * * This results in different data in the message buffer than * appears on the system console (which doesn't add extra line feed * characters). * * A number of programs and rc scripts write a line feed, or a period * and a line feed when they have completed their operation. On * the console, this looks seamless, but when displayed with * 'dmesg -a', you wind up with output that looks like this: * * Updating motd: * . * * On the console, it looks like this: * Updating motd:. * * We could add logic to detect that situation, or just not insert * the extra newlines. Set the kern.log_console_add_linefeed * sysctl/tunable variable to get the old behavior. */ if (!nl && log_console_add_linefeed) { consbuffer[0] = '\n'; consbuffer[1] = '\0'; msglogstr(consbuffer, pri, /*filter_cr*/ 1); } msgbuftrigger = 1; free(uio, M_IOV); free(consbuffer, M_TEMP); return; } int printf(const char *fmt, ...) { va_list ap; int retval; va_start(ap, fmt); retval = vprintf(fmt, ap); va_end(ap); return (retval); } int vprintf(const char *fmt, va_list ap) { int retval; retval = _vprintf(-1, TOCONS | TOLOG, fmt, ap); if (!panicstr) msgbuftrigger = 1; return (retval); } static void putbuf(int c, struct putchar_arg *ap) { /* Check if no console output buffer was provided. */ if (ap->p_bufr == NULL) { /* Output direct to the console. */ if (ap->flags & TOCONS) cnputc(c); if (ap->flags & TOLOG) msglogchar(c, ap->pri); } else { /* Buffer the character: */ *ap->p_next++ = c; ap->remain--; /* Always leave the buffer zero terminated. */ *ap->p_next = '\0'; /* Check if the buffer needs to be flushed. */ if (ap->remain == 2 || c == '\n') { if (ap->flags & TOLOG) msglogstr(ap->p_bufr, ap->pri, /*filter_cr*/1); if (ap->flags & TOCONS) { if ((panicstr == NULL) && (constty != NULL)) msgbuf_addstr(&consmsgbuf, -1, ap->p_bufr, /*filter_cr*/ 0); if ((constty == NULL) ||(always_console_output)) cnputs(ap->p_bufr); } ap->p_next = ap->p_bufr; ap->remain = ap->n_bufr; *ap->p_next = '\0'; } /* * Since we fill the buffer up one character at a time, * this should not happen. We should always catch it when * ap->remain == 2 (if not sooner due to a newline), flush * the buffer and move on. One way this could happen is * if someone sets PRINTF_BUFR_SIZE to 1 or something * similarly silly. */ KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd", ap->remain)); } } /* * Print a character on console or users terminal. If destination is * the console then the last bunch of characters are saved in msgbuf for * inspection later. */ static void putchar(int c, void *arg) { struct putchar_arg *ap = (struct putchar_arg*) arg; struct tty *tp = ap->tty; int flags = ap->flags; /* Don't use the tty code after a panic or while in ddb. */ if (kdb_active) { if (c != '\0') cnputc(c); return; } if ((flags & TOTTY) && tp != NULL && panicstr == NULL) tty_putchar(tp, c); if ((flags & (TOCONS | TOLOG)) && c != '\0') putbuf(c, ap); } /* * Scaled down version of sprintf(3). */ int sprintf(char *buf, const char *cfmt, ...) { int retval; va_list ap; va_start(ap, cfmt); retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); buf[retval] = '\0'; va_end(ap); return (retval); } /* * Scaled down version of vsprintf(3). */ int vsprintf(char *buf, const char *cfmt, va_list ap) { int retval; retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); buf[retval] = '\0'; return (retval); } /* * Scaled down version of snprintf(3). */ int snprintf(char *str, size_t size, const char *format, ...) { int retval; va_list ap; va_start(ap, format); retval = vsnprintf(str, size, format, ap); va_end(ap); return(retval); } /* * Scaled down version of vsnprintf(3). */ int vsnprintf(char *str, size_t size, const char *format, va_list ap) { struct snprintf_arg info; int retval; info.str = str; info.remain = size; retval = kvprintf(format, snprintf_func, &info, 10, ap); if (info.remain >= 1) *info.str++ = '\0'; return (retval); } /* * Kernel version which takes radix argument vsnprintf(3). */ int vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap) { struct snprintf_arg info; int retval; info.str = str; info.remain = size; retval = kvprintf(format, snprintf_func, &info, radix, ap); if (info.remain >= 1) *info.str++ = '\0'; return (retval); } static void snprintf_func(int ch, void *arg) { struct snprintf_arg *const info = arg; if (info->remain >= 2) { *info->str++ = ch; info->remain--; } } /* * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse * order; return an optional length and a pointer to the last character * written in the buffer (i.e., the first character of the string). * The buffer pointed to by `nbuf' must have length >= MAXNBUF. */ static char * ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper) { char *p, c; p = nbuf; *p = '\0'; do { c = hex2ascii(num % base); *++p = upper ? toupper(c) : c; } while (num /= base); if (lenp) *lenp = p - nbuf; return (p); } /* * Scaled down version of printf(3). * * Two additional formats: * * The format %b is supported to decode error registers. * Its usage is: * * printf("reg=%b\n", regval, "*"); * * where is the output base expressed as a control character, e.g. * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, * the first of which gives the bit number to be inspected (origin 1), and * the next characters (up to a control character, i.e. a character <= 32), * give the name of the register. Thus: * * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE"); * * would produce output: * * reg=3 * * XXX: %D -- Hexdump, takes pointer and separator string: * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX * ("%*D", len, ptr, " " -> XX XX XX XX ... */ int kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) { #define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } char nbuf[MAXNBUF]; char *d; const char *p, *percent, *q; u_char *up; int ch, n; uintmax_t num; int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot; int cflag, hflag, jflag, tflag, zflag; int dwidth, upper; char padc; int stop = 0, retval = 0; num = 0; if (!func) d = (char *) arg; else d = NULL; if (fmt == NULL) fmt = "(fmt null)\n"; if (radix < 2 || radix > 36) radix = 10; for (;;) { padc = ' '; width = 0; while ((ch = (u_char)*fmt++) != '%' || stop) { if (ch == '\0') return (retval); PCHAR(ch); } percent = fmt - 1; qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; sign = 0; dot = 0; dwidth = 0; upper = 0; cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0; reswitch: switch (ch = (u_char)*fmt++) { case '.': dot = 1; goto reswitch; case '#': sharpflag = 1; goto reswitch; case '+': sign = 1; goto reswitch; case '-': ladjust = 1; goto reswitch; case '%': PCHAR(ch); break; case '*': if (!dot) { width = va_arg(ap, int); if (width < 0) { ladjust = !ladjust; width = -width; } } else { dwidth = va_arg(ap, int); } goto reswitch; case '0': if (!dot) { padc = '0'; goto reswitch; } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': for (n = 0;; ++fmt) { n = n * 10 + ch - '0'; ch = *fmt; if (ch < '0' || ch > '9') break; } if (dot) dwidth = n; else width = n; goto reswitch; case 'b': num = (u_int)va_arg(ap, int); p = va_arg(ap, char *); for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;) PCHAR(*q--); if (num == 0) break; for (tmp = 0; *p;) { n = *p++; if (num & (1 << (n - 1))) { PCHAR(tmp ? ',' : '<'); for (; (n = *p) > ' '; ++p) PCHAR(n); tmp = 1; } else for (; *p > ' '; ++p) continue; } if (tmp) PCHAR('>'); break; case 'c': width -= 1; if (!ladjust && width > 0) while (width--) PCHAR(padc); PCHAR(va_arg(ap, int)); if (ladjust && width > 0) while (width--) PCHAR(padc); break; case 'D': up = va_arg(ap, u_char *); p = va_arg(ap, char *); if (!width) width = 16; while(width--) { PCHAR(hex2ascii(*up >> 4)); PCHAR(hex2ascii(*up & 0x0f)); up++; if (width) for (q=p;*q;q++) PCHAR(*q); } break; case 'd': case 'i': base = 10; sign = 1; goto handle_sign; case 'h': if (hflag) { hflag = 0; cflag = 1; } else hflag = 1; goto reswitch; case 'j': jflag = 1; goto reswitch; case 'l': if (lflag) { lflag = 0; qflag = 1; } else lflag = 1; goto reswitch; case 'n': if (jflag) *(va_arg(ap, intmax_t *)) = retval; else if (qflag) *(va_arg(ap, quad_t *)) = retval; else if (lflag) *(va_arg(ap, long *)) = retval; else if (zflag) *(va_arg(ap, size_t *)) = retval; else if (hflag) *(va_arg(ap, short *)) = retval; else if (cflag) *(va_arg(ap, char *)) = retval; else *(va_arg(ap, int *)) = retval; break; case 'o': base = 8; goto handle_nosign; case 'p': base = 16; sharpflag = (width == 0); sign = 0; num = (uintptr_t)va_arg(ap, void *); goto number; case 'q': qflag = 1; goto reswitch; case 'r': base = radix; if (sign) goto handle_sign; goto handle_nosign; case 's': p = va_arg(ap, char *); if (p == NULL) p = "(null)"; if (!dot) n = strlen (p); else for (n = 0; n < dwidth && p[n]; n++) continue; width -= n; if (!ladjust && width > 0) while (width--) PCHAR(padc); while (n--) PCHAR(*p++); if (ladjust && width > 0) while (width--) PCHAR(padc); break; case 't': tflag = 1; goto reswitch; case 'u': base = 10; goto handle_nosign; case 'X': upper = 1; case 'x': base = 16; goto handle_nosign; case 'y': base = 16; sign = 1; goto handle_sign; case 'z': zflag = 1; goto reswitch; handle_nosign: sign = 0; if (jflag) num = va_arg(ap, uintmax_t); else if (qflag) num = va_arg(ap, u_quad_t); else if (tflag) num = va_arg(ap, ptrdiff_t); else if (lflag) num = va_arg(ap, u_long); else if (zflag) num = va_arg(ap, size_t); else if (hflag) num = (u_short)va_arg(ap, int); else if (cflag) num = (u_char)va_arg(ap, int); else num = va_arg(ap, u_int); goto number; handle_sign: if (jflag) num = va_arg(ap, intmax_t); else if (qflag) num = va_arg(ap, quad_t); else if (tflag) num = va_arg(ap, ptrdiff_t); else if (lflag) num = va_arg(ap, long); else if (zflag) num = va_arg(ap, ssize_t); else if (hflag) num = (short)va_arg(ap, int); else if (cflag) num = (char)va_arg(ap, int); else num = va_arg(ap, int); number: if (sign && (intmax_t)num < 0) { neg = 1; num = -(intmax_t)num; } p = ksprintn(nbuf, num, base, &n, upper); tmp = 0; if (sharpflag && num != 0) { if (base == 8) tmp++; else if (base == 16) tmp += 2; } if (neg) tmp++; if (!ladjust && padc == '0') dwidth = width - tmp; width -= tmp + imax(dwidth, n); dwidth -= n; if (!ladjust) while (width-- > 0) PCHAR(' '); if (neg) PCHAR('-'); if (sharpflag && num != 0) { if (base == 8) { PCHAR('0'); } else if (base == 16) { PCHAR('0'); PCHAR('x'); } } while (dwidth-- > 0) PCHAR('0'); while (*p) PCHAR(*p--); if (ladjust) while (width-- > 0) PCHAR(' '); break; default: while (percent < fmt) PCHAR(*percent++); /* * Since we ignore a formatting argument it is no * longer safe to obey the remaining formatting * arguments as the arguments will no longer match * the format specs. */ stop = 1; break; } } #undef PCHAR } /* * Put character in log buffer with a particular priority. */ static void msglogchar(int c, int pri) { static int lastpri = -1; static int dangling; char nbuf[MAXNBUF]; char *p; if (!msgbufmapped) return; if (c == '\0' || c == '\r') return; if (pri != -1 && pri != lastpri) { if (dangling) { msgbuf_addchar(msgbufp, '\n'); dangling = 0; } msgbuf_addchar(msgbufp, '<'); for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;) msgbuf_addchar(msgbufp, *p--); msgbuf_addchar(msgbufp, '>'); lastpri = pri; } msgbuf_addchar(msgbufp, c); if (c == '\n') { dangling = 0; lastpri = -1; } else { dangling = 1; } } static void msglogstr(char *str, int pri, int filter_cr) { if (!msgbufmapped) return; msgbuf_addstr(msgbufp, pri, str, filter_cr); } void msgbufinit(void *ptr, int size) { char *cp; static struct msgbuf *oldp = NULL; size -= sizeof(*msgbufp); cp = (char *)ptr; msgbufp = (struct msgbuf *)(cp + size); msgbuf_reinit(msgbufp, cp, size); if (msgbufmapped && oldp != msgbufp) msgbuf_copy(oldp, msgbufp); msgbufmapped = 1; oldp = msgbufp; } static int unprivileged_read_msgbuf = 1; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf, CTLFLAG_RW, &unprivileged_read_msgbuf, 0, "Unprivileged processes may read the kernel message buffer"); /* Sysctls for accessing/clearing the msgbuf */ static int sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS) { char buf[128]; u_int seq; int error, len; if (!unprivileged_read_msgbuf) { error = priv_check(req->td, PRIV_MSGBUF); if (error) return (error); } /* Read the whole buffer, one chunk at a time. */ mtx_lock(&msgbuf_lock); msgbuf_peekbytes(msgbufp, NULL, 0, &seq); for (;;) { len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq); mtx_unlock(&msgbuf_lock); if (len == 0) return (SYSCTL_OUT(req, "", 1)); /* add nulterm */ error = sysctl_handle_opaque(oidp, buf, len, req); if (error) return (error); mtx_lock(&msgbuf_lock); } } SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer"); static int msgbuf_clearflag; static int sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) { mtx_lock(&msgbuf_lock); msgbuf_clear(msgbufp); mtx_unlock(&msgbuf_lock); msgbuf_clearflag = 0; } return (error); } SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I", "Clear kernel message buffer"); #ifdef DDB DB_SHOW_COMMAND(msgbuf, db_show_msgbuf) { int i, j; if (!msgbufmapped) { db_printf("msgbuf not mapped yet\n"); return; } db_printf("msgbufp = %p\n", msgbufp); db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n", msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq, msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum); for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) { j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq); db_printf("%c", msgbufp->msg_ptr[j]); } db_printf("\n"); } #endif /* DDB */ void hexdump(const void *ptr, int length, const char *hdr, int flags) { int i, j, k; int cols; const unsigned char *cp; char delim; if ((flags & HD_DELIM_MASK) != 0) delim = (flags & HD_DELIM_MASK) >> 8; else delim = ' '; if ((flags & HD_COLUMN_MASK) != 0) cols = flags & HD_COLUMN_MASK; else cols = 16; cp = ptr; for (i = 0; i < length; i+= cols) { if (hdr != NULL) printf("%s", hdr); if ((flags & HD_OMIT_COUNT) == 0) printf("%04x ", i); if ((flags & HD_OMIT_HEX) == 0) { for (j = 0; j < cols; j++) { k = i + j; if (k < length) printf("%c%02x", delim, cp[k]); else printf(" "); } } if ((flags & HD_OMIT_CHARS) == 0) { printf(" |"); for (j = 0; j < cols; j++) { k = i + j; if (k >= length) printf(" "); else if (cp[k] >= ' ' && cp[k] <= '~') printf("%c", cp[k]); else printf("."); } printf("|"); } printf("\n"); } } #endif /* _KERNEL */ void sbuf_hexdump(struct sbuf *sb, const void *ptr, int length, const char *hdr, int flags) { int i, j, k; int cols; const unsigned char *cp; char delim; if ((flags & HD_DELIM_MASK) != 0) delim = (flags & HD_DELIM_MASK) >> 8; else delim = ' '; if ((flags & HD_COLUMN_MASK) != 0) cols = flags & HD_COLUMN_MASK; else cols = 16; cp = ptr; for (i = 0; i < length; i+= cols) { if (hdr != NULL) sbuf_printf(sb, "%s", hdr); if ((flags & HD_OMIT_COUNT) == 0) sbuf_printf(sb, "%04x ", i); if ((flags & HD_OMIT_HEX) == 0) { for (j = 0; j < cols; j++) { k = i + j; if (k < length) sbuf_printf(sb, "%c%02x", delim, cp[k]); else sbuf_printf(sb, " "); } } if ((flags & HD_OMIT_CHARS) == 0) { sbuf_printf(sb, " |"); for (j = 0; j < cols; j++) { k = i + j; if (k >= length) sbuf_printf(sb, " "); else if (cp[k] >= ' ' && cp[k] <= '~') sbuf_printf(sb, "%c", cp[k]); else sbuf_printf(sb, "."); } sbuf_printf(sb, "|"); } sbuf_printf(sb, "\n"); } } #ifdef _KERNEL void counted_warning(unsigned *counter, const char *msg) { struct thread *td; unsigned c; for (;;) { c = *counter; if (c == 0) break; if (atomic_cmpset_int(counter, c, c - 1)) { td = curthread; log(LOG_INFO, "pid %d (%s) %s%s\n", td->td_proc->p_pid, td->td_name, msg, c > 1 ? "" : " - not logging anymore"); break; } } } #endif Index: user/alc/PQ_LAUNDRY/sys/netinet/ip_output.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet/ip_output.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/netinet/ip_output.c (revision 307896) @@ -1,1414 +1,1417 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #ifdef IPSEC #include #include #endif /* IPSEC*/ #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; static inline int ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp); m = *mp; if ((*error) != 0 || m == NULL) return 1; /* Finished */ ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; return -1; /* Reloop */ } /* See if fib was changed by packet filter. */ if ((*fibnum) != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; *fibnum = M_GETFIB(m); return -1; /* Reloop for FIB change */ } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); return -1; /* Reloop for CHANGE of dst */ } return 0; } /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct rm_priotracker in_ifa_tracker; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; int isbroadcast; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ uint32_t fibnum; int have_ia_ref; #ifdef IPSEC int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } else ro->ro_flags |= RT_LLE_CACHE; #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } /* * dst/gw handling: * * dst can be rewritten but always points to &ro->ro_dst. * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ gw = dst = (struct sockaddr_in *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); rte = ro->ro_rt; if (rte == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } again: /* * Validate route against routing table additions; * a better/more specific route might have been added. */ if (inp) RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. * Also check whether routing cache needs invalidation. */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { RTFREE(rte); rte = ro->ro_rt = (struct rtentry *)NULL; if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = (struct llentry *)NULL; } ia = NULL; have_ia_ref = 0; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; - isbroadcast = in_ifaddr_broadcast(dst->sin_addr, ia); + isbroadcast = ifp->if_flags & IFF_BROADCAST ? + in_ifaddr_broadcast(dst->sin_addr, ia) : 0; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia) have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (rte == NULL) { #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), fibnum); #else in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; } if (rte == NULL || (rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ia = ifatoia(rte->rt_ifa); ifp = rte->rt_ifp; counter_u64_add(rte->rt_pksent, 1); rt_update_ro_flags(ro); if (rte->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); - else + else if (ifp->if_flags & IFF_BROADCAST) isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); + else + isbroadcast = 0; } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) mtu = rte->rt_mtu; else mtu = ifp->if_mtu; /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC switch(ip_ipsec_output(&m, inp, &error)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (PFIL_HOOKED(&V_inet_pfil_hook)) { switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) { case 1: /* Finished */ goto done; case 0: /* Continue normally */ ip = mtod(m, struct ip *); break; case -1: /* Need to try again */ /* Reset everything for a new round */ RO_RTFREE(ro); if (have_ia_ref) ifa_free(&ia->ia_ifa); ro->ro_prepend = NULL; rte = NULL; gw = dst; ip = mtod(m, struct ip *); goto again; } } /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: if (ro == &iproute) RO_RTFREE(ro); else if (rte == NULL) /* * If the caller supplied a route but somehow the reference * to it has been released need to prevent the caller * calling RTFREE on it again. */ ro->ro_rt = NULL; if (have_ia_ref) ifa_free(&ia->ia_ifa); return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; ip_len = ntohs(ip->ip_len); csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ /* find the mbuf in the chain where the checksum starts*/ while ((m != NULL) && (offset >= m->m_len)) { offset -= m->m_len; m = m->m_next; } KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain.")); KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs.")); *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(inp, sopt->sopt_name, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != NULL) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); if_simloop(ifp, copym, AF_INET, 0); } } Index: user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_tables.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_tables.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/ufs/ffs/ffs_tables.c (revision 307896) @@ -1,137 +1,138 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_tables.c 8.1 (Berkeley) 6/11/93 */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include /* * Bit patterns for identifying fragments in the block map * used as ((map & around) == inside) */ int around[9] = { 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff }; int inside[9] = { 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe }; /* * Given a block map bit pattern, the frag tables tell whether a * particular size fragment is available. * * used as: * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] { * at least one fragment of the indicated size is available * } * * These tables are used by the scanc instruction on the VAX to * quickly find an appropriate fragment. */ static u_char fragtbl124[256] = { 0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e, 0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a, 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, 0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e, 0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae, 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, 0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce, 0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a, }; static u_char fragtbl8[256] = { 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40, 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12, 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c, 0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c, 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, }; /* * The actual fragtbl array. */ u_char *fragtbl[MAXFRAG + 1] = { 0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8, }; Index: user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/x86/include/x86_var.h (revision 307896) @@ -1,121 +1,120 @@ /*- * Copyright (c) 1995 Bruce D. Evans. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _X86_X86_VAR_H_ #define _X86_X86_VAR_H_ /* * Miscellaneous machine-dependent declarations. */ extern long Maxmem; extern u_int basemem; extern int busdma_swi_pending; extern u_int cpu_exthigh; extern u_int cpu_feature; extern u_int cpu_feature2; extern u_int amd_feature; extern u_int amd_feature2; extern u_int amd_pminfo; extern u_int via_feature_rng; extern u_int via_feature_xcrypt; extern u_int cpu_clflush_line_size; extern u_int cpu_stdext_feature; extern u_int cpu_stdext_feature2; extern u_int cpu_fxsr; extern u_int cpu_high; extern u_int cpu_id; extern u_int cpu_max_ext_state_size; extern u_int cpu_mxcsr_mask; extern u_int cpu_procinfo; extern u_int cpu_procinfo2; extern char cpu_vendor[]; extern u_int cpu_vendor_id; extern u_int cpu_mon_mwait_flags; extern u_int cpu_mon_min_size; extern u_int cpu_mon_max_size; extern u_int cpu_maxphyaddr; extern char ctx_switch_xsave[]; extern u_int hv_high; extern char hv_vendor[]; extern char kstack[]; extern char sigcode[]; extern int szsigcode; extern int vm_page_dump_size; extern int workaround_erratum383; extern int _udatasel; extern int _ucodesel; extern int _ucode32sel; extern int _ufssel; extern int _ugssel; extern int use_xsave; extern uint64_t xsave_mask; struct pcb; struct thread; struct reg; struct fpreg; struct dbreg; struct dumperinfo; struct trapframe; /* * The interface type of the interrupt handler entry point cannot be * expressed in C. Use simplest non-variadic function type as an * approximation. */ typedef void alias_for_inthand_t(void); void *alloc_fpusave(int flags); void busdma_swi(void); bool cpu_mwait_usable(void); void cpu_probe_amdc1e(void); void cpu_setregs(void); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); void identify_cpu(void); void initializecpu(void); void initializecpucache(void); bool fix_cpuid(void); void fillw(int /*u_short*/ pat, void *base, size_t cnt); int is_physical_memory(vm_paddr_t addr); int isa_nmi(int cd); -bool nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame, - bool panic); -bool nmi_call_kdb_smp(u_int type, struct trapframe *frame, bool panic); -int nmi_handle_intr(u_int type, struct trapframe *frame, bool panic); +void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame); +void nmi_call_kdb_smp(u_int type, struct trapframe *frame); +void nmi_handle_intr(u_int type, struct trapframe *frame); void pagecopy(void *from, void *to); void printcpuinfo(void); int user_dbreg_trap(void); int minidumpsys(struct dumperinfo *); struct pcb *get_pcb_td(struct thread *td); #endif Index: user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/x86/x86/cpu_machdep.c (revision 307896) @@ -1,574 +1,577 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" #ifdef __i386__ #include "opt_npx.h" #include "opt_apic.h" #include "opt_xbox.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } void acpi_cpu_c1(void) { __asm __volatile("sti; hlt"); } void acpi_cpu_idle_mwait(uint32_t mwait_hint) { int *state; /* * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ state = (int *)PCPU_PTR(monitorbuf); KASSERT(*state == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); *state = STATE_MWAIT; cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); /* * We should exit on any event that interrupts mwait, because * that event might be a wanted interrupt. */ *state = STATE_RUNNING; } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef __i386__ if ((cpu_feature & CPUID_TSC) == 0) return (EOPNOTSUPP); #endif /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } bool cpu_mwait_usable(void) { return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); #ifndef PC98 static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) cpu_idle_hook(sbt); else acpi_cpu_c1(); *state = STATE_RUNNING; } #endif /* !PC98 */ static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) enable_intr(); else acpi_cpu_c1(); *state = STATE_RUNNING; } static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_MWAIT; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) { enable_intr(); *state = STATE_RUNNING; return; } cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); *state = STATE_RUNNING; } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_RUNNING; /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ */ #define MSR_AMDK8_IPM 0xc0010055 #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) void cpu_probe_amdc1e(void) { /* * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. */ if (cpu_vendor_id == CPU_VENDOR_AMD && (cpu_id & 0x00000f00) == 0x00000f00 && (cpu_id & 0x0fff0000) >= 0x00040000) { cpu_ident_amdc1e = 1; } } #if defined(__i386__) && defined(PC98) void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; #else void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; #endif void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); #ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } int cpu_idle_wakeup(int cpu) { struct pcpu *pcpu; int *state; pcpu = pcpu_find(cpu); state = (int *)pcpu->pc_monitorbuf; /* * This doesn't need to be atomic since missing the race will * simply result in unnecessary IPIs. */ if (*state == STATE_SLEEPING) return (0); if (*state == STATE_MWAIT) *state = STATE_RUNNING; return (1); } /* * Ordered by speed/power consumption. */ struct { void *id_fn; char *id_name; } idle_tbl[] = { { cpu_idle_spin, "spin" }, { cpu_idle_mwait, "mwait" }, { cpu_idle_hlt, "hlt" }, #if !defined(__i386__) || !defined(PC98) { cpu_idle_acpi, "acpi" }, #endif { NULL, NULL } }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; #if !defined(__i386__) || !defined(PC98) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static int idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; int error; char *p; int i; p = "unknown"; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; #if !defined(__i386__) || !defined(PC98) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif if (strcmp(idle_tbl[i].id_name, buf)) continue; cpu_idle_fn = idle_tbl[i].id_fn; return (0); } return (EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, idle_sysctl, "A", "currently selected idle function"); +static int panic_on_nmi = 1; +SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, + &panic_on_nmi, 0, + "Panic on NMI"); int nmi_is_broadcast = 1; SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, &nmi_is_broadcast, 0, "Chipset NMI is broadcast"); #ifdef KDB int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN, &kdb_on_nmi, 0, "Go to KDB on NMI"); #endif #ifdef DEV_ISA -bool -nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame, bool do_panic) +void +nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) { /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton for debugging. */ if (kdb_on_nmi) { printf ("NMI/cpu%d ... going to debugger\n", cpu); kdb_trap(type, 0, frame); - return (true); } - } else #endif /* KDB */ - if (do_panic) + } else if (panic_on_nmi) { panic("NMI indicates hardware failure"); - return (false); + } } #endif -int -nmi_handle_intr(u_int type, struct trapframe *frame, bool panic) +void +nmi_handle_intr(u_int type, struct trapframe *frame) { #ifdef DEV_ISA #ifdef SMP - if (nmi_is_broadcast) - return (nmi_call_kdb_smp(type, frame, panic)); - else + if (nmi_is_broadcast) { + nmi_call_kdb_smp(type, frame); + return; + } #endif - return (nmi_call_kdb(0, type, frame, panic)); + nmi_call_kdb(0, type, frame); #endif } Index: user/alc/PQ_LAUNDRY/sys/x86/x86/mp_x86.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/x86/x86/mp_x86.c (revision 307895) +++ user/alc/PQ_LAUNDRY/sys/x86/x86/mp_x86.c (revision 307896) @@ -1,1589 +1,1587 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef __i386__ #include "opt_apic.h" #endif #include "opt_cpu.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* Default cpu_ops implementation. */ struct cpu_ops cpu_ops; /* * Local data and functions. */ static volatile cpuset_t ipi_stop_nmi_pending; /* used to hold the AP's until we are ready to release them */ struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info cpu_info[MAX_APIC_ID + 1]; int apic_cpuids[MAX_APIC_ID + 1]; int cpu_apic_ids[MAXCPU]; /* Holds pending bitmap based IPIs per CPU */ volatile u_int cpu_ipi_pending[MAXCPU]; static void release_aps(void *dummy); static void cpustop_handler_post(u_int cpu); static int hyperthreading_allowed = 1; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); static struct topo_node topo_root; static int pkg_id_shift; static int core_id_shift; static int disabled_cpus; struct cache_info { int id_shift; int present; } static caches[MAX_CACHE_LEVELS]; void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } /* * Add a cache level to the cache topology description. */ static int add_deterministic_cache(int type, int level, int share_count) { if (type == 0) return (0); if (type > 3) { printf("unexpected cache type %d\n", type); return (1); } if (type == 2) /* ignore instruction cache */ return (1); if (level == 0 || level > MAX_CACHE_LEVELS) { printf("unexpected cache level %d\n", type); return (1); } if (caches[level - 1].present) { printf("WARNING: multiple entries for L%u data cache\n", level); printf("%u => %u\n", caches[level - 1].id_shift, mask_width(share_count)); } caches[level - 1].id_shift = mask_width(share_count); caches[level - 1].present = 1; if (caches[level - 1].id_shift > pkg_id_shift) { printf("WARNING: L%u data cache covers more " "APIC IDs than a package\n", level); printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); caches[level - 1].id_shift = pkg_id_shift; } if (caches[level - 1].id_shift < core_id_shift) { printf("WARNING: L%u data cache covers less " "APIC IDs than a core\n", level); printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); caches[level - 1].id_shift = core_id_shift; } return (1); } /* * Determine topology of processing units and caches for AMD CPUs. * See: * - AMD CPUID Specification (Publication # 25481) * - BKDG For AMD Family 10h Processors (Publication # 31116), section 2.15 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) * XXX At the moment the code does not recognize grouping of AMD CMT threads, * if supported, into cores, so each thread is treated as being in its own * core. In other words, each logical CPU is considered to be a core. */ static void topo_probe_amd(void) { u_int p[4]; int level; int share_count; int type; int i; /* No multi-core capability. */ if ((amd_feature2 & AMDID2_CMP) == 0) return; /* For families 10h and newer. */ pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> AMDID_COREID_SIZE_SHIFT; /* For 0Fh family. */ if (pkg_id_shift == 0) pkg_id_shift = mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { for (i = 0; ; i++) { cpuid_count(0x8000001d, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } else { if (cpu_exthigh >= 0x80000005) { cpuid_count(0x80000005, 0, p); if (((p[2] >> 24) & 0xff) != 0) { caches[0].id_shift = 0; caches[0].present = 1; } } if (cpu_exthigh >= 0x80000006) { cpuid_count(0x80000006, 0, p); if (((p[2] >> 16) & 0xffff) != 0) { caches[1].id_shift = 0; caches[1].present = 1; } if (((p[3] >> 18) & 0x3fff) != 0) { /* * TODO: Account for dual-node processors * where each node within a package has its own * L3 cache. */ caches[2].id_shift = pkg_id_shift; caches[2].present = 1; } } } } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 1 and Leaf 4, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0x4(void) { u_int p[4]; int max_cores; int max_logical; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; core_id_shift = mask_width(max_logical/max_cores); KASSERT(core_id_shift >= 0, ("intel topo: max_cores > max_logical\n")); pkg_id_shift = core_id_shift + mask_width(max_cores); } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 11, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0xb(void) { u_int p[4]; int bits; int type; int i; /* Fall back if CPU leaf 11 doesn't really exist. */ cpuid_count(0x0b, 0, p); if (p[1] == 0) { topo_probe_intel_0x4(); return; } /* We only support three levels for now. */ for (i = 0; ; i++) { cpuid_count(0x0b, i, p); bits = p[0] & 0x1f; type = (p[2] >> 8) & 0xff; if (type == 0) break; /* TODO: check for duplicate (re-)assignment */ if (type == CPUID_TYPE_SMT) core_id_shift = bits; else if (type == CPUID_TYPE_CORE) pkg_id_shift = bits; else printf("unknown CPU level type %d\n", type); } if (pkg_id_shift < core_id_shift) { printf("WARNING: core covers more APIC IDs than a package\n"); core_id_shift = pkg_id_shift; } } /* * Determine topology of caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 Architectures Software Developer’s Manual * Volume 2A: Instruction Set Reference, A-M, * CPUID instruction */ static void topo_probe_intel_caches(void) { u_int p[4]; int level; int share_count; int type; int i; if (cpu_high < 0x4) { /* * Available cache level and sizes can be determined * via CPUID leaf 2, but that requires a huge table of hardcoded * values, so for now just assume L1 and L2 caches potentially * shared only by HTT processing units, if HTT is present. */ caches[0].id_shift = pkg_id_shift; caches[0].present = 1; caches[1].id_shift = pkg_id_shift; caches[1].present = 1; return; } for (i = 0; ; i++) { cpuid_count(0x4, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } /* * Determine topology of processing units and caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration */ static void topo_probe_intel(void) { /* * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_intel_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_intel_0xb(); else if (cpu_high >= 0x1) topo_probe_intel_0x4(); topo_probe_intel_caches(); } /* * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using an * assumption that APIC ID to hardware component ID mapping is * homogenious. * That doesn't necesserily imply that the topology is uniform. */ void topo_probe(void) { static int cpu_topo_probed = 0; struct x86_topo_layer { int type; int subtype; int id_shift; } topo_layers[MAX_CACHE_LEVELS + 3]; struct topo_node *parent; struct topo_node *node; int layer; int nlayers; int node_id; int i; if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); if (mp_ncpus <= 1) ; /* nothing */ else if (cpu_vendor_id == CPU_VENDOR_AMD) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) topo_probe_intel(); KASSERT(pkg_id_shift >= core_id_shift, ("bug in APIC topology discovery")); nlayers = 0; bzero(topo_layers, sizeof(topo_layers)); topo_layers[nlayers].type = TOPO_TYPE_PKG; topo_layers[nlayers].id_shift = pkg_id_shift; if (bootverbose) printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; /* * Consider all caches to be within a package/chip * and "in front" of all sub-components like * cores and hardware threads. */ for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { if (caches[i].present) { KASSERT(caches[i].id_shift <= pkg_id_shift, ("bug in APIC topology discovery")); KASSERT(caches[i].id_shift >= core_id_shift, ("bug in APIC topology discovery")); topo_layers[nlayers].type = TOPO_TYPE_CACHE; topo_layers[nlayers].subtype = i + 1; topo_layers[nlayers].id_shift = caches[i].id_shift; if (bootverbose) printf("L%u cache ID shift: %u\n", topo_layers[nlayers].subtype, topo_layers[nlayers].id_shift); nlayers++; } } if (pkg_id_shift > core_id_shift) { topo_layers[nlayers].type = TOPO_TYPE_CORE; topo_layers[nlayers].id_shift = core_id_shift; if (bootverbose) printf("Core ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; } topo_layers[nlayers].type = TOPO_TYPE_PU; topo_layers[nlayers].id_shift = 0; nlayers++; topo_init_root(&topo_root); for (i = 0; i <= MAX_APIC_ID; ++i) { if (!cpu_info[i].cpu_present) continue; parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { node_id = i >> topo_layers[layer].id_shift; parent = topo_add_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); } } parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { node_id = boot_cpu_id >> topo_layers[layer].id_shift; node = topo_find_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); topo_promote_child(node); parent = node; } cpu_topo_probed = 1; } /* * Assign logical CPU IDs to local APICs. */ void assign_cpu_ids(void) { struct topo_node *node; u_int smt_mask; smt_mask = (1u << core_id_shift) - 1; /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. */ mp_ncpus = 0; TOPO_FOREACH(node, &topo_root) { if (node->type != TOPO_TYPE_PU) continue; if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) cpu_info[node->hwid].cpu_hyperthread = 1; if (resource_disabled("lapic", node->hwid)) { if (node->hwid != boot_cpu_id) cpu_info[node->hwid].cpu_disabled = 1; else printf("Cannot disable BSP, APIC ID = %d\n", node->hwid); } if (!hyperthreading_allowed && cpu_info[node->hwid].cpu_hyperthread) cpu_info[node->hwid].cpu_disabled = 1; if (mp_ncpus >= MAXCPU) cpu_info[node->hwid].cpu_disabled = 1; if (cpu_info[node->hwid].cpu_disabled) { disabled_cpus++; continue; } cpu_apic_ids[mp_ncpus] = node->hwid; apic_cpuids[node->hwid] = mp_ncpus; topo_set_pu_id(node, mp_ncpus); mp_ncpus++; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { struct topo_node *node; const char *hyperthread; int pkg_count; int cores_per_pkg; int thrs_per_core; printf("FreeBSD/SMP: "); if (topo_analyze(&topo_root, 1, &pkg_count, &cores_per_pkg, &thrs_per_core)) { printf("%d package(s)", pkg_count); if (cores_per_pkg > 0) printf(" x %d core(s)", cores_per_pkg); if (thrs_per_core > 1) printf(" x %d hardware threads", thrs_per_core); } else { printf("Non-uniform topology"); } printf("\n"); if (disabled_cpus) { printf("FreeBSD/SMP Online: "); if (topo_analyze(&topo_root, 0, &pkg_count, &cores_per_pkg, &thrs_per_core)) { printf("%d package(s)", pkg_count); if (cores_per_pkg > 0) printf(" x %d core(s)", cores_per_pkg); if (thrs_per_core > 1) printf(" x %d hardware threads", thrs_per_core); } else { printf("Non-uniform topology"); } printf("\n"); } if (!bootverbose) return; TOPO_FOREACH(node, &topo_root) { switch (node->type) { case TOPO_TYPE_PKG: printf("Package HW ID = %u\n", node->hwid); break; case TOPO_TYPE_CORE: printf("\tCore HW ID = %u\n", node->hwid); break; case TOPO_TYPE_PU: if (cpu_info[node->hwid].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; if (node->subtype == 0) printf("\t\tCPU (AP%s): APIC ID: %u" "(disabled)\n", hyperthread, node->hwid); else if (node->id == 0) printf("\t\tCPU0 (BSP): APIC ID: %u\n", node->hwid); else printf("\t\tCPU%u (AP%s): APIC ID: %u\n", node->id, hyperthread, node->hwid); break; default: /* ignored */ break; } } } /* * Add a scheduling group, a group of logical processors sharing * a particular cache (and, thus having an affinity), to the scheduling * topology. * This function recursively works on lower level caches. */ static void x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) { struct topo_node *node; int nchildren; int ncores; int i; KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, ("x86topo_add_sched_group: bad type: %u", root->type)); CPU_COPY(&root->cpuset, &cg_root->cg_mask); cg_root->cg_count = root->cpu_count; if (root->type == TOPO_TYPE_SYSTEM) cg_root->cg_level = CG_SHARE_NONE; else cg_root->cg_level = root->subtype; /* * Check how many core nodes we have under the given root node. * If we have multiple logical processors, but not multiple * cores, then those processors must be hardware threads. */ ncores = 0; node = root; while (node != NULL) { if (node->type != TOPO_TYPE_CORE) { node = topo_next_node(root, node); continue; } ncores++; node = topo_next_nonchild_node(root, node); } if (cg_root->cg_level != CG_SHARE_NONE && root->cpu_count > 1 && ncores < 2) cg_root->cg_flags = CG_FLAG_SMT; /* * Find out how many cache nodes we have under the given root node. * We ignore cache nodes that cover all the same processors as the * root node. Also, we do not descend below found cache nodes. * That is, we count top-level "non-redundant" caches under the root * node. */ nchildren = 0; node = root; while (node != NULL) { if (node->type != TOPO_TYPE_CACHE || (root->type != TOPO_TYPE_SYSTEM && CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { node = topo_next_node(root, node); continue; } nchildren++; node = topo_next_nonchild_node(root, node); } cg_root->cg_child = smp_topo_alloc(nchildren); cg_root->cg_children = nchildren; /* * Now find again the same cache nodes as above and recursively * build scheduling topologies for them. */ node = root; i = 0; while (node != NULL) { if (node->type != TOPO_TYPE_CACHE || (root->type != TOPO_TYPE_SYSTEM && CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { node = topo_next_node(root, node); continue; } cg_root->cg_child[i].cg_parent = cg_root; x86topo_add_sched_group(node, &cg_root->cg_child[i]); i++; node = topo_next_nonchild_node(root, node); } } /* * Build the MI scheduling topology from the discovered hardware topology. */ struct cpu_group * cpu_topo(void) { struct cpu_group *cg_root; if (mp_ncpus <= 1) return (smp_topo_none()); cg_root = smp_topo_alloc(1); x86topo_add_sched_group(&topo_root, cg_root); return (cg_root); } /* * Add a logical CPU to the topology. */ void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) { mp_ncpus++; mp_maxid = mp_ncpus - 1; } if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). * If there were no calls to cpu_add() assume this is a UP system. */ if (mp_ncpus == 0) mp_ncpus = 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ CPU_SETOF(0, &all_cpus); return (mp_ncpus > 1); } /* * AP CPU's call this to initialize themselves. */ void init_secondary_tail(void) { u_int cpuid; /* * On real hardware, switch to x2apic mode if possible. Do it * after aps_ready was signalled, to avoid manipulating the * mode while BSP might still want to send some IPI to us * (second startup IPI is ignored on modern hardware etc). */ lapic_xapic_mode(); /* Initialize the PAT MSR. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX */ initializecpu(); /* set up FPU state on the AP */ #ifdef __amd64__ fpuinit(); #else npxinit(false); #endif if (cpu_ops.cpu_init) cpu_ops.cpu_init(); /* A quick check from sanity claus */ cpuid = PCPU_GET(cpuid); if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", cpuid); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mca_init(); mtx_lock_spin(&ap_boot_mtx); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); printf("SMP: AP CPU #%d Launched!\n", cpuid); /* Determine if we are a logical CPU. */ if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) CPU_SET(cpuid, &logical_cpus_mask); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } #ifdef __amd64__ /* * Enable global pages TLB extension * This also implicitly flushes the TLB */ load_cr4(rcr4() | CR4_PGE); if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); #endif mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (atomic_load_acq_int(&smp_started) == 0) ia32_pause(); #ifndef EARLY_AP_STARTUP /* Start per-CPU event timers. */ cpu_initclocks_ap(); #endif sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (cpu_info[apic_id].cpu_hyperthread) continue; intr_add_cpu(i); } } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Init and startup IPI. */ void ipi_startup(int apic_id, int vector) { /* * This attempts to follow the algorithm described in the * Intel Multiprocessor Specification v1.4 in section B.4. * For each IPI, we allow the local APIC ~20us to deliver the * IPI. If that times out, we panic. */ /* * first we do an INIT IPI: this INIT IPI might be run, resetting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); lapic_ipi_wait(100); /* Explicitly deassert the INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); DELAY(10000); /* wait ~10mS */ /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver first STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver second STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ } /* * Send an IPI to specified CPU handling the bitmap logic. */ void ipi_send_cpu(int cpu, u_int ipi) { u_int bitmap, old_pending, new_pending; KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (old_pending) return; } lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); while ((cpu = CPU_FFS(&cpus)) != 0) { cpu--; CPU_CLR(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); if (IPI_IS_BITMAPED(ipi)) { ipi_selected(other_cpus, ipi); return; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int ipi_nmi_handler(void) { u_int cpuid; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) return (1); CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); cpustop_handler(); return (0); } #ifdef DEV_ISA int nmi_kdb_lock; -bool -nmi_call_kdb_smp(u_int type, struct trapframe *frame, bool do_panic) +void +nmi_call_kdb_smp(u_int type, struct trapframe *frame) { int cpu; - bool call_post, ret; + bool call_post; cpu = PCPU_GET(cpuid); if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { - ret = nmi_call_kdb(cpu, type, frame, do_panic); + nmi_call_kdb(cpu, type, frame); call_post = false; } else { - ret = true; savectx(&stoppcbs[cpu]); CPU_SET_ATOMIC(cpu, &stopped_cpus); while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) ia32_pause(); call_post = true; } atomic_store_rel_int(&nmi_kdb_lock, 0); if (call_post) cpustop_handler_post(cpu); - return (ret); } #endif /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { u_int cpu; cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); cpustop_handler_post(cpu); } static void cpustop_handler_post(u_int cpu) { CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); #if defined(__amd64__) && defined(DDB) amd64_db_resume_dbreg(); #endif if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { u_int cpu; mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { #ifdef __amd64__ fpususpend(susppcbs[cpu]->sp_fpususpend); #else npxsuspend(susppcbs[cpu]->sp_fpususpend); #endif wbinvd(); CPU_SET_ATOMIC(cpu, &suspended_cpus); } else { #ifdef __amd64__ fpuresume(susppcbs[cpu]->sp_fpususpend); #else npxresume(susppcbs[cpu]->sp_fpususpend); #endif pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); /* Indicate that we are resumed */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); } /* Wait for resume */ while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); if (cpu_ops.cpu_resume) cpu_ops.cpu_resume(); #ifdef __amd64__ if (vmm_resume_p) vmm_resume_p(); #endif /* Resume MCA and local APIC */ lapic_xapic_mode(); mca_resume(); lapic_setup(0); /* Indicate that we are resumed */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); CPU_CLR_ATOMIC(cpu, &started_cpus); } void invlcache_handler(void) { uint32_t generation; #ifdef COUNT_IPIS (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ /* * Reading the generation here allows greater parallelism * since wbinvd is a serializing instruction. Without the * temporary, we'd wait for wbinvd to complete, then the read - * would execute, then the dependent write, whuch must then + * would execute, then the dependent write, which must then * complete before return from interrupt. */ generation = smp_tlb_generation; wbinvd(); PCPU_SET(smp_tlb_done, generation); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); intrcnt_add(buf, &ipi_invlcache_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif /* * Flush the TLB on other CPU's */ /* Variables needed for SMP tlb shootdown. */ static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; pmap_t smp_tlb_pmap; volatile uint32_t smp_tlb_generation; #ifdef __amd64__ #define read_eflags() read_rflags() #endif static void smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { cpuset_t other_cpus; volatile uint32_t *p_cpudone; uint32_t generation; int cpu; /* * Check for other cpus. Return if none. */ if (CPU_ISFULLSET(&mask)) { if (mp_ncpus <= 1) return; } else { CPU_CLR(PCPU_GET(cpuid), &mask); if (CPU_EMPTY(&mask)) return; } if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; smp_tlb_pmap = pmap; generation = ++smp_tlb_generation; if (CPU_ISFULLSET(&mask)) { ipi_all_but_self(vector); other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); } else { other_cpus = mask; while ((cpu = CPU_FFS(&mask)) != 0) { cpu--; CPU_CLR(cpu, &mask); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, vector); ipi_send_cpu(cpu, vector); } } while ((cpu = CPU_FFS(&other_cpus)) != 0) { cpu--; CPU_CLR(cpu, &other_cpus); p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; while (*p_cpudone != generation) ia32_pause(); } mtx_unlock_spin(&smp_ipi_mtx); } void smp_masked_invltlb(cpuset_t mask, pmap_t pmap) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif } } void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif } } void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void smp_cache_flush(void) { if (smp_started) { smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0); } } /* * Handlers for TLB related IPIs */ void invltlb_handler(void) { uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ /* * Reading the generation here allows greater parallelism * since invalidating the TLB is a serializing operation. */ generation = smp_tlb_generation; if (smp_tlb_pmap == kernel_pmap) invltlb_glob(); else invltlb(); PCPU_SET(smp_tlb_done, generation); } void invlpg_handler(void) { uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ generation = smp_tlb_generation; /* Overlap with serialization */ invlpg(smp_tlb_addr1); PCPU_SET(smp_tlb_done, generation); } void invlrng_handler(void) { vm_offset_t addr, addr2; uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; addr2 = smp_tlb_addr2; generation = smp_tlb_generation; /* Overlap with serialization */ do { invlpg(addr); addr += PAGE_SIZE; } while (addr < addr2); PCPU_SET(smp_tlb_done, generation); } Index: user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.8 =================================================================== --- user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.8 (revision 307895) +++ user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.8 (revision 307896) @@ -1,198 +1,198 @@ .\" Copyright (c) 1989, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 4. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)mountd.8 8.4 (Berkeley) 4/28/95 .\" $FreeBSD$ .\" -.Dd October 14, 2012 +.Dd October 24, 2016 .Dt MOUNTD 8 .Os .Sh NAME .Nm mountd .Nd service remote .Tn NFS mount requests .Sh SYNOPSIS .Nm .Op Fl 2delnrS .Op Fl h Ar bindip .Op Fl p Ar port .Op Ar exportsfile ... .Sh DESCRIPTION The .Nm utility is the server for .Tn NFS mount requests from other client machines. It listens for service requests at the port indicated in the .Tn NFS server specification; see .%T "Network File System Protocol Specification" , RFC1094, Appendix A and .%T "NFS: Network File System Version 3 Protocol Specification" , Appendix I. .Pp The following options are available: .Bl -tag -width indent .It Fl 2 Allow the administrator to force clients to use only the version 2 .Tn NFS protocol to mount file systems from this server. .It Fl d Output debugging information. .Nm will not detach from the controlling terminal and will print debugging messages to stderr. .It Fl e Ignored; included for backward compatibility. .It Fl h Ar bindip Specify specific IP addresses to bind to for TCP and UDP requests. This option may be specified multiple times. If no .Fl h option is specified, .Nm will bind to .Dv INADDR_ANY . Note that when specifying IP addresses with .Fl h , .Nm will automatically add .Li 127.0.0.1 and if IPv6 is enabled, .Li ::1 to the list. .It Fl l Cause all succeeded .Nm requests to be logged. .It Fl n Allow non-root mount requests to be served. This should only be specified if there are clients such as PC's, that require it. -It will automatically clear the vfs.nfsrv.nfs_privport sysctl flag, which +It will automatically clear the vfs.nfsd.nfs_privport sysctl flag, which controls if the kernel will accept NFS requests from reserved ports only. .It Fl p Ar port Force .Nm to bind to the specified port, for both .Dv AF_INET and .Dv AF_INET6 address families. This is typically done to ensure that the port which .Nm binds to is a known quantity which can be used in firewall rulesets. If .Nm cannot bind to this port, an appropriate error will be recorded in the system log, and the daemon will then exit. .It Fl r Allow mount RPCs requests for regular files to be served. Although this seems to violate the mount protocol specification, some diskless workstations do mount requests for their swapfiles and expect them to be regular files. Since a regular file cannot be specified in .Pa /etc/exports , the entire file system in which the swapfiles resides will have to be exported with the .Fl alldirs flag. .It Ar exportsfile Specify an alternate location for the exports file. More than one exports file can be specified. .It Fl S Tell mountd to suspend/resume execution of the nfsd threads whenever the exports list is being reloaded. This avoids intermittent access errors for clients that do NFS RPCs while the exports are being reloaded, but introduces a delay in RPC response while the reload is in progress. If .Nm crashes while an exports load is in progress, .Nm must be restarted to get the nfsd threads running again, if this option is used. .El .Pp When .Nm is started, it loads the export host addresses and options into the kernel using the .Xr mount 2 system call. After changing the exports file, a hangup signal should be sent to the .Nm daemon to get it to reload the export information. After sending the SIGHUP (kill \-s HUP `cat /var/run/mountd.pid`), check the syslog output to see if .Nm logged any parsing errors in the exports file. .Pp If .Nm detects that the running kernel does not include .Tn NFS support, it will attempt to load a loadable kernel module containing .Tn NFS code, using .Xr kldload 2 . If this fails, or no .Tn NFS KLD was available, .Nm exits with an error. .Sh FILES .Bl -tag -width /var/run/mountd.pid -compact .It Pa /etc/exports the list of exported file systems .It Pa /var/run/mountd.pid the pid of the currently running mountd .It Pa /var/db/mountdtab the current list of remote mounted file systems .El .Sh SEE ALSO .Xr nfsstat 1 , .Xr kldload 2 , .Xr nfsv4 4 , .Xr exports 5 , .Xr nfsd 8 , .Xr rpcbind 8 , .Xr showmount 8 .Sh HISTORY The .Nm utility first appeared in .Bx 4.4 . Index: user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.c (revision 307895) +++ user/alc/PQ_LAUNDRY/usr.sbin/mountd/mountd.c (revision 307896) @@ -1,3290 +1,3290 @@ /* * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Herb Hasler and Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1989, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /*not lint*/ #if 0 #ifndef lint static char sccsid[] = "@(#)mountd.c 8.15 (Berkeley) 5/1/95"; #endif /*not lint*/ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pathnames.h" #include "mntopts.h" #ifdef DEBUG #include #endif /* * Structures for keeping the mount list and export list */ struct mountlist { struct mountlist *ml_next; char ml_host[MNTNAMLEN+1]; char ml_dirp[MNTPATHLEN+1]; }; struct dirlist { struct dirlist *dp_left; struct dirlist *dp_right; int dp_flag; struct hostlist *dp_hosts; /* List of hosts this dir exported to */ char dp_dirp[1]; /* Actually malloc'd to size of dir */ }; /* dp_flag bits */ #define DP_DEFSET 0x1 #define DP_HOSTSET 0x2 struct exportlist { struct exportlist *ex_next; struct dirlist *ex_dirl; struct dirlist *ex_defdir; int ex_flag; fsid_t ex_fs; char *ex_fsdir; char *ex_indexfile; int ex_numsecflavors; int ex_secflavors[MAXSECFLAVORS]; int ex_defnumsecflavors; int ex_defsecflavors[MAXSECFLAVORS]; }; /* ex_flag bits */ #define EX_LINKED 0x1 struct netmsk { struct sockaddr_storage nt_net; struct sockaddr_storage nt_mask; char *nt_name; }; union grouptypes { struct addrinfo *gt_addrinfo; struct netmsk gt_net; }; struct grouplist { int gr_type; union grouptypes gr_ptr; struct grouplist *gr_next; int gr_numsecflavors; int gr_secflavors[MAXSECFLAVORS]; }; /* Group types */ #define GT_NULL 0x0 #define GT_HOST 0x1 #define GT_NET 0x2 #define GT_DEFAULT 0x3 #define GT_IGNORE 0x5 struct hostlist { int ht_flag; /* Uses DP_xx bits */ struct grouplist *ht_grp; struct hostlist *ht_next; }; struct fhreturn { int fhr_flag; int fhr_vers; nfsfh_t fhr_fh; int fhr_numsecflavors; int *fhr_secflavors; }; #define GETPORT_MAXTRY 20 /* Max tries to get a port # */ /* Global defs */ static char *add_expdir(struct dirlist **, char *, int); static void add_dlist(struct dirlist **, struct dirlist *, struct grouplist *, int, struct exportlist *); static void add_mlist(char *, char *); static int check_dirpath(char *); static int check_options(struct dirlist *); static int checkmask(struct sockaddr *sa); static int chk_host(struct dirlist *, struct sockaddr *, int *, int *, int *, int **); static char *strsep_quote(char **stringp, const char *delim); static int create_service(struct netconfig *nconf); static void complete_service(struct netconfig *nconf, char *port_str); static void clearout_service(void); static void del_mlist(char *hostp, char *dirp); static struct dirlist *dirp_search(struct dirlist *, char *); static int do_mount(struct exportlist *, struct grouplist *, int, struct xucred *, char *, int, struct statfs *); static int do_opt(char **, char **, struct exportlist *, struct grouplist *, int *, int *, struct xucred *); static struct exportlist *ex_search(fsid_t *); static struct exportlist *get_exp(void); static void free_dir(struct dirlist *); static void free_exp(struct exportlist *); static void free_grp(struct grouplist *); static void free_host(struct hostlist *); static void get_exportlist(void); static int get_host(char *, struct grouplist *, struct grouplist *); static struct hostlist *get_ht(void); static int get_line(void); static void get_mountlist(void); static int get_net(char *, struct netmsk *, int); static void getexp_err(struct exportlist *, struct grouplist *); static struct grouplist *get_grp(void); static void hang_dirp(struct dirlist *, struct grouplist *, struct exportlist *, int); static void huphandler(int sig); static int makemask(struct sockaddr_storage *ssp, int bitlen); static void mntsrv(struct svc_req *, SVCXPRT *); static void nextfield(char **, char **); static void out_of_mem(void); static void parsecred(char *, struct xucred *); static int parsesec(char *, struct exportlist *); static int put_exlist(struct dirlist *, XDR *, struct dirlist *, int *, int); static void *sa_rawaddr(struct sockaddr *sa, int *nbytes); static int sacmp(struct sockaddr *sa1, struct sockaddr *sa2, struct sockaddr *samask); static int scan_tree(struct dirlist *, struct sockaddr *); static void usage(void); static int xdr_dir(XDR *, char *); static int xdr_explist(XDR *, caddr_t); static int xdr_explist_brief(XDR *, caddr_t); static int xdr_explist_common(XDR *, caddr_t, int); static int xdr_fhs(XDR *, caddr_t); static int xdr_mlist(XDR *, caddr_t); static void terminate(int); static struct exportlist *exphead; static struct mountlist *mlhead; static struct grouplist *grphead; static char *exnames_default[2] = { _PATH_EXPORTS, NULL }; static char **exnames; static char **hosts = NULL; static struct xucred def_anon = { XUCRED_VERSION, (uid_t)-2, 1, { (gid_t)-2 }, NULL }; static int force_v2 = 0; static int resvport_only = 1; static int nhosts = 0; static int dir_only = 1; static int dolog = 0; static int got_sighup = 0; static int xcreated = 0; static char *svcport_str = NULL; static int mallocd_svcport = 0; static int *sock_fd; static int sock_fdcnt; static int sock_fdpos; static int suspend_nfsd = 0; static int opt_flags; static int have_v6 = 1; static int v4root_phase = 0; static char v4root_dirpath[PATH_MAX + 1]; static int has_publicfh = 0; static struct pidfh *pfh = NULL; /* Bits for opt_flags above */ #define OP_MAPROOT 0x01 #define OP_MAPALL 0x02 /* 0x4 free */ #define OP_MASK 0x08 #define OP_NET 0x10 #define OP_ALLDIRS 0x40 #define OP_HAVEMASK 0x80 /* A mask was specified or inferred. */ #define OP_QUIET 0x100 #define OP_MASKLEN 0x200 #define OP_SEC 0x400 #ifdef DEBUG static int debug = 1; static void SYSLOG(int, const char *, ...) __printflike(2, 3); #define syslog SYSLOG #else static int debug = 0; #endif /* * Similar to strsep(), but it allows for quoted strings * and escaped characters. * * It returns the string (or NULL, if *stringp is NULL), * which is a de-quoted version of the string if necessary. * * It modifies *stringp in place. */ static char * strsep_quote(char **stringp, const char *delim) { char *srcptr, *dstptr, *retval; char quot = 0; if (stringp == NULL || *stringp == NULL) return (NULL); srcptr = dstptr = retval = *stringp; while (*srcptr) { /* * We're looking for several edge cases here. * First: if we're in quote state (quot != 0), * then we ignore the delim characters, but otherwise * process as normal, unless it is the quote character. * Second: if the current character is a backslash, * we take the next character as-is, without checking * for delim, quote, or backslash. Exception: if the * next character is a NUL, that's the end of the string. * Third: if the character is a quote character, we toggle * quote state. * Otherwise: check the current character for NUL, or * being in delim, and end the string if either is true. */ if (*srcptr == '\\') { srcptr++; /* * The edge case here is if the next character * is NUL, we want to stop processing. But if * it's not NUL, then we simply want to copy it. */ if (*srcptr) { *dstptr++ = *srcptr++; } continue; } if (quot == 0 && (*srcptr == '\'' || *srcptr == '"')) { quot = *srcptr++; continue; } if (quot && *srcptr == quot) { /* End of the quoted part */ quot = 0; srcptr++; continue; } if (!quot && strchr(delim, *srcptr)) break; *dstptr++ = *srcptr++; } *dstptr = 0; /* Terminate the string */ *stringp = (*srcptr == '\0') ? NULL : srcptr + 1; return (retval); } /* * Mountd server for NFS mount protocol as described in: * NFS: Network File System Protocol Specification, RFC1094, Appendix A * The optional arguments are the exports file name * default: _PATH_EXPORTS * and "-n" to allow nonroot mount. */ int main(int argc, char **argv) { fd_set readfds; struct netconfig *nconf; char *endptr, **hosts_bak; void *nc_handle; pid_t otherpid; in_port_t svcport; int c, k, s; int maxrec = RPC_MAXDATASIZE; int attempt_cnt, port_len, port_pos, ret; char **port_list; /* Check that another mountd isn't already running. */ pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &otherpid); if (pfh == NULL) { if (errno == EEXIST) errx(1, "mountd already running, pid: %d.", otherpid); warn("cannot open or create pidfile"); } s = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (s < 0) have_v6 = 0; else close(s); while ((c = getopt(argc, argv, "2deh:lnp:rS")) != -1) switch (c) { case '2': force_v2 = 1; break; case 'e': /* now a no-op, since this is the default */ break; case 'n': resvport_only = 0; break; case 'r': dir_only = 0; break; case 'd': debug = debug ? 0 : 1; break; case 'l': dolog = 1; break; case 'p': endptr = NULL; svcport = (in_port_t)strtoul(optarg, &endptr, 10); if (endptr == NULL || *endptr != '\0' || svcport == 0 || svcport >= IPPORT_MAX) usage(); svcport_str = strdup(optarg); break; case 'h': ++nhosts; hosts_bak = hosts; hosts_bak = realloc(hosts, nhosts * sizeof(char *)); if (hosts_bak == NULL) { if (hosts != NULL) { for (k = 0; k < nhosts; k++) free(hosts[k]); free(hosts); out_of_mem(); } } hosts = hosts_bak; hosts[nhosts - 1] = strdup(optarg); if (hosts[nhosts - 1] == NULL) { for (k = 0; k < (nhosts - 1); k++) free(hosts[k]); free(hosts); out_of_mem(); } break; case 'S': suspend_nfsd = 1; break; default: usage(); } if (modfind("nfsd") < 0) { /* Not present in kernel, try loading it */ if (kldload("nfsd") < 0 || modfind("nfsd") < 0) errx(1, "NFS server is not available"); } argc -= optind; argv += optind; grphead = (struct grouplist *)NULL; exphead = (struct exportlist *)NULL; mlhead = (struct mountlist *)NULL; if (argc > 0) exnames = argv; else exnames = exnames_default; openlog("mountd", LOG_PID, LOG_DAEMON); if (debug) warnx("getting export list"); get_exportlist(); if (debug) warnx("getting mount list"); get_mountlist(); if (debug) warnx("here we go"); if (debug == 0) { daemon(0, 0); signal(SIGINT, SIG_IGN); signal(SIGQUIT, SIG_IGN); } signal(SIGHUP, huphandler); signal(SIGTERM, terminate); signal(SIGPIPE, SIG_IGN); pidfile_write(pfh); rpcb_unset(MOUNTPROG, MOUNTVERS, NULL); rpcb_unset(MOUNTPROG, MOUNTVERS3, NULL); rpc_control(RPC_SVC_CONNMAXREC_SET, &maxrec); if (!resvport_only) { - if (sysctlbyname("vfs.nfsrv.nfs_privport", NULL, NULL, + if (sysctlbyname("vfs.nfsd.nfs_privport", NULL, NULL, &resvport_only, sizeof(resvport_only)) != 0 && errno != ENOENT) { syslog(LOG_ERR, "sysctl: %m"); exit(1); } } /* * If no hosts were specified, add a wildcard entry to bind to * INADDR_ANY. Otherwise make sure 127.0.0.1 and ::1 are added to the * list. */ if (nhosts == 0) { hosts = malloc(sizeof(char *)); if (hosts == NULL) out_of_mem(); hosts[0] = "*"; nhosts = 1; } else { hosts_bak = hosts; if (have_v6) { hosts_bak = realloc(hosts, (nhosts + 2) * sizeof(char *)); if (hosts_bak == NULL) { for (k = 0; k < nhosts; k++) free(hosts[k]); free(hosts); out_of_mem(); } else hosts = hosts_bak; nhosts += 2; hosts[nhosts - 2] = "::1"; } else { hosts_bak = realloc(hosts, (nhosts + 1) * sizeof(char *)); if (hosts_bak == NULL) { for (k = 0; k < nhosts; k++) free(hosts[k]); free(hosts); out_of_mem(); } else { nhosts += 1; hosts = hosts_bak; } } hosts[nhosts - 1] = "127.0.0.1"; } attempt_cnt = 1; sock_fdcnt = 0; sock_fd = NULL; port_list = NULL; port_len = 0; nc_handle = setnetconfig(); while ((nconf = getnetconfig(nc_handle))) { if (nconf->nc_flag & NC_VISIBLE) { if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) { /* DO NOTHING */ } else { ret = create_service(nconf); if (ret == 1) /* Ignore this call */ continue; if (ret < 0) { /* * Failed to bind port, so close off * all sockets created and try again * if the port# was dynamically * assigned via bind(2). */ clearout_service(); if (mallocd_svcport != 0 && attempt_cnt < GETPORT_MAXTRY) { free(svcport_str); svcport_str = NULL; mallocd_svcport = 0; } else { errno = EADDRINUSE; syslog(LOG_ERR, "bindresvport_sa: %m"); exit(1); } /* Start over at the first service. */ free(sock_fd); sock_fdcnt = 0; sock_fd = NULL; nc_handle = setnetconfig(); attempt_cnt++; } else if (mallocd_svcport != 0 && attempt_cnt == GETPORT_MAXTRY) { /* * For the last attempt, allow * different port #s for each nconf * by saving the svcport_str and * setting it back to NULL. */ port_list = realloc(port_list, (port_len + 1) * sizeof(char *)); if (port_list == NULL) out_of_mem(); port_list[port_len++] = svcport_str; svcport_str = NULL; mallocd_svcport = 0; } } } } /* * Successfully bound the ports, so call complete_service() to * do the rest of the setup on the service(s). */ sock_fdpos = 0; port_pos = 0; nc_handle = setnetconfig(); while ((nconf = getnetconfig(nc_handle))) { if (nconf->nc_flag & NC_VISIBLE) { if (have_v6 == 0 && strcmp(nconf->nc_protofmly, "inet6") == 0) { /* DO NOTHING */ } else if (port_list != NULL) { if (port_pos >= port_len) { syslog(LOG_ERR, "too many port#s"); exit(1); } complete_service(nconf, port_list[port_pos++]); } else complete_service(nconf, svcport_str); } } endnetconfig(nc_handle); free(sock_fd); if (port_list != NULL) { for (port_pos = 0; port_pos < port_len; port_pos++) free(port_list[port_pos]); free(port_list); } if (xcreated == 0) { syslog(LOG_ERR, "could not create any services"); exit(1); } /* Expand svc_run() here so that we can call get_exportlist(). */ for (;;) { if (got_sighup) { get_exportlist(); got_sighup = 0; } readfds = svc_fdset; switch (select(svc_maxfd + 1, &readfds, NULL, NULL, NULL)) { case -1: if (errno == EINTR) continue; syslog(LOG_ERR, "mountd died: select: %m"); exit(1); case 0: continue; default: svc_getreqset(&readfds); } } } /* * This routine creates and binds sockets on the appropriate * addresses. It gets called one time for each transport. * It returns 0 upon success, 1 for ingore the call and -1 to indicate * bind failed with EADDRINUSE. * Any file descriptors that have been created are stored in sock_fd and * the total count of them is maintained in sock_fdcnt. */ static int create_service(struct netconfig *nconf) { struct addrinfo hints, *res = NULL; struct sockaddr_in *sin; struct sockaddr_in6 *sin6; struct __rpc_sockinfo si; int aicode; int fd; int nhostsbak; int one = 1; int r; u_int32_t host_addr[4]; /* IPv4 or IPv6 */ int mallocd_res; if ((nconf->nc_semantics != NC_TPI_CLTS) && (nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) return (1); /* not my type */ /* * XXX - using RPC library internal functions. */ if (!__rpc_nconf2sockinfo(nconf, &si)) { syslog(LOG_ERR, "cannot get information for %s", nconf->nc_netid); return (1); } /* Get mountd's address on this transport */ memset(&hints, 0, sizeof hints); hints.ai_family = si.si_af; hints.ai_socktype = si.si_socktype; hints.ai_protocol = si.si_proto; /* * Bind to specific IPs if asked to */ nhostsbak = nhosts; while (nhostsbak > 0) { --nhostsbak; sock_fd = realloc(sock_fd, (sock_fdcnt + 1) * sizeof(int)); if (sock_fd == NULL) out_of_mem(); sock_fd[sock_fdcnt++] = -1; /* Set invalid for now. */ mallocd_res = 0; hints.ai_flags = AI_PASSIVE; /* * XXX - using RPC library internal functions. */ if ((fd = __rpc_nconf2fd(nconf)) < 0) { int non_fatal = 0; if (errno == EAFNOSUPPORT && nconf->nc_semantics != NC_TPI_CLTS) non_fatal = 1; syslog(non_fatal ? LOG_DEBUG : LOG_ERR, "cannot create socket for %s", nconf->nc_netid); if (non_fatal != 0) continue; exit(1); } switch (hints.ai_family) { case AF_INET: if (inet_pton(AF_INET, hosts[nhostsbak], host_addr) == 1) { hints.ai_flags |= AI_NUMERICHOST; } else { /* * Skip if we have an AF_INET6 address. */ if (inet_pton(AF_INET6, hosts[nhostsbak], host_addr) == 1) { close(fd); continue; } } break; case AF_INET6: if (inet_pton(AF_INET6, hosts[nhostsbak], host_addr) == 1) { hints.ai_flags |= AI_NUMERICHOST; } else { /* * Skip if we have an AF_INET address. */ if (inet_pton(AF_INET, hosts[nhostsbak], host_addr) == 1) { close(fd); continue; } } /* * We're doing host-based access checks here, so don't * allow v4-in-v6 to confuse things. The kernel will * disable it by default on NFS sockets too. */ if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof one) < 0) { syslog(LOG_ERR, "can't disable v4-in-v6 on IPv6 socket"); exit(1); } break; default: break; } /* * If no hosts were specified, just bind to INADDR_ANY */ if (strcmp("*", hosts[nhostsbak]) == 0) { if (svcport_str == NULL) { res = malloc(sizeof(struct addrinfo)); if (res == NULL) out_of_mem(); mallocd_res = 1; res->ai_flags = hints.ai_flags; res->ai_family = hints.ai_family; res->ai_protocol = hints.ai_protocol; switch (res->ai_family) { case AF_INET: sin = malloc(sizeof(struct sockaddr_in)); if (sin == NULL) out_of_mem(); sin->sin_family = AF_INET; sin->sin_port = htons(0); sin->sin_addr.s_addr = htonl(INADDR_ANY); res->ai_addr = (struct sockaddr*) sin; res->ai_addrlen = (socklen_t) sizeof(struct sockaddr_in); break; case AF_INET6: sin6 = malloc(sizeof(struct sockaddr_in6)); if (sin6 == NULL) out_of_mem(); sin6->sin6_family = AF_INET6; sin6->sin6_port = htons(0); sin6->sin6_addr = in6addr_any; res->ai_addr = (struct sockaddr*) sin6; res->ai_addrlen = (socklen_t) sizeof(struct sockaddr_in6); break; default: syslog(LOG_ERR, "bad addr fam %d", res->ai_family); exit(1); } } else { if ((aicode = getaddrinfo(NULL, svcport_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address for %s: %s", nconf->nc_netid, gai_strerror(aicode)); close(fd); continue; } } } else { if ((aicode = getaddrinfo(hosts[nhostsbak], svcport_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address for %s: %s", nconf->nc_netid, gai_strerror(aicode)); close(fd); continue; } } /* Store the fd. */ sock_fd[sock_fdcnt - 1] = fd; /* Now, attempt the bind. */ r = bindresvport_sa(fd, res->ai_addr); if (r != 0) { if (errno == EADDRINUSE && mallocd_svcport != 0) { if (mallocd_res != 0) { free(res->ai_addr); free(res); } else freeaddrinfo(res); return (-1); } syslog(LOG_ERR, "bindresvport_sa: %m"); exit(1); } if (svcport_str == NULL) { svcport_str = malloc(NI_MAXSERV * sizeof(char)); if (svcport_str == NULL) out_of_mem(); mallocd_svcport = 1; if (getnameinfo(res->ai_addr, res->ai_addr->sa_len, NULL, NI_MAXHOST, svcport_str, NI_MAXSERV * sizeof(char), NI_NUMERICHOST | NI_NUMERICSERV)) errx(1, "Cannot get port number"); } if (mallocd_res != 0) { free(res->ai_addr); free(res); } else freeaddrinfo(res); res = NULL; } return (0); } /* * Called after all the create_service() calls have succeeded, to complete * the setup and registration. */ static void complete_service(struct netconfig *nconf, char *port_str) { struct addrinfo hints, *res = NULL; struct __rpc_sockinfo si; struct netbuf servaddr; SVCXPRT *transp = NULL; int aicode, fd, nhostsbak; int registered = 0; if ((nconf->nc_semantics != NC_TPI_CLTS) && (nconf->nc_semantics != NC_TPI_COTS) && (nconf->nc_semantics != NC_TPI_COTS_ORD)) return; /* not my type */ /* * XXX - using RPC library internal functions. */ if (!__rpc_nconf2sockinfo(nconf, &si)) { syslog(LOG_ERR, "cannot get information for %s", nconf->nc_netid); return; } nhostsbak = nhosts; while (nhostsbak > 0) { --nhostsbak; if (sock_fdpos >= sock_fdcnt) { /* Should never happen. */ syslog(LOG_ERR, "Ran out of socket fd's"); return; } fd = sock_fd[sock_fdpos++]; if (fd < 0) continue; if (nconf->nc_semantics != NC_TPI_CLTS) listen(fd, SOMAXCONN); if (nconf->nc_semantics == NC_TPI_CLTS ) transp = svc_dg_create(fd, 0, 0); else transp = svc_vc_create(fd, RPC_MAXDATASIZE, RPC_MAXDATASIZE); if (transp != (SVCXPRT *) NULL) { if (!svc_reg(transp, MOUNTPROG, MOUNTVERS, mntsrv, NULL)) syslog(LOG_ERR, "can't register %s MOUNTVERS service", nconf->nc_netid); if (!force_v2) { if (!svc_reg(transp, MOUNTPROG, MOUNTVERS3, mntsrv, NULL)) syslog(LOG_ERR, "can't register %s MOUNTVERS3 service", nconf->nc_netid); } } else syslog(LOG_WARNING, "can't create %s services", nconf->nc_netid); if (registered == 0) { registered = 1; memset(&hints, 0, sizeof hints); hints.ai_flags = AI_PASSIVE; hints.ai_family = si.si_af; hints.ai_socktype = si.si_socktype; hints.ai_protocol = si.si_proto; if ((aicode = getaddrinfo(NULL, port_str, &hints, &res)) != 0) { syslog(LOG_ERR, "cannot get local address: %s", gai_strerror(aicode)); exit(1); } servaddr.buf = malloc(res->ai_addrlen); memcpy(servaddr.buf, res->ai_addr, res->ai_addrlen); servaddr.len = res->ai_addrlen; rpcb_set(MOUNTPROG, MOUNTVERS, nconf, &servaddr); rpcb_set(MOUNTPROG, MOUNTVERS3, nconf, &servaddr); xcreated++; freeaddrinfo(res); } } /* end while */ } /* * Clear out sockets after a failure to bind one of them, so that the * cycle of socket creation/binding can start anew. */ static void clearout_service(void) { int i; for (i = 0; i < sock_fdcnt; i++) { if (sock_fd[i] >= 0) { shutdown(sock_fd[i], SHUT_RDWR); close(sock_fd[i]); } } } static void usage(void) { fprintf(stderr, "usage: mountd [-2] [-d] [-e] [-l] [-n] [-p ] [-r] " "[-S] [-h ] [export_file ...]\n"); exit(1); } /* * The mount rpc service */ void mntsrv(struct svc_req *rqstp, SVCXPRT *transp) { struct exportlist *ep; struct dirlist *dp; struct fhreturn fhr; struct stat stb; struct statfs fsb; char host[NI_MAXHOST], numerichost[NI_MAXHOST]; int lookup_failed = 1; struct sockaddr *saddr; u_short sport; char rpcpath[MNTPATHLEN + 1], dirpath[MAXPATHLEN]; int bad = 0, defset, hostset; sigset_t sighup_mask; int numsecflavors, *secflavorsp; sigemptyset(&sighup_mask); sigaddset(&sighup_mask, SIGHUP); saddr = svc_getrpccaller(transp)->buf; switch (saddr->sa_family) { case AF_INET6: sport = ntohs(((struct sockaddr_in6 *)saddr)->sin6_port); break; case AF_INET: sport = ntohs(((struct sockaddr_in *)saddr)->sin_port); break; default: syslog(LOG_ERR, "request from unknown address family"); return; } lookup_failed = getnameinfo(saddr, saddr->sa_len, host, sizeof host, NULL, 0, 0); getnameinfo(saddr, saddr->sa_len, numerichost, sizeof numerichost, NULL, 0, NI_NUMERICHOST); switch (rqstp->rq_proc) { case NULLPROC: if (!svc_sendreply(transp, (xdrproc_t)xdr_void, NULL)) syslog(LOG_ERR, "can't send reply"); return; case MOUNTPROC_MNT: if (sport >= IPPORT_RESERVED && resvport_only) { syslog(LOG_NOTICE, "mount request from %s from unprivileged port", numerichost); svcerr_weakauth(transp); return; } if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) { syslog(LOG_NOTICE, "undecodable mount request from %s", numerichost); svcerr_decode(transp); return; } /* * Get the real pathname and make sure it is a directory * or a regular file if the -r option was specified * and it exists. */ if (realpath(rpcpath, dirpath) == NULL || stat(dirpath, &stb) < 0 || (!S_ISDIR(stb.st_mode) && (dir_only || !S_ISREG(stb.st_mode))) || statfs(dirpath, &fsb) < 0) { chdir("/"); /* Just in case realpath doesn't */ syslog(LOG_NOTICE, "mount request from %s for non existent path %s", numerichost, dirpath); if (debug) warnx("stat failed on %s", dirpath); bad = ENOENT; /* We will send error reply later */ } /* Check in the exports list */ sigprocmask(SIG_BLOCK, &sighup_mask, NULL); ep = ex_search(&fsb.f_fsid); hostset = defset = 0; if (ep && (chk_host(ep->ex_defdir, saddr, &defset, &hostset, &numsecflavors, &secflavorsp) || ((dp = dirp_search(ep->ex_dirl, dirpath)) && chk_host(dp, saddr, &defset, &hostset, &numsecflavors, &secflavorsp)) || (defset && scan_tree(ep->ex_defdir, saddr) == 0 && scan_tree(ep->ex_dirl, saddr) == 0))) { if (bad) { if (!svc_sendreply(transp, (xdrproc_t)xdr_long, (caddr_t)&bad)) syslog(LOG_ERR, "can't send reply"); sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return; } if (hostset & DP_HOSTSET) { fhr.fhr_flag = hostset; fhr.fhr_numsecflavors = numsecflavors; fhr.fhr_secflavors = secflavorsp; } else { fhr.fhr_flag = defset; fhr.fhr_numsecflavors = ep->ex_defnumsecflavors; fhr.fhr_secflavors = ep->ex_defsecflavors; } fhr.fhr_vers = rqstp->rq_vers; /* Get the file handle */ memset(&fhr.fhr_fh, 0, sizeof(nfsfh_t)); if (getfh(dirpath, (fhandle_t *)&fhr.fhr_fh) < 0) { bad = errno; syslog(LOG_ERR, "can't get fh for %s", dirpath); if (!svc_sendreply(transp, (xdrproc_t)xdr_long, (caddr_t)&bad)) syslog(LOG_ERR, "can't send reply"); sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return; } if (!svc_sendreply(transp, (xdrproc_t)xdr_fhs, (caddr_t)&fhr)) syslog(LOG_ERR, "can't send reply"); if (!lookup_failed) add_mlist(host, dirpath); else add_mlist(numerichost, dirpath); if (debug) warnx("mount successful"); if (dolog) syslog(LOG_NOTICE, "mount request succeeded from %s for %s", numerichost, dirpath); } else { bad = EACCES; syslog(LOG_NOTICE, "mount request denied from %s for %s", numerichost, dirpath); } if (bad && !svc_sendreply(transp, (xdrproc_t)xdr_long, (caddr_t)&bad)) syslog(LOG_ERR, "can't send reply"); sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return; case MOUNTPROC_DUMP: if (!svc_sendreply(transp, (xdrproc_t)xdr_mlist, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); else if (dolog) syslog(LOG_NOTICE, "dump request succeeded from %s", numerichost); return; case MOUNTPROC_UMNT: if (sport >= IPPORT_RESERVED && resvport_only) { syslog(LOG_NOTICE, "umount request from %s from unprivileged port", numerichost); svcerr_weakauth(transp); return; } if (!svc_getargs(transp, (xdrproc_t)xdr_dir, rpcpath)) { syslog(LOG_NOTICE, "undecodable umount request from %s", numerichost); svcerr_decode(transp); return; } if (realpath(rpcpath, dirpath) == NULL) { syslog(LOG_NOTICE, "umount request from %s " "for non existent path %s", numerichost, dirpath); } if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); if (!lookup_failed) del_mlist(host, dirpath); del_mlist(numerichost, dirpath); if (dolog) syslog(LOG_NOTICE, "umount request succeeded from %s for %s", numerichost, dirpath); return; case MOUNTPROC_UMNTALL: if (sport >= IPPORT_RESERVED && resvport_only) { syslog(LOG_NOTICE, "umountall request from %s from unprivileged port", numerichost); svcerr_weakauth(transp); return; } if (!svc_sendreply(transp, (xdrproc_t)xdr_void, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); if (!lookup_failed) del_mlist(host, NULL); del_mlist(numerichost, NULL); if (dolog) syslog(LOG_NOTICE, "umountall request succeeded from %s", numerichost); return; case MOUNTPROC_EXPORT: if (!svc_sendreply(transp, (xdrproc_t)xdr_explist, (caddr_t)NULL)) if (!svc_sendreply(transp, (xdrproc_t)xdr_explist_brief, (caddr_t)NULL)) syslog(LOG_ERR, "can't send reply"); if (dolog) syslog(LOG_NOTICE, "export request succeeded from %s", numerichost); return; default: svcerr_noproc(transp); return; } } /* * Xdr conversion for a dirpath string */ static int xdr_dir(XDR *xdrsp, char *dirp) { return (xdr_string(xdrsp, &dirp, MNTPATHLEN)); } /* * Xdr routine to generate file handle reply */ static int xdr_fhs(XDR *xdrsp, caddr_t cp) { struct fhreturn *fhrp = (struct fhreturn *)cp; u_long ok = 0, len, auth; int i; if (!xdr_long(xdrsp, &ok)) return (0); switch (fhrp->fhr_vers) { case 1: return (xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, NFSX_V2FH)); case 3: len = NFSX_V3FH; if (!xdr_long(xdrsp, &len)) return (0); if (!xdr_opaque(xdrsp, (caddr_t)&fhrp->fhr_fh, len)) return (0); if (fhrp->fhr_numsecflavors) { if (!xdr_int(xdrsp, &fhrp->fhr_numsecflavors)) return (0); for (i = 0; i < fhrp->fhr_numsecflavors; i++) if (!xdr_int(xdrsp, &fhrp->fhr_secflavors[i])) return (0); return (1); } else { auth = AUTH_SYS; len = 1; if (!xdr_long(xdrsp, &len)) return (0); return (xdr_long(xdrsp, &auth)); } } return (0); } static int xdr_mlist(XDR *xdrsp, caddr_t cp __unused) { struct mountlist *mlp; int true = 1; int false = 0; char *strp; mlp = mlhead; while (mlp) { if (!xdr_bool(xdrsp, &true)) return (0); strp = &mlp->ml_host[0]; if (!xdr_string(xdrsp, &strp, MNTNAMLEN)) return (0); strp = &mlp->ml_dirp[0]; if (!xdr_string(xdrsp, &strp, MNTPATHLEN)) return (0); mlp = mlp->ml_next; } if (!xdr_bool(xdrsp, &false)) return (0); return (1); } /* * Xdr conversion for export list */ static int xdr_explist_common(XDR *xdrsp, caddr_t cp __unused, int brief) { struct exportlist *ep; int false = 0; int putdef; sigset_t sighup_mask; sigemptyset(&sighup_mask); sigaddset(&sighup_mask, SIGHUP); sigprocmask(SIG_BLOCK, &sighup_mask, NULL); ep = exphead; while (ep) { putdef = 0; if (put_exlist(ep->ex_dirl, xdrsp, ep->ex_defdir, &putdef, brief)) goto errout; if (ep->ex_defdir && putdef == 0 && put_exlist(ep->ex_defdir, xdrsp, (struct dirlist *)NULL, &putdef, brief)) goto errout; ep = ep->ex_next; } sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); if (!xdr_bool(xdrsp, &false)) return (0); return (1); errout: sigprocmask(SIG_UNBLOCK, &sighup_mask, NULL); return (0); } /* * Called from xdr_explist() to traverse the tree and export the * directory paths. */ static int put_exlist(struct dirlist *dp, XDR *xdrsp, struct dirlist *adp, int *putdefp, int brief) { struct grouplist *grp; struct hostlist *hp; int true = 1; int false = 0; int gotalldir = 0; char *strp; if (dp) { if (put_exlist(dp->dp_left, xdrsp, adp, putdefp, brief)) return (1); if (!xdr_bool(xdrsp, &true)) return (1); strp = dp->dp_dirp; if (!xdr_string(xdrsp, &strp, MNTPATHLEN)) return (1); if (adp && !strcmp(dp->dp_dirp, adp->dp_dirp)) { gotalldir = 1; *putdefp = 1; } if (brief) { if (!xdr_bool(xdrsp, &true)) return (1); strp = "(...)"; if (!xdr_string(xdrsp, &strp, MNTPATHLEN)) return (1); } else if ((dp->dp_flag & DP_DEFSET) == 0 && (gotalldir == 0 || (adp->dp_flag & DP_DEFSET) == 0)) { hp = dp->dp_hosts; while (hp) { grp = hp->ht_grp; if (grp->gr_type == GT_HOST) { if (!xdr_bool(xdrsp, &true)) return (1); strp = grp->gr_ptr.gt_addrinfo->ai_canonname; if (!xdr_string(xdrsp, &strp, MNTNAMLEN)) return (1); } else if (grp->gr_type == GT_NET) { if (!xdr_bool(xdrsp, &true)) return (1); strp = grp->gr_ptr.gt_net.nt_name; if (!xdr_string(xdrsp, &strp, MNTNAMLEN)) return (1); } hp = hp->ht_next; if (gotalldir && hp == (struct hostlist *)NULL) { hp = adp->dp_hosts; gotalldir = 0; } } } if (!xdr_bool(xdrsp, &false)) return (1); if (put_exlist(dp->dp_right, xdrsp, adp, putdefp, brief)) return (1); } return (0); } static int xdr_explist(XDR *xdrsp, caddr_t cp) { return xdr_explist_common(xdrsp, cp, 0); } static int xdr_explist_brief(XDR *xdrsp, caddr_t cp) { return xdr_explist_common(xdrsp, cp, 1); } static char *line; static size_t linesize; static FILE *exp_file; /* * Get the export list from one, currently open file */ static void get_exportlist_one(void) { struct exportlist *ep, *ep2; struct grouplist *grp, *tgrp; struct exportlist **epp; struct dirlist *dirhead; struct statfs fsb; struct xucred anon; char *cp, *endcp, *dirp, *hst, *usr, *dom, savedc; int len, has_host, exflags, got_nondir, dirplen, netgrp; v4root_phase = 0; dirhead = (struct dirlist *)NULL; while (get_line()) { if (debug) warnx("got line %s", line); cp = line; nextfield(&cp, &endcp); if (*cp == '#') goto nextline; /* * Set defaults. */ has_host = FALSE; anon = def_anon; exflags = MNT_EXPORTED; got_nondir = 0; opt_flags = 0; ep = (struct exportlist *)NULL; dirp = NULL; /* * Handle the V4 root dir. */ if (*cp == 'V' && *(cp + 1) == '4' && *(cp + 2) == ':') { /* * V4: just indicates that it is the v4 root point, * so skip over that and set v4root_phase. */ if (v4root_phase > 0) { syslog(LOG_ERR, "V4:duplicate line, ignored"); goto nextline; } v4root_phase = 1; cp += 3; nextfield(&cp, &endcp); } /* * Create new exports list entry */ len = endcp-cp; tgrp = grp = get_grp(); while (len > 0) { if (len > MNTNAMLEN) { getexp_err(ep, tgrp); goto nextline; } if (*cp == '-') { if (ep == (struct exportlist *)NULL) { getexp_err(ep, tgrp); goto nextline; } if (debug) warnx("doing opt %s", cp); got_nondir = 1; if (do_opt(&cp, &endcp, ep, grp, &has_host, &exflags, &anon)) { getexp_err(ep, tgrp); goto nextline; } } else if (*cp == '/') { savedc = *endcp; *endcp = '\0'; if (v4root_phase > 1) { if (dirp != NULL) { syslog(LOG_ERR, "Multiple V4 dirs"); getexp_err(ep, tgrp); goto nextline; } } if (check_dirpath(cp) && statfs(cp, &fsb) >= 0) { if ((fsb.f_flags & MNT_AUTOMOUNTED) != 0) syslog(LOG_ERR, "Warning: exporting of " "automounted fs %s not supported", cp); if (got_nondir) { syslog(LOG_ERR, "dirs must be first"); getexp_err(ep, tgrp); goto nextline; } if (v4root_phase == 1) { if (dirp != NULL) { syslog(LOG_ERR, "Multiple V4 dirs"); getexp_err(ep, tgrp); goto nextline; } if (strlen(v4root_dirpath) == 0) { strlcpy(v4root_dirpath, cp, sizeof (v4root_dirpath)); } else if (strcmp(v4root_dirpath, cp) != 0) { syslog(LOG_ERR, "different V4 dirpath %s", cp); getexp_err(ep, tgrp); goto nextline; } dirp = cp; v4root_phase = 2; got_nondir = 1; ep = get_exp(); } else { if (ep) { if (ep->ex_fs.val[0] != fsb.f_fsid.val[0] || ep->ex_fs.val[1] != fsb.f_fsid.val[1]) { getexp_err(ep, tgrp); goto nextline; } } else { /* * See if this directory is already * in the list. */ ep = ex_search(&fsb.f_fsid); if (ep == (struct exportlist *)NULL) { ep = get_exp(); ep->ex_fs = fsb.f_fsid; ep->ex_fsdir = (char *)malloc (strlen(fsb.f_mntonname) + 1); if (ep->ex_fsdir) strcpy(ep->ex_fsdir, fsb.f_mntonname); else out_of_mem(); if (debug) warnx( "making new ep fs=0x%x,0x%x", fsb.f_fsid.val[0], fsb.f_fsid.val[1]); } else if (debug) warnx("found ep fs=0x%x,0x%x", fsb.f_fsid.val[0], fsb.f_fsid.val[1]); } /* * Add dirpath to export mount point. */ dirp = add_expdir(&dirhead, cp, len); dirplen = len; } } else { getexp_err(ep, tgrp); goto nextline; } *endcp = savedc; } else { savedc = *endcp; *endcp = '\0'; got_nondir = 1; if (ep == (struct exportlist *)NULL) { getexp_err(ep, tgrp); goto nextline; } /* * Get the host or netgroup. */ setnetgrent(cp); netgrp = getnetgrent(&hst, &usr, &dom); do { if (has_host) { grp->gr_next = get_grp(); grp = grp->gr_next; } if (netgrp) { if (hst == 0) { syslog(LOG_ERR, "null hostname in netgroup %s, skipping", cp); grp->gr_type = GT_IGNORE; } else if (get_host(hst, grp, tgrp)) { syslog(LOG_ERR, "bad host %s in netgroup %s, skipping", hst, cp); grp->gr_type = GT_IGNORE; } } else if (get_host(cp, grp, tgrp)) { syslog(LOG_ERR, "bad host %s, skipping", cp); grp->gr_type = GT_IGNORE; } has_host = TRUE; } while (netgrp && getnetgrent(&hst, &usr, &dom)); endnetgrent(); *endcp = savedc; } cp = endcp; nextfield(&cp, &endcp); len = endcp - cp; } if (check_options(dirhead)) { getexp_err(ep, tgrp); goto nextline; } if (!has_host) { grp->gr_type = GT_DEFAULT; if (debug) warnx("adding a default entry"); /* * Don't allow a network export coincide with a list of * host(s) on the same line. */ } else if ((opt_flags & OP_NET) && tgrp->gr_next) { syslog(LOG_ERR, "network/host conflict"); getexp_err(ep, tgrp); goto nextline; /* * If an export list was specified on this line, make sure * that we have at least one valid entry, otherwise skip it. */ } else { grp = tgrp; while (grp && grp->gr_type == GT_IGNORE) grp = grp->gr_next; if (! grp) { getexp_err(ep, tgrp); goto nextline; } } if (v4root_phase == 1) { syslog(LOG_ERR, "V4:root, no dirp, ignored"); getexp_err(ep, tgrp); goto nextline; } /* * Loop through hosts, pushing the exports into the kernel. * After loop, tgrp points to the start of the list and * grp points to the last entry in the list. */ grp = tgrp; do { if (do_mount(ep, grp, exflags, &anon, dirp, dirplen, &fsb)) { getexp_err(ep, tgrp); goto nextline; } } while (grp->gr_next && (grp = grp->gr_next)); /* * For V4: don't enter in mount lists. */ if (v4root_phase > 0 && v4root_phase <= 2) { /* * Since these structures aren't used by mountd, * free them up now. */ if (ep != NULL) free_exp(ep); while (tgrp != NULL) { grp = tgrp; tgrp = tgrp->gr_next; free_grp(grp); } goto nextline; } /* * Success. Update the data structures. */ if (has_host) { hang_dirp(dirhead, tgrp, ep, opt_flags); grp->gr_next = grphead; grphead = tgrp; } else { hang_dirp(dirhead, (struct grouplist *)NULL, ep, opt_flags); free_grp(grp); } dirhead = (struct dirlist *)NULL; if ((ep->ex_flag & EX_LINKED) == 0) { ep2 = exphead; epp = &exphead; /* * Insert in the list in alphabetical order. */ while (ep2 && strcmp(ep2->ex_fsdir, ep->ex_fsdir) < 0) { epp = &ep2->ex_next; ep2 = ep2->ex_next; } if (ep2) ep->ex_next = ep2; *epp = ep; ep->ex_flag |= EX_LINKED; } nextline: v4root_phase = 0; if (dirhead) { free_dir(dirhead); dirhead = (struct dirlist *)NULL; } } } /* * Get the export list from all specified files */ static void get_exportlist(void) { struct exportlist *ep, *ep2; struct grouplist *grp, *tgrp; struct export_args export; struct iovec *iov; struct statfs *fsp, *mntbufp; struct xvfsconf vfc; char errmsg[255]; int num, i; int iovlen; int done; struct nfsex_args eargs; if (suspend_nfsd != 0) (void)nfssvc(NFSSVC_SUSPENDNFSD, NULL); v4root_dirpath[0] = '\0'; bzero(&export, sizeof(export)); export.ex_flags = MNT_DELEXPORT; iov = NULL; iovlen = 0; bzero(errmsg, sizeof(errmsg)); /* * First, get rid of the old list */ ep = exphead; while (ep) { ep2 = ep; ep = ep->ex_next; free_exp(ep2); } exphead = (struct exportlist *)NULL; grp = grphead; while (grp) { tgrp = grp; grp = grp->gr_next; free_grp(tgrp); } grphead = (struct grouplist *)NULL; /* * and the old V4 root dir. */ bzero(&eargs, sizeof (eargs)); eargs.export.ex_flags = MNT_DELEXPORT; if (nfssvc(NFSSVC_V4ROOTEXPORT, (caddr_t)&eargs) < 0 && errno != ENOENT) syslog(LOG_ERR, "Can't delete exports for V4:"); /* * and clear flag that notes if a public fh has been exported. */ has_publicfh = 0; /* * And delete exports that are in the kernel for all local * filesystems. * XXX: Should know how to handle all local exportable filesystems. */ num = getmntinfo(&mntbufp, MNT_NOWAIT); if (num > 0) { build_iovec(&iov, &iovlen, "fstype", NULL, 0); build_iovec(&iov, &iovlen, "fspath", NULL, 0); build_iovec(&iov, &iovlen, "from", NULL, 0); build_iovec(&iov, &iovlen, "update", NULL, 0); build_iovec(&iov, &iovlen, "export", &export, sizeof(export)); build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); } for (i = 0; i < num; i++) { fsp = &mntbufp[i]; if (getvfsbyname(fsp->f_fstypename, &vfc) != 0) { syslog(LOG_ERR, "getvfsbyname() failed for %s", fsp->f_fstypename); continue; } /* * We do not need to delete "export" flag from * filesystems that do not have it set. */ if (!(fsp->f_flags & MNT_EXPORTED)) continue; /* * Do not delete export for network filesystem by * passing "export" arg to nmount(). * It only makes sense to do this for local filesystems. */ if (vfc.vfc_flags & VFCF_NETWORK) continue; iov[1].iov_base = fsp->f_fstypename; iov[1].iov_len = strlen(fsp->f_fstypename) + 1; iov[3].iov_base = fsp->f_mntonname; iov[3].iov_len = strlen(fsp->f_mntonname) + 1; iov[5].iov_base = fsp->f_mntfromname; iov[5].iov_len = strlen(fsp->f_mntfromname) + 1; errmsg[0] = '\0'; /* * EXDEV is returned when path exists but is not a * mount point. May happens if raced with unmount. */ if (nmount(iov, iovlen, fsp->f_flags) < 0 && errno != ENOENT && errno != ENOTSUP && errno != EXDEV) { syslog(LOG_ERR, "can't delete exports for %s: %m %s", fsp->f_mntonname, errmsg); } } if (iov != NULL) { /* Free strings allocated by strdup() in getmntopts.c */ free(iov[0].iov_base); /* fstype */ free(iov[2].iov_base); /* fspath */ free(iov[4].iov_base); /* from */ free(iov[6].iov_base); /* update */ free(iov[8].iov_base); /* export */ free(iov[10].iov_base); /* errmsg */ /* free iov, allocated by realloc() */ free(iov); iovlen = 0; } /* * Read in the exports file and build the list, calling * nmount() as we go along to push the export rules into the kernel. */ done = 0; for (i = 0; exnames[i] != NULL; i++) { if (debug) warnx("reading exports from %s", exnames[i]); if ((exp_file = fopen(exnames[i], "r")) == NULL) { syslog(LOG_WARNING, "can't open %s", exnames[i]); continue; } get_exportlist_one(); fclose(exp_file); done++; } if (done == 0) { syslog(LOG_ERR, "can't open any exports file"); exit(2); } /* * If there was no public fh, clear any previous one set. */ if (has_publicfh == 0) (void) nfssvc(NFSSVC_NOPUBLICFH, NULL); /* Resume the nfsd. If they weren't suspended, this is harmless. */ (void)nfssvc(NFSSVC_RESUMENFSD, NULL); } /* * Allocate an export list element */ static struct exportlist * get_exp(void) { struct exportlist *ep; ep = (struct exportlist *)calloc(1, sizeof (struct exportlist)); if (ep == (struct exportlist *)NULL) out_of_mem(); return (ep); } /* * Allocate a group list element */ static struct grouplist * get_grp(void) { struct grouplist *gp; gp = (struct grouplist *)calloc(1, sizeof (struct grouplist)); if (gp == (struct grouplist *)NULL) out_of_mem(); return (gp); } /* * Clean up upon an error in get_exportlist(). */ static void getexp_err(struct exportlist *ep, struct grouplist *grp) { struct grouplist *tgrp; if (!(opt_flags & OP_QUIET)) syslog(LOG_ERR, "bad exports list line %s", line); if (ep && (ep->ex_flag & EX_LINKED) == 0) free_exp(ep); while (grp) { tgrp = grp; grp = grp->gr_next; free_grp(tgrp); } } /* * Search the export list for a matching fs. */ static struct exportlist * ex_search(fsid_t *fsid) { struct exportlist *ep; ep = exphead; while (ep) { if (ep->ex_fs.val[0] == fsid->val[0] && ep->ex_fs.val[1] == fsid->val[1]) return (ep); ep = ep->ex_next; } return (ep); } /* * Add a directory path to the list. */ static char * add_expdir(struct dirlist **dpp, char *cp, int len) { struct dirlist *dp; dp = (struct dirlist *)malloc(sizeof (struct dirlist) + len); if (dp == (struct dirlist *)NULL) out_of_mem(); dp->dp_left = *dpp; dp->dp_right = (struct dirlist *)NULL; dp->dp_flag = 0; dp->dp_hosts = (struct hostlist *)NULL; strcpy(dp->dp_dirp, cp); *dpp = dp; return (dp->dp_dirp); } /* * Hang the dir list element off the dirpath binary tree as required * and update the entry for host. */ static void hang_dirp(struct dirlist *dp, struct grouplist *grp, struct exportlist *ep, int flags) { struct hostlist *hp; struct dirlist *dp2; if (flags & OP_ALLDIRS) { if (ep->ex_defdir) free((caddr_t)dp); else ep->ex_defdir = dp; if (grp == (struct grouplist *)NULL) { ep->ex_defdir->dp_flag |= DP_DEFSET; /* Save the default security flavors list. */ ep->ex_defnumsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(ep->ex_defsecflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); } else while (grp) { hp = get_ht(); hp->ht_grp = grp; hp->ht_next = ep->ex_defdir->dp_hosts; ep->ex_defdir->dp_hosts = hp; /* Save the security flavors list for this host set. */ grp->gr_numsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(grp->gr_secflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); grp = grp->gr_next; } } else { /* * Loop through the directories adding them to the tree. */ while (dp) { dp2 = dp->dp_left; add_dlist(&ep->ex_dirl, dp, grp, flags, ep); dp = dp2; } } } /* * Traverse the binary tree either updating a node that is already there * for the new directory or adding the new node. */ static void add_dlist(struct dirlist **dpp, struct dirlist *newdp, struct grouplist *grp, int flags, struct exportlist *ep) { struct dirlist *dp; struct hostlist *hp; int cmp; dp = *dpp; if (dp) { cmp = strcmp(dp->dp_dirp, newdp->dp_dirp); if (cmp > 0) { add_dlist(&dp->dp_left, newdp, grp, flags, ep); return; } else if (cmp < 0) { add_dlist(&dp->dp_right, newdp, grp, flags, ep); return; } else free((caddr_t)newdp); } else { dp = newdp; dp->dp_left = (struct dirlist *)NULL; *dpp = dp; } if (grp) { /* * Hang all of the host(s) off of the directory point. */ do { hp = get_ht(); hp->ht_grp = grp; hp->ht_next = dp->dp_hosts; dp->dp_hosts = hp; /* Save the security flavors list for this host set. */ grp->gr_numsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(grp->gr_secflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); grp = grp->gr_next; } while (grp); } else { dp->dp_flag |= DP_DEFSET; /* Save the default security flavors list. */ ep->ex_defnumsecflavors = ep->ex_numsecflavors; if (ep->ex_numsecflavors > 0) memcpy(ep->ex_defsecflavors, ep->ex_secflavors, sizeof(ep->ex_secflavors)); } } /* * Search for a dirpath on the export point. */ static struct dirlist * dirp_search(struct dirlist *dp, char *dirp) { int cmp; if (dp) { cmp = strcmp(dp->dp_dirp, dirp); if (cmp > 0) return (dirp_search(dp->dp_left, dirp)); else if (cmp < 0) return (dirp_search(dp->dp_right, dirp)); else return (dp); } return (dp); } /* * Scan for a host match in a directory tree. */ static int chk_host(struct dirlist *dp, struct sockaddr *saddr, int *defsetp, int *hostsetp, int *numsecflavors, int **secflavorsp) { struct hostlist *hp; struct grouplist *grp; struct addrinfo *ai; if (dp) { if (dp->dp_flag & DP_DEFSET) *defsetp = dp->dp_flag; hp = dp->dp_hosts; while (hp) { grp = hp->ht_grp; switch (grp->gr_type) { case GT_HOST: ai = grp->gr_ptr.gt_addrinfo; for (; ai; ai = ai->ai_next) { if (!sacmp(ai->ai_addr, saddr, NULL)) { *hostsetp = (hp->ht_flag | DP_HOSTSET); if (numsecflavors != NULL) { *numsecflavors = grp->gr_numsecflavors; *secflavorsp = grp->gr_secflavors; } return (1); } } break; case GT_NET: if (!sacmp(saddr, (struct sockaddr *) &grp->gr_ptr.gt_net.nt_net, (struct sockaddr *) &grp->gr_ptr.gt_net.nt_mask)) { *hostsetp = (hp->ht_flag | DP_HOSTSET); if (numsecflavors != NULL) { *numsecflavors = grp->gr_numsecflavors; *secflavorsp = grp->gr_secflavors; } return (1); } break; } hp = hp->ht_next; } } return (0); } /* * Scan tree for a host that matches the address. */ static int scan_tree(struct dirlist *dp, struct sockaddr *saddr) { int defset, hostset; if (dp) { if (scan_tree(dp->dp_left, saddr)) return (1); if (chk_host(dp, saddr, &defset, &hostset, NULL, NULL)) return (1); if (scan_tree(dp->dp_right, saddr)) return (1); } return (0); } /* * Traverse the dirlist tree and free it up. */ static void free_dir(struct dirlist *dp) { if (dp) { free_dir(dp->dp_left); free_dir(dp->dp_right); free_host(dp->dp_hosts); free((caddr_t)dp); } } /* * Parse a colon separated list of security flavors */ static int parsesec(char *seclist, struct exportlist *ep) { char *cp, savedc; int flavor; ep->ex_numsecflavors = 0; for (;;) { cp = strchr(seclist, ':'); if (cp) { savedc = *cp; *cp = '\0'; } if (!strcmp(seclist, "sys")) flavor = AUTH_SYS; else if (!strcmp(seclist, "krb5")) flavor = RPCSEC_GSS_KRB5; else if (!strcmp(seclist, "krb5i")) flavor = RPCSEC_GSS_KRB5I; else if (!strcmp(seclist, "krb5p")) flavor = RPCSEC_GSS_KRB5P; else { if (cp) *cp = savedc; syslog(LOG_ERR, "bad sec flavor: %s", seclist); return (1); } if (ep->ex_numsecflavors == MAXSECFLAVORS) { if (cp) *cp = savedc; syslog(LOG_ERR, "too many sec flavors: %s", seclist); return (1); } ep->ex_secflavors[ep->ex_numsecflavors] = flavor; ep->ex_numsecflavors++; if (cp) { *cp = savedc; seclist = cp + 1; } else { break; } } return (0); } /* * Parse the option string and update fields. * Option arguments may either be -