Index: user/ngie/bsnmp_cleanup/contrib/bsnmp/snmpd/main.c =================================================================== --- user/ngie/bsnmp_cleanup/contrib/bsnmp/snmpd/main.c (revision 298467) +++ user/ngie/bsnmp_cleanup/contrib/bsnmp/snmpd/main.c (revision 298468) @@ -1,3281 +1,3281 @@ /* * Copyright (c) 2001-2003 * Fraunhofer Institute for Open Communication Systems (FhG Fokus). * All rights reserved. * * Author: Harti Brandt * * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Shteryana Sotirova Shopova * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Begemot: bsnmp/snmpd/main.c,v 1.100 2006/02/14 09:04:20 brandt_h Exp $ * * SNMPd main stuff. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_TCPWRAPPERS #include #include #endif #include "support.h" #include "snmpmod.h" #include "snmpd.h" #include "tree.h" #include "oid.h" #define PATH_PID "/var/run/%s.pid" #define PATH_CONFIG "/etc/%s.config" #define PATH_ENGINE "/var/%s.engine" uint64_t this_tick; /* start of processing of current packet (absolute) */ uint64_t start_tick; /* start of processing */ struct systemg systemg = { NULL, { 8, { 1, 3, 6, 1, 4, 1, 1115, 7352 }}, NULL, NULL, NULL, 64 + 8 + 4, 0 }; struct debug debug = { 0, /* dump_pdus */ LOG_DEBUG, /* log_pri */ 0, /* evdebug */ }; struct snmpd snmpd = { 2048, /* txbuf */ 2048, /* rxbuf */ 0, /* comm_dis */ 0, /* auth_traps */ {0, 0, 0, 0}, /* trap1addr */ VERS_ENABLE_ALL,/* version_enable */ }; struct snmpd_stats snmpd_stats; struct snmpd_usmstat snmpd_usmstats; /* snmpEngine */ struct snmp_engine snmpd_engine; /* snmpSerialNo */ int32_t snmp_serial_no; struct snmpd_target_stats snmpd_target_stats; /* search path for config files */ const char *syspath = PATH_SYSCONFIG; /* list of all loaded modules */ struct lmodules lmodules = TAILQ_HEAD_INITIALIZER(lmodules); /* list of loaded modules during start-up in the order they were loaded */ static struct lmodules modules_start = TAILQ_HEAD_INITIALIZER(modules_start); /* list of all known communities */ struct community_list community_list = TAILQ_HEAD_INITIALIZER(community_list); /* list of all known USM users */ static struct usm_userlist usm_userlist = SLIST_HEAD_INITIALIZER(usm_userlist); /* A list of all VACM users configured, including v1, v2c and v3 */ static struct vacm_userlist vacm_userlist = SLIST_HEAD_INITIALIZER(vacm_userlist); /* A list of all VACM groups */ static struct vacm_grouplist vacm_grouplist = SLIST_HEAD_INITIALIZER(vacm_grouplist); static struct vacm_group vacm_default_group = { .groupname = "", }; /* The list of configured access entries */ static struct vacm_accesslist vacm_accesslist = TAILQ_HEAD_INITIALIZER(vacm_accesslist); /* The list of configured views */ static struct vacm_viewlist vacm_viewlist = SLIST_HEAD_INITIALIZER(vacm_viewlist); /* The list of configured contexts */ static struct vacm_contextlist vacm_contextlist = SLIST_HEAD_INITIALIZER(vacm_contextlist); /* list of all installed object resources */ struct objres_list objres_list = TAILQ_HEAD_INITIALIZER(objres_list); /* community value generator */ static u_int next_community_index = 1; /* list of all known ranges */ struct idrange_list idrange_list = TAILQ_HEAD_INITIALIZER(idrange_list); /* identifier generator */ u_int next_idrange = 1; /* list of all current timers */ struct timer_list timer_list = LIST_HEAD_INITIALIZER(timer_list); /* list of file descriptors */ struct fdesc_list fdesc_list = LIST_HEAD_INITIALIZER(fdesc_list); /* program arguments */ static char **progargs; static int nprogargs; /* current community */ u_int community; static struct community *comm; /* current USM user */ struct usm_user *usm_user; /* file names */ static char config_file[MAXPATHLEN + 1]; static char pid_file[MAXPATHLEN + 1]; char engine_file[MAXPATHLEN + 1]; #ifndef USE_LIBBEGEMOT /* event context */ static evContext evctx; #endif /* signal mask */ static sigset_t blocked_sigs; /* signal handling */ static int work; #define WORK_DOINFO 0x0001 #define WORK_RECONFIG 0x0002 /* oids */ static const struct asn_oid oid_snmpMIB = OIDX_snmpMIB, oid_begemotSnmpd = OIDX_begemotSnmpd, oid_coldStart = OIDX_coldStart, oid_authenticationFailure = OIDX_authenticationFailure; const struct asn_oid oid_zeroDotZero = { 2, { 0, 0 }}; const struct asn_oid oid_usmUnknownEngineIDs = { 11, { 1, 3, 6, 1, 6, 3, 15, 1, 1, 4, 0}}; const struct asn_oid oid_usmNotInTimeWindows = { 11, { 1, 3, 6, 1, 6, 3, 15, 1, 1, 2, 0}}; /* request id generator for traps */ u_int trap_reqid; /* help text */ static const char usgtxt[] = "\ Begemot simple SNMP daemon. Copyright (c) 2001-2002 Fraunhofer Institute for\n\ Open Communication Systems (FhG Fokus). All rights reserved.\n\ Copyright (c) 2010 The FreeBSD Foundation. All rights reserved.\n\ usage: snmpd [-dh] [-c file] [-D options] [-e file] [-I path]\n\ [-l prefix] [-m variable=value] [-p file]\n\ options:\n\ -d don't daemonize\n\ -h print this info\n\ -c file specify configuration file\n\ -D options debugging options\n\ -e file specify engine id file\n\ -I path system include path\n\ -l prefix default basename for pid and config file\n\ -m var=val define variable\n\ -p file specify pid file\n\ "; /* hosts_access(3) request */ #ifdef USE_TCPWRAPPERS static struct request_info req; #endif /* transports */ extern const struct transport_def udp_trans; extern const struct transport_def lsock_trans; struct transport_list transport_list = TAILQ_HEAD_INITIALIZER(transport_list); /* forward declarations */ static void snmp_printf_func(const char *fmt, ...); static void snmp_error_func(const char *err, ...); static void snmp_debug_func(const char *err, ...); static void asn_error_func(const struct asn_buf *b, const char *err, ...); /* * Allocate rx/tx buffer. We allocate one byte more for rx. */ void * buf_alloc(int tx) { void *buf; if ((buf = malloc(tx ? snmpd.txbuf : snmpd.rxbuf)) == NULL) { syslog(LOG_CRIT, "cannot allocate buffer"); if (tx) snmpd_stats.noTxbuf++; else snmpd_stats.noRxbuf++; return (NULL); } return (buf); } /* * Return the buffer size. */ size_t buf_size(int tx) { return (tx ? snmpd.txbuf : snmpd.rxbuf); } /* * Prepare a PDU for output */ void snmp_output(struct snmp_pdu *pdu, u_char *sndbuf, size_t *sndlen, const char *dest) { struct asn_buf resp_b; resp_b.asn_ptr = sndbuf; resp_b.asn_len = snmpd.txbuf; if (snmp_pdu_encode(pdu, &resp_b) != 0) { syslog(LOG_ERR, "cannot encode message"); abort(); } if (debug.dump_pdus) { snmp_printf("%s <- ", dest); snmp_pdu_dump(pdu); } *sndlen = (size_t)(resp_b.asn_ptr - sndbuf); } /* * Check USM PDU header credentials against local SNMP Engine & users. */ static enum snmp_code snmp_pdu_auth_user(struct snmp_pdu *pdu) { uint64_t etime; usm_user = NULL; /* un-authenticated snmpEngineId discovery */ if (pdu->engine.engine_len == 0 && strlen(pdu->user.sec_name) == 0) { pdu->engine.engine_len = snmpd_engine.engine_len; memcpy(pdu->engine.engine_id, snmpd_engine.engine_id, snmpd_engine.engine_len); pdu->engine.engine_boots = snmpd_engine.engine_boots; pdu->engine.engine_time = snmpd_engine.engine_time; pdu->flags |= SNMP_MSG_AUTODISCOVER; return (SNMP_CODE_OK); } if ((usm_user = usm_find_user(pdu->engine.engine_id, pdu->engine.engine_len, pdu->user.sec_name)) == NULL || usm_user->status != 1 /* active */) return (SNMP_CODE_BADUSER); if (usm_user->user_engine_len != snmpd_engine.engine_len || memcmp(usm_user->user_engine_id, snmpd_engine.engine_id, snmpd_engine.engine_len) != 0) return (SNMP_CODE_BADENGINE); pdu->user.priv_proto = usm_user->suser.priv_proto; memcpy(pdu->user.priv_key, usm_user->suser.priv_key, sizeof(pdu->user.priv_key)); /* authenticated snmpEngineId discovery */ if ((pdu->flags & SNMP_MSG_AUTH_FLAG) != 0) { etime = (get_ticks() - start_tick) / 100ULL; if (etime < INT32_MAX) snmpd_engine.engine_time = etime; else { start_tick = get_ticks(); set_snmpd_engine(); snmpd_engine.engine_time = start_tick; } pdu->user.auth_proto = usm_user->suser.auth_proto; memcpy(pdu->user.auth_key, usm_user->suser.auth_key, sizeof(pdu->user.auth_key)); if (pdu->engine.engine_boots == 0 && pdu->engine.engine_time == 0) { pdu->flags |= SNMP_MSG_AUTODISCOVER; return (SNMP_CODE_OK); } if (pdu->engine.engine_boots != snmpd_engine.engine_boots || abs(pdu->engine.engine_time - snmpd_engine.engine_time) > SNMP_TIME_WINDOW) return (SNMP_CODE_NOTINTIME); } if (((pdu->flags & SNMP_MSG_PRIV_FLAG) != 0 && (pdu->flags & SNMP_MSG_AUTH_FLAG) == 0) || ((pdu->flags & SNMP_MSG_AUTH_FLAG) == 0 && usm_user->suser.auth_proto != SNMP_AUTH_NOAUTH) || ((pdu->flags & SNMP_MSG_PRIV_FLAG) == 0 && usm_user->suser.priv_proto != SNMP_PRIV_NOPRIV)) return (SNMP_CODE_BADSECLEVEL); return (SNMP_CODE_OK); } /* * Check whether access to each of var bindings in the PDU is allowed based * on the user credentials against the configured User groups & VACM views. */ enum snmp_code snmp_pdu_auth_access(struct snmp_pdu *pdu, int32_t *ip) { const char *uname; int32_t suboid, smodel; uint32_t i; struct vacm_user *vuser; struct vacm_access *acl; struct vacm_context *vacmctx; struct vacm_view *view; /* * At least a default context exists if the snmpd_vacm(3) module is * running. */ if (SLIST_EMPTY(&vacm_contextlist) || (pdu->flags & SNMP_MSG_AUTODISCOVER) != 0) return (SNMP_CODE_OK); switch (pdu->version) { case SNMP_V1: if ((uname = comm_string(community)) == NULL) return (SNMP_CODE_FAILED); smodel = SNMP_SECMODEL_SNMPv1; break; case SNMP_V2c: if ((uname = comm_string(community)) == NULL) return (SNMP_CODE_FAILED); smodel = SNMP_SECMODEL_SNMPv2c; break; case SNMP_V3: uname = pdu->user.sec_name; if ((smodel = pdu->security_model) != SNMP_SECMODEL_USM) return (SNMP_CODE_FAILED); /* Compare the PDU context engine id against the agent's */ if (pdu->context_engine_len != snmpd_engine.engine_len || memcmp(pdu->context_engine, snmpd_engine.engine_id, snmpd_engine.engine_len) != 0) return (SNMP_CODE_FAILED); break; default: abort(); } SLIST_FOREACH(vuser, &vacm_userlist, vvu) if (strcmp(uname, vuser->secname) == 0 && vuser->sec_model == smodel) break; if (vuser == NULL || vuser->group == NULL) return (SNMP_CODE_FAILED); /* XXX: shteryana - recheck */ TAILQ_FOREACH_REVERSE(acl, &vacm_accesslist, vacm_accesslist, vva) { if (acl->group != vuser->group) continue; SLIST_FOREACH(vacmctx, &vacm_contextlist, vcl) if (memcmp(vacmctx->ctxname, acl->ctx_prefix, acl->ctx_match) == 0) goto match; } return (SNMP_CODE_FAILED); match: switch (pdu->type) { case SNMP_PDU_GET: case SNMP_PDU_GETNEXT: case SNMP_PDU_GETBULK: if ((view = acl->read_view) == NULL) return (SNMP_CODE_FAILED); break; case SNMP_PDU_SET: if ((view = acl->write_view) == NULL) return (SNMP_CODE_FAILED); break; case SNMP_PDU_TRAP: case SNMP_PDU_INFORM: case SNMP_PDU_TRAP2: case SNMP_PDU_REPORT: if ((view = acl->notify_view) == NULL) return (SNMP_CODE_FAILED); break; case SNMP_PDU_RESPONSE: /* NOTREACHED */ return (SNMP_CODE_FAILED); default: abort(); } for (i = 0; i < pdu->nbindings; i++) { /* XXX - view->mask*/ suboid = asn_is_suboid(&view->subtree, &pdu->bindings[i].var); if ((!suboid && !view->exclude) || (suboid && view->exclude)) { *ip = i + 1; return (SNMP_CODE_FAILED); } } return (SNMP_CODE_OK); } /* * SNMP input. Start: decode the PDU, find the user or community. */ enum snmpd_input_err snmp_input_start(const u_char *buf, size_t len, const char *source, struct snmp_pdu *pdu, int32_t *ip, size_t *pdulen) { struct asn_buf b; enum snmp_code code; enum snmpd_input_err ret; int sret; /* update uptime */ this_tick = get_ticks(); b.asn_cptr = buf; b.asn_len = len; /* look whether we have enough bytes for the entire PDU. */ switch (sret = snmp_pdu_snoop(&b)) { case 0: return (SNMPD_INPUT_TRUNC); case -1: snmpd_stats.inASNParseErrs++; return (SNMPD_INPUT_FAILED); } b.asn_len = *pdulen = (size_t)sret; memset(pdu, 0, sizeof(*pdu)); if ((code = snmp_pdu_decode_header(&b, pdu)) != SNMP_CODE_OK) goto decoded; if (pdu->version == SNMP_V3) { if (pdu->security_model != SNMP_SECMODEL_USM) { code = SNMP_CODE_FAILED; goto decoded; } if ((code = snmp_pdu_auth_user(pdu)) != SNMP_CODE_OK) goto decoded; if ((code = snmp_pdu_decode_secmode(&b, pdu)) != SNMP_CODE_OK) goto decoded; } code = snmp_pdu_decode_scoped(&b, pdu, ip); ret = SNMPD_INPUT_OK; decoded: snmpd_stats.inPkts++; switch (code) { case SNMP_CODE_FAILED: snmpd_stats.inASNParseErrs++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_BADVERS: bad_vers: snmpd_stats.inBadVersions++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_BADLEN: if (pdu->type == SNMP_OP_SET) ret = SNMPD_INPUT_VALBADLEN; break; case SNMP_CODE_OORANGE: if (pdu->type == SNMP_OP_SET) ret = SNMPD_INPUT_VALRANGE; break; case SNMP_CODE_BADENC: if (pdu->type == SNMP_OP_SET) ret = SNMPD_INPUT_VALBADENC; break; case SNMP_CODE_BADSECLEVEL: snmpd_usmstats.unsupported_seclevels++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_NOTINTIME: snmpd_usmstats.not_in_time_windows++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_BADUSER: snmpd_usmstats.unknown_users++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_BADENGINE: snmpd_usmstats.unknown_engine_ids++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_BADDIGEST: snmpd_usmstats.wrong_digests++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_EDECRYPT: snmpd_usmstats.decrypt_errors++; return (SNMPD_INPUT_FAILED); case SNMP_CODE_OK: switch (pdu->version) { case SNMP_V1: if (!(snmpd.version_enable & VERS_ENABLE_V1)) goto bad_vers; break; case SNMP_V2c: if (!(snmpd.version_enable & VERS_ENABLE_V2C)) goto bad_vers; break; case SNMP_V3: if (!(snmpd.version_enable & VERS_ENABLE_V3)) goto bad_vers; break; case SNMP_Verr: goto bad_vers; } break; } if (debug.dump_pdus) { snmp_printf("%s -> ", source); snmp_pdu_dump(pdu); } /* * Look, whether we know the community or user */ if (pdu->version != SNMP_V3) { TAILQ_FOREACH(comm, &community_list, link) if (comm->string != NULL && strcmp(comm->string, pdu->community) == 0) break; if (comm == NULL) { snmpd_stats.inBadCommunityNames++; snmp_pdu_free(pdu); if (snmpd.auth_traps) snmp_send_trap(&oid_authenticationFailure, (struct snmp_value *)NULL); ret = SNMPD_INPUT_BAD_COMM; } else community = comm->value; } else if (pdu->nbindings == 0) { /* RFC 3414 - snmpEngineID Discovery */ if (strlen(pdu->user.sec_name) == 0) { asn_append_oid(&(pdu->bindings[pdu->nbindings++].var), &oid_usmUnknownEngineIDs); pdu->context_engine_len = snmpd_engine.engine_len; memcpy(pdu->context_engine, snmpd_engine.engine_id, snmpd_engine.engine_len); } else if (pdu->engine.engine_boots == 0 && pdu->engine.engine_time == 0) { asn_append_oid(&(pdu->bindings[pdu->nbindings++].var), &oid_usmNotInTimeWindows); pdu->engine.engine_boots = snmpd_engine.engine_boots; pdu->engine.engine_time = snmpd_engine.engine_time; } } else if (usm_user->suser.auth_proto != SNMP_AUTH_NOAUTH && (pdu->engine.engine_boots == 0 || pdu->engine.engine_time == 0)) { snmpd_usmstats.not_in_time_windows++; ret = SNMPD_INPUT_FAILED; } if ((code = snmp_pdu_auth_access(pdu, ip)) != SNMP_CODE_OK) ret = SNMPD_INPUT_FAILED; return (ret); } /* * Will return only _OK or _FAILED */ enum snmpd_input_err snmp_input_finish(struct snmp_pdu *pdu, const u_char *rcvbuf, size_t rcvlen, u_char *sndbuf, size_t *sndlen, const char *source, enum snmpd_input_err ierr, int32_t ivar, void *data) { struct snmp_pdu resp; struct asn_buf resp_b, pdu_b; enum snmp_ret ret; resp_b.asn_ptr = sndbuf; resp_b.asn_len = snmpd.txbuf; pdu_b.asn_cptr = rcvbuf; pdu_b.asn_len = rcvlen; if (ierr != SNMPD_INPUT_OK) { /* error decoding the input of a SET */ if (pdu->version == SNMP_V1) pdu->error_status = SNMP_ERR_BADVALUE; else if (ierr == SNMPD_INPUT_VALBADLEN) pdu->error_status = SNMP_ERR_WRONG_LENGTH; else if (ierr == SNMPD_INPUT_VALRANGE) pdu->error_status = SNMP_ERR_WRONG_VALUE; else pdu->error_status = SNMP_ERR_WRONG_ENCODING; pdu->error_index = ivar; if (snmp_make_errresp(pdu, &pdu_b, &resp_b) == SNMP_RET_IGN) { syslog(LOG_WARNING, "could not encode error response"); snmpd_stats.silentDrops++; return (SNMPD_INPUT_FAILED); } if (debug.dump_pdus) { snmp_printf("%s <- ", source); snmp_pdu_dump(pdu); } *sndlen = (size_t)(resp_b.asn_ptr - sndbuf); return (SNMPD_INPUT_OK); } switch (pdu->type) { case SNMP_PDU_GET: ret = snmp_get(pdu, &resp_b, &resp, data); break; case SNMP_PDU_GETNEXT: ret = snmp_getnext(pdu, &resp_b, &resp, data); break; case SNMP_PDU_SET: ret = snmp_set(pdu, &resp_b, &resp, data); break; case SNMP_PDU_GETBULK: ret = snmp_getbulk(pdu, &resp_b, &resp, data); break; default: ret = SNMP_RET_IGN; break; } switch (ret) { case SNMP_RET_OK: /* normal return - send a response */ if (debug.dump_pdus) { snmp_printf("%s <- ", source); snmp_pdu_dump(&resp); } *sndlen = (size_t)(resp_b.asn_ptr - sndbuf); snmp_pdu_free(&resp); return (SNMPD_INPUT_OK); case SNMP_RET_IGN: /* error - send nothing */ snmpd_stats.silentDrops++; return (SNMPD_INPUT_FAILED); case SNMP_RET_ERR: /* error - send error response. The snmp routine has * changed the error fields in the original message. */ resp_b.asn_ptr = sndbuf; resp_b.asn_len = snmpd.txbuf; if (snmp_make_errresp(pdu, &pdu_b, &resp_b) == SNMP_RET_IGN) { syslog(LOG_WARNING, "could not encode error response"); snmpd_stats.silentDrops++; return (SNMPD_INPUT_FAILED); } else { if (debug.dump_pdus) { snmp_printf("%s <- ", source); snmp_pdu_dump(pdu); } *sndlen = (size_t)(resp_b.asn_ptr - sndbuf); return (SNMPD_INPUT_OK); } } abort(); } /* * Insert a port into the right place in the transport's table of ports */ void trans_insert_port(struct transport *t, struct tport *port) { struct tport *p; TAILQ_FOREACH(p, &t->table, link) { if (asn_compare_oid(&p->index, &port->index) > 0) { TAILQ_INSERT_BEFORE(p, port, link); return; } } port->transport = t; TAILQ_INSERT_TAIL(&t->table, port, link); } /* * Remove a port from a transport's list */ void trans_remove_port(struct tport *port) { TAILQ_REMOVE(&port->transport->table, port, link); } /* * Find a port on a transport's list */ struct tport * trans_find_port(struct transport *t, const struct asn_oid *idx, u_int sub) { return (FIND_OBJECT_OID(&t->table, idx, sub)); } /* * Find next port on a transport's list */ struct tport * trans_next_port(struct transport *t, const struct asn_oid *idx, u_int sub) { return (NEXT_OBJECT_OID(&t->table, idx, sub)); } /* * Return first port */ struct tport * trans_first_port(struct transport *t) { return (TAILQ_FIRST(&t->table)); } /* * Iterate through all ports until a function returns a 0. */ struct tport * trans_iter_port(struct transport *t, int (*func)(struct tport *, intptr_t), intptr_t arg) { struct tport *p; TAILQ_FOREACH(p, &t->table, link) if (func(p, arg) == 0) return (p); return (NULL); } /* * Register a transport */ int trans_register(const struct transport_def *def, struct transport **pp) { u_int i; char or_descr[256]; if ((*pp = malloc(sizeof(**pp))) == NULL) return (SNMP_ERR_GENERR); /* construct index */ (*pp)->index.len = strlen(def->name) + 1; (*pp)->index.subs[0] = strlen(def->name); for (i = 0; i < (*pp)->index.subs[0]; i++) (*pp)->index.subs[i + 1] = def->name[i]; (*pp)->vtab = def; if (FIND_OBJECT_OID(&transport_list, &(*pp)->index, 0) != NULL) { free(*pp); return (SNMP_ERR_INCONS_VALUE); } /* register module */ snprintf(or_descr, sizeof(or_descr), "%s transport mapping", def->name); if (((*pp)->or_index = or_register(&def->id, or_descr, NULL)) == 0) { free(*pp); return (SNMP_ERR_GENERR); } INSERT_OBJECT_OID((*pp), &transport_list); TAILQ_INIT(&(*pp)->table); return (SNMP_ERR_NOERROR); } /* * Unregister transport */ int trans_unregister(struct transport *t) { if (!TAILQ_EMPTY(&t->table)) return (SNMP_ERR_INCONS_VALUE); or_unregister(t->or_index); TAILQ_REMOVE(&transport_list, t, link); return (SNMP_ERR_NOERROR); } /* * File descriptor support */ #ifdef USE_LIBBEGEMOT static void input(int fd, int mask __unused, void *uap) #else static void input(evContext ctx __unused, void *uap, int fd, int mask __unused) #endif { struct fdesc *f = uap; (*f->func)(fd, f->udata); } void fd_suspend(void *p) { struct fdesc *f = p; #ifdef USE_LIBBEGEMOT if (f->id >= 0) { poll_unregister(f->id); f->id = -1; } #else if (evTestID(f->id)) { (void)evDeselectFD(evctx, f->id); evInitID(&f->id); } #endif } int fd_resume(void *p) { struct fdesc *f = p; int err; #ifdef USE_LIBBEGEMOT if (f->id >= 0) return (0); if ((f->id = poll_register(f->fd, input, f, POLL_IN)) < 0) { err = errno; syslog(LOG_ERR, "select fd %d: %m", f->fd); errno = err; return (-1); } #else if (evTestID(f->id)) return (0); if (evSelectFD(evctx, f->fd, EV_READ, input, f, &f->id)) { err = errno; syslog(LOG_ERR, "select fd %d: %m", f->fd); errno = err; return (-1); } #endif return (0); } void * fd_select(int fd, void (*func)(int, void *), void *udata, struct lmodule *mod) { struct fdesc *f; int err; if ((f = malloc(sizeof(struct fdesc))) == NULL) { err = errno; syslog(LOG_ERR, "fd_select: %m"); errno = err; return (NULL); } f->fd = fd; f->func = func; f->udata = udata; f->owner = mod; #ifdef USE_LIBBEGEMOT f->id = -1; #else evInitID(&f->id); #endif if (fd_resume(f)) { err = errno; free(f); errno = err; return (NULL); } LIST_INSERT_HEAD(&fdesc_list, f, link); return (f); } void fd_deselect(void *p) { struct fdesc *f = p; LIST_REMOVE(f, link); fd_suspend(f); free(f); } static void fd_flush(struct lmodule *mod) { struct fdesc *t, *t1; t = LIST_FIRST(&fdesc_list); while (t != NULL) { t1 = LIST_NEXT(t, link); if (t->owner == mod) fd_deselect(t); t = t1; } } /* * Consume a message from the input buffer */ static void snmp_input_consume(struct port_input *pi) { if (!pi->stream) { /* always consume everything */ pi->length = 0; return; } if (pi->consumed >= pi->length) { /* all bytes consumed */ pi->length = 0; return; } memmove(pi->buf, pi->buf + pi->consumed, pi->length - pi->consumed); pi->length -= pi->consumed; } static void check_priv_dgram(struct port_input *pi, struct sockcred *cred) { /* process explicitly sends credentials */ if (cred) pi->priv = (cred->sc_euid == 0); else pi->priv = 0; } static void check_priv_stream(struct port_input *pi) { struct xucred ucred; socklen_t ucredlen; /* obtain the accept time credentials */ ucredlen = sizeof(ucred); if (getsockopt(pi->fd, 0, LOCAL_PEERCRED, &ucred, &ucredlen) == 0 && ucredlen >= sizeof(ucred) && ucred.cr_version == XUCRED_VERSION) pi->priv = (ucred.cr_uid == 0); else pi->priv = 0; } /* * Input from a stream socket. */ static int recv_stream(struct port_input *pi) { struct msghdr msg; struct iovec iov[1]; ssize_t len; if (pi->buf == NULL) { /* no buffer yet - allocate one */ if ((pi->buf = buf_alloc(0)) == NULL) { /* ups - could not get buffer. Return an error * the caller must close the transport. */ return (-1); } pi->buflen = buf_size(0); pi->consumed = 0; pi->length = 0; } /* try to get a message */ msg.msg_name = pi->peer; msg.msg_namelen = pi->peerlen; msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; iov[0].iov_base = pi->buf + pi->length; iov[0].iov_len = pi->buflen - pi->length; len = recvmsg(pi->fd, &msg, 0); if (len == -1 || len == 0) /* receive error */ return (-1); pi->length += len; if (pi->cred) check_priv_stream(pi); return (0); } /* * Input from a datagram socket. * Each receive should return one datagram. */ static int recv_dgram(struct port_input *pi, struct in_addr *laddr) { u_char embuf[1000]; char cbuf[CMSG_SPACE(SOCKCREDSIZE(CMGROUP_MAX)) + CMSG_SPACE(sizeof(struct in_addr))]; struct msghdr msg; struct iovec iov[1]; ssize_t len; struct cmsghdr *cmsg; struct sockcred *cred = NULL; if (pi->buf == NULL) { /* no buffer yet - allocate one */ if ((pi->buf = buf_alloc(0)) == NULL) { /* ups - could not get buffer. Read away input * and drop it */ (void)recvfrom(pi->fd, embuf, sizeof(embuf), 0, NULL, NULL); /* return error */ return (-1); } pi->buflen = buf_size(0); } /* try to get a message */ msg.msg_name = pi->peer; msg.msg_namelen = pi->peerlen; msg.msg_iov = iov; msg.msg_iovlen = 1; memset(cbuf, 0, sizeof(cbuf)); msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); msg.msg_flags = 0; iov[0].iov_base = pi->buf; iov[0].iov_len = pi->buflen; len = recvmsg(pi->fd, &msg, 0); if (len == -1 || len == 0) /* receive error */ return (-1); if (msg.msg_flags & MSG_TRUNC) { /* truncated - drop */ snmpd_stats.silentDrops++; snmpd_stats.inTooLong++; return (-1); } pi->length = (size_t)len; for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_RECVDSTADDR) memcpy(laddr, CMSG_DATA(cmsg), sizeof(struct in_addr)); if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDS) memcpy(cred, CMSG_DATA(cmsg), sizeof(struct sockcred)); } if (pi->cred) check_priv_dgram(pi, cred); return (0); } /* * Input from a socket */ int snmpd_input(struct port_input *pi, struct tport *tport) { u_char *sndbuf; size_t sndlen; struct snmp_pdu pdu; enum snmpd_input_err ierr, ferr; enum snmpd_proxy_err perr; int32_t vi; int ret; ssize_t slen; #ifdef USE_TCPWRAPPERS char client[16]; #endif struct msghdr msg; struct iovec iov[1]; char cbuf[CMSG_SPACE(sizeof(struct in_addr))]; struct cmsghdr *cmsgp; /* get input depending on the transport */ if (pi->stream) { msg.msg_control = NULL; msg.msg_controllen = 0; ret = recv_stream(pi); } else { struct in_addr laddr; memset(cbuf, 0, CMSG_SPACE(sizeof(struct in_addr))); msg.msg_control = cbuf; msg.msg_controllen = CMSG_SPACE(sizeof(struct in_addr)); cmsgp = CMSG_FIRSTHDR(&msg); cmsgp->cmsg_len = CMSG_LEN(sizeof(struct in_addr)); cmsgp->cmsg_level = IPPROTO_IP; cmsgp->cmsg_type = IP_SENDSRCADDR; memcpy(&laddr, CMSG_DATA(cmsgp), sizeof(struct in_addr)); ret = recv_dgram(pi, &laddr); if (laddr.s_addr == 0) { msg.msg_control = NULL; msg.msg_controllen = 0; } } if (ret == -1) return (-1); #ifdef USE_TCPWRAPPERS /* * In case of AF_INET{6} peer, do hosts_access(5) check. */ if (pi->peer->sa_family != AF_LOCAL && inet_ntop(pi->peer->sa_family, &((const struct sockaddr_in *)(const void *)pi->peer)->sin_addr, client, sizeof(client)) != NULL) { request_set(&req, RQ_CLIENT_ADDR, client, 0); if (hosts_access(&req) == 0) { syslog(LOG_ERR, "refused connection from %.500s", eval_client(&req)); return (-1); } } else if (pi->peer->sa_family != AF_LOCAL) syslog(LOG_ERR, "inet_ntop(): %m"); #endif /* * Handle input */ ierr = snmp_input_start(pi->buf, pi->length, "SNMP", &pdu, &vi, &pi->consumed); if (ierr == SNMPD_INPUT_TRUNC) { /* need more bytes. This is ok only for streaming transports. * but only if we have not reached bufsiz yet. */ if (pi->stream) { if (pi->length == buf_size(0)) { snmpd_stats.silentDrops++; return (-1); } return (0); } snmpd_stats.silentDrops++; return (-1); } /* can't check for bad SET pdus here, because a proxy may have to * check the access first. We don't want to return an error response * to a proxy PDU with a wrong community */ if (ierr == SNMPD_INPUT_FAILED) { /* for streaming transports this is fatal */ if (pi->stream) return (-1); snmp_input_consume(pi); return (0); } if (ierr == SNMPD_INPUT_BAD_COMM) { snmp_input_consume(pi); return (0); } /* * If that is a module community and the module has a proxy function, * the hand it over to the module. */ if (comm != NULL && comm->owner != NULL && comm->owner->config->proxy != NULL) { perr = (*comm->owner->config->proxy)(&pdu, tport->transport, &tport->index, pi->peer, pi->peerlen, ierr, vi, !pi->cred || pi->priv); switch (perr) { case SNMPD_PROXY_OK: snmp_input_consume(pi); return (0); case SNMPD_PROXY_REJ: break; case SNMPD_PROXY_DROP: snmp_input_consume(pi); snmp_pdu_free(&pdu); snmpd_stats.proxyDrops++; return (0); case SNMPD_PROXY_BADCOMM: snmp_input_consume(pi); snmp_pdu_free(&pdu); snmpd_stats.inBadCommunityNames++; if (snmpd.auth_traps) snmp_send_trap(&oid_authenticationFailure, (struct snmp_value *)NULL); return (0); case SNMPD_PROXY_BADCOMMUSE: snmp_input_consume(pi); snmp_pdu_free(&pdu); snmpd_stats.inBadCommunityUses++; if (snmpd.auth_traps) snmp_send_trap(&oid_authenticationFailure, (struct snmp_value *)NULL); return (0); } } /* * Check type */ if (pdu.type == SNMP_PDU_RESPONSE || pdu.type == SNMP_PDU_TRAP || pdu.type == SNMP_PDU_TRAP2) { snmpd_stats.silentDrops++; snmpd_stats.inBadPduTypes++; snmp_pdu_free(&pdu); snmp_input_consume(pi); return (0); } /* * Check community */ if (pdu.version < SNMP_V3 && ((pi->cred && !pi->priv && pdu.type == SNMP_PDU_SET) || (community != COMM_WRITE && (pdu.type == SNMP_PDU_SET || community != COMM_READ)))) { snmpd_stats.inBadCommunityUses++; snmp_pdu_free(&pdu); snmp_input_consume(pi); if (snmpd.auth_traps) snmp_send_trap(&oid_authenticationFailure, (struct snmp_value *)NULL); return (0); } /* * Execute it. */ if ((sndbuf = buf_alloc(1)) == NULL) { snmpd_stats.silentDrops++; snmp_pdu_free(&pdu); snmp_input_consume(pi); return (0); } ferr = snmp_input_finish(&pdu, pi->buf, pi->length, sndbuf, &sndlen, "SNMP", ierr, vi, NULL); if (ferr == SNMPD_INPUT_OK) { msg.msg_name = pi->peer; msg.msg_namelen = pi->peerlen; msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_flags = 0; iov[0].iov_base = sndbuf; iov[0].iov_len = sndlen; slen = sendmsg(pi->fd, &msg, 0); if (slen == -1) syslog(LOG_ERR, "sendmsg: %m"); else if ((size_t)slen != sndlen) syslog(LOG_ERR, "sendmsg: short write %zu/%zu", sndlen, (size_t)slen); } snmp_pdu_free(&pdu); free(sndbuf); snmp_input_consume(pi); return (0); } /* * Send a PDU to a given port */ void snmp_send_port(void *targ, const struct asn_oid *port, struct snmp_pdu *pdu, const struct sockaddr *addr, socklen_t addrlen) { struct transport *trans = targ; struct tport *tp; u_char *sndbuf; size_t sndlen; ssize_t len; TAILQ_FOREACH(tp, &trans->table, link) if (asn_compare_oid(port, &tp->index) == 0) break; if (tp == 0) return; if ((sndbuf = buf_alloc(1)) == NULL) return; snmp_output(pdu, sndbuf, &sndlen, "SNMP PROXY"); len = trans->vtab->send(tp, sndbuf, sndlen, addr, addrlen); if (len == -1) syslog(LOG_ERR, "sendto: %m"); else if ((size_t)len != sndlen) syslog(LOG_ERR, "sendto: short write %zu/%zu", sndlen, (size_t)len); free(sndbuf); } /* * Close an input source */ void snmpd_input_close(struct port_input *pi) { if (pi->id != NULL) fd_deselect(pi->id); if (pi->fd >= 0) (void)close(pi->fd); if (pi->buf != NULL) free(pi->buf); } /* * Dump internal state. */ #ifdef USE_LIBBEGEMOT static void info_func(void) #else static void info_func(evContext ctx __unused, void *uap __unused, const void *tag __unused) #endif { struct lmodule *m; u_int i; char buf[10000]; syslog(LOG_DEBUG, "Dump of SNMPd %lu\n", (u_long)getpid()); for (i = 0; i < tree_size; i++) { switch (tree[i].type) { case SNMP_NODE_LEAF: sprintf(buf, "LEAF: %s %s", tree[i].name, asn_oid2str(&tree[i].oid)); break; case SNMP_NODE_COLUMN: sprintf(buf, "COL: %s %s", tree[i].name, asn_oid2str(&tree[i].oid)); break; } syslog(LOG_DEBUG, "%s", buf); } TAILQ_FOREACH(m, &lmodules, link) if (m->config->dump) (*m->config->dump)(); } /* * Re-read configuration */ #ifdef USE_LIBBEGEMOT static void config_func(void) #else static void config_func(evContext ctx __unused, void *uap __unused, const void *tag __unused) #endif { struct lmodule *m; if (read_config(config_file, NULL)) { syslog(LOG_ERR, "error reading config file '%s'", config_file); return; } TAILQ_FOREACH(m, &lmodules, link) if (m->config->config) (*m->config->config)(); } /* * On USR1 dump actual configuration. */ static void onusr1(int s __unused) { work |= WORK_DOINFO; } static void onhup(int s __unused) { work |= WORK_RECONFIG; } static void onterm(int s __unused) { /* allow clean-up */ exit(0); } static void init_sigs(void) { struct sigaction sa; sa.sa_handler = onusr1; sa.sa_flags = SA_RESTART; sigemptyset(&sa.sa_mask); if (sigaction(SIGUSR1, &sa, NULL)) { syslog(LOG_ERR, "sigaction: %m"); exit(1); } sa.sa_handler = onhup; if (sigaction(SIGHUP, &sa, NULL)) { syslog(LOG_ERR, "sigaction: %m"); exit(1); } sa.sa_handler = onterm; sa.sa_flags = 0; sigemptyset(&sa.sa_mask); if (sigaction(SIGTERM, &sa, NULL)) { syslog(LOG_ERR, "sigaction: %m"); exit(1); } if (sigaction(SIGINT, &sa, NULL)) { syslog(LOG_ERR, "sigaction: %m"); exit(1); } } static void block_sigs(void) { sigset_t set; sigfillset(&set); if (sigprocmask(SIG_BLOCK, &set, &blocked_sigs) == -1) { syslog(LOG_ERR, "SIG_BLOCK: %m"); exit(1); } } static void unblock_sigs(void) { if (sigprocmask(SIG_SETMASK, &blocked_sigs, NULL) == -1) { syslog(LOG_ERR, "SIG_SETMASK: %m"); exit(1); } } /* * Shut down */ static void term(void) { (void)unlink(pid_file); } static void trans_stop(void) { struct transport *t; TAILQ_FOREACH(t, &transport_list, link) (void)t->vtab->stop(1); } /* * Define a macro from the command line */ static void do_macro(char *arg) { char *eq; int err; if ((eq = strchr(arg, '=')) == NULL) err = define_macro(arg, ""); else { *eq++ = '\0'; err = define_macro(arg, eq); } if (err == -1) { syslog(LOG_ERR, "cannot save macro: %m"); exit(1); } } /* * Re-implement getsubopt from scratch, because the second argument is broken * and will not compile with WARNS=5. */ static int getsubopt1(char **arg, const char *const *options, char **valp, char **optp) { static const char *const delim = ",\t "; u_int i; char *ptr; *optp = NULL; /* skip leading junk */ for (ptr = *arg; *ptr != '\0'; ptr++) if (strchr(delim, *ptr) == NULL) break; if (*ptr == '\0') { *arg = ptr; return (-1); } *optp = ptr; /* find the end of the option */ while (*++ptr != '\0') if (strchr(delim, *ptr) != NULL || *ptr == '=') break; if (*ptr != '\0') { if (*ptr == '=') { *ptr++ = '\0'; *valp = ptr; while (*ptr != '\0' && strchr(delim, *ptr) == NULL) ptr++; if (*ptr != '\0') *ptr++ = '\0'; } else *ptr++ = '\0'; } *arg = ptr; for (i = 0; *options != NULL; options++, i++) if (strcmp(*optp, *options) == 0) return (i); return (-1); } int main(int argc, char *argv[]) { int opt; FILE *fp; int background = 1; struct tport *p; const char *prefix = "snmpd"; struct lmodule *m; char *value = NULL, *option; /* XXX */ struct transport *t; #define DBG_DUMP 0 #define DBG_EVENTS 1 #define DBG_TRACE 2 static const char *const debug_opts[] = { "dump", "events", "trace", NULL }; snmp_printf = snmp_printf_func; snmp_error = snmp_error_func; snmp_debug = snmp_debug_func; asn_error = asn_error_func; while ((opt = getopt(argc, argv, "c:dD:e:hI:l:m:p:")) != EOF) switch (opt) { case 'c': strlcpy(config_file, optarg, sizeof(config_file)); break; case 'd': background = 0; break; case 'D': while (*optarg) { switch (getsubopt1(&optarg, debug_opts, &value, &option)) { case DBG_DUMP: debug.dump_pdus = 1; break; case DBG_EVENTS: debug.evdebug++; break; case DBG_TRACE: if (value == NULL) syslog(LOG_ERR, "no value for 'trace'"); else snmp_trace = strtoul(value, NULL, 0); break; case -1: if (suboptarg) syslog(LOG_ERR, "unknown debug flag '%s'", option); else syslog(LOG_ERR, "missing debug flag"); break; } } break; case 'e': strlcpy(engine_file, optarg, sizeof(engine_file)); break; case 'h': fprintf(stderr, "%s", usgtxt); exit(0); case 'I': syspath = optarg; break; case 'l': prefix = optarg; break; case 'm': do_macro(optarg); break; case 'p': strlcpy(pid_file, optarg, sizeof(pid_file)); break; } openlog(prefix, LOG_PID | (background ? 0 : LOG_PERROR), LOG_USER); setlogmask(LOG_UPTO(debug.logpri - 1)); if (background && daemon(0, 0) < 0) { syslog(LOG_ERR, "daemon: %m"); exit(1); } argc -= optind; argv += optind; progargs = argv; nprogargs = argc; srandomdev(); snmp_serial_no = random(); #ifdef USE_TCPWRAPPERS /* * Initialize hosts_access(3) handler. */ request_init(&req, RQ_DAEMON, "snmpd", 0); sock_methods(&req); #endif /* * Initialize the tree. */ if ((tree = malloc(sizeof(struct snmp_node) * CTREE_SIZE)) == NULL) { syslog(LOG_ERR, "%m"); exit(1); } memcpy(tree, ctree, sizeof(struct snmp_node) * CTREE_SIZE); tree_size = CTREE_SIZE; /* * Get standard communities */ (void)comm_define(1, "SNMP read", NULL, NULL); (void)comm_define(2, "SNMP write", NULL, NULL); community = COMM_INITIALIZE; trap_reqid = reqid_allocate(512, NULL); if (config_file[0] == '\0') snprintf(config_file, sizeof(config_file), PATH_CONFIG, prefix); init_actvals(); init_snmpd_engine(); this_tick = get_ticks(); start_tick = this_tick; /* start transports */ if (atexit(trans_stop) == -1) { syslog(LOG_ERR, "atexit failed: %m"); exit(1); } if (udp_trans.start() != SNMP_ERR_NOERROR) syslog(LOG_WARNING, "cannot start UDP transport"); if (lsock_trans.start() != SNMP_ERR_NOERROR) syslog(LOG_WARNING, "cannot start LSOCK transport"); #ifdef USE_LIBBEGEMOT if (debug.evdebug > 0) rpoll_trace = 1; #else if (evCreate(&evctx)) { syslog(LOG_ERR, "evCreate: %m"); exit(1); } if (debug.evdebug > 0) evSetDebug(evctx, 10, stderr); #endif if (engine_file[0] == '\0') snprintf(engine_file, sizeof(engine_file), PATH_ENGINE, prefix); if (read_config(config_file, NULL)) { syslog(LOG_ERR, "error in config file"); exit(1); } TAILQ_FOREACH(t, &transport_list, link) TAILQ_FOREACH(p, &t->table, link) t->vtab->init_port(p); init_sigs(); if (pid_file[0] == '\0') snprintf(pid_file, sizeof(pid_file), PATH_PID, prefix); if ((fp = fopen(pid_file, "w")) != NULL) { fprintf(fp, "%u", getpid()); fclose(fp); if (atexit(term) == -1) { syslog(LOG_ERR, "atexit failed: %m"); (void)remove(pid_file); exit(0); } } if (or_register(&oid_snmpMIB, "The MIB module for SNMPv2 entities.", NULL) == 0) { syslog(LOG_ERR, "cannot register SNMPv2 MIB"); exit(1); } if (or_register(&oid_begemotSnmpd, "The MIB module for the Begemot SNMPd.", NULL) == 0) { syslog(LOG_ERR, "cannot register begemotSnmpd MIB"); exit(1); } while ((m = TAILQ_FIRST(&modules_start)) != NULL) { m->flags &= ~LM_ONSTARTLIST; TAILQ_REMOVE(&modules_start, m, start); lm_start(m); } snmp_send_trap(&oid_coldStart, (struct snmp_value *)NULL); for (;;) { #ifndef USE_LIBBEGEMOT evEvent event; #endif struct lmodule *mod; TAILQ_FOREACH(mod, &lmodules, link) if (mod->config->idle != NULL) (*mod->config->idle)(); #ifndef USE_LIBBEGEMOT if (evGetNext(evctx, &event, EV_WAIT) == 0) { if (evDispatch(evctx, event)) syslog(LOG_ERR, "evDispatch: %m"); } else if (errno != EINTR) { syslog(LOG_ERR, "evGetNext: %m"); exit(1); } #else poll_dispatch(1); #endif if (work != 0) { block_sigs(); if (work & WORK_DOINFO) { #ifdef USE_LIBBEGEMOT info_func(); #else if (evWaitFor(evctx, &work, info_func, NULL, NULL) == -1) { syslog(LOG_ERR, "evWaitFor: %m"); exit(1); } #endif } if (work & WORK_RECONFIG) { #ifdef USE_LIBBEGEMOT config_func(); #else if (evWaitFor(evctx, &work, config_func, NULL, NULL) == -1) { syslog(LOG_ERR, "evWaitFor: %m"); exit(1); } #endif } work = 0; unblock_sigs(); #ifndef USE_LIBBEGEMOT if (evDo(evctx, &work) == -1) { syslog(LOG_ERR, "evDo: %m"); exit(1); } #endif } } return (0); } uint64_t get_ticks(void) { struct timeval tv; uint64_t ret; if (gettimeofday(&tv, NULL)) abort(); ret = tv.tv_sec * 100ULL + tv.tv_usec / 10000ULL; return (ret); } /* * Timer support */ /* * Trampoline for the non-repeatable timers. */ #ifdef USE_LIBBEGEMOT static void tfunc(int tid __unused, void *uap) #else static void tfunc(evContext ctx __unused, void *uap, struct timespec due __unused, struct timespec inter __unused) #endif { struct timer *tp = uap; LIST_REMOVE(tp, link); tp->func(tp->udata); free(tp); } /* * Trampoline for the repeatable timers. */ #ifdef USE_LIBBEGEMOT static void trfunc(int tid __unused, void *uap) #else static void trfunc(evContext ctx __unused, void *uap, struct timespec due __unused, struct timespec inter __unused) #endif { struct timer *tp = uap; tp->func(tp->udata); } /* * Start a one-shot timer */ void * timer_start(u_int ticks, void (*func)(void *), void *udata, struct lmodule *mod) { struct timer *tp; #ifndef USE_LIBBEGEMOT struct timespec due; #endif if ((tp = malloc(sizeof(struct timer))) == NULL) { syslog(LOG_CRIT, "out of memory for timer"); exit(1); } #ifndef USE_LIBBEGEMOT due = evAddTime(evNowTime(), evConsTime(ticks / 100, (ticks % 100) * 10000)); #endif tp->udata = udata; tp->owner = mod; tp->func = func; LIST_INSERT_HEAD(&timer_list, tp, link); #ifdef USE_LIBBEGEMOT if ((tp->id = poll_start_timer(ticks * 10, 0, tfunc, tp)) < 0) { syslog(LOG_ERR, "cannot set timer: %m"); exit(1); } #else if (evSetTimer(evctx, tfunc, tp, due, evConsTime(0, 0), &tp->id) == -1) { syslog(LOG_ERR, "cannot set timer: %m"); exit(1); } #endif return (tp); } /* * Start a repeatable timer. When used with USE_LIBBEGEMOT the first argument * is currently ignored and the initial number of ticks is set to the * repeat number of ticks. */ void * timer_start_repeat(u_int ticks __unused, u_int repeat_ticks, void (*func)(void *), void *udata, struct lmodule *mod) { struct timer *tp; #ifndef USE_LIBBEGEMOT struct timespec due; struct timespec inter; #endif if ((tp = malloc(sizeof(struct timer))) == NULL) { syslog(LOG_CRIT, "out of memory for timer"); exit(1); } #ifndef USE_LIBBEGEMOT due = evAddTime(evNowTime(), evConsTime(ticks / 100, (ticks % 100) * 10000)); inter = evConsTime(repeat_ticks / 100, (repeat_ticks % 100) * 10000); #endif tp->udata = udata; tp->owner = mod; tp->func = func; LIST_INSERT_HEAD(&timer_list, tp, link); #ifdef USE_LIBBEGEMOT if ((tp->id = poll_start_timer(repeat_ticks * 10, 1, trfunc, tp)) < 0) { syslog(LOG_ERR, "cannot set timer: %m"); exit(1); } #else if (evSetTimer(evctx, trfunc, tp, due, inter, &tp->id) == -1) { syslog(LOG_ERR, "cannot set timer: %m"); exit(1); } #endif return (tp); } /* * Stop a timer. */ void timer_stop(void *p) { struct timer *tp = p; LIST_REMOVE(tp, link); #ifdef USE_LIBBEGEMOT poll_stop_timer(tp->id); #else if (evClearTimer(evctx, tp->id) == -1) { syslog(LOG_ERR, "cannot stop timer: %m"); exit(1); } #endif free(p); } static void timer_flush(struct lmodule *mod) { struct timer *t, *t1; t = LIST_FIRST(&timer_list); while (t != NULL) { t1 = LIST_NEXT(t, link); if (t->owner == mod) timer_stop(t); t = t1; } } static void snmp_printf_func(const char *fmt, ...) { va_list ap; static char *pend = NULL; char *ret, *new; va_start(ap, fmt); vasprintf(&ret, fmt, ap); va_end(ap); if (ret == NULL) return; if (pend != NULL) { if ((new = realloc(pend, strlen(pend) + strlen(ret) + 1)) == NULL) { free(ret); return; } pend = new; strcat(pend, ret); free(ret); } else pend = ret; while ((ret = strchr(pend, '\n')) != NULL) { *ret = '\0'; syslog(LOG_DEBUG, "%s", pend); if (strlen(ret + 1) == 0) { free(pend); pend = NULL; break; } strcpy(pend, ret + 1); } } static void snmp_error_func(const char *err, ...) { char errbuf[1000]; va_list ap; if (!(snmp_trace & LOG_SNMP_ERRORS)) return; va_start(ap, err); snprintf(errbuf, sizeof(errbuf), "SNMP: "); vsnprintf(errbuf + strlen(errbuf), sizeof(errbuf) - strlen(errbuf), err, ap); va_end(ap); syslog(LOG_ERR, "%s", errbuf); } static void snmp_debug_func(const char *err, ...) { char errbuf[1000]; va_list ap; va_start(ap, err); snprintf(errbuf, sizeof(errbuf), "SNMP: "); vsnprintf(errbuf+strlen(errbuf), sizeof(errbuf)-strlen(errbuf), err, ap); va_end(ap); syslog(LOG_DEBUG, "%s", errbuf); } static void asn_error_func(const struct asn_buf *b, const char *err, ...) { char errbuf[1000]; va_list ap; u_int i; if (!(snmp_trace & LOG_ASN1_ERRORS)) return; va_start(ap, err); snprintf(errbuf, sizeof(errbuf), "ASN.1: "); vsnprintf(errbuf + strlen(errbuf), sizeof(errbuf) - strlen(errbuf), err, ap); va_end(ap); if (b != NULL) { snprintf(errbuf + strlen(errbuf), sizeof(errbuf) - strlen(errbuf), " at"); for (i = 0; b->asn_len > i; i++) snprintf(errbuf + strlen(errbuf), sizeof(errbuf) - strlen(errbuf), " %02x", b->asn_cptr[i]); } syslog(LOG_ERR, "%s", errbuf); } /* * Create a new community */ u_int comm_define(u_int priv, const char *descr, struct lmodule *owner, const char *str) { struct community *c, *p; u_int ncomm; /* generate an identifier */ do { if ((ncomm = next_community_index++) == UINT_MAX) next_community_index = 1; TAILQ_FOREACH(c, &community_list, link) if (c->value == ncomm) break; } while (c != NULL); if ((c = malloc(sizeof(struct community))) == NULL) { syslog(LOG_ERR, "comm_define: %m"); return (0); } c->owner = owner; c->value = ncomm; c->descr = descr; c->string = NULL; c->private = priv; if (str != NULL) { if((c->string = malloc(strlen(str)+1)) == NULL) { free(c); return (0); } strcpy(c->string, str); } /* make index */ if (c->owner == NULL) { c->index.len = 1; c->index.subs[0] = 0; } else { c->index = c->owner->index; } c->index.subs[c->index.len++] = c->private; /* * Insert ordered */ TAILQ_FOREACH(p, &community_list, link) { if (asn_compare_oid(&p->index, &c->index) > 0) { TAILQ_INSERT_BEFORE(p, c, link); break; } } if (p == NULL) TAILQ_INSERT_TAIL(&community_list, c, link); return (c->value); } const char * comm_string(u_int ncomm) { struct community *p; TAILQ_FOREACH(p, &community_list, link) if (p->value == ncomm) return (p->string); return (NULL); } /* * Delete all communities allocated by a module */ static void comm_flush(struct lmodule *mod) { struct community *p, *p1; p = TAILQ_FIRST(&community_list); while (p != NULL) { p1 = TAILQ_NEXT(p, link); if (p->owner == mod) { free(p->string); TAILQ_REMOVE(&community_list, p, link); free(p); } p = p1; } } /* * Request ID handling. * * Allocate a new range of request ids. Use a first fit algorithm. */ u_int reqid_allocate(int size, struct lmodule *mod) { u_int type; struct idrange *r, *r1; if (size <= 0 || size > INT32_MAX) { syslog(LOG_CRIT, "%s: size out of range: %d", __func__, size); return (0); } /* allocate a type id */ do { if ((type = next_idrange++) == UINT_MAX) next_idrange = 1; TAILQ_FOREACH(r, &idrange_list, link) if (r->type == type) break; } while(r != NULL); /* find a range */ if (TAILQ_EMPTY(&idrange_list)) r = NULL; else { r = TAILQ_FIRST(&idrange_list); if (r->base < size) { while((r1 = TAILQ_NEXT(r, link)) != NULL) { if (r1->base - (r->base + r->size) >= size) break; r = r1; } r = r1; } if (r == NULL) { r1 = TAILQ_LAST(&idrange_list, idrange_list); if (INT32_MAX - size + 1 < r1->base + r1->size) { syslog(LOG_ERR, "out of id ranges (%u)", size); return (0); } } } /* allocate structure */ if ((r1 = malloc(sizeof(struct idrange))) == NULL) { syslog(LOG_ERR, "%s: %m", __FUNCTION__); return (0); } r1->type = type; r1->size = size; r1->owner = mod; if (TAILQ_EMPTY(&idrange_list) || r == TAILQ_FIRST(&idrange_list)) { r1->base = 0; TAILQ_INSERT_HEAD(&idrange_list, r1, link); } else if (r == NULL) { r = TAILQ_LAST(&idrange_list, idrange_list); r1->base = r->base + r->size; TAILQ_INSERT_TAIL(&idrange_list, r1, link); } else { r = TAILQ_PREV(r, idrange_list, link); r1->base = r->base + r->size; TAILQ_INSERT_AFTER(&idrange_list, r, r1, link); } r1->next = r1->base; return (type); } int32_t reqid_next(u_int type) { struct idrange *r; int32_t id; TAILQ_FOREACH(r, &idrange_list, link) if (r->type == type) break; if (r == NULL) { syslog(LOG_CRIT, "wrong idrange type"); abort(); } if ((id = r->next++) == r->base + (r->size - 1)) r->next = r->base; return (id); } int32_t reqid_base(u_int type) { struct idrange *r; TAILQ_FOREACH(r, &idrange_list, link) if (r->type == type) return (r->base); syslog(LOG_CRIT, "wrong idrange type"); abort(); } u_int reqid_type(int32_t reqid) { struct idrange *r; TAILQ_FOREACH(r, &idrange_list, link) if (reqid >= r->base && reqid <= r->base + (r->size - 1)) return (r->type); return (0); } int reqid_istype(int32_t reqid, u_int type) { return (reqid_type(reqid) == type); } /* * Delete all communities allocated by a module */ static void reqid_flush(struct lmodule *mod) { struct idrange *p, *p1; p = TAILQ_FIRST(&idrange_list); while (p != NULL) { p1 = TAILQ_NEXT(p, link); if (p->owner == mod) { TAILQ_REMOVE(&idrange_list, p, link); free(p); } p = p1; } } /* * Merge the given tree for the given module into the main tree. */ static int compare_node(const void *v1, const void *v2) { const struct snmp_node *n1 = v1; const struct snmp_node *n2 = v2; return (asn_compare_oid(&n1->oid, &n2->oid)); } static int tree_merge(const struct snmp_node *ntree, u_int nsize, struct lmodule *mod) { struct snmp_node *xtree; u_int i; xtree = realloc(tree, sizeof(*tree) * (tree_size + nsize)); if (xtree == NULL) { syslog(LOG_ERR, "tree_merge: %m"); return (-1); } tree = xtree; memcpy(&tree[tree_size], ntree, sizeof(*tree) * nsize); for (i = 0; i < nsize; i++) tree[tree_size + i].tree_data = mod; tree_size += nsize; qsort(tree, tree_size, sizeof(tree[0]), compare_node); return (0); } /* * Remove all nodes belonging to the loadable module */ static void tree_unmerge(struct lmodule *mod) { u_int s, d; for(s = d = 0; s < tree_size; s++) if (tree[s].tree_data != mod) { if (s != d) tree[d] = tree[s]; d++; } tree_size = d; } /* * Loadable modules */ struct lmodule * lm_load(const char *path, const char *section) { struct lmodule *m; int err; int i; char *av[MAX_MOD_ARGS + 1]; int ac; u_int u; if ((m = malloc(sizeof(*m))) == NULL) { syslog(LOG_ERR, "lm_load: %m"); return (NULL); } m->handle = NULL; m->flags = 0; strcpy(m->section, section); if ((m->path = malloc(strlen(path) + 1)) == NULL) { syslog(LOG_ERR, "lm_load: %m"); goto err; } strcpy(m->path, path); /* * Make index */ m->index.subs[0] = strlen(section); m->index.len = m->index.subs[0] + 1; for (u = 0; u < m->index.subs[0]; u++) m->index.subs[u + 1] = section[u]; /* * Load the object file and locate the config structure */ if ((m->handle = dlopen(m->path, RTLD_NOW|RTLD_GLOBAL)) == NULL) { syslog(LOG_ERR, "lm_load: open %s", dlerror()); goto err; } if ((m->config = dlsym(m->handle, "config")) == NULL) { syslog(LOG_ERR, "lm_load: no 'config' symbol %s", dlerror()); goto err; } /* * Insert it into the right place */ INSERT_OBJECT_OID(m, &lmodules); /* preserve order */ if (community == COMM_INITIALIZE) { m->flags |= LM_ONSTARTLIST; TAILQ_INSERT_TAIL(&modules_start, m, start); } /* * make the argument vector. */ ac = 0; for (i = 0; i < nprogargs; i++) { if (strlen(progargs[i]) >= strlen(section) + 1 && strncmp(progargs[i], section, strlen(section)) == 0 && progargs[i][strlen(section)] == ':') { if (ac == MAX_MOD_ARGS) { syslog(LOG_WARNING, "too many arguments for " "module '%s", section); break; } av[ac++] = &progargs[i][strlen(section)+1]; } } av[ac] = NULL; /* * Run the initialization function */ if ((err = (*m->config->init)(m, ac, av)) != 0) { syslog(LOG_ERR, "lm_load: init failed: %d", err); TAILQ_REMOVE(&lmodules, m, link); goto err; } return (m); err: if ((m->flags & LM_ONSTARTLIST) != 0) TAILQ_REMOVE(&modules_start, m, start); if (m->handle) dlclose(m->handle); free(m->path); free(m); return (NULL); } /* * Start a module */ void lm_start(struct lmodule *mod) { const struct lmodule *m; /* * Merge tree. If this fails, unload the module. */ if (tree_merge(mod->config->tree, mod->config->tree_size, mod)) { lm_unload(mod); return; } /* * Read configuration */ if (read_config(config_file, mod)) { syslog(LOG_ERR, "error in config file"); lm_unload(mod); return; } if (mod->config->start) (*mod->config->start)(); mod->flags |= LM_STARTED; /* * Inform other modules */ TAILQ_FOREACH(m, &lmodules, link) if (m->config->loading) (*m->config->loading)(mod, 1); } /* * Unload a module. */ void lm_unload(struct lmodule *m) { int err; const struct lmodule *mod; TAILQ_REMOVE(&lmodules, m, link); if (m->flags & LM_ONSTARTLIST) TAILQ_REMOVE(&modules_start, m, start); tree_unmerge(m); if ((m->flags & LM_STARTED) && m->config->fini && (err = (*m->config->fini)()) != 0) syslog(LOG_WARNING, "lm_unload(%s): fini %d", m->section, err); comm_flush(m); reqid_flush(m); timer_flush(m); fd_flush(m); dlclose(m->handle); free(m->path); /* * Inform other modules */ TAILQ_FOREACH(mod, &lmodules, link) if (mod->config->loading) (*mod->config->loading)(m, 0); free(m); } /* * Register an object resource and return the index (or 0 on failures) */ u_int or_register(const struct asn_oid *or, const char *descr, struct lmodule *mod) { struct objres *objres, *or1; u_int idx; /* find a free index */ idx = 1; for (objres = TAILQ_FIRST(&objres_list); objres != NULL; objres = TAILQ_NEXT(objres, link)) { if ((or1 = TAILQ_NEXT(objres, link)) == NULL || or1->index > objres->index + 1) { idx = objres->index + 1; break; } } if ((objres = malloc(sizeof(*objres))) == NULL) return (0); objres->index = idx; objres->oid = *or; strlcpy(objres->descr, descr, sizeof(objres->descr)); objres->uptime = (uint32_t)(get_ticks() - start_tick); objres->module = mod; INSERT_OBJECT_INT(objres, &objres_list); systemg.or_last_change = objres->uptime; return (idx); } void or_unregister(u_int idx) { struct objres *objres; TAILQ_FOREACH(objres, &objres_list, link) if (objres->index == idx) { TAILQ_REMOVE(&objres_list, objres, link); free(objres); return; } } /* * RFC 3414 User-based Security Model support */ struct snmpd_usmstat * bsnmpd_get_usm_stats(void) { return (&snmpd_usmstats); } void bsnmpd_reset_usm_stats(void) { memset(&snmpd_usmstats, 0, sizeof(snmpd_usmstats)); } struct usm_user * usm_first_user(void) { return (SLIST_FIRST(&usm_userlist)); } struct usm_user * usm_next_user(struct usm_user *uuser) { if (uuser == NULL) return (NULL); return (SLIST_NEXT(uuser, up)); } struct usm_user * usm_find_user(uint8_t *engine, uint32_t elen, char *uname) { struct usm_user *uuser; SLIST_FOREACH(uuser, &usm_userlist, up) if (uuser->user_engine_len == elen && memcmp(uuser->user_engine_id, engine, elen) == 0 && strlen(uuser->suser.sec_name) == strlen(uname) && strcmp(uuser->suser.sec_name, uname) == 0) break; return (uuser); } static int usm_compare_user(struct usm_user *u1, struct usm_user *u2) { uint32_t i; if (u1->user_engine_len < u2->user_engine_len) return (-1); if (u1->user_engine_len > u2->user_engine_len) return (1); for (i = 0; i < u1->user_engine_len; i++) { if (u1->user_engine_id[i] < u2->user_engine_id[i]) return (-1); if (u1->user_engine_id[i] > u2->user_engine_id[i]) return (1); } if (strlen(u1->suser.sec_name) < strlen(u2->suser.sec_name)) return (-1); if (strlen(u1->suser.sec_name) > strlen(u2->suser.sec_name)) return (1); for (i = 0; i < strlen(u1->suser.sec_name); i++) { if (u1->suser.sec_name[i] < u2->suser.sec_name[i]) return (-1); if (u1->suser.sec_name[i] > u2->suser.sec_name[i]) return (1); } return (0); } struct usm_user * usm_new_user(uint8_t *eid, uint32_t elen, char *uname) { int cmp; struct usm_user *uuser, *temp, *prev; for (uuser = usm_first_user(); uuser != NULL; (uuser = usm_next_user(uuser))) { if (uuser->user_engine_len == elen && strlen(uname) == strlen(uuser->suser.sec_name) && strcmp(uname, uuser->suser.sec_name) == 0 && memcmp(eid, uuser->user_engine_id, elen) == 0) return (NULL); } if ((uuser = (struct usm_user *)malloc(sizeof(*uuser))) == NULL) return (NULL); - memset(uuser, 0, sizeof(struct usm_user)); + memset(uuser, 0, sizeof(*uuser)); strlcpy(uuser->suser.sec_name, uname, SNMP_ADM_STR32_SIZ); memcpy(uuser->user_engine_id, eid, elen); uuser->user_engine_len = elen; if ((prev = SLIST_FIRST(&usm_userlist)) == NULL || usm_compare_user(uuser, prev) < 0) { SLIST_INSERT_HEAD(&usm_userlist, uuser, up); return (uuser); } SLIST_FOREACH(temp, &usm_userlist, up) { if ((cmp = usm_compare_user(uuser, temp)) <= 0) break; prev = temp; } if (temp == NULL || cmp < 0) SLIST_INSERT_AFTER(prev, uuser, up); else if (cmp > 0) SLIST_INSERT_AFTER(temp, uuser, up); else { syslog(LOG_ERR, "User %s exists", uuser->suser.sec_name); free(uuser); return (NULL); } return (uuser); } void usm_delete_user(struct usm_user *uuser) { SLIST_REMOVE(&usm_userlist, uuser, usm_user, up); free(uuser); } void usm_flush_users(void) { struct usm_user *uuser; while ((uuser = SLIST_FIRST(&usm_userlist)) != NULL) { SLIST_REMOVE_HEAD(&usm_userlist, up); free(uuser); } SLIST_INIT(&usm_userlist); } /* * RFC 3415 View-based Access Control Model support */ struct vacm_user * vacm_first_user(void) { return (SLIST_FIRST(&vacm_userlist)); } struct vacm_user * vacm_next_user(struct vacm_user *vuser) { if (vuser == NULL) return (NULL); return (SLIST_NEXT(vuser, vvu)); } static int vacm_compare_user(struct vacm_user *v1, struct vacm_user *v2) { uint32_t i; if (v1->sec_model < v2->sec_model) return (-1); if (v1->sec_model > v2->sec_model) return (1); if (strlen(v1->secname) < strlen(v2->secname)) return (-1); if (strlen(v1->secname) > strlen(v2->secname)) return (1); for (i = 0; i < strlen(v1->secname); i++) { if (v1->secname[i] < v2->secname[i]) return (-1); if (v1->secname[i] > v2->secname[i]) return (1); } return (0); } struct vacm_user * vacm_new_user(int32_t smodel, char *uname) { int cmp; struct vacm_user *user, *temp, *prev; SLIST_FOREACH(user, &vacm_userlist, vvu) if (strcmp(uname, user->secname) == 0 && smodel == user->sec_model) return (NULL); if ((user = (struct vacm_user *)malloc(sizeof(*user))) == NULL) return (NULL); memset(user, 0, sizeof(*user)); user->group = &vacm_default_group; SLIST_INSERT_HEAD(&vacm_default_group.group_users, user, vvg); user->sec_model = smodel; strlcpy(user->secname, uname, sizeof(user->secname)); if ((prev = SLIST_FIRST(&vacm_userlist)) == NULL || vacm_compare_user(user, prev) < 0) { SLIST_INSERT_HEAD(&vacm_userlist, user, vvu); return (user); } SLIST_FOREACH(temp, &vacm_userlist, vvu) { if ((cmp = vacm_compare_user(user, temp)) <= 0) break; prev = temp; } if (temp == NULL || cmp < 0) SLIST_INSERT_AFTER(prev, user, vvu); else if (cmp > 0) SLIST_INSERT_AFTER(temp, user, vvu); else { syslog(LOG_ERR, "User %s exists", user->secname); free(user); return (NULL); } return (user); } int vacm_delete_user(struct vacm_user *user) { if (user->group != NULL && user->group != &vacm_default_group) { SLIST_REMOVE(&user->group->group_users, user, vacm_user, vvg); if (SLIST_EMPTY(&user->group->group_users)) { SLIST_REMOVE(&vacm_grouplist, user->group, vacm_group, vge); free(user->group); } } SLIST_REMOVE(&vacm_userlist, user, vacm_user, vvu); free(user); return (0); } int vacm_user_set_group(struct vacm_user *user, u_char *octets, u_int len) { struct vacm_group *group; if (len >= SNMP_ADM_STR32_SIZ) return (-1); SLIST_FOREACH(group, &vacm_grouplist, vge) if (strlen(group->groupname) == len && memcmp(octets, group->groupname, len) == 0) break; if (group == NULL) { if ((group = (struct vacm_group *)malloc(sizeof(*group))) == NULL) return (-1); memset(group, 0, sizeof(*group)); memcpy(group->groupname, octets, len); group->groupname[len] = '\0'; SLIST_INSERT_HEAD(&vacm_grouplist, group, vge); } SLIST_REMOVE(&user->group->group_users, user, vacm_user, vvg); SLIST_INSERT_HEAD(&group->group_users, user, vvg); user->group = group; return (0); } void vacm_groups_init(void) { SLIST_INSERT_HEAD(&vacm_grouplist, &vacm_default_group, vge); } struct vacm_access * vacm_first_access_rule(void) { return (TAILQ_FIRST(&vacm_accesslist)); } struct vacm_access * vacm_next_access_rule(struct vacm_access *acl) { if (acl == NULL) return (NULL); return (TAILQ_NEXT(acl, vva)); } static int vacm_compare_access_rule(struct vacm_access *v1, struct vacm_access *v2) { uint32_t i; if (strlen(v1->group->groupname) < strlen(v2->group->groupname)) return (-1); if (strlen(v1->group->groupname) > strlen(v2->group->groupname)) return (1); for (i = 0; i < strlen(v1->group->groupname); i++) { if (v1->group->groupname[i] < v2->group->groupname[i]) return (-1); if (v1->group->groupname[i] > v2->group->groupname[i]) return (1); } if (strlen(v1->ctx_prefix) < strlen(v2->ctx_prefix)) return (-1); if (strlen(v1->ctx_prefix) > strlen(v2->ctx_prefix)) return (1); for (i = 0; i < strlen(v1->ctx_prefix); i++) { if (v1->ctx_prefix[i] < v2->ctx_prefix[i]) return (-1); if (v1->ctx_prefix[i] > v2->ctx_prefix[i]) return (1); } if (v1->sec_model < v2->sec_model) return (-1); if (v1->sec_model > v2->sec_model) return (1); if (v1->sec_level < v2->sec_level) return (-1); if (v1->sec_level > v2->sec_level) return (1); return (0); } struct vacm_access * vacm_new_access_rule(char *gname, char *cprefix, int32_t smodel, int32_t slevel) { struct vacm_group *group; struct vacm_access *acl, *temp; TAILQ_FOREACH(acl, &vacm_accesslist, vva) { if (acl->group == NULL) continue; if (strcmp(gname, acl->group->groupname) == 0 && strcmp(cprefix, acl->ctx_prefix) == 0 && acl->sec_model == smodel && acl->sec_level == slevel) return (NULL); } /* Make sure the group exists */ SLIST_FOREACH(group, &vacm_grouplist, vge) if (strcmp(gname, group->groupname) == 0) break; if (group == NULL) return (NULL); if ((acl = (struct vacm_access *)malloc(sizeof(*acl))) == NULL) return (NULL); memset(acl, 0, sizeof(*acl)); acl->group = group; strlcpy(acl->ctx_prefix, cprefix, sizeof(acl->ctx_prefix)); acl->sec_model = smodel; acl->sec_level = slevel; if ((temp = TAILQ_FIRST(&vacm_accesslist)) == NULL || vacm_compare_access_rule(acl, temp) < 0) { TAILQ_INSERT_HEAD(&vacm_accesslist, acl, vva); return (acl); } TAILQ_FOREACH(temp, &vacm_accesslist, vva) if (vacm_compare_access_rule(acl, temp) < 0) { TAILQ_INSERT_BEFORE(temp, acl, vva); return (acl); } TAILQ_INSERT_TAIL(&vacm_accesslist, acl, vva); return (acl); } int vacm_delete_access_rule(struct vacm_access *acl) { TAILQ_REMOVE(&vacm_accesslist, acl, vva); free(acl); return (0); } struct vacm_view * vacm_first_view(void) { return (SLIST_FIRST(&vacm_viewlist)); } struct vacm_view * vacm_next_view(struct vacm_view *view) { if (view == NULL) return (NULL); return (SLIST_NEXT(view, vvl)); } static int vacm_compare_view(struct vacm_view *v1, struct vacm_view *v2) { uint32_t i; if (strlen(v1->viewname) < strlen(v2->viewname)) return (-1); if (strlen(v1->viewname) > strlen(v2->viewname)) return (1); for (i = 0; i < strlen(v1->viewname); i++) { if (v1->viewname[i] < v2->viewname[i]) return (-1); if (v1->viewname[i] > v2->viewname[i]) return (1); } return (asn_compare_oid(&v1->subtree, &v2->subtree)); } struct vacm_view * vacm_new_view(char *vname, struct asn_oid *oid) { int cmp; struct vacm_view *view, *temp, *prev; SLIST_FOREACH(view, &vacm_viewlist, vvl) if (strcmp(vname, view->viewname) == 0) return (NULL); if ((view = (struct vacm_view *)malloc(sizeof(*view))) == NULL) return (NULL); memset(view, 0, sizeof(*view)); strlcpy(view->viewname, vname, sizeof(view->viewname)); asn_append_oid(&view->subtree, oid); if ((prev = SLIST_FIRST(&vacm_viewlist)) == NULL || vacm_compare_view(view, prev) < 0) { SLIST_INSERT_HEAD(&vacm_viewlist, view, vvl); return (view); } SLIST_FOREACH(temp, &vacm_viewlist, vvl) { if ((cmp = vacm_compare_view(view, temp)) <= 0) break; prev = temp; } if (temp == NULL || cmp < 0) SLIST_INSERT_AFTER(prev, view, vvl); else if (cmp > 0) SLIST_INSERT_AFTER(temp, view, vvl); else { syslog(LOG_ERR, "View %s exists", view->viewname); free(view); return (NULL); } return (view); } int vacm_delete_view(struct vacm_view *view) { SLIST_REMOVE(&vacm_viewlist, view, vacm_view, vvl); free(view); return (0); } struct vacm_context * vacm_first_context(void) { return (SLIST_FIRST(&vacm_contextlist)); } struct vacm_context * vacm_next_context(struct vacm_context *vacmctx) { if (vacmctx == NULL) return (NULL); return (SLIST_NEXT(vacmctx, vcl)); } struct vacm_context * vacm_add_context(char *ctxname, int regid) { int cmp; struct vacm_context *ctx, *temp, *prev; SLIST_FOREACH(ctx, &vacm_contextlist, vcl) if (strcmp(ctxname, ctx->ctxname) == 0) { syslog(LOG_ERR, "Context %s exists", ctx->ctxname); return (NULL); } if ((ctx = (struct vacm_context *)malloc(sizeof(*ctx))) == NULL) return (NULL); memset(ctx, 0, sizeof(*ctx)); strlcpy(ctx->ctxname, ctxname, sizeof(ctx->ctxname)); ctx->regid = regid; if ((prev = SLIST_FIRST(&vacm_contextlist)) == NULL || strlen(ctx->ctxname) < strlen(prev->ctxname) || strcmp(ctx->ctxname, prev->ctxname) < 0) { SLIST_INSERT_HEAD(&vacm_contextlist, ctx, vcl); return (ctx); } SLIST_FOREACH(temp, &vacm_contextlist, vcl) { if (strlen(ctx->ctxname) < strlen(temp->ctxname) || strcmp(ctx->ctxname, temp->ctxname) < 0) { cmp = -1; break; } prev = temp; } if (temp == NULL || cmp < 0) SLIST_INSERT_AFTER(prev, ctx, vcl); else if (cmp > 0) SLIST_INSERT_AFTER(temp, ctx, vcl); else { syslog(LOG_ERR, "Context %s exists", ctx->ctxname); free(ctx); return (NULL); } return (ctx); } void vacm_flush_contexts(int regid) { struct vacm_context *ctx, *temp; SLIST_FOREACH_SAFE(ctx, &vacm_contextlist, vcl, temp) if (ctx->regid == regid) { SLIST_REMOVE(&vacm_contextlist, ctx, vacm_context, vcl); free(ctx); } } Index: user/ngie/bsnmp_cleanup/sys/arm/arm/pmap-v6.c =================================================================== --- user/ngie/bsnmp_cleanup/sys/arm/arm/pmap-v6.c (revision 298467) +++ user/ngie/bsnmp_cleanup/sys/arm/arm/pmap-v6.c (revision 298468) @@ -1,6653 +1,6805 @@ /*- * Copyright (c) 1991 Regents of the University of California. * Copyright (c) 1994 John S. Dyson * Copyright (c) 1994 David Greenman * Copyright (c) 2005-2010 Alan L. Cox * Copyright (c) 2014-2016 Svatopluk Kraus * Copyright (c) 2014-2016 Michal Meloun * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include "opt_pmap.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #else #include #endif #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #ifndef DIAGNOSTIC #define PMAP_INLINE __inline #else #define PMAP_INLINE #endif #ifdef PMAP_DEBUG static void pmap_zero_page_check(vm_page_t m); void pmap_debug(int level); int pmap_pid_dump(int pid); #define PDEBUG(_lev_,_stat_) \ if (pmap_debug_level >= (_lev_)) \ ((_stat_)) #define dprintf printf int pmap_debug_level = 1; #else /* PMAP_DEBUG */ #define PDEBUG(_lev_,_stat_) /* Nothing */ #define dprintf(x, arg...) #endif /* PMAP_DEBUG */ /* * Level 2 page tables map definion ('max' is excluded). */ #define PT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define PT2V_MAX_ADDRESS ((vm_offset_t)PT2MAP + PT2MAP_SIZE) #define UPT2V_MIN_ADDRESS ((vm_offset_t)PT2MAP) #define UPT2V_MAX_ADDRESS \ ((vm_offset_t)(PT2MAP + (KERNBASE >> PT2MAP_SHIFT))) /* * Promotion to a 1MB (PTE1) page mapping requires that the corresponding * 4KB (PTE2) page mappings have identical settings for the following fields: */ #define PTE2_PROMOTE (PTE2_V | PTE2_A | PTE2_NM | PTE2_S | PTE2_NG | \ PTE2_NX | PTE2_RO | PTE2_U | PTE2_W | \ PTE2_ATTR_MASK) #define PTE1_PROMOTE (PTE1_V | PTE1_A | PTE1_NM | PTE1_S | PTE1_NG | \ PTE1_NX | PTE1_RO | PTE1_U | PTE1_W | \ PTE1_ATTR_MASK) #define ATTR_TO_L1(l2_attr) ((((l2_attr) & L2_TEX0) ? L1_S_TEX0 : 0) | \ (((l2_attr) & L2_C) ? L1_S_C : 0) | \ (((l2_attr) & L2_B) ? L1_S_B : 0) | \ (((l2_attr) & PTE2_A) ? PTE1_A : 0) | \ (((l2_attr) & PTE2_NM) ? PTE1_NM : 0) | \ (((l2_attr) & PTE2_S) ? PTE1_S : 0) | \ (((l2_attr) & PTE2_NG) ? PTE1_NG : 0) | \ (((l2_attr) & PTE2_NX) ? PTE1_NX : 0) | \ (((l2_attr) & PTE2_RO) ? PTE1_RO : 0) | \ (((l2_attr) & PTE2_U) ? PTE1_U : 0) | \ (((l2_attr) & PTE2_W) ? PTE1_W : 0)) #define ATTR_TO_L2(l1_attr) ((((l1_attr) & L1_S_TEX0) ? L2_TEX0 : 0) | \ (((l1_attr) & L1_S_C) ? L2_C : 0) | \ (((l1_attr) & L1_S_B) ? L2_B : 0) | \ (((l1_attr) & PTE1_A) ? PTE2_A : 0) | \ (((l1_attr) & PTE1_NM) ? PTE2_NM : 0) | \ (((l1_attr) & PTE1_S) ? PTE2_S : 0) | \ (((l1_attr) & PTE1_NG) ? PTE2_NG : 0) | \ (((l1_attr) & PTE1_NX) ? PTE2_NX : 0) | \ (((l1_attr) & PTE1_RO) ? PTE2_RO : 0) | \ (((l1_attr) & PTE1_U) ? PTE2_U : 0) | \ (((l1_attr) & PTE1_W) ? PTE2_W : 0)) /* * PTE2 descriptors creation macros. */ #define PTE2_ATTR_DEFAULT vm_memattr_to_pte2(VM_MEMATTR_DEFAULT) #define PTE2_ATTR_PT vm_memattr_to_pte2(pt_memattr) #define PTE2_KPT(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KPT_NG(pa) PTE2_KERN_NG(pa, PTE2_AP_KRW, PTE2_ATTR_PT) #define PTE2_KRW(pa) PTE2_KERN(pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT) #define PTE2_KRO(pa) PTE2_KERN(pa, PTE2_AP_KR, PTE2_ATTR_DEFAULT) #define PV_STATS #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif /* * The boot_pt1 is used temporary in very early boot stage as L1 page table. * We can init many things with no memory allocation thanks to its static * allocation and this brings two main advantages: * (1) other cores can be started very simply, * (2) various boot loaders can be supported as its arguments can be processed * in virtual address space and can be moved to safe location before * first allocation happened. * Only disadvantage is that boot_pt1 is used only in very early boot stage. * However, the table is uninitialized and so lays in bss. Therefore kernel * image size is not influenced. * * QQQ: In the future, maybe, boot_pt1 can be used for soft reset and * CPU suspend/resume game. */ extern pt1_entry_t boot_pt1[]; vm_paddr_t base_pt1; pt1_entry_t *kern_pt1; pt2_entry_t *kern_pt2tab; pt2_entry_t *PT2MAP; static uint32_t ttb_flags; static vm_memattr_t pt_memattr; ttb_entry_t pmap_kern_ttb; struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ static vm_offset_t kernel_vm_end_new; vm_offset_t kernel_vm_end = KERNBASE + NKPT2PG * NPT2_IN_PG * PTE1_SIZE; vm_offset_t vm_max_kernel_address; vm_paddr_t kernel_l1pa; static struct rwlock __aligned(CACHE_LINE_SIZE) pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; /* XXX: Is it used only the list in md_page? */ static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ vm_paddr_t first_managed_pa; #define pa_to_pvh(pa) (&pv_table[pte1_index(pa - first_managed_pa)]) /* * All those kernel PT submaps that BSD is so fond of */ struct sysmaps { struct mtx lock; pt2_entry_t *CMAP1; pt2_entry_t *CMAP2; pt2_entry_t *CMAP3; caddr_t CADDR1; caddr_t CADDR2; caddr_t CADDR3; }; static struct sysmaps sysmaps_pcpu[MAXCPU]; static pt2_entry_t *CMAP3; static caddr_t CADDR3; caddr_t _tmppt = 0; struct msgbuf *msgbufp = NULL; /* XXX move it to machdep.c */ /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt2_entry_t *PMAP1 = NULL, *PMAP2; static pt2_entry_t *PADDR1 = NULL, *PADDR2; #ifdef DDB static pt2_entry_t *PMAP3; static pt2_entry_t *PADDR3; static int PMAP3cpu __unused; /* for SMP only */ #endif #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte2_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte2_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte2_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static __inline void pt2_wirecount_init(vm_page_t m); static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va); void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size); /* * Function to set the debug level of the pmap code. */ #ifdef PMAP_DEBUG void pmap_debug(int level) { pmap_debug_level = level; dprintf("pmap_debug: level=%d\n", pmap_debug_level); } #endif /* PMAP_DEBUG */ /* * This table must corespond with memory attribute configuration in vm.h. * First entry is used for normal system mapping. * * Device memory is always marked as shared. * Normal memory is shared only in SMP . * Not outer shareable bits are not used yet. * Class 6 cannot be used on ARM11. */ #define TEXDEF_TYPE_SHIFT 0 #define TEXDEF_TYPE_MASK 0x3 #define TEXDEF_INNER_SHIFT 2 #define TEXDEF_INNER_MASK 0x3 #define TEXDEF_OUTER_SHIFT 4 #define TEXDEF_OUTER_MASK 0x3 #define TEXDEF_NOS_SHIFT 6 #define TEXDEF_NOS_MASK 0x1 #define TEX(t, i, o, s) \ ((t) << TEXDEF_TYPE_SHIFT) | \ ((i) << TEXDEF_INNER_SHIFT) | \ ((o) << TEXDEF_OUTER_SHIFT | \ ((s) << TEXDEF_NOS_SHIFT)) static uint32_t tex_class[8] = { /* type inner cache outer cache */ TEX(PRRR_MEM, NMRR_WB_WA, NMRR_WB_WA, 0), /* 0 - ATTR_WB_WA */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 1 - ATTR_NOCACHE */ TEX(PRRR_DEV, NMRR_NC, NMRR_NC, 0), /* 2 - ATTR_DEVICE */ TEX(PRRR_SO, NMRR_NC, NMRR_NC, 0), /* 3 - ATTR_SO */ TEX(PRRR_MEM, NMRR_WT, NMRR_WT, 0), /* 4 - ATTR_WT */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 5 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 6 - NOT USED YET */ TEX(PRRR_MEM, NMRR_NC, NMRR_NC, 0), /* 7 - NOT USED YET */ }; #undef TEX static uint32_t pte2_attr_tab[8] = { PTE2_ATTR_WB_WA, /* 0 - VM_MEMATTR_WB_WA */ PTE2_ATTR_NOCACHE, /* 1 - VM_MEMATTR_NOCACHE */ PTE2_ATTR_DEVICE, /* 2 - VM_MEMATTR_DEVICE */ PTE2_ATTR_SO, /* 3 - VM_MEMATTR_SO */ PTE2_ATTR_WT, /* 4 - VM_MEMATTR_WRITE_THROUGH */ 0, /* 5 - NOT USED YET */ 0, /* 6 - NOT USED YET */ 0 /* 7 - NOT USED YET */ }; CTASSERT(VM_MEMATTR_WB_WA == 0); CTASSERT(VM_MEMATTR_NOCACHE == 1); CTASSERT(VM_MEMATTR_DEVICE == 2); CTASSERT(VM_MEMATTR_SO == 3); CTASSERT(VM_MEMATTR_WRITE_THROUGH == 4); static inline uint32_t vm_memattr_to_pte2(vm_memattr_t ma) { KASSERT((u_int)ma < 5, ("%s: bad vm_memattr_t %d", __func__, ma)); return (pte2_attr_tab[(u_int)ma]); } static inline uint32_t vm_page_pte2_attr(vm_page_t m) { return (vm_memattr_to_pte2(m->md.pat_mode)); } /* * Convert TEX definition entry to TTB flags. */ static uint32_t encode_ttb_flags(int idx) { uint32_t inner, outer, nos, reg; inner = (tex_class[idx] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[idx] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[idx] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; reg = nos << 5; reg |= outer << 3; if (cpuinfo.coherent_walk) reg |= (inner & 0x1) << 6; reg |= (inner & 0x2) >> 1; #ifdef SMP reg |= 1 << 1; #endif return reg; } /* * Set TEX remapping registers in current CPU. */ void pmap_set_tex(void) { uint32_t prrr, nmrr; uint32_t type, inner, outer, nos; int i; #ifdef PMAP_PTE_NOCACHE /* XXX fixme */ if (cpuinfo.coherent_walk) { pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); } else { pt_memattr = VM_MEMATTR_NOCACHE; ttb_flags = encode_ttb_flags(1); } #else pt_memattr = VM_MEMATTR_WB_WA; ttb_flags = encode_ttb_flags(0); #endif prrr = 0; nmrr = 0; /* Build remapping register from TEX classes. */ for (i = 0; i < 8; i++) { type = (tex_class[i] >> TEXDEF_TYPE_SHIFT) & TEXDEF_TYPE_MASK; inner = (tex_class[i] >> TEXDEF_INNER_SHIFT) & TEXDEF_INNER_MASK; outer = (tex_class[i] >> TEXDEF_OUTER_SHIFT) & TEXDEF_OUTER_MASK; nos = (tex_class[i] >> TEXDEF_NOS_SHIFT) & TEXDEF_NOS_MASK; prrr |= type << (i * 2); prrr |= nos << (i + 24); nmrr |= inner << (i * 2); nmrr |= outer << (i * 2 + 16); } /* Add shareable bits for device memory. */ prrr |= PRRR_DS0 | PRRR_DS1; /* Add shareable bits for normal memory in SMP case. */ #ifdef SMP prrr |= PRRR_NS1; #endif cp15_prrr_set(prrr); cp15_nmrr_set(nmrr); /* Caches are disabled, so full TLB flush should be enough. */ tlb_flush_all_local(); } /* * KERNBASE must be multiple of NPT2_IN_PG * PTE1_SIZE. In other words, * KERNBASE is mapped by first L2 page table in L2 page table page. It * meets same constrain due to PT2MAP being placed just under KERNBASE. */ CTASSERT((KERNBASE & (NPT2_IN_PG * PTE1_SIZE - 1)) == 0); CTASSERT((KERNBASE - VM_MAXUSER_ADDRESS) >= PT2MAP_SIZE); /* * In crazy dreams, PAGE_SIZE could be a multiple of PTE2_SIZE in general. * For now, anyhow, the following check must be fulfilled. */ CTASSERT(PAGE_SIZE == PTE2_SIZE); /* * We don't want to mess up MI code with all MMU and PMAP definitions, * so some things, which depend on other ones, are defined independently. * Now, it is time to check that we don't screw up something. */ CTASSERT(PDRSHIFT == PTE1_SHIFT); /* * Check L1 and L2 page table entries definitions consistency. */ CTASSERT(NB_IN_PT1 == (sizeof(pt1_entry_t) * NPTE1_IN_PT1)); CTASSERT(NB_IN_PT2 == (sizeof(pt2_entry_t) * NPTE2_IN_PT2)); /* * Check L2 page tables page consistency. */ CTASSERT(PAGE_SIZE == (NPT2_IN_PG * NB_IN_PT2)); CTASSERT((1 << PT2PG_SHIFT) == NPT2_IN_PG); /* * Check PT2TAB consistency. * PT2TAB_ENTRIES is defined as a division of NPTE1_IN_PT1 by NPT2_IN_PG. * This should be done without remainder. */ CTASSERT(NPTE1_IN_PT1 == (PT2TAB_ENTRIES * NPT2_IN_PG)); /* * A PT2MAP magic. * * All level 2 page tables (PT2s) are mapped continuously and accordingly * into PT2MAP address space. As PT2 size is less than PAGE_SIZE, this can * be done only if PAGE_SIZE is a multiple of PT2 size. All PT2s in one page * must be used together, but not necessary at once. The first PT2 in a page * must map things on correctly aligned address and the others must follow * in right order. */ #define NB_IN_PT2TAB (PT2TAB_ENTRIES * sizeof(pt2_entry_t)) #define NPT2_IN_PT2TAB (NB_IN_PT2TAB / NB_IN_PT2) #define NPG_IN_PT2TAB (NB_IN_PT2TAB / PAGE_SIZE) /* * Check PT2TAB consistency. * NPT2_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by NB_IN_PT2. * NPG_IN_PT2TAB is defined as a division of NB_IN_PT2TAB by PAGE_SIZE. * The both should be done without remainder. */ CTASSERT(NB_IN_PT2TAB == (NPT2_IN_PT2TAB * NB_IN_PT2)); CTASSERT(NB_IN_PT2TAB == (NPG_IN_PT2TAB * PAGE_SIZE)); /* * The implementation was made general, however, with the assumption * bellow in mind. In case of another value of NPG_IN_PT2TAB, * the code should be once more rechecked. */ CTASSERT(NPG_IN_PT2TAB == 1); /* * Get offset of PT2 in a page * associated with given PT1 index. */ static __inline u_int page_pt2off(u_int pt1_idx) { return ((pt1_idx & PT2PG_MASK) * NB_IN_PT2); } /* * Get physical address of PT2 * associated with given PT2s page and PT1 index. */ static __inline vm_paddr_t page_pt2pa(vm_paddr_t pgpa, u_int pt1_idx) { return (pgpa + page_pt2off(pt1_idx)); } /* * Get first entry of PT2 * associated with given PT2s page and PT1 index. */ static __inline pt2_entry_t * page_pt2(vm_offset_t pgva, u_int pt1_idx) { return ((pt2_entry_t *)(pgva + page_pt2off(pt1_idx))); } /* * Get virtual address of PT2s page (mapped in PT2MAP) * which holds PT2 which holds entry which maps given virtual address. */ static __inline vm_offset_t pt2map_pt2pg(vm_offset_t va) { va &= ~(NPT2_IN_PG * PTE1_SIZE - 1); return ((vm_offset_t)pt2map_entry(va)); } /***************************************************************************** * * THREE pmap initialization milestones exist: * * locore.S * -> fundamental init (including MMU) in ASM * * initarm() * -> fundamental init continues in C * -> first available physical address is known * * pmap_bootstrap_prepare() -> FIRST PMAP MILESTONE (first epoch begins) * -> basic (safe) interface for physical address allocation is made * -> basic (safe) interface for virtual mapping is made * -> limited not SMP coherent work is possible * * -> more fundamental init continues in C * -> locks and some more things are available * -> all fundamental allocations and mappings are done * * pmap_bootstrap() -> SECOND PMAP MILESTONE (second epoch begins) * -> phys_avail[] and virtual_avail is set * -> control is passed to vm subsystem * -> physical and virtual address allocation are off limit * -> low level mapping functions, some SMP coherent, * are available, which cannot be used before vm subsystem * is being inited * * mi_startup() * -> vm subsystem is being inited * * pmap_init() -> THIRD PMAP MILESTONE (third epoch begins) * -> pmap is fully inited * *****************************************************************************/ /***************************************************************************** * * PMAP first stage initialization and utility functions * for pre-bootstrap epoch. * * After pmap_bootstrap_prepare() is called, the following functions * can be used: * * (1) strictly only for this stage functions for physical page allocations, * virtual space allocations, and mappings: * * vm_paddr_t pmap_preboot_get_pages(u_int num); * void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num); * vm_offset_t pmap_preboot_reserve_pages(u_int num); * vm_offset_t pmap_preboot_get_vpages(u_int num); * void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, * vm_prot_t prot, vm_memattr_t attr); * * (2) for all stages: * * vm_paddr_t pmap_kextract(vm_offset_t va); * * NOTE: This is not SMP coherent stage. * *****************************************************************************/ #define KERNEL_P2V(pa) \ ((vm_offset_t)((pa) - arm_physmem_kernaddr + KERNVIRTADDR)) #define KERNEL_V2P(va) \ ((vm_paddr_t)((va) - KERNVIRTADDR + arm_physmem_kernaddr)) static vm_paddr_t last_paddr; /* * Pre-bootstrap epoch page allocator. */ vm_paddr_t pmap_preboot_get_pages(u_int num) { vm_paddr_t ret; ret = last_paddr; last_paddr += num * PAGE_SIZE; return (ret); } /* * The fundamental initalization of PMAP stuff. * * Some things already happened in locore.S and some things could happen * before pmap_bootstrap_prepare() is called, so let's recall what is done: * 1. Caches are disabled. * 2. We are running on virtual addresses already with 'boot_pt1' * as L1 page table. * 3. So far, all virtual addresses can be converted to physical ones and * vice versa by the following macros: * KERNEL_P2V(pa) .... physical to virtual ones, * KERNEL_V2P(va) .... virtual to physical ones. * * What is done herein: * 1. The 'boot_pt1' is replaced by real kernel L1 page table 'kern_pt1'. * 2. PT2MAP magic is brought to live. * 3. Basic preboot functions for page allocations and mappings can be used. * 4. Everything is prepared for L1 cache enabling. * * Variations: * 1. To use second TTB register, so kernel and users page tables will be * separated. This way process forking - pmap_pinit() - could be faster, * it saves physical pages and KVA per a process, and it's simple change. * However, it will lead, due to hardware matter, to the following: * (a) 2G space for kernel and 2G space for users. * (b) 1G space for kernel in low addresses and 3G for users above it. * A question is: Is the case (b) really an option? Note that case (b) * does save neither physical memory and KVA. */ void pmap_bootstrap_prepare(vm_paddr_t last) { vm_paddr_t pt2pg_pa, pt2tab_pa, pa, size; vm_offset_t pt2pg_va; pt1_entry_t *pte1p; pt2_entry_t *pte2p; u_int i; uint32_t actlr_mask, actlr_set, l1_attr; /* * Now, we are going to make real kernel mapping. Note that we are * already running on some mapping made in locore.S and we expect * that it's large enough to ensure nofault access to physical memory * allocated herein before switch. * * As kernel image and everything needed before are and will be mapped * by section mappings, we align last physical address to PTE1_SIZE. */ last_paddr = pte1_roundup(last); /* * Allocate and zero page(s) for kernel L1 page table. * * Note that it's first allocation on space which was PTE1_SIZE * aligned and as such base_pt1 is aligned to NB_IN_PT1 too. */ base_pt1 = pmap_preboot_get_pages(NPG_IN_PT1); kern_pt1 = (pt1_entry_t *)KERNEL_P2V(base_pt1); bzero((void*)kern_pt1, NB_IN_PT1); pte1_sync_range(kern_pt1, NB_IN_PT1); /* Allocate and zero page(s) for kernel PT2TAB. */ pt2tab_pa = pmap_preboot_get_pages(NPG_IN_PT2TAB); kern_pt2tab = (pt2_entry_t *)KERNEL_P2V(pt2tab_pa); bzero(kern_pt2tab, NB_IN_PT2TAB); pte2_sync_range(kern_pt2tab, NB_IN_PT2TAB); /* Allocate and zero page(s) for kernel L2 page tables. */ pt2pg_pa = pmap_preboot_get_pages(NKPT2PG); pt2pg_va = KERNEL_P2V(pt2pg_pa); size = NKPT2PG * PAGE_SIZE; bzero((void*)pt2pg_va, size); pte2_sync_range((pt2_entry_t *)pt2pg_va, size); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated pages for kernel L2 page tables so that vm_page * structures representing these pages will be created. The vm_page * structures are required for promotion of the corresponding kernel * virtual addresses to section mappings. */ vm_phys_add_seg(pt2tab_pa, pmap_preboot_get_pages(0)); /* * Insert allocated L2 page table pages to PT2TAB and make * link to all PT2s in L1 page table. See how kernel_vm_end * is initialized. * * We play simple and safe. So every KVA will have underlaying * L2 page table, even kernel image mapped by sections. */ pte2p = kern_pt2tab_entry(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += PTE2_SIZE) pt2tab_store(pte2p++, PTE2_KPT(pa)); pte1p = kern_pte1(KERNBASE); for (pa = pt2pg_pa; pa < pt2pg_pa + size; pa += NB_IN_PT2) pte1_store(pte1p++, PTE1_LINK(pa)); /* Make section mappings for kernel. */ l1_attr = ATTR_TO_L1(PTE2_ATTR_DEFAULT); pte1p = kern_pte1(KERNBASE); for (pa = KERNEL_V2P(KERNBASE); pa < last; pa += PTE1_SIZE) pte1_store(pte1p++, PTE1_KERN(pa, PTE1_AP_KRW, l1_attr)); /* * Get free and aligned space for PT2MAP and make L1 page table links * to L2 page tables held in PT2TAB. * * Note that pages holding PT2s are stored in PT2TAB as pt2_entry_t * descriptors and PT2TAB page(s) itself is(are) used as PT2s. Thus * each entry in PT2TAB maps all PT2s in a page. This implies that * virtual address of PT2MAP must be aligned to NPT2_IN_PG * PTE1_SIZE. */ PT2MAP = (pt2_entry_t *)(KERNBASE - PT2MAP_SIZE); pte1p = kern_pte1((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Store PT2TAB in PT2TAB itself, i.e. self reference mapping. * Each pmap will hold own PT2TAB, so the mapping should be not global. */ pte2p = kern_pt2tab_entry((vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* * Choose correct L2 page table and make mappings for allocations * made herein which replaces temporary locore.S mappings after a while. * Note that PT2MAP cannot be used until we switch to kern_pt1. * * Note, that these allocations started aligned on 1M section and * kernel PT1 was allocated first. Making of mappings must follow * order of physical allocations as we've used KERNEL_P2V() macro * for virtual addresses resolution. */ pte2p = kern_pt2tab_entry((vm_offset_t)kern_pt1); pt2pg_va = KERNEL_P2V(pte2_pa(pte2_load(pte2p))); pte2p = page_pt2(pt2pg_va, pte1_index((vm_offset_t)kern_pt1)); /* Make mapping for kernel L1 page table. */ for (pa = base_pt1, i = 0; i < NPG_IN_PT1; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Make mapping for kernel PT2TAB. */ for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) pte2_store(pte2p++, PTE2_KPT(pa)); /* Finally, switch from 'boot_pt1' to 'kern_pt1'. */ pmap_kern_ttb = base_pt1 | ttb_flags; cpuinfo_get_actlr_modifier(&actlr_mask, &actlr_set); reinit_mmu(pmap_kern_ttb, actlr_mask, actlr_set); /* * Initialize the first available KVA. As kernel image is mapped by * sections, we are leaving some gap behind. */ virtual_avail = (vm_offset_t)kern_pt2tab + NPG_IN_PT2TAB * PAGE_SIZE; } /* * Setup L2 page table page for given KVA. * Used in pre-bootstrap epoch. * * Note that we have allocated NKPT2PG pages for L2 page tables in advance * and used them for mapping KVA starting from KERNBASE. However, this is not * enough. Vectors and devices need L2 page tables too. Note that they are * even above VM_MAX_KERNEL_ADDRESS. */ static __inline vm_paddr_t pmap_preboot_pt2pg_setup(vm_offset_t va) { pt2_entry_t *pte2p, pte2; vm_paddr_t pt2pg_pa; /* Get associated entry in PT2TAB. */ pte2p = kern_pt2tab_entry(va); /* Just return, if PT2s page exists already. */ pte2 = pt2tab_load(pte2p); if (pte2_is_valid(pte2)) return (pte2_pa(pte2)); KASSERT(va >= VM_MAX_KERNEL_ADDRESS, ("%s: NKPT2PG too small", __func__)); /* * Allocate page for PT2s and insert it to PT2TAB. * In other words, map it into PT2MAP space. */ pt2pg_pa = pmap_preboot_get_pages(1); pt2tab_store(pte2p, PTE2_KPT(pt2pg_pa)); /* Zero all PT2s in allocated page. */ bzero((void*)pt2map_pt2pg(va), PAGE_SIZE); pte2_sync_range((pt2_entry_t *)pt2map_pt2pg(va), PAGE_SIZE); return (pt2pg_pa); } /* * Setup L2 page table for given KVA. * Used in pre-bootstrap epoch. */ static void pmap_preboot_pt2_setup(vm_offset_t va) { pt1_entry_t *pte1p; vm_paddr_t pt2pg_pa, pt2_pa; /* Setup PT2's page. */ pt2pg_pa = pmap_preboot_pt2pg_setup(va); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(va)); /* Insert PT2 to PT1. */ pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } /* * Get L2 page entry associated with given KVA. * Used in pre-bootstrap epoch. */ static __inline pt2_entry_t* pmap_preboot_vtopte2(vm_offset_t va) { pt1_entry_t *pte1p; /* Setup PT2 if needed. */ pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) /* XXX - sections ?! */ pmap_preboot_pt2_setup(va); return (pt2map_entry(va)); } /* * Pre-bootstrap epoch page(s) mapping(s). */ void pmap_preboot_map_pages(vm_paddr_t pa, vm_offset_t va, u_int num) { u_int i; pt2_entry_t *pte2p; /* Map all the pages. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KRW(pa)); va += PAGE_SIZE; pa += PAGE_SIZE; } } /* * Pre-bootstrap epoch virtual space alocator. */ vm_offset_t pmap_preboot_reserve_pages(u_int num) { u_int i; vm_offset_t start, va; pt2_entry_t *pte2p; /* Allocate virtual space. */ start = va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Zero the mapping. */ for (i = 0; i < num; i++) { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, 0); va += PAGE_SIZE; } return (start); } /* * Pre-bootstrap epoch page(s) allocation and mapping(s). */ vm_offset_t pmap_preboot_get_vpages(u_int num) { vm_paddr_t pa; vm_offset_t va; /* Allocate physical page(s). */ pa = pmap_preboot_get_pages(num); /* Allocate virtual space. */ va = virtual_avail; virtual_avail += num * PAGE_SIZE; /* Map and zero all. */ pmap_preboot_map_pages(pa, va, num); bzero((void *)va, num * PAGE_SIZE); return (va); } /* * Pre-bootstrap epoch page mapping(s) with attributes. */ void pmap_preboot_map_attr(vm_paddr_t pa, vm_offset_t va, vm_size_t size, vm_prot_t prot, vm_memattr_t attr) { u_int num; u_int l1_attr, l1_prot, l2_prot, l2_attr; pt1_entry_t *pte1p; pt2_entry_t *pte2p; l2_prot = prot & VM_PROT_WRITE ? PTE2_AP_KRW : PTE2_AP_KR; l2_prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l2_attr = vm_memattr_to_pte2(attr); l1_prot = ATTR_TO_L1(l2_prot); l1_attr = ATTR_TO_L1(l2_attr); /* Map all the pages. */ num = round_page(size); while (num > 0) { if ((((va | pa) & PTE1_OFFSET) == 0) && (num >= PTE1_SIZE)) { pte1p = kern_pte1(va); pte1_store(pte1p, PTE1_KERN(pa, l1_prot, l1_attr)); va += PTE1_SIZE; pa += PTE1_SIZE; num -= PTE1_SIZE; } else { pte2p = pmap_preboot_vtopte2(va); pte2_store(pte2p, PTE2_KERN(pa, l2_prot, l2_attr)); va += PAGE_SIZE; pa += PAGE_SIZE; num -= PAGE_SIZE; } } } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". */ vm_paddr_t pmap_kextract(vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); } else if (pte1_is_link(pte1)) { /* * We should beware of concurrent promotion that changes * pte1 at this point. However, it's not a problem as PT2 * page is preserved by promotion in PT2TAB. So even if * it happens, using of PT2MAP is still safe. * * QQQ: However, concurrent removing is a problem which * ends in abort on PT2MAP space. Locking must be used * to deal with this. */ pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2) | (va & PTE2_OFFSET); } else { panic("%s: va %#x pte1 %#x", __func__, va, pte1); } return (pa); } /* * Extract from the kernel page table the physical address * that is mapped by the given virtual address "va". Also * return L2 page table entry which maps the address. * * This is only intended to be used for panic dumps. */ vm_paddr_t pmap_dump_kextract(vm_offset_t va, pt2_entry_t *pte2p) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t pte2; pte1 = pte1_load(kern_pte1(va)); if (pte1_is_section(pte1)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); pte2 = pa | ATTR_TO_L2(pte1) | PTE2_V; } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(va)); pa = pte2_pa(pte2); } else { pte2 = 0; pa = 0; } if (pte2p != NULL) *pte2p = pte2; return (pa); } /***************************************************************************** * * PMAP second stage initialization and utility functions * for bootstrap epoch. * * After pmap_bootstrap() is called, the following functions for * mappings can be used: * * void pmap_kenter(vm_offset_t va, vm_paddr_t pa); * void pmap_kremove(vm_offset_t va); * vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, * int prot); * * NOTE: This is not SMP coherent stage. And physical page allocation is not * allowed during this stage. * *****************************************************************************/ /* * Initialize kernel PMAP locks and lists, kernel_pmap itself, and * reserve various virtual spaces for temporary mappings. */ void pmap_bootstrap(vm_offset_t firstaddr) { pt2_entry_t *unused __unused; struct sysmaps *sysmaps; u_int i; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_l1pa = (vm_paddr_t)kern_pt1; /* for libkvm */ kernel_pmap->pm_pt1 = kern_pt1; kernel_pmap->pm_pt2tab = kern_pt2tab; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) do { \ v = (c)pmap_preboot_reserve_pages(n); \ p = pt2map_entry((vm_offset_t)v); \ } while (0) /* * Local CMAP1/CMAP2 are used for zeroing and copying pages. * Local CMAP3 is used for data cache cleaning. * Global CMAP3 is used for the idle process page zeroing. */ for (i = 0; i < MAXCPU; i++) { sysmaps = &sysmaps_pcpu[i]; mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1); SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1); SYSMAP(caddr_t, sysmaps->CMAP3, sysmaps->CADDR3, 1); } SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS); /* * _tmppt is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, _tmppt, 1); /* * PADDR1 and PADDR2 are used by pmap_pte2_quick() and pmap_pte2(), * respectively. PADDR3 is used by pmap_pte2_ddb(). */ SYSMAP(pt2_entry_t *, PMAP1, PADDR1, 1); SYSMAP(pt2_entry_t *, PMAP2, PADDR2, 1); #ifdef DDB SYSMAP(pt2_entry_t *, PMAP3, PADDR3, 1); #endif mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); /* * Note that in very short time in initarm(), we are going to * initialize phys_avail[] array and no futher page allocation * can happen after that until vm subsystem will be initialized. */ kernel_vm_end_new = kernel_vm_end; virtual_end = vm_max_kernel_address; } static void pmap_init_qpages(void) { struct pcpu *pc; int i; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_qmap_addr = kva_alloc(PAGE_SIZE); if (pc->pc_qmap_addr == 0) panic("%s: unable to allocate KVA", __func__); } } SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL); /* * The function can already be use in second initialization stage. * As such, the function DOES NOT call pmap_growkernel() where PT2 * allocation can happen. So if used, be sure that PT2 for given * virtual address is allocated already! * * Add a wired page to the kva. * Note: not SMP coherent. */ static __inline void pmap_kenter_prot_attr(vm_offset_t va, vm_paddr_t pa, uint32_t prot, uint32_t attr) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pte1p = kern_pte1(va); if (!pte1_is_valid(pte1_load(pte1p))) { /* XXX - sections ?! */ /* * This is a very low level function, so PT2 and particularly * PT2PG associated with given virtual address must be already * allocated. It's a pain mainly during pmap initialization * stage. However, called after pmap initialization with * virtual address not under kernel_vm_end will lead to * the same misery. */ if (!pte2_is_valid(pte2_load(kern_pt2tab_entry(va)))) panic("%s: kernel PT2 not allocated!", __func__); } pte2p = pt2map_entry(va); pte2_store(pte2p, PTE2_KERN(pa, prot, attr)); } PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, PTE2_ATTR_DEFAULT); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt2_entry_t *pte2p; pte2p = pt2map_entry(va); pte2_clear(pte2p); } /* * Share new kernel PT2PG with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pt2tab(vm_offset_t va, pt2_entry_t npte2) { pmap_t pmap; pt2_entry_t *pte2p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, npte2); } mtx_unlock_spin(&allpmaps_lock); } /* * Share new kernel PTE1 with all pmaps. * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pte1(vm_offset_t va, pt1_entry_t npte1) { pmap_t pmap; pt1_entry_t *pte1p; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { pte1p = pmap_pte1(pmap, va); pte1_store(pte1p, npte1); } mtx_unlock_spin(&allpmaps_lock); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. * * NOTE: Read the comments above pmap_kenter_prot_attr() as * the function is used herein! */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t pte1_offset; pt1_entry_t npte1; uint32_t l1prot, l2prot; uint32_t l1attr, l2attr; PDEBUG(1, printf("%s: virt = %#x, start = %#x, end = %#x (size = %#x)," " prot = %d\n", __func__, *virt, start, end, end - start, prot)); l2prot = (prot & VM_PROT_WRITE) ? PTE2_AP_KRW : PTE2_AP_KR; l2prot |= (prot & VM_PROT_EXECUTE) ? PTE2_X : PTE2_NX; l1prot = ATTR_TO_L1(l2prot); l2attr = PTE2_ATTR_DEFAULT; l1attr = ATTR_TO_L1(l2attr); va = *virt; /* * Does the physical address range's size and alignment permit at * least one section mapping to be created? */ pte1_offset = start & PTE1_OFFSET; if ((end - start) - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) >= PTE1_SIZE) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of section mappings. */ if ((va & PTE1_OFFSET) < pte1_offset) va = pte1_trunc(va) + pte1_offset; else if ((va & PTE1_OFFSET) > pte1_offset) va = pte1_roundup(va) + pte1_offset; } sva = va; while (start < end) { if ((start & PTE1_OFFSET) == 0 && end - start >= PTE1_SIZE) { KASSERT((va & PTE1_OFFSET) == 0, ("%s: misaligned va %#x", __func__, va)); npte1 = PTE1_KERN(start, l1prot, l1attr); pmap_kenter_pte1(va, npte1); va += PTE1_SIZE; start += PTE1_SIZE; } else { pmap_kenter_prot_attr(va, start, l2prot, l2attr); va += PAGE_SIZE; start += PAGE_SIZE; } } tlb_flush_range(sva, va - sva); *virt = va; return (sva); } /* * Make a temporary mapping for a physical address. * This is only intended to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; /* QQQ: 'i' should be less or equal to MAXDUMPPGS. */ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); tlb_flush_local(va); return ((void *)crashdumpmap); } /************************************* * * TLB & cache maintenance routines. * *************************************/ /* * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_tlb_flush(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush(va); } PMAP_INLINE void pmap_tlb_flush_range(pmap_t pmap, vm_offset_t sva, vm_size_t size) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) tlb_flush_range(sva, size); } /* * Abuse the pte2 nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PTE2_* bits * are ever set, PTE2_V in particular. * - Assumes we can write to pte2s without pte2_store() atomic ops. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PTE2_V. * - Assumes a vm_offset_t will fit in a pte2 (true for arm). * Because PTE2_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_pte2list_alloc(vm_offset_t *head) { pt2_entry_t *pte2p; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte2p = pt2map_entry(va); *head = *pte2p; if (*head & PTE2_V) panic("%s: va with PTE2_V set!", __func__); *pte2p = 0; return (va); } static void pmap_pte2list_free(vm_offset_t *head, vm_offset_t va) { pt2_entry_t *pte2p; if (va & PTE2_V) panic("%s: freeing va with PTE2_V set!", __func__); pte2p = pt2map_entry(va); *pte2p = *head; /* virtual! PTE2_V is 0 though */ *head = va; } static void pmap_pte2list_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_pte2list_free(head, va); } } /***************************************************************************** * * PMAP third and final stage initialization. * * After pmap_init() is called, PMAP subsystem is fully initialized. * *****************************************************************************/ SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static u_long nkpt2pg = NKPT2PG; SYSCTL_ULONG(_vm_pmap, OID_AUTO, nkpt2pg, CTLFLAG_RD, &nkpt2pg, 0, "Pre-allocated pages for kernel PT2s"); static int sp_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, sp_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &sp_enabled, 0, "Are large page mappings enabled?"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pte1, CTLFLAG_RD, 0, "1MB page mapping counters"); static u_long pmap_pte1_demotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pte1_demotions, 0, "1MB page demotions"); static u_long pmap_pte1_mappings; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pte1_mappings, 0, "1MB page mappings"); static u_long pmap_pte1_p_failures; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pte1_p_failures, 0, "1MB page promotion failures"); static u_long pmap_pte1_promotions; SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pte1_promotions, 0, "1MB page promotions"); +static u_long pmap_pte1_kern_demotions; +SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_demotions, CTLFLAG_RD, + &pmap_pte1_kern_demotions, 0, "1MB page kernel demotions"); + +static u_long pmap_pte1_kern_promotions; +SYSCTL_ULONG(_vm_pmap_pte1, OID_AUTO, kern_promotions, CTLFLAG_RD, + &pmap_pte1_kern_promotions, 0, "1MB page kernel promotions"); + static __inline ttb_entry_t pmap_ttb_get(pmap_t pmap) { return (vtophys(pmap->pm_pt1) | ttb_flags); } /* * Initialize a vm_page's machine-dependent fields. * * Variations: * 1. Pages for L2 page tables are always not managed. So, pv_list and * pt2_wirecount can share same physical space. However, proper * initialization on a page alloc for page tables and reinitialization * on the page free must be ensured. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); pt2_wirecount_init(m); m->md.pat_mode = VM_MEMATTR_DEFAULT; } /* * Virtualization for faster way how to zero whole page. */ static __inline void pagezero(void *page) { bzero(page, PAGE_SIZE); } /* * Zero L2 page table page. * Use same KVA as in pmap_zero_page(). */ static __inline vm_paddr_t pmap_pt2pg_zero(vm_page_t m) { vm_paddr_t pa; struct sysmaps *sysmaps; pa = VM_PAGE_TO_PHYS(m); /* * XXX: For now, we map whole page even if it's already zero, * to sync it even if the sync is only DSB. */ sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (pte2_load(sysmaps->CMAP2) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); /* Even VM_ALLOC_ZERO request is only advisory. */ if ((m->flags & PG_ZERO) == 0) pagezero(sysmaps->CADDR2); pte2_sync_range((pt2_entry_t *)sysmaps->CADDR2, PAGE_SIZE); pte2_clear(sysmaps->CMAP2); tlb_flush((vm_offset_t)sysmaps->CADDR2); sched_unpin(); mtx_unlock(&sysmaps->lock); return (pa); } /* * Init just allocated page as L2 page table(s) holder * and return its physical address. */ static __inline vm_paddr_t pmap_pt2pg_init(pmap_t pmap, vm_offset_t va, vm_page_t m) { vm_paddr_t pa; pt2_entry_t *pte2p; /* Check page attributes. */ if (m->md.pat_mode != pt_memattr) pmap_page_set_memattr(m, pt_memattr); /* Zero page and init wire counts. */ pa = pmap_pt2pg_zero(m); pt2_wirecount_init(m); /* * Map page to PT2MAP address space for given pmap. * Note that PT2MAP space is shared with all pmaps. */ if (pmap == kernel_pmap) pmap_kenter_pt2tab(va, PTE2_KPT(pa)); else { pte2p = pmap_pt2tab_entry(pmap, va); pt2tab_store(pte2p, PTE2_KPT_NG(pa)); } return (pa); } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; pt2_entry_t *pte2p, pte2; u_int i, pte1_idx, pv_npg; PDEBUG(1, printf("%s: phys_start = %#x\n", __func__, PHYSADDR)); /* * Initialize the vm page array entries for kernel pmap's * L2 page table pages allocated in advance. */ pte1_idx = pte1_index(KERNBASE - PT2MAP_SIZE); pte2p = kern_pt2tab_entry(KERNBASE - PT2MAP_SIZE); for (i = 0; i < nkpt2pg + NPG_IN_PT2TAB; i++, pte2p++) { vm_paddr_t pa; vm_page_t m; pte2 = pte2_load(pte2p); KASSERT(pte2_is_valid(pte2), ("%s: no valid entry", __func__)); pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: L2 page table page is out of range", __func__)); m->pindex = pte1_idx; m->phys_addr = pa; pte1_idx += NPT2_IN_PG; } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.sp_enabled", &sp_enabled); if (sp_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("%s: can't assign to pagesizes[1]", __func__)); pagesizes[1] = PTE1_SIZE; } /* * Calculate the size of the pv head table for sections. * Handle the possibility that "vm_phys_segs[...].end" is zero. * Note that the table is only for sections which could be promoted. */ first_managed_pa = pte1_trunc(vm_phys_segs[0].start); pv_npg = (pte1_trunc(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) - first_managed_pa) / PTE1_SIZE + 1; /* * Allocate memory for the pv head table for sections. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("%s: not enough kvm for pv chunks", __func__); pmap_pte2list_init(&pv_vafree, pv_chunkbase, pv_maxchunks); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { u_int anychanged; pt2_entry_t *epte2p, *pte2p, pte2; vm_page_t m; vm_paddr_t pa; anychanged = 0; pte2p = pt2map_entry(sva); epte2p = pte2p + count; while (pte2p < epte2p) { m = *ma++; pa = VM_PAGE_TO_PHYS(m); pte2 = pte2_load(pte2p); if ((pte2_pa(pte2) != pa) || (pte2_attr(pte2) != vm_page_pte2_attr(m))) { anychanged++; pte2_store(pte2p, PTE2_KERN(pa, PTE2_AP_KRW, vm_page_pte2_attr(m))); } pte2p++; } if (__predict_false(anychanged)) tlb_flush_range(sva, count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } tlb_flush_range(sva, va - sva); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || (pmap == vmspace_pmap(curthread->td_proc->p_vmspace))); } /* * If the given pmap is not the current or kernel pmap, the returned * pte2 must be released by passing it to pmap_pte2_release(). */ static pt2_entry_t * pmap_pte2(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } return (PADDR2 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Releases a pte2 that was obtained from pmap_pte2(). * Be prepared for the pte2p being NULL. */ static __inline void pmap_pte2_release(pt2_entry_t *pte2p) { if ((pt2_entry_t *)(trunc_page((vm_offset_t)pte2p)) == PADDR2) { mtx_unlock(&PMAP2mutex); } } /* * Super fast pmap_pte2 routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire tlb flush for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt2_entry_t * pmap_pte2_quick(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) panic("%s: attempt to map PTE1", __func__); if (pte1_is_link(pte1)) { /* Are we current address space or kernel? */ if (pmap_is_current(pmap)) return (pt2map_entry(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } return (NULL); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t pa; pt1_entry_t pte1; pt2_entry_t *pte2p; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) pa = pte1_pa(pte1) | (va & PTE1_OFFSET); else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pa = pte2_pa(pte2_load(pte2p)) | (va & PTE2_OFFSET); pmap_pte2_release(pte2p); } else pa = 0; PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { vm_paddr_t pa, lockpa; pt1_entry_t pte1; pt2_entry_t pte2, *pte2p; vm_page_t m; lockpa = 0; m = NULL; PMAP_LOCK(pmap); retry: pte1 = pte1_load(pmap_pte1(pmap, va)); if (pte1_is_section(pte1)) { if (!(pte1 & PTE1_RO) || !(prot & VM_PROT_WRITE)) { pa = pte1_pa(pte1) | (va & PTE1_OFFSET); if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) goto retry; m = PHYS_TO_VM_PAGE(pa); vm_page_hold(m); } } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (pte2_is_valid(pte2) && (!(pte2 & PTE2_RO) || !(prot & VM_PROT_WRITE))) { pa = pte2_pa(pte2); if (vm_page_pa_tryrelock(pmap, pa, &lockpa)) goto retry; m = PHYS_TO_VM_PAGE(pa); vm_page_hold(m); } } PA_UNLOCK_COND(lockpa); PMAP_UNLOCK(pmap); return (m); } /* * Grow the number of kernel L2 page table entries, if needed. */ void pmap_growkernel(vm_offset_t addr) { vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pt1_entry_t pte1; pt2_entry_t pte2; PDEBUG(1, printf("%s: addr = %#x\n", __func__, addr)); /* * All the time kernel_vm_end is first KVA for which underlying * L2 page table is either not allocated or linked from L1 page table * (not considering sections). Except for two possible cases: * * (1) in the very beginning as long as pmap_growkernel() was * not called, it could be first unused KVA (which is not * rounded up to PTE1_SIZE), * * (2) when all KVA space is mapped and kernel_map->max_offset * address is not rounded up to PTE1_SIZE. (For example, * it could be 0xFFFFFFFF.) */ kernel_vm_end = pte1_roundup(kernel_vm_end); mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, PTE1_SIZE); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { pte1 = pte1_load(kern_pte1(kernel_vm_end)); if (pte1_is_valid(pte1)) { kernel_vm_end += PTE1_SIZE; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } /* * kernel_vm_end_new is used in pmap_pinit() when kernel * mappings are entered to new pmap all at once to avoid race * between pmap_kenter_pte1() and kernel_vm_end increase. * The same aplies to pmap_kenter_pt2tab(). */ kernel_vm_end_new = kernel_vm_end + PTE1_SIZE; pte2 = pt2tab_load(kern_pt2tab_entry(kernel_vm_end)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into kernel PT2TAB. */ m = vm_page_alloc(NULL, pte1_index(kernel_vm_end) & ~PT2PG_MASK, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) panic("%s: no memory to grow kernel", __func__); /* * QQQ: To link all new L2 page tables from L1 page * table now and so pmap_kenter_pte1() them * at once together with pmap_kenter_pt2tab() * could be nice speed up. However, * pmap_growkernel() does not happen so often... * QQQ: The other TTBR is another option. */ pt2pg_pa = pmap_pt2pg_init(kernel_pmap, kernel_vm_end, m); } else pt2pg_pa = pte2_pa(pte2); pt2_pa = page_pt2pa(pt2pg_pa, pte1_index(kernel_vm_end)); pmap_kenter_pte1(kernel_vm_end, PTE1_LINK(pt2_pa)); kernel_vm_end = kernel_vm_end_new; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = vm_max_kernel_address - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = vm_max_kernel_address - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /*********************************************** * * Pmap allocation/deallocation routines. * ***********************************************/ /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PDEBUG(1, printf("%s: pmap = %p\n", __func__, pmap)); PMAP_LOCK_INIT(pmap); /* * Kernel page table directory and pmap stuff around is already * initialized, we are using it right now and here. So, finish * only PMAP structures initialization for process0 ... * * Since the L1 page table and PT2TAB is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pt1 = kern_pt1; pmap->pm_pt2tab = kern_pt2tab; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); CPU_SET(0, &pmap->pm_active); } static __inline void pte1_copy_nosync(pt1_entry_t *spte1p, pt1_entry_t *dpte1p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pte1_index(sva); count = (pte1_index(eva) - idx + 1) * sizeof(pt1_entry_t); bcopy(spte1p + idx, dpte1p + idx, count); } static __inline void pt2tab_copy_nosync(pt2_entry_t *spte2p, pt2_entry_t *dpte2p, vm_offset_t sva, vm_offset_t eva) { u_int idx, count; idx = pt2tab_index(sva); count = (pt2tab_index(eva) - idx + 1) * sizeof(pt2_entry_t); bcopy(spte2p + idx, dpte2p + idx, count); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; vm_paddr_t pa, pt2tab_pa; u_int i; PDEBUG(6, printf("%s: pmap = %p, pm_pt1 = %p\n", __func__, pmap, pmap->pm_pt1)); /* * No need to allocate L2 page table space yet but we do need * a valid L1 page table and PT2TAB table. * * Install shared kernel mappings to these tables. It's a little * tricky as some parts of KVA are reserved for vectors, devices, * and whatever else. These parts are supposed to be above * vm_max_kernel_address. Thus two regions should be installed: * * (1) . * * QQQ: The second region should be stable enough to be installed * only once in time when the tables are allocated. * QQQ: Maybe copy of both regions at once could be faster ... * QQQ: Maybe the other TTBR is an option. * * Finally, install own PT2TAB table to these tables. */ if (pmap->pm_pt1 == NULL) { pmap->pm_pt1 = (pt1_entry_t *)kmem_alloc_contig(kernel_arena, NB_IN_PT1, M_NOWAIT | M_ZERO, 0, -1UL, NB_IN_PT1, 0, pt_memattr); if (pmap->pm_pt1 == NULL) return (0); } if (pmap->pm_pt2tab == NULL) { /* * QQQ: (1) PT2TAB must be contiguous. If PT2TAB is one page * only, what should be the only size for 32 bit systems, * then we could allocate it with vm_page_alloc() and all * the stuff needed as other L2 page table pages. * (2) Note that a process PT2TAB is special L2 page table * page. Its mapping in kernel_arena is permanent and can * be used no matter which process is current. Its mapping * in PT2MAP can be used only for current process. */ pmap->pm_pt2tab = (pt2_entry_t *)kmem_alloc_attr(kernel_arena, NB_IN_PT2TAB, M_NOWAIT | M_ZERO, 0, -1UL, pt_memattr); if (pmap->pm_pt2tab == NULL) { /* * QQQ: As struct pmap is allocated from UMA with * UMA_ZONE_NOFREE flag, it's important to leave * no allocation in pmap if initialization failed. */ kmem_free(kernel_arena, (vm_offset_t)pmap->pm_pt1, NB_IN_PT1); pmap->pm_pt1 = NULL; return (0); } /* * QQQ: Each L2 page table page vm_page_t has pindex set to * pte1 index of virtual address mapped by this page. * It's not valid for non kernel PT2TABs themselves. * The pindex of these pages can not be altered because * of the way how they are allocated now. However, it * should not be a problem. */ } mtx_lock_spin(&allpmaps_lock); /* * To avoid race with pmap_kenter_pte1() and pmap_kenter_pt2tab(), * kernel_vm_end_new is used here instead of kernel_vm_end. */ pte1_copy_nosync(kern_pt1, pmap->pm_pt1, KERNBASE, kernel_vm_end_new - 1); pte1_copy_nosync(kern_pt1, pmap->pm_pt1, vm_max_kernel_address, 0xFFFFFFFF); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, KERNBASE, kernel_vm_end_new - 1); pt2tab_copy_nosync(kern_pt2tab, pmap->pm_pt2tab, vm_max_kernel_address, 0xFFFFFFFF); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Store PT2MAP PT2 pages (a.k.a. PT2TAB) in PT2TAB itself. * I.e. self reference mapping. The PT2TAB is private, however mapped * into shared PT2MAP space, so the mapping should be not global. */ pt2tab_pa = vtophys(pmap->pm_pt2tab); pte2p = pmap_pt2tab_entry(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPG_IN_PT2TAB; i++, pa += PTE2_SIZE) { pt2tab_store(pte2p++, PTE2_KPT_NG(pa)); } /* Insert PT2MAP PT2s into pmap PT1. */ pte1p = pmap_pte1(pmap, (vm_offset_t)PT2MAP); for (pa = pt2tab_pa, i = 0; i < NPT2_IN_PT2TAB; i++, pa += NB_IN_PT2) { pte1_store(pte1p++, PTE1_LINK(pa)); } /* * Now synchronize new mapping which was made above. */ pte1_sync_range(pmap->pm_pt1, NB_IN_PT1); pte2_sync_range(pmap->pm_pt2tab, NB_IN_PT2TAB); CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } #ifdef INVARIANTS static boolean_t pt2tab_user_is_empty(pt2_entry_t *tab) { u_int i, end; end = pt2tab_index(VM_MAXUSER_ADDRESS); for (i = 0; i < end; i++) if (tab[i] != 0) return (FALSE); return (TRUE); } #endif /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { #ifdef INVARIANTS vm_offset_t start, end; #endif KASSERT(pmap->pm_stats.resident_count == 0, ("%s: pmap resident count %ld != 0", __func__, pmap->pm_stats.resident_count)); KASSERT(pt2tab_user_is_empty(pmap->pm_pt2tab), ("%s: has allocated user PT2(s)", __func__)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("%s: pmap %p is active on some CPU(s)", __func__, pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); #ifdef INVARIANTS start = pte1_index(KERNBASE) * sizeof(pt1_entry_t); end = (pte1_index(0xFFFFFFFF) + 1) * sizeof(pt1_entry_t); bzero((char *)pmap->pm_pt1 + start, end - start); start = pt2tab_index(KERNBASE) * sizeof(pt2_entry_t); end = (pt2tab_index(0xFFFFFFFF) + 1) * sizeof(pt2_entry_t); bzero((char *)pmap->pm_pt2tab + start, end - start); #endif /* * We are leaving PT1 and PT2TAB allocated on released pmap, * so hopefully UMA vmspace_zone will always be inited with * UMA_ZONE_NOFREE flag. */ } /********************************************************* * * L2 table pages and their pages management routines. * *********************************************************/ /* * Virtual interface for L2 page table wire counting. * * Each L2 page table in a page has own counter which counts a number of * valid mappings in a table. Global page counter counts mappings in all * tables in a page plus a single itself mapping in PT2TAB. * * During a promotion we leave the associated L2 page table counter * untouched, so the table (strictly speaking a page which holds it) * is never freed if promoted. * * If a page m->wire_count == 1 then no valid mappings exist in any L2 page * table in the page and the page itself is only mapped in PT2TAB. */ static __inline void pt2_wirecount_init(vm_page_t m) { u_int i; /* * Note: A page m is allocated with VM_ALLOC_WIRED flag and * m->wire_count should be already set correctly. * So, there is no need to set it again herein. */ for (i = 0; i < NPT2_IN_PG; i++) m->md.pt2_wirecount[i] = 0; } static __inline void pt2_wirecount_inc(vm_page_t m, uint32_t pte1_idx) { /* * Note: A just modificated pte2 (i.e. already allocated) * is acquiring one extra reference which must be * explicitly cleared. It influences the KASSERTs herein. * All L2 page tables in a page always belong to the same * pmap, so we allow only one extra reference for the page. */ KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] < (NPTE2_IN_PT2 + 1), ("%s: PT2 is overflowing ...", __func__)); KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowing ...", __func__)); m->wire_count++; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]++; } static __inline void pt2_wirecount_dec(vm_page_t m, uint32_t pte1_idx) { KASSERT(m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] != 0, ("%s: PT2 is underflowing ...", __func__)); KASSERT(m->wire_count > 1, ("%s: PT2PG is underflowing ...", __func__)); m->wire_count--; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]--; } static __inline void pt2_wirecount_set(vm_page_t m, uint32_t pte1_idx, uint16_t count) { KASSERT(count <= NPTE2_IN_PT2, ("%s: invalid count %u", __func__, count)); KASSERT(m->wire_count > m->md.pt2_wirecount[pte1_idx & PT2PG_MASK], ("%s: PT2PG corrupting (%u, %u) ...", __func__, m->wire_count, m->md.pt2_wirecount[pte1_idx & PT2PG_MASK])); m->wire_count -= m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]; m->wire_count += count; m->md.pt2_wirecount[pte1_idx & PT2PG_MASK] = count; KASSERT(m->wire_count <= (NPTE2_IN_PG + 1), ("%s: PT2PG is overflowed (%u) ...", __func__, m->wire_count)); } static __inline uint32_t pt2_wirecount_get(vm_page_t m, uint32_t pte1_idx) { return (m->md.pt2_wirecount[pte1_idx & PT2PG_MASK]); } static __inline boolean_t pt2_is_empty(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == 0); } static __inline boolean_t pt2_is_full(vm_page_t m, vm_offset_t va) { return (m->md.pt2_wirecount[pte1_index(va) & PT2PG_MASK] == NPTE2_IN_PT2); } static __inline boolean_t pt2pg_is_empty(vm_page_t m) { return (m->wire_count == 1); } /* * This routine is called if the L2 page table * is not mapped correctly. */ static vm_page_t _pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { uint32_t pte1_idx; pt1_entry_t *pte1p; pt2_entry_t pte2; vm_page_t m; vm_paddr_t pt2pg_pa, pt2_pa; pte1_idx = pte1_index(va); pte1p = pmap->pm_pt1 + pte1_idx; KASSERT(pte1_load(pte1p) == 0, ("%s: pm_pt1[%#x] is not zero: %#x", __func__, pte1_idx, pte1_load(pte1p))); pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, va)); if (!pte2_is_valid(pte2)) { /* * Install new PT2s page into pmap PT2TAB. */ m = vm_page_alloc(NULL, pte1_idx & ~PT2PG_MASK, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, * the L2 page table page may have been allocated. */ return (NULL); } pmap->pm_stats.resident_count++; pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { pt2pg_pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pt2pg_pa); } pt2_wirecount_inc(m, pte1_idx); pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); return (m); } static vm_page_t pmap_allocpte2(pmap_t pmap, vm_offset_t va, u_int flags) { u_int pte1_idx; pt1_entry_t *pte1p, pte1; vm_page_t m; pte1_idx = pte1_index(va); retry: pte1p = pmap->pm_pt1 + pte1_idx; pte1 = pte1_load(pte1p); /* * This supports switching from a 1MB page to a * normal 4K page. */ if (pte1_is_section(pte1)) { (void)pmap_demote_pte1(pmap, pte1p, va); /* * Reload pte1 after demotion. * * Note: Demotion can even fail as either PT2 is not find for * the virtual address or PT2PG can not be allocated. */ pte1 = pte1_load(pte1p); } /* * If the L2 page table page is mapped, we just increment the * hold count, and activate it. */ if (pte1_is_link(pte1)) { m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(m, pte1_idx); } else { /* * Here if the PT2 isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte2(pmap, va, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused L2 page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free) { /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ #ifdef PMAP_DEBUG pmap_zero_page_check(m); #endif m->flags |= PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Unwire L2 page tables page. */ static void pmap_unwire_pt2pg(pmap_t pmap, vm_offset_t va, vm_page_t m) { pt1_entry_t *pte1p, opte1 __unused; pt2_entry_t *pte2p; uint32_t i; KASSERT(pt2pg_is_empty(m), ("%s: pmap %p PT2PG %p wired", __func__, pmap, m)); /* * Unmap all L2 page tables in the page from L1 page table. * * QQQ: Individual L2 page tables (except the last one) can be unmapped * earlier. However, we are doing that this way. */ KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: pmap %p va %#x PT2PG %p bad index", __func__, pmap, va, m)); pte1p = pmap->pm_pt1 + m->pindex; for (i = 0; i < NPT2_IN_PG; i++, pte1p++) { KASSERT(m->md.pt2_wirecount[i] == 0, ("%s: pmap %p PT2 %u (PG %p) wired", __func__, pmap, i, m)); opte1 = pte1_load(pte1p); if (pte1_is_link(opte1)) { pte1_clear(pte1p); /* * Flush intermediate TLB cache. */ pmap_tlb_flush(pmap, (m->pindex + i) << PTE1_SHIFT); } #ifdef INVARIANTS else KASSERT((opte1 == 0) || pte1_is_section(opte1), ("%s: pmap %p va %#x bad pte1 %x at %u", __func__, pmap, va, opte1, i)); #endif } /* * Unmap the page from PT2TAB. */ pte2p = pmap_pt2tab_entry(pmap, va); (void)pt2tab_load_clear(pte2p); pmap_tlb_flush(pmap, pt2map_pt2pg(va)); m->wire_count = 0; pmap->pm_stats.resident_count--; /* * This is a release store so that the ordinary store unmapping * the L2 page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); } /* * Decrements a L2 page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static __inline boolean_t pmap_unwire_pt2(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { pt2_wirecount_dec(m, pte1_index(va)); if (pt2pg_is_empty(m)) { /* * QQQ: Wire count is zero, so whole page should be zero and * we can set PG_ZERO flag to it. * Note that when promotion is enabled, it takes some * more efforts. See pmap_unwire_pt2_all() below. */ pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); return (TRUE); } else return (FALSE); } /* * Drop a L2 page table page's wire count at once, which is used to record * the number of valid L2 page table entries within the page. If the wire * count drops to zero, then the L2 page table page is unmapped. */ static __inline void pmap_unwire_pt2_all(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { u_int pte1_idx = pte1_index(va); KASSERT(m->pindex == (pte1_idx & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); KASSERT(m->wire_count > pt2_wirecount_get(m, pte1_idx), ("%s: bad pt2 wire count %u > %u", __func__, m->wire_count, pt2_wirecount_get(m, pte1_idx))); /* * It's possible that the L2 page table was never used. * It happened in case that a section was created without promotion. */ if (pt2_is_full(m, va)) { pt2_wirecount_set(m, pte1_idx, 0); /* * QQQ: We clear L2 page table now, so when L2 page table page * is going to be freed, we can set it PG_ZERO flag ... * This function is called only on section mappings, so * hopefully it's not to big overload. * * XXX: If pmap is current, existing PT2MAP mapping could be * used for zeroing. */ pmap_zero_page_area(m, page_pt2off(pte1_idx), NB_IN_PT2); } #ifdef INVARIANTS else KASSERT(pt2_is_empty(m, va), ("%s: PT2 is not empty (%u)", __func__, pt2_wirecount_get(m, pte1_idx))); #endif if (pt2pg_is_empty(m)) { pmap_unwire_pt2pg(pmap, va, m); pmap_add_delayed_free_list(m, free); } } /* * After removing a L2 page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static boolean_t pmap_unuse_pt2(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt1_entry_t pte1; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (FALSE); pte1 = pte1_load(pmap_pte1(pmap, va)); mpte = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); return (pmap_unwire_pt2(pmap, va, mpte, free)); } /************************************* * * Page management routines. * *************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * Is given page managed? */ static __inline boolean_t is_managed(vm_paddr_t pa) { vm_offset_t pgnum; vm_page_t m; pgnum = atop(pa); if (pgnum >= first_page) { m = PHYS_TO_VM_PAGE(pa); if (m == NULL) return (FALSE); if ((m->oflags & VPO_UNMANAGED) == 0) return (TRUE); } return (FALSE); } static __inline boolean_t pte1_is_managed(pt1_entry_t pte1) { return (is_managed(pte1_pa(pte1))); } static __inline boolean_t pte2_is_managed(pt2_entry_t pte2) { return (is_managed(pte2_pa(pte2))); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pt1_entry_t *pte1p; pmap_t pmap; pt2_entry_t *pte2p, tpte2; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffs(inuse) - 1; pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) continue; pte2p = pmap_pte2(pmap, va); tpte2 = pte2_load(pte2p); if ((tpte2 & PTE2_W) == 0) tpte2 = pte2_load_clear(pte2p); pmap_pte2_release(pte2p); if ((tpte2 & PTE2_W) != 0) continue; KASSERT(tpte2 != 0, ("pmap_pv_reclaim: pmap %p va %#x zero pte", pmap, va)); pmap_tlb_flush(pmap, va); m = PHYS_TO_VM_PAGE(pte2_pa(tpte2)); if (pte2_is_dirty(tpte2)) vm_page_dirty(m); if ((tpte2 & PTE2_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt2(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_pte2list_free(&pv_vafree, (vm_offset_t)pc); } /* * Free the pv_entry back to the free list. */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } /* * Get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffs(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the pte2list "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_pte2list_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_pte2list_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } static void pmap_pv_demote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_demote_pte1: pa is not 1mpage aligned")); /* * Transfer the 1mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pte1: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pte1: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PTE1_OFFSET) == 0, ("pmap_pv_promote_pte1: pa is not 1mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 1mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_pv_reclaim() and that pmap_pv_reclaim() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = pte1_trunc(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pte1: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTE2_IN_PT2 - 1 pv entries. */ va_last = va + PTE1_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a section. */ static boolean_t pmap_pv_insert_pte1(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } +static inline void +pmap_tlb_flush_pte1(pmap_t pmap, vm_offset_t va, pt1_entry_t npte1) +{ + + /* Kill all the small mappings or the big one only. */ + if (pte1_is_section(npte1)) + pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); + else + pmap_tlb_flush(pmap, pte1_trunc(va)); +} + /* + * Update kernel pte1 on all pmaps. + * + * The following function is called only on one cpu with disabled interrupts. + * In SMP case, smp_rendezvous_cpus() is used to stop other cpus. This way + * nobody can invoke explicit hardware table walk during the update of pte1. + * Unsolicited hardware table walk can still happen, invoked by speculative + * data or instruction prefetch or even by speculative hardware table walk. + * + * The break-before-make approach should be implemented here. However, it's + * not so easy to do that for kernel mappings as it would be unhappy to unmap + * itself unexpectedly but voluntarily. + */ +static void +pmap_update_pte1_kernel(vm_offset_t va, pt1_entry_t npte1) +{ + pmap_t pmap; + pt1_entry_t *pte1p; + + /* + * Get current pmap. Interrupts should be disabled here + * so PCPU_GET() is done atomically. + */ + pmap = PCPU_GET(curpmap); + if (pmap == NULL) + pmap = kernel_pmap; + + /* + * (1) Change pte1 on current pmap. + * (2) Flush all obsolete TLB entries on current CPU. + * (3) Change pte1 on all pmaps. + * (4) Flush all obsolete TLB entries on all CPUs in SMP case. + */ + + pte1p = pmap_pte1(pmap, va); + pte1_store(pte1p, npte1); + + /* Kill all the small mappings or the big one only. */ + if (pte1_is_section(npte1)) { + pmap_pte1_kern_promotions++; + tlb_flush_range_local(pte1_trunc(va), PTE1_SIZE); + } else { + pmap_pte1_kern_demotions++; + tlb_flush_local(pte1_trunc(va)); + } + + /* + * In SMP case, this function is called when all cpus are at smp + * rendezvous, so there is no need to use 'allpmaps_lock' lock here. + * In UP case, the function is called with this lock locked. + */ + LIST_FOREACH(pmap, &allpmaps, pm_list) { + pte1p = pmap_pte1(pmap, va); + pte1_store(pte1p, npte1); + } + +#ifdef SMP + /* Kill all the small mappings or the big one only. */ + if (pte1_is_section(npte1)) + tlb_flush_range(pte1_trunc(va), PTE1_SIZE); + else + tlb_flush(pte1_trunc(va)); +#endif +} + +#ifdef SMP +struct pte1_action { + vm_offset_t va; + pt1_entry_t npte1; + u_int update; /* CPU that updates the PTE1 */ +}; + +static void +pmap_update_pte1_action(void *arg) +{ + struct pte1_action *act = arg; + + if (act->update == PCPU_GET(cpuid)) + pmap_update_pte1_kernel(act->va, act->npte1); +} + +/* + * Change pte1 on current pmap. + * Note that kernel pte1 must be changed on all pmaps. + * + * By ARM ARM manual, the behaviour is UNPREDICABLE when two or more TLB + * entries map same VA. It's a problem when either promotion or demotion + * is being done. The pte1 update and appropriate TLB flush must be done + * atomically in general. + */ +static void +pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, + pt1_entry_t npte1) +{ + + if (pmap == kernel_pmap) { + struct pte1_action act; + + sched_pin(); + act.va = va; + act.npte1 = npte1; + act.update = PCPU_GET(cpuid); + smp_rendezvous_cpus(all_cpus, smp_no_rendevous_barrier, + pmap_update_pte1_action, NULL, &act); + sched_unpin(); + } else { + register_t cspr; + + /* + * Use break-before-make approach for changing userland + * mappings. It can cause L1 translation aborts on other + * cores in SMP case. So, special treatment is implemented + * in pmap_fault(). Interrups are disabled here to make it + * without any interruption as quick as possible. + */ + cspr = disable_interrupts(PSR_I | PSR_F); + pte1_clear(pte1p); + pmap_tlb_flush_pte1(pmap, va, npte1); + pte1_store(pte1p, npte1); + restore_interrupts(cspr); + } +} +#else +static void +pmap_change_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va, + pt1_entry_t npte1) +{ + + if (pmap == kernel_pmap) { + mtx_lock_spin(&allpmaps_lock); + pmap_update_pte1_kernel(va, npte1); + mtx_unlock_spin(&allpmaps_lock); + } else { + register_t cspr; + + /* + * Use break-before-make approach for changing userland + * mappings. It's absolutely safe in UP case when interrupts + * are disabled. + */ + cspr = disable_interrupts(PSR_I | PSR_F); + pte1_clear(pte1p); + pmap_tlb_flush_pte1(pmap, va, npte1); + pte1_store(pte1p, npte1); + restore_interrupts(cspr); + } +} +#endif + +/* * Tries to promote the NPTE2_IN_PT2, contiguous 4KB page mappings that are * within a single page table page (PT2) to a single 1MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PTE1s are replicated in each pmap but * pmap_remove_write(), pmap_clear_modify(), and pmap_clear_reference() only * read the PTE1 from the kernel pmap. */ static void pmap_promote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t npte1; pt2_entry_t *fpte2p, fpte2, fpte2_fav; pt2_entry_t *pte2p, pte2; vm_offset_t pteva __unused; vm_page_t m __unused; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE2 in the specified PT2. Abort if this PTE2 is * either invalid, unused, or does not map the first 4KB physical page * within a 1MB page. */ fpte2p = pmap_pte2_quick(pmap, pte1_trunc(va)); -setpte1: fpte2 = pte2_load(fpte2p); if ((fpte2 & ((PTE2_FRAME & PTE1_OFFSET) | PTE2_A | PTE2_V)) != (PTE2_A | PTE2_V)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(1) for va %#x in pmap %p", __func__, va, pmap); return; } if (pte2_is_managed(fpte2) && pmap == kernel_pmap) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(2) for va %#x in pmap %p", __func__, va, pmap); return; } if ((fpte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set without * a TLB invalidation. - * - * Note: When modified bit is being set, then in hardware case, - * the TLB entry is re-read (updated) from PT2, and in - * software case (abort), the PTE2 is read from PT2 and - * TLB flushed if changed. The following cmpset() solves - * any race with setting this bit in both cases. */ - if (!pte2_cmpset(fpte2p, fpte2, fpte2 | PTE2_RO)) - goto setpte1; fpte2 |= PTE2_RO; + pte2_store(fpte2p, fpte2); } /* * Examine each of the other PTE2s in the specified PT2. Abort if this * PTE2 maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE2. */ fpte2_fav = (fpte2 & (PTE2_FRAME | PTE2_A | PTE2_V)); fpte2_fav += PTE1_SIZE - PTE2_SIZE; /* examine from the end */ for (pte2p = fpte2p + NPTE2_IN_PT2 - 1; pte2p > fpte2p; pte2p--) { -setpte2: pte2 = pte2_load(pte2p); if ((pte2 & (PTE2_FRAME | PTE2_A | PTE2_V)) != fpte2_fav) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(3) for va %#x in pmap %p", __func__, va, pmap); return; } if ((pte2 & (PTE2_NM | PTE2_RO)) == PTE2_NM) { /* * When page is not modified, PTE2_RO can be set * without a TLB invalidation. See note above. */ - if (!pte2_cmpset(pte2p, pte2, pte2 | PTE2_RO)) - goto setpte2; pte2 |= PTE2_RO; + pte2_store(pte2p, pte2); pteva = pte1_trunc(va) | (pte2 & PTE1_OFFSET & PTE2_FRAME); CTR3(KTR_PMAP, "%s: protect for va %#x in pmap %p", __func__, pteva, pmap); } if ((pte2 & PTE2_PROMOTE) != (fpte2 & PTE2_PROMOTE)) { pmap_pte1_p_failures++; CTR3(KTR_PMAP, "%s: failure(4) for va %#x in pmap %p", __func__, va, pmap); return; } fpte2_fav -= PTE2_SIZE; } /* * The page table page in its current state will stay in PT2TAB * until the PTE1 mapping the section is demoted by pmap_demote_pte1() * or destroyed by pmap_remove_pte1(). * * Note that L2 page table size is not equal to PAGE_SIZE. */ m = PHYS_TO_VM_PAGE(trunc_page(pte1_link_pa(pte1_load(pte1p)))); KASSERT(m >= vm_page_array && m < &vm_page_array[vm_page_array_size], ("%s: PT2 page is out of range", __func__)); KASSERT(m->pindex == (pte1_index(va) & ~PT2PG_MASK), ("%s: PT2 page's pindex is wrong", __func__)); /* - * Get pte1 from pte2 format. - */ + * Get pte1 from pte2 format. + */ npte1 = (fpte2 & PTE1_FRAME) | ATTR_TO_L1(fpte2) | PTE1_V; /* * Promote the pv entries. */ if (pte2_is_managed(fpte2)) pmap_pv_promote_pte1(pmap, va, pte1_pa(npte1)); /* - * Map the section. + * Promote the mappings. */ - if (pmap == kernel_pmap) - pmap_kenter_pte1(va, npte1); - else - pte1_store(pte1p, npte1); - /* - * Flush old small mappings. We call single pmap_tlb_flush() in - * pmap_demote_pte1() and pmap_remove_pte1(), so we must be sure that - * no small mappings survive. We assume that given pmap is current and - * don't play game with PTE2_NG. - */ - pmap_tlb_flush_range(pmap, pte1_trunc(va), PTE1_SIZE); + pmap_change_pte1(pmap, pte1p, va, npte1); pmap_pte1_promotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); } /* * Zero L2 page table page. */ static __inline void pmap_clear_pt2(pt2_entry_t *fpte2p) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) pte2_clear(pte2p); } /* * Removes a 1MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { vm_page_t m; uint32_t pte1_idx; pt2_entry_t *fpte2p; vm_paddr_t pt2_pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); m = pmap_pt2_page(pmap, va); if (m == NULL) /* * QQQ: Is this function called only on promoted pte1? * We certainly do section mappings directly * (without promotion) in kernel !!! */ panic("%s: missing pt2 page", __func__); pte1_idx = pte1_index(va); /* * Initialize the L2 page table. */ fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); pmap_clear_pt2(fpte2p); /* * Remove the mapping. */ pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(m), pte1_idx); pmap_kenter_pte1(va, PTE1_LINK(pt2_pa)); /* * QQQ: We do not need to invalidate PT2MAP mapping * as we did not change it. I.e. the L2 page table page * was and still is mapped the same way. */ } /* * Do the things to unmap a section in a process */ static void pmap_remove_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, struct spglist *free) { pt1_entry_t opte1; struct md_page *pvh; vm_offset_t eva, va; vm_page_t m; PDEBUG(6, printf("%s(%p): va %#x pte1 %#x at %p\n", __func__, pmap, sva, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); /* * Clear and invalidate the mapping. It should occupy one and only TLB * entry. So, pmap_tlb_flush() called with aligned address should be * sufficient. */ opte1 = pte1_load_clear(pte1p); pmap_tlb_flush(pmap, sva); if (pte1_is_wired(opte1)) pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; if (pte1_is_managed(opte1)) { pvh = pa_to_pvh(pte1_pa(opte1)); pmap_pvh_free(pvh, pmap, sva); eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) { if (pte1_is_dirty(opte1)) vm_page_dirty(m); if (opte1 & PTE1_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { /* * L2 page table(s) can't be removed from kernel map as * kernel counts on it (stuff around pmap_growkernel()). */ pmap_remove_kernel_pte1(pmap, pte1p, sva); } else { /* * Get associated L2 page table page. * It's possible that the page was never allocated. */ m = pmap_pt2_page(pmap, sva); if (m != NULL) pmap_unwire_pt2_all(pmap, sva, m, free); } } /* * Fills L2 page table page with mappings to consecutive physical pages. */ static __inline void pmap_fill_pt2(pt2_entry_t *fpte2p, pt2_entry_t npte2) { pt2_entry_t *pte2p; for (pte2p = fpte2p; pte2p < fpte2p + NPTE2_IN_PT2; pte2p++) { pte2_store(pte2p, npte2); npte2 += PTE2_SIZE; } } /* * Tries to demote a 1MB page mapping. If demotion fails, the * 1MB page mapping is invalidated. */ static boolean_t pmap_demote_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t va) { pt1_entry_t opte1, npte1; pt2_entry_t *fpte2p, npte2; vm_paddr_t pt2pg_pa, pt2_pa; vm_page_t m; struct spglist free; uint32_t pte1_idx, isnew = 0; PDEBUG(6, printf("%s(%p): try for va %#x pte1 %#x at %p\n", __func__, pmap, va, pte1_load(pte1p), pte1p)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); opte1 = pte1_load(pte1p); KASSERT(pte1_is_section(opte1), ("%s: opte1 not a section", __func__)); if ((opte1 & PTE1_A) == 0 || (m = pmap_pt2_page(pmap, va)) == NULL) { KASSERT(!pte1_is_wired(opte1), ("%s: PT2 page for a wired mapping is missing", __func__)); /* * Invalidate the 1MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((opte1 & PTE1_A) == 0 || (m = vm_page_alloc(NULL, pte1_index(va) & ~PT2PG_MASK, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); pmap_remove_pte1(pmap, pte1p, pte1_trunc(va), &free); pmap_free_zero_pages(&free); CTR3(KTR_PMAP, "%s: failure for va %#x in pmap %p", __func__, va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; isnew = 1; /* * We init all L2 page tables in the page even if * we are going to change everything for one L2 page * table in a while. */ pt2pg_pa = pmap_pt2pg_init(pmap, va, m); } else { if (va < VM_MAXUSER_ADDRESS) { if (pt2_is_empty(m, va)) isnew = 1; /* Demoting section w/o promotion. */ #ifdef INVARIANTS else KASSERT(pt2_is_full(m, va), ("%s: bad PT2 wire" " count %u", __func__, pt2_wirecount_get(m, pte1_index(va)))); #endif } } pt2pg_pa = VM_PAGE_TO_PHYS(m); pte1_idx = pte1_index(va); /* * If the pmap is current, then the PT2MAP can provide access to * the page table page (promoted L2 page tables are not unmapped). * Otherwise, temporarily map the L2 page table page (m) into * the kernel's address space at either PADDR1 or PADDR2. * * Note that L2 page table size is not equal to PAGE_SIZE. */ if (pmap_is_current(pmap)) fpte2p = page_pt2(pt2map_pt2pg(va), pte1_idx); else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if (pte2_pa(pte2_load(PMAP1)) != pt2pg_pa) { pte2_store(PMAP1, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; fpte2p = page_pt2((vm_offset_t)PADDR1, pte1_idx); } else { mtx_lock(&PMAP2mutex); if (pte2_pa(pte2_load(PMAP2)) != pt2pg_pa) { pte2_store(PMAP2, PTE2_KPT(pt2pg_pa)); tlb_flush((vm_offset_t)PADDR2); } fpte2p = page_pt2((vm_offset_t)PADDR2, pte1_idx); } pt2_pa = page_pt2pa(pt2pg_pa, pte1_idx); npte1 = PTE1_LINK(pt2_pa); KASSERT((opte1 & PTE1_A) != 0, ("%s: opte1 is missing PTE1_A", __func__)); KASSERT((opte1 & (PTE1_NM | PTE1_RO)) != PTE1_NM, ("%s: opte1 has PTE1_NM", __func__)); /* * Get pte2 from pte1 format. */ npte2 = pte1_pa(opte1) | ATTR_TO_L2(opte1) | PTE2_V; /* * If the L2 page table page is new, initialize it. If the mapping * has changed attributes, update the page table entries. */ if (isnew != 0) { pt2_wirecount_set(m, pte1_idx, NPTE2_IN_PT2); pmap_fill_pt2(fpte2p, npte2); } else if ((pte2_load(fpte2p) & PTE2_PROMOTE) != (npte2 & PTE2_PROMOTE)) pmap_fill_pt2(fpte2p, npte2); KASSERT(pte2_pa(pte2_load(fpte2p)) == pte2_pa(npte2), ("%s: fpte2p and npte2 map different physical addresses", __func__)); if (fpte2p == PADDR2) mtx_unlock(&PMAP2mutex); /* * Demote the mapping. This pmap is locked. The old PTE1 has * PTE1_A set. If the old PTE1 has not PTE1_RO set, it also * has not PTE1_NM set. Thus, there is no danger of a race with * another processor changing the setting of PTE1_A and/or PTE1_NM * between the read above and the store below. */ - if (pmap == kernel_pmap) - pmap_kenter_pte1(va, npte1); - else - pte1_store(pte1p, npte1); + pmap_change_pte1(pmap, pte1p, va, npte1); /* - * Flush old big mapping. The mapping should occupy one and only - * TLB entry. So, pmap_tlb_flush() called with aligned address - * should be sufficient. - */ - pmap_tlb_flush(pmap, pte1_trunc(va)); - - /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_pv_reclaim(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PTE1 must have already changed from mapping * the 1mpage to referencing the page table page. */ if (pte1_is_managed(opte1)) pmap_pv_demote_pte1(pmap, va, pte1_pa(opte1)); pmap_pte1_demotions++; CTR3(KTR_PMAP, "%s: success for va %#x in pmap %p", __func__, va, pmap); PDEBUG(6, printf("%s(%p): success for va %#x pte1 %#x(%#x) at %p\n", __func__, pmap, va, npte1, pte1_load(pte1p), pte1p)); return (TRUE); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pt1_entry_t *pte1p; pt2_entry_t *pte2p; pt2_entry_t npte2, opte2; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte2, om; boolean_t wired; va = trunc_page(va); mpte2 = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= vm_max_kernel_address, ("%s: toobig", __func__)); KASSERT(va < UPT2V_MIN_ADDRESS || va >= UPT2V_MAX_ADDRESS, ("%s: invalid to pmap_enter page table pages (va: 0x%x)", __func__, va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { mpte2 = pmap_allocpte2(pmap, va, flags); if (mpte2 == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte2 failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } pte1p = pmap_pte1(pmap, va); if (pte1_is_section(pte1_load(pte1p))) panic("%s: attempted on 1MB page", __func__); pte2p = pmap_pte2_quick(pmap, va); if (pte2p == NULL) panic("%s: invalid L1 page table entry va=%#x", __func__, va); om = NULL; pa = VM_PAGE_TO_PHYS(m); opte2 = pte2_load(pte2p); opa = pte2_pa(opte2); /* * Mapping has not changed, must be protection or wiring change. */ if (pte2_is_valid(opte2) && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT2 pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT2 page will be also. */ if (wired && !pte2_is_wired(opte2)) pmap->pm_stats.wired_count++; else if (!wired && pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; /* * Remove extra pte2 reference */ if (mpte2) pt2_wirecount_dec(mpte2, pte1_index(va)); if (pte2_is_managed(opte2)) om = m; goto validate; } /* * QQQ: We think that changing physical address on writeable mapping * is not safe. Well, maybe on kernel address space with correct * locking, it can make a sense. However, we have no idea why * anyone should do that on user address space. Are we wrong? */ KASSERT((opa == 0) || (opa == pa) || !pte2_is_valid(opte2) || ((opte2 & PTE2_RO) != 0), ("%s: pmap %p va %#x(%#x) opa %#x pa %#x - gotcha %#x %#x!", __func__, pmap, va, opte2, opa, pa, flags, prot)); pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; if (pte2_is_managed(opte2)) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } /* * Remove extra pte2 reference */ if (mpte2 != NULL) pt2_wirecount_dec(mpte2, va >> PTE1_SHIFT); } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("%s: managed mapping within the clean submap", __func__)); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ npte2 = PTE2(pa, PTE2_NM, vm_page_pte2_attr(m)); if (prot & VM_PROT_WRITE) { if (pte2_is_managed(npte2)) vm_page_aflag_set(m, PGA_WRITEABLE); } else npte2 |= PTE2_RO; if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; if (wired) npte2 |= PTE2_W; if (va < VM_MAXUSER_ADDRESS) npte2 |= PTE2_U; if (pmap != kernel_pmap) npte2 |= PTE2_NG; /* * If the mapping or permission bits are different, we need * to update the pte2. * * QQQ: Think again and again what to do * if the mapping is going to be changed! */ if ((opte2 & ~(PTE2_NM | PTE2_A)) != (npte2 & ~(PTE2_NM | PTE2_A))) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, there is a race * for other threads of current process in lazy loading case. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). * * QQQ: (1) Does it exist any better way where * or how to sync icache? * (2) Now, we do it on a page basis. */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pat_mode == VM_MEMATTR_WB_WA && (opa != pa || (opte2 & PTE2_NX))) cache_icache_sync_fresh(va, pa, PAGE_SIZE); npte2 |= PTE2_A; if (flags & VM_PROT_WRITE) npte2 &= ~PTE2_NM; if (opte2 & PTE2_V) { /* Change mapping with break-before-make approach. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); pte2_store(pte2p, npte2); if (opte2 & PTE2_A) { if (pte2_is_managed(opte2)) vm_page_aflag_set(om, PGA_REFERENCED); } if (pte2_is_dirty(opte2)) { if (pte2_is_managed(opte2)) vm_page_dirty(om); } if (pte2_is_managed(opte2) && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } else pte2_store(pte2p, npte2); } #if 0 else { /* * QQQ: In time when both access and not mofified bits are * emulated by software, this should not happen. Some * analysis is need, if this really happen. Missing * tlb flush somewhere could be the reason. */ panic("%s: pmap %p va %#x opte2 %x npte2 %x !!", __func__, pmap, va, opte2, npte2); } #endif /* * If both the L2 page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte2 == NULL || pt2_is_full(mpte2, va)) && sp_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pte1(pmap, pte1p, va); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Do the things to unmap a page in a process. */ static int pmap_remove_pte2(pmap_t pmap, pt2_entry_t *pte2p, vm_offset_t va, struct spglist *free) { pt2_entry_t opte2; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Clear and invalidate the mapping. */ opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %#x not link pte2 %#x", __func__, pmap, va, opte2)); if (opte2 & PTE2_W) pmap->pm_stats.wired_count -= 1; pmap->pm_stats.resident_count -= 1; if (pte2_is_managed(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); if (pte2_is_dirty(opte2)) vm_page_dirty(m); if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt2(pmap, va, free)); } /* * Remove a single page from a process address space. */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt2_entry_t *pte2p; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("%s: curthread not pinned", __func__)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte2p = pmap_pte2_quick(pmap, va)) == NULL || !pte2_is_valid(pte2_load(pte2p))) return; pmap_remove_pte2(pmap, pte2p, va, free); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * Special handling of removing one page. A very common * operation and easy to short circuit some code. */ if (sva + PAGE_SIZE == eva) { pte1 = pte1_load(pmap_pte1(pmap, sva)); if (pte1_is_link(pte1)) { pmap_remove_page(pmap, sva, &free); goto out; } } for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; if (pmap->pm_stats.resident_count == 0) break; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that the L1 page * table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_remove_pte1(pmap, pte1p, sva, &free); continue; } else if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* The large page mapping was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion. */ pte1 = pte1_load(pte1p); } #endif } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being removed. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (pmap_remove_pte2(pmap, pte2p, sva, &free)) break; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt2_entry_t *pte2p, opte2; pt1_entry_t *pte1p; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found " "a 1mpage in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); opte2 = pte2_load_clear(pte2p); pmap_tlb_flush(pmap, pv->pv_va); KASSERT(pte2_is_valid(opte2), ("%s: pmap %p va %x zero pte2", __func__, pmap, pv->pv_va)); if (pte2_is_wired(opte2)) pmap->pm_stats.wired_count--; if (opte2 & PTE2_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_unuse_pt2(pmap, pv->pv_va, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte1_quick(pmap_t pmap, pt1_entry_t pte1, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m, mt, mpt2pg; struct md_page *pvh; pa = pte1_pa(pte1); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte1 %#x", __func__, pte1)); if (pte1_is_dirty(pte1)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } pmap->pm_stats.resident_count -= PTE1_SIZE / PAGE_SIZE; pvh = pa_to_pvh(pa); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[PTE1_SIZE / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpt2pg = pmap_pt2_page(pmap, pv->pv_va); if (mpt2pg != NULL) pmap_unwire_pt2_all(pmap, pv->pv_va, mpt2pg, free); } /* * Just subroutine for pmap_remove_pages() to reasonably satisfy * good coding style, a.k.a. 80 character line width limit hell. */ static __inline void pmap_remove_pte2_quick(pmap_t pmap, pt2_entry_t pte2, pv_entry_t pv, struct spglist *free) { vm_paddr_t pa; vm_page_t m; struct md_page *pvh; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("%s: vm_page_t %p addr mismatch %#x %#x", __func__, m, m->phys_addr, pa)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("%s: bad pte2 %#x", __func__, pte2)); if (pte2_is_dirty(pte2)) vm_page_dirty(m); pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(pa); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt2(pmap, pv->pv_va, free); } /* * Remove all pages from specified address space this aids process * exit speeds. Also, this code is special cased for current process * only, but can have the more generic (and slightly slower) mode enabled. * This is much faster than pmap_remove in the case of running down * an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; pv_entry_t pv; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; boolean_t allfree; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == vmspace_pmap(curthread->td_proc->p_vmspace), ("%s: non-current pmap %p", __func__, pmap)); #if defined(SMP) && defined(INVARIANTS) { cpuset_t other_cpus; sched_pin(); other_cpus = pmap->pm_active; CPU_CLR(PCPU_GET(cpuid), &other_cpus); sched_unpin(); KASSERT(CPU_EMPTY(&other_cpus), ("%s: pmap %p active on other cpus", __func__, pmap)); } #endif SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("%s: wrong pmap %p %p", __func__, pmap, pc->pc_pmap)); allfree = TRUE; for (field = 0; field < _NPCM; field++) { inuse = (~(pc->pc_map[field])) & pc_freemask[field]; while (inuse != 0) { bit = ffs(inuse) - 1; bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; /* * Note that we cannot remove wired pages * from a process' mapping at this time */ pte1p = pmap_pte1(pmap, pv->pv_va); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) { allfree = FALSE; continue; } pte1_clear(pte1p); pmap_remove_pte1_quick(pmap, pte1, pv, &free); } else if (pte1_is_link(pte1)) { pte2p = pt2map_entry(pv->pv_va); pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) { printf("%s: pmap %p va %#x " "pte2 %#x\n", __func__, pmap, pv->pv_va, pte2); panic("bad pte2"); } if (pte2_is_wired(pte2)) { allfree = FALSE; continue; } pte2_clear(pte2p); pmap_remove_pte2_quick(pmap, pte2, pv, &free); } else { printf("%s: pmap %p va %#x pte1 %#x\n", __func__, pmap, pv->pv_va, pte1); panic("bad pte1"); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } tlb_flush_all_ng_local(); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * This code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No L2 page table pages. * but is *MUCH* faster than pmap_enter... */ static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpt2pg) { pt2_entry_t *pte2p, pte2; vm_paddr_t pa; struct spglist free; uint32_t l2prot; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("%s: managed mapping within the clean submap", __func__)); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a L2 page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int pte1_idx; pt1_entry_t pte1, *pte1p; vm_paddr_t pt2_pa; /* * Get L1 page table things. */ pte1_idx = pte1_index(va); pte1p = pmap_pte1(pmap, va); pte1 = pte1_load(pte1p); if (mpt2pg && (mpt2pg->pindex == (pte1_idx & ~PT2PG_MASK))) { /* * Each of NPT2_IN_PG L2 page tables on the page can * come here. Make sure that associated L1 page table * link is established. * * QQQ: It comes that we don't establish all links to * L2 page tables for newly allocated L2 page * tables page. */ KASSERT(!pte1_is_section(pte1), ("%s: pte1 %#x is section", __func__, pte1)); if (!pte1_is_link(pte1)) { pt2_pa = page_pt2pa(VM_PAGE_TO_PHYS(mpt2pg), pte1_idx); pte1_store(pte1p, PTE1_LINK(pt2_pa)); } pt2_wirecount_inc(mpt2pg, pte1_idx); } else { /* * If the L2 page table page is mapped, we just * increment the hold count, and activate it. */ if (pte1_is_section(pte1)) { return (NULL); } else if (pte1_is_link(pte1)) { mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); pt2_wirecount_inc(mpt2pg, pte1_idx); } else { mpt2pg = _pmap_allocpte2(pmap, va, PMAP_ENTER_NOSLEEP); if (mpt2pg == NULL) return (NULL); } } } else { mpt2pg = NULL; } /* * This call to pt2map_entry() makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte2_quick(). * But that isn't as quick as pt2map_entry(). */ pte2p = pt2map_entry(va); pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { if (mpt2pg != NULL) { /* * Remove extra pte2 reference */ pt2_wirecount_dec(mpt2pg, pte1_index(va)); mpt2pg = NULL; } return (NULL); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpt2pg != NULL) { SLIST_INIT(&free); if (pmap_unwire_pt2(pmap, va, mpt2pg, &free)) { pmap_tlb_flush(pmap, va); pmap_free_zero_pages(&free); } mpt2pg = NULL; } return (NULL); } /* * Increment counters */ pmap->pm_stats.resident_count++; /* * Now validate mapping with RO protection */ pa = VM_PAGE_TO_PHYS(m); l2prot = PTE2_RO | PTE2_NM; if (va < VM_MAXUSER_ADDRESS) l2prot |= PTE2_U | PTE2_NG; if ((prot & VM_PROT_EXECUTE) == 0) l2prot |= PTE2_NX; else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ cache_icache_sync_fresh(va, pa, PAGE_SIZE); } pte2_store(pte2p, PTE2(pa, l2prot, vm_page_pte2_attr(m))); return (mpt2pg); } void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Tries to create 1MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pte1(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pt1_entry_t *pte1p; vm_paddr_t pa; uint32_t l1prot; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pte1p = pmap_pte1(pmap, va); if (pte1_is_valid(pte1_load(pte1p))) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (FALSE); } if ((m->oflags & VPO_UNMANAGED) == 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pte1(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR3(KTR_PMAP, "%s: failure for va %#lx in pmap %p", __func__, va, pmap); return (FALSE); } } /* * Increment counters. */ pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; /* * Map the section. * * QQQ: Why VM_PROT_WRITE is not evaluated and the mapping is * made readonly? */ pa = VM_PAGE_TO_PHYS(m); l1prot = PTE1_RO | PTE1_NM; if (va < VM_MAXUSER_ADDRESS) l1prot |= PTE1_U | PTE1_NG; if ((prot & VM_PROT_EXECUTE) == 0) l1prot |= PTE1_NX; else if (m->md.pat_mode == VM_MEMATTR_WB_WA && pmap != kernel_pmap) { /* * Sync icache if exec permission and attribute VM_MEMATTR_WB_WA * is set. QQQ: For more info, see comments in pmap_enter(). */ cache_icache_sync_fresh(va, pa, PTE1_SIZE); } pte1_store(pte1p, PTE1(pa, l1prot, ATTR_TO_L1(vm_page_pte2_attr(m)))); pmap_pte1_mappings++; CTR3(KTR_PMAP, "%s: success for va %#lx in pmap %p", __func__, va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpt2pg; vm_pindex_t diff, psize; PDEBUG(6, printf("%s: pmap %p start %#x end %#x m %p prot %#x\n", __func__, pmap, start, end, m_start, prot)); VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpt2pg = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PTE1_OFFSET) == 0 && va + PTE1_SIZE <= end && m->psind == 1 && sp_enabled && pmap_enter_pte1(pmap, va, m, prot)) m = &m[PTE1_SIZE / PAGE_SIZE - 1]; else mpt2pg = pmap_enter_quick_locked(pmap, va, m, prot, mpt2pg); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pt1_entry_t *pte1p; vm_paddr_t pa, pte2_pa; vm_page_t p; vm_memattr_t pat_mode; u_int l1attr, l1prot; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("%s: non-device object", __func__)); if ((addr & PTE1_OFFSET) == 0 && (size & PTE1_OFFSET) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 1MB page boundary. */ pte2_pa = VM_PAGE_TO_PHYS(p); if (pte2_pa & PTE1_OFFSET) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = pte2_pa + PAGE_SIZE; pa < pte2_pa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("%s: invalid page %p", __func__, p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 1MB pages. * * QQQ: Well, we are mapping a section, so same condition must * be hold like during promotion. It looks that only RW mapping * is done here, so readonly mapping must be done elsewhere. */ l1prot = PTE1_U | PTE1_NG | PTE1_RW | PTE1_M | PTE1_A; l1attr = ATTR_TO_L1(vm_memattr_to_pte2(pat_mode)); PMAP_LOCK(pmap); for (pa = pte2_pa; pa < pte2_pa + size; pa += PTE1_SIZE) { pte1p = pmap_pte1(pmap, addr); if (!pte1_is_valid(pte1_load(pte1p))) { pte1_store(pte1p, PTE1(pa, l1prot, l1attr)); pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } /* Else continue on if the PTE1 is already valid. */ addr += PTE1_SIZE; } PMAP_UNLOCK(pmap); } } /* * Do the things to protect a 1mpage in a process. */ static void pmap_protect_pte1(pmap_t pmap, pt1_entry_t *pte1p, vm_offset_t sva, vm_prot_t prot) { pt1_entry_t npte1, opte1; vm_offset_t eva, va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PTE1_OFFSET) == 0, ("%s: sva is not 1mpage aligned", __func__)); -retry: + opte1 = npte1 = pte1_load(pte1p); if (pte1_is_managed(opte1)) { eva = sva + PTE1_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(pte1_pa(opte1)); va < eva; va += PAGE_SIZE, m++) if (pte1_is_dirty(opte1)) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) npte1 |= PTE1_RO | PTE1_NM; if ((prot & VM_PROT_EXECUTE) == 0) npte1 |= PTE1_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte1 != opte1) { - if (!pte1_cmpset(pte1p, opte1, npte1)) - goto retry; + pte1_store(pte1p, npte1); pmap_tlb_flush(pmap, sva); } } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { boolean_t pv_lists_locked; vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, opte2, npte2; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { /* * Calculate address for next L2 page table. */ nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pmap_protect_pte1(pmap, pte1p, sva, prot); continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping * was destroyed. */ continue; } #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { vm_page_t m; -retry: + opte2 = npte2 = pte2_load(pte2p); if (!pte2_is_valid(opte2)) continue; if ((prot & VM_PROT_WRITE) == 0) { if (pte2_is_managed(opte2) && pte2_is_dirty(opte2)) { m = PHYS_TO_VM_PAGE(pte2_pa(opte2)); vm_page_dirty(m); } npte2 |= PTE2_RO | PTE2_NM; } if ((prot & VM_PROT_EXECUTE) == 0) npte2 |= PTE2_NX; /* * QQQ: Herein, execute permission is never set. * It only can be cleared. So, no icache * syncing is needed. */ if (npte2 != opte2) { - - if (!pte2_cmpset(pte2p, opte2, npte2)) - goto retry; + pte2_store(pte2p, npte2); pmap_tlb_flush(pmap, sva); } } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt1_entry_t pte1; pt2_entry_t pte2; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { if (pte1_is_wired(pte1)) count++; } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); if (pte2_is_wired(pte2)) count++; } PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 1mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = pte1_is_dirty(pte1); } else { KASSERT(pte1_is_link(pte1), ("%s: pte1 %#x is not link", __func__, pte1)); pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = pte2_is_dirty(pte2); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTE2s can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt1_entry_t pte1; pt2_entry_t pte2; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, addr)); if (pte1_is_link(pte1)) { pte2 = pte2_load(pt2map_entry(addr)); rv = !pte2_is_valid(pte2) ; } PMAP_UNLOCK(pmap); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 1mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt1_entry_t pte1; pt2_entry_t pte2; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1 = pte1_load(pmap_pte1(pmap, pv->pv_va)); if (pte1_is_section(pte1)) { rv = (pte1 & (PTE1_A | PTE1_V)) == (PTE1_A | PTE1_V); } else { pte2 = pte2_load(pmap_pte2_quick(pmap, pv->pv_va)); rv = (pte2 & (PTE2_A | PTE2_V)) == (PTE2_A | PTE2_V); } PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } #define PMAP_TS_REFERENCED_MAX 5 /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); opte1 = pte1_load(pte1p); if ((opte1 & PTE1_A) != 0) { /* * Since this reference bit is shared by 256 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual section number, and the pmap * address to select one 4KB page out of the 256 * on which testing the reference bit will result * in clearing that bit. This function is designed * to avoid the selection of the same 4KB page * for every 1MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the section is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PTE1_SHIFT) ^ (uintptr_t)pmap) & (NPTE2_IN_PG - 1)) == 0 && !pte1_is_wired(opte1)) { pte1_clear_bit(pte1p, PTE1_A); pmap_tlb_flush(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(pte1_is_link(pte1_load(pte1p)), ("%s: not found a link in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); if ((pte2_load(pte2p) & PTE2_A) != 0) { pte2_clear_bit(pte2p, PTE2_A); pmap_tlb_flush(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t nextva; pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = nextva) { nextva = pte1_trunc(sva + PTE1_SIZE); if (nextva < sva) nextva = eva; pte1p = pmap_pte1(pmap, sva); pte1 = pte1_load(pte1p); /* * Weed out invalid mappings. Note: we assume that L1 page * page table is always allocated, and in kernel virtual. */ if (pte1 == 0) continue; if (pte1_is_section(pte1)) { if (!pte1_is_wired(pte1)) panic("%s: pte1 %#x not wired", __func__, pte1); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + PTE1_SIZE == nextva && eva >= nextva) { pte1_clear_bit(pte1p, PTE1_W); pmap->pm_stats.wired_count -= PTE1_SIZE / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) panic("%s: demotion failed", __func__); #ifdef INVARIANTS else { /* Update pte1 after demotion */ pte1 = pte1_load(pte1p); } #endif } } KASSERT(pte1_is_link(pte1), ("%s: pmap %p va %#x pte1 %#x at %p" " is not link", __func__, pmap, sva, pte1, pte1p)); /* * Limit our scan to either the end of the va represented * by the current L2 page table page, or to the end of the * range being protected. */ if (nextva > eva) nextva = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != nextva; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2)) continue; if (!pte2_is_wired(pte2)) panic("%s: pte2 %#x is missing PTE2_W", __func__, pte2); /* * PTE2_W must be cleared atomically. Although the pmap * lock synchronizes access to PTE2_W, another processor * could be changing PTE2_NM and/or PTE2_A concurrently. */ pte2_clear_bit(pte2p, PTE2_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); if (!(pte1_load(pte1p) & PTE1_RO)) (void)pmap_demote_pte1(pmap, pte1p, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); -retry: opte2 = pte2_load(pte2p); if (!(opte2 & PTE2_RO)) { - if (!pte2_cmpset(pte2p, opte2, - opte2 | (PTE2_RO | PTE2_NM))) - goto retry; + pte2_store(pte2p, opte2 | PTE2_RO | PTE2_NM); if (pte2_is_dirty(opte2)) vm_page_dirty(m); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, pte2; vm_offset_t pdnxt; vm_page_t m; boolean_t pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = pte1_trunc(sva + PTE1_SIZE); if (pdnxt < sva) pdnxt = eva; pte1p = pmap_pte1(pmap, sva); opte1 = pte1_load(pte1p); if (!pte1_is_valid(opte1)) /* XXX */ continue; else if (pte1_is_section(opte1)) { if (!pte1_is_managed(opte1)) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pte1(pmap, pte1p, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying L2 page * table is fully populated, this removal never * frees a L2 page table page. */ if (!pte1_is_wired(opte1)) { pte2p = pmap_pte2_quick(pmap, sva); KASSERT(pte2_is_valid(pte2_load(pte2p)), ("%s: invalid PTE2", __func__)); pmap_remove_pte2(pmap, pte2p, sva, NULL); } } if (pdnxt > eva) pdnxt = eva; for (pte2p = pmap_pte2_quick(pmap, sva); sva != pdnxt; pte2p++, sva += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (!pte2_is_valid(pte2) || !pte2_is_managed(pte2)) continue; else if (pte2_is_dirty(pte2)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); vm_page_dirty(m); } pte2_set_bit(pte2p, PTE2_NM); pte2_clear_bit(pte2p, PTE2_A); } else if ((pte2 & PTE2_A) != 0) pte2_clear_bit(pte2p, PTE2_A); else continue; pmap_tlb_flush(pmap, sva); } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pt1_entry_t *pte1p, opte1; pt2_entry_t *pte2p, opte2; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("%s: page %p is exclusive busy", __func__, m)); /* * If the page is not PGA_WRITEABLE, then no PTE2s can have PTE2_NM * cleared. If the object containing the page is locked and the page * is not exclusive busied, then PGA_WRITEABLE cannot be concurrently * set. */ if ((m->flags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, va); opte1 = pte1_load(pte1p); if (!(opte1 & PTE1_RO)) { if (pmap_demote_pte1(pmap, pte1p, va) && !pte1_is_wired(opte1)) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - pte1_pa(opte1); pte2p = pmap_pte2_quick(pmap, va); opte2 = pte2_load(pte2p); if ((opte2 & PTE2_V)) { pte2_set_bit(pte2p, PTE2_NM | PTE2_RO); vm_page_dirty(m); pmap_tlb_flush(pmap, va); } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte1p = pmap_pte1(pmap, pv->pv_va); KASSERT(!pte1_is_section(pte1_load(pte1p)), ("%s: found" " a section in page %p's pv list", __func__, m)); pte2p = pmap_pte2_quick(pmap, pv->pv_va); if (pte2_is_dirty(pte2_load(pte2p))) { pte2_set_bit(pte2p, PTE2_NM); pmap_tlb_flush(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { struct sysmaps *sysmaps; vm_memattr_t oma; vm_paddr_t pa; oma = m->md.pat_mode; m->md.pat_mode = ma; CTR5(KTR_PMAP, "%s: page %p - 0x%08X oma: %d, ma: %d", __func__, m, VM_PAGE_TO_PHYS(m), oma, ma); if ((m->flags & PG_FICTITIOUS) != 0) return; #if 0 /* * If "m" is a normal page, flush it from the cache. * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m, oma)) return; #endif /* * If page is not mapped by sf buffer, map the page * transient and do invalidation. */ if (ma != oma) { pa = VM_PAGE_TO_PHYS(m); sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP2) panic("%s: CMAP2 busy", __func__); pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(pa, PTE2_AP_KRW, vm_memattr_to_pte2(ma))); dcache_wbinv_poc((vm_offset_t)sysmaps->CADDR2, pa, PAGE_SIZE); pte2_clear(sysmaps->CMAP2); tlb_flush((vm_offset_t)sysmaps->CADDR2); sched_unpin(); mtx_unlock(&sysmaps->lock); } } /* * Miscellaneous support routines follow */ /* * Returns TRUE if the given page is mapped individually or as part of * a 1mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("%s: page %p is not managed", __func__, m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { struct sysmaps *sysmaps; sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (pte2_load(sysmaps->CMAP2) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); pagezero(sysmaps->CADDR2); pte2_clear(sysmaps->CMAP2); tlb_flush((vm_offset_t)sysmaps->CADDR2); sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { struct sysmaps *sysmaps; sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (pte2_load(sysmaps->CMAP2) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); if (off == 0 && size == PAGE_SIZE) pagezero(sysmaps->CADDR2); else bzero(sysmaps->CADDR2 + off, size); pte2_clear(sysmaps->CMAP2); tlb_flush((vm_offset_t)sysmaps->CADDR2); sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { if (pte2_load(CMAP3) != 0) panic("%s: CMAP3 busy", __func__); sched_pin(); pte2_store(CMAP3, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); pagezero(CADDR3); pte2_clear(CMAP3); tlb_flush((vm_offset_t)CADDR3); sched_unpin(); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { struct sysmaps *sysmaps; sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (pte2_load(sysmaps->CMAP1) != 0) panic("%s: CMAP1 busy", __func__); if (pte2_load(sysmaps->CMAP2) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(sysmaps->CMAP1, PTE2_KERN_NG(VM_PAGE_TO_PHYS(src), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(src))); pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(dst), PTE2_AP_KRW, vm_page_pte2_attr(dst))); bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); pte2_clear(sysmaps->CMAP1); tlb_flush((vm_offset_t)sysmaps->CADDR1); pte2_clear(sysmaps->CMAP2); tlb_flush((vm_offset_t)sysmaps->CADDR2); sched_unpin(); mtx_unlock(&sysmaps->lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { struct sysmaps *sysmaps; vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; int cnt; sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*sysmaps->CMAP2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); pte2_store(sysmaps->CMAP1, PTE2_KERN_NG(VM_PAGE_TO_PHYS(a_pg), PTE2_AP_KR | PTE2_NM, vm_page_pte2_attr(a_pg))); tlb_flush_local((vm_offset_t)sysmaps->CADDR1); pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(b_pg), PTE2_AP_KRW, vm_page_pte2_attr(b_pg))); tlb_flush_local((vm_offset_t)sysmaps->CADDR2); a_cp = sysmaps->CADDR1 + a_pg_offset; b_cp = sysmaps->CADDR2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } pte2_clear(sysmaps->CMAP1); tlb_flush((vm_offset_t)sysmaps->CADDR1); pte2_clear(sysmaps->CMAP2); tlb_flush((vm_offset_t)sysmaps->CADDR2); sched_unpin(); mtx_unlock(&sysmaps->lock); } vm_offset_t pmap_quick_enter_page(vm_page_t m) { pt2_entry_t *pte2p; vm_offset_t qmap_addr; critical_enter(); qmap_addr = PCPU_GET(qmap_addr); pte2p = pt2map_entry(qmap_addr); KASSERT(pte2_load(pte2p) == 0, ("%s: PTE2 busy", __func__)); pte2_store(pte2p, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); return (qmap_addr); } void pmap_quick_remove_page(vm_offset_t addr) { pt2_entry_t *pte2p; vm_offset_t qmap_addr; qmap_addr = PCPU_GET(qmap_addr); pte2p = pt2map_entry(qmap_addr); KASSERT(addr == qmap_addr, ("%s: invalid address", __func__)); KASSERT(pte2_load(pte2p) != 0, ("%s: PTE2 not in use", __func__)); pte2_clear(pte2p); tlb_flush(qmap_addr); critical_exit(); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t nextva; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = nextva) { pt2_entry_t *src_pte2p, *dst_pte2p; vm_page_t dst_mpt2pg, src_mpt2pg; pt1_entry_t src_pte1; u_int pte1_idx; KASSERT(addr < VM_MAXUSER_ADDRESS, ("%s: invalid to pmap_copy page tables", __func__)); nextva = pte1_trunc(addr + PTE1_SIZE); if (nextva < addr) nextva = end_addr; pte1_idx = pte1_index(addr); src_pte1 = src_pmap->pm_pt1[pte1_idx]; if (pte1_is_section(src_pte1)) { if ((addr & PTE1_OFFSET) != 0 || (addr + PTE1_SIZE) > end_addr) continue; if (dst_pmap->pm_pt1[pte1_idx] == 0 && (!pte1_is_managed(src_pte1) || pmap_pv_insert_pte1(dst_pmap, addr, pte1_pa(src_pte1)))) { dst_pmap->pm_pt1[pte1_idx] = src_pte1 & ~PTE1_W; dst_pmap->pm_stats.resident_count += PTE1_SIZE / PAGE_SIZE; pmap_pte1_mappings++; } continue; } else if (!pte1_is_link(src_pte1)) continue; src_mpt2pg = PHYS_TO_VM_PAGE(pte1_link_pa(src_pte1)); /* * We leave PT2s to be linked from PT1 even if they are not * referenced until all PT2s in a page are without reference. * * QQQ: It could be changed ... */ #if 0 /* single_pt2_link_is_cleared */ KASSERT(pt2_wirecount_get(src_mpt2pg, pte1_idx) > 0, ("%s: source page table page is unused", __func__)); #else if (pt2_wirecount_get(src_mpt2pg, pte1_idx) == 0) continue; #endif if (nextva > end_addr) nextva = end_addr; src_pte2p = pt2map_entry(addr); while (addr < nextva) { pt2_entry_t temp_pte2; temp_pte2 = pte2_load(src_pte2p); /* * we only virtual copy managed pages */ if (pte2_is_managed(temp_pte2)) { dst_mpt2pg = pmap_allocpte2(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dst_mpt2pg == NULL) goto out; dst_pte2p = pmap_pte2_quick(dst_pmap, addr); if (!pte2_is_valid(pte2_load(dst_pte2p)) && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(pte2_pa(temp_pte2)))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ temp_pte2 &= ~(PTE2_W | PTE2_A); temp_pte2 |= PTE2_NM; pte2_store(dst_pte2p, temp_pte2); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_pt2(dst_pmap, addr, dst_mpt2pg, &free)) { pmap_tlb_flush(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (pt2_wirecount_get(dst_mpt2pg, pte1_idx) >= pt2_wirecount_get(src_mpt2pg, pte1_idx)) break; } addr += PAGE_SIZE; src_pte2p++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more section mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t pte1_offset; if (size < PTE1_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); pte1_offset = offset & PTE1_OFFSET; if (size - ((PTE1_SIZE - pte1_offset) & PTE1_OFFSET) < PTE1_SIZE || (*addr & PTE1_OFFSET) == pte1_offset) return; if ((*addr & PTE1_OFFSET) < pte1_offset) *addr = pte1_trunc(*addr) + pte1_offset; else *addr = pte1_roundup(*addr) + pte1_offset; } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid, ttb; PDEBUG(9, printf("%s: td = %08x\n", __func__, (uint32_t)td)); critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif ttb = pmap_ttb_get(pmap); /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_pagedir = ttb; cp15_ttbr_set(ttb); PCPU_SET(curpmap, pmap); critical_exit(); } /* * Perform the pmap work for mincore. */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; vm_paddr_t pa; boolean_t managed; int val; PMAP_LOCK(pmap); retry: pte1p = pmap_pte1(pmap, addr); pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { pa = trunc_page(pte1_pa(pte1) | (addr & PTE1_OFFSET)); managed = pte1_is_managed(pte1); val = MINCORE_SUPER | MINCORE_INCORE; if (pte1_is_dirty(pte1)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte1 & PTE1_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else if (pte1_is_link(pte1)) { pte2p = pmap_pte2(pmap, addr); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); pa = pte2_pa(pte2); managed = pte2_is_managed(pte2); val = MINCORE_INCORE; if (pte2_is_dirty(pte2)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if (pte2 & PTE2_A) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } else { managed = FALSE; val = 0; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_kenter_device(vm_offset_t va, vm_size_t size, vm_paddr_t pa) { vm_offset_t sva; uint32_t l2attr; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; l2attr = vm_memattr_to_pte2(VM_MEMATTR_DEVICE); while (size != 0) { pmap_kenter_prot_attr(va, pa, PTE2_AP_KRW, l2attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_kremove_device(vm_offset_t va, vm_size_t size) { vm_offset_t sva; KASSERT((size & PAGE_MASK) == 0, ("%s: device mapping not page-sized", __func__)); sva = va; while (size != 0) { pmap_kremove(va); va += PAGE_SIZE; size -= PAGE_SIZE; } tlb_flush_range(sva, va - sva); } void pmap_set_pcb_pagedir(pmap_t pmap, struct pcb *pcb) { pcb->pcb_pagedir = pmap_ttb_get(pmap); } /* * Clean L1 data cache range by physical address. * The range must be within a single page. */ static void pmap_dcache_wb_pou(vm_paddr_t pa, vm_size_t size, uint32_t attr) { struct sysmaps *sysmaps; KASSERT(((pa & PAGE_MASK) + size) <= PAGE_SIZE, ("%s: not on single page", __func__)); sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (*sysmaps->CMAP3) panic("%s: CMAP3 busy", __func__); pte2_store(sysmaps->CMAP3, PTE2_KERN_NG(pa, PTE2_AP_KRW, attr)); dcache_wb_pou((vm_offset_t)sysmaps->CADDR3 + (pa & PAGE_MASK), size); pte2_clear(sysmaps->CMAP3); tlb_flush((vm_offset_t)sysmaps->CADDR3); sched_unpin(); mtx_unlock(&sysmaps->lock); } /* * Sync instruction cache range which is not mapped yet. */ void cache_icache_sync_fresh(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { uint32_t len, offset; vm_page_t m; /* Write back d-cache on given address range. */ offset = pa & PAGE_MASK; for ( ; size != 0; size -= len, pa += len, offset = 0) { len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t size) { /* Write back d-cache on given address range. */ if (va >= VM_MIN_KERNEL_ADDRESS) { dcache_wb_pou(va, size); } else { uint32_t len, offset; vm_paddr_t pa; vm_page_t m; offset = va & PAGE_MASK; for ( ; size != 0; size -= len, va += len, offset = 0) { pa = pmap_extract(pmap, va); /* offset is preserved */ len = min(PAGE_SIZE - offset, size); m = PHYS_TO_VM_PAGE(pa); KASSERT(m != NULL, ("%s: vm_page_t is null for %#x", __func__, pa)); pmap_dcache_wb_pou(pa, len, vm_page_pte2_attr(m)); } } /* * I-cache is VIPT. Only way how to flush all virtual mappings * on given physical address is to invalidate all i-cache. */ icache_inv_all(); } /* * The implementation of pmap_fault() uses IN_RANGE2() macro which * depends on the fact that given range size is a power of 2. */ CTASSERT(powerof2(NB_IN_PT1)); CTASSERT(powerof2(PT2MAP_SIZE)); #define IN_RANGE2(addr, start, size) \ ((vm_offset_t)(start) == ((vm_offset_t)(addr) & ~((size) - 1))) /* * Handle access and R/W emulation faults. */ int pmap_fault(pmap_t pmap, vm_offset_t far, uint32_t fsr, int idx, bool usermode) { pt1_entry_t *pte1p, pte1; pt2_entry_t *pte2p, pte2; if (pmap == NULL) pmap = kernel_pmap; /* * In kernel, we should never get abort with FAR which is in range of * pmap->pm_pt1 or PT2MAP address spaces. If it happens, stop here * and print out a useful abort message and even get to the debugger * otherwise it likely ends with never ending loop of aborts. */ if (__predict_false(IN_RANGE2(far, pmap->pm_pt1, NB_IN_PT1))) { /* * All L1 tables should always be mapped and present. * However, we check only current one herein. For user mode, * only permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x pm_pt1 %#x far %#x", __func__, pmap, pmap->pm_pt1, far); panic("%s: pm_pt1 abort", __func__); } return (KERN_INVALID_ADDRESS); } if (__predict_false(IN_RANGE2(far, PT2MAP, PT2MAP_SIZE))) { /* * PT2MAP should be always mapped and present in current * L1 table. However, only existing L2 tables are mapped * in PT2MAP. For user mode, only L2 translation abort and * permission abort from malicious user is not fatal. * And alignment abort as it may have higher priority. */ if (!usermode || (idx != FAULT_ALIGN && idx != FAULT_TRAN_L2 && idx != FAULT_PERM_L2)) { CTR4(KTR_PMAP, "%s: pmap %#x PT2MAP %#x far %#x", __func__, pmap, PT2MAP, far); panic("%s: PT2MAP abort", __func__); } return (KERN_INVALID_ADDRESS); } /* + * A pmap lock is used below for handling of access and R/W emulation + * aborts. They were handled by atomic operations before so some + * analysis of new situation is needed to answer the following question: + * Is it safe to use the lock even for these aborts? + * + * There may happen two cases in general: + * + * (1) Aborts while the pmap lock is locked already - this should not + * happen as pmap lock is not recursive. However, under pmap lock only + * internal kernel data should be accessed and such data should be + * mapped with A bit set and NM bit cleared. If double abort happens, + * then a mapping of data which has caused it must be fixed. Further, + * all new mappings are always made with A bit set and the bit can be + * cleared only on managed mappings. + * + * (2) Aborts while another lock(s) is/are locked - this already can + * happen. However, there is no difference here if it's either access or + * R/W emulation abort, or if it's some other abort. + */ + + PMAP_LOCK(pmap); +#ifdef SMP + /* + * Special treatment due to break-before-make approach done when + * pte1 is updated for userland mapping during section promotion or + * demotion. If not catched here, pmap_enter() can find a section + * mapping on faulting address. That is not allowed. + */ + if (idx == FAULT_TRAN_L1 && usermode && cp15_ats1cur_check(far) == 0) { + PMAP_UNLOCK(pmap); + return (KERN_SUCCESS); + } +#endif + /* * Accesss bits for page and section. Note that the entry * is not in TLB yet, so TLB flush is not necessary. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. - * We do not lock PMAP, so cmpset() is a need. Hopefully, - * no one removes the mapping when we are here. */ if (idx == FAULT_ACCESS_L2) { pte2p = pt2map_entry(far); -pte2_seta: pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2)) { - if (!pte2_cmpset(pte2p, pte2, pte2 | PTE2_A)) { - goto pte2_seta; - } + pte2_store(pte2p, pte2 | PTE2_A); + PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if (idx == FAULT_ACCESS_L1) { pte1p = pmap_pte1(pmap, far); -pte1_seta: pte1 = pte1_load(pte1p); if (pte1_is_section(pte1)) { - if (!pte1_cmpset(pte1p, pte1, pte1 | PTE1_A)) { - goto pte1_seta; - } + pte1_store(pte1p, pte1 | PTE1_A); + PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * Handle modify bits for page and section. Note that the modify * bit is emulated by software. So PTEx_RO is software read only * bit and PTEx_NM flag is real hardware read only bit. * * QQQ: This is hardware emulation, we do not call userret() * for aborts from user mode. - * We do not lock PMAP, so cmpset() is a need. Hopefully, - * no one removes the mapping when we are here. */ if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L2)) { pte2p = pt2map_entry(far); -pte2_setrw: pte2 = pte2_load(pte2p); if (pte2_is_valid(pte2) && !(pte2 & PTE2_RO) && (pte2 & PTE2_NM)) { - if (!pte2_cmpset(pte2p, pte2, pte2 & ~PTE2_NM)) { - goto pte2_setrw; - } + pte2_store(pte2p, pte2 & ~PTE2_NM); tlb_flush(trunc_page(far)); + PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } if ((fsr & FSR_WNR) && (idx == FAULT_PERM_L1)) { pte1p = pmap_pte1(pmap, far); -pte1_setrw: pte1 = pte1_load(pte1p); if (pte1_is_section(pte1) && !(pte1 & PTE1_RO) && (pte1 & PTE1_NM)) { - if (!pte1_cmpset(pte1p, pte1, pte1 & ~PTE1_NM)) { - goto pte1_setrw; - } + pte1_store(pte1p, pte1 & ~PTE1_NM); tlb_flush(pte1_trunc(far)); + PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } } /* * QQQ: The previous code, mainly fast handling of access and * modify bits aborts, could be moved to ASM. Now we are * starting to deal with not fast aborts. */ #ifdef INVARIANTS /* * Read an entry in PT2TAB associated with both pmap and far. * It's safe because PT2TAB is always mapped. - * - * QQQ: We do not lock PMAP, so false positives could happen if - * the mapping is removed concurrently. */ pte2 = pt2tab_load(pmap_pt2tab_entry(pmap, far)); if (pte2_is_valid(pte2)) { /* * Now, when we know that L2 page table is allocated, * we can use PT2MAP to get L2 page table entry. */ pte2 = pte2_load(pt2map_entry(far)); if (pte2_is_valid(pte2)) { /* * If L2 page table entry is valid, make sure that * L1 page table entry is valid too. Note that we * leave L2 page entries untouched when promoted. */ pte1 = pte1_load(pmap_pte1(pmap, far)); if (!pte1_is_valid(pte1)) { panic("%s: missing L1 page entry (%p, %#x)", __func__, pmap, far); } } } #endif + PMAP_UNLOCK(pmap); return (KERN_FAILURE); } #if defined(PMAP_DEBUG) /* * Reusing of KVA used in pmap_zero_page function !!! */ static void pmap_zero_page_check(vm_page_t m) { uint32_t *p, *end; struct sysmaps *sysmaps; sched_pin(); sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; mtx_lock(&sysmaps->lock); if (pte2_load(sysmaps->CMAP2) != 0) panic("%s: CMAP2 busy", __func__); pte2_store(sysmaps->CMAP2, PTE2_KERN_NG(VM_PAGE_TO_PHYS(m), PTE2_AP_KRW, vm_page_pte2_attr(m))); end = (uint32_t*)(sysmaps->CADDR2 + PAGE_SIZE); for (p = (uint32_t*)sysmaps->CADDR2; p < end; p++) if (*p != 0) panic("%s: page %p not zero, va: %p", __func__, m, sysmaps->CADDR2); pte2_clear(sysmaps->CMAP2); tlb_flush((vm_offset_t)sysmaps->CADDR2); sched_unpin(); mtx_unlock(&sysmaps->lock); } int pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte2 = 0; int i, j, index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid || p->p_vmspace == NULL) continue; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPTE1_IN_PT1; i++) { pt1_entry_t pte1; pt2_entry_t *pte2p, pte2; vm_offset_t base, va; vm_paddr_t pa; vm_page_t m; base = i << PTE1_SHIFT; pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1_is_section(pte1)) { /* * QQQ: Do something here! */ } else if (pte1_is_link(pte1)) { for (j = 0; j < NPTE2_IN_PT2; j++) { va = base + (j << PAGE_SHIFT); if (va >= VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte2); } pte2p = pmap_pte2(pmap, va); pte2 = pte2_load(pte2p); pmap_pte2_release(pte2p); if (!pte2_is_valid(pte2)) continue; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf("va: 0x%x, pa: 0x%x, h: %d, w:" " %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte2++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } sx_sunlock(&allproc_lock); return (npte2); } #endif #ifdef DDB static pt2_entry_t * pmap_pte2_ddb(pmap_t pmap, vm_offset_t va) { pt1_entry_t pte1; vm_paddr_t pt2pg_pa; pte1 = pte1_load(pmap_pte1(pmap, va)); if (!pte1_is_link(pte1)) return (NULL); if (pmap_is_current(pmap)) return (pt2map_entry(va)); /* Note that L2 page table size is not equal to PAGE_SIZE. */ pt2pg_pa = trunc_page(pte1_link_pa(pte1)); if (pte2_pa(pte2_load(PMAP3)) != pt2pg_pa) { pte2_store(PMAP3, PTE2_KPT(pt2pg_pa)); #ifdef SMP PMAP3cpu = PCPU_GET(cpuid); #endif tlb_flush_local((vm_offset_t)PADDR3); } #ifdef SMP else if (PMAP3cpu != PCPU_GET(cpuid)) { PMAP3cpu = PCPU_GET(cpuid); tlb_flush_local((vm_offset_t)PADDR3); } #endif return (PADDR3 + (arm32_btop(va) & (NPTE2_IN_PG - 1))); } static void dump_pmap(pmap_t pmap) { printf("pmap %p\n", pmap); printf(" pm_pt1: %p\n", pmap->pm_pt1); printf(" pm_pt2tab: %p\n", pmap->pm_pt2tab); printf(" pm_active: 0x%08lX\n", pmap->pm_active.__bits[0]); } DB_SHOW_COMMAND(pmaps, pmap_list_pmaps) { pmap_t pmap; LIST_FOREACH(pmap, &allpmaps, pm_list) { dump_pmap(pmap); } } static int pte2_class(pt2_entry_t pte2) { int cls; cls = (pte2 >> 2) & 0x03; cls |= (pte2 >> 4) & 0x04; return (cls); } static void dump_section(pmap_t pmap, uint32_t pte1_idx) { } static void dump_link(pmap_t pmap, uint32_t pte1_idx, boolean_t invalid_ok) { uint32_t i; vm_offset_t va; pt2_entry_t *pte2p, pte2; vm_page_t m; va = pte1_idx << PTE1_SHIFT; pte2p = pmap_pte2_ddb(pmap, va); for (i = 0; i < NPTE2_IN_PT2; i++, pte2p++, va += PAGE_SIZE) { pte2 = pte2_load(pte2p); if (pte2 == 0) continue; if (!pte2_is_valid(pte2)) { printf(" 0x%08X: 0x%08X", va, pte2); if (!invalid_ok) printf(" - not valid !!!"); printf("\n"); continue; } m = PHYS_TO_VM_PAGE(pte2_pa(pte2)); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, g:%d, m:%p", va , pte2, pte2_class(pte2), !!(pte2 & PTE2_S), !(pte2 & PTE2_NG), m); if (m != NULL) { printf(" v:%d h:%d w:%d f:0x%04X\n", m->valid, m->hold_count, m->wire_count, m->flags); } else { printf("\n"); } } } static __inline boolean_t is_pv_chunk_space(vm_offset_t va) { if ((((vm_offset_t)pv_chunkbase) <= va) && (va < ((vm_offset_t)pv_chunkbase + PAGE_SIZE * pv_maxchunks))) return (TRUE); return (FALSE); } DB_SHOW_COMMAND(pmap, pmap_pmap_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va, eva; vm_page_t m; uint32_t i; boolean_t invalid_ok, dump_link_ok, dump_pv_chunk; if (have_addr) { pmap_t pm; LIST_FOREACH(pm, &allpmaps, pm_list) if (pm == pmap) break; if (pm == NULL) { printf("given pmap %p is not in allpmaps list\n", pmap); return; } } else pmap = PCPU_GET(curpmap); eva = (modif[0] == 'u') ? VM_MAXUSER_ADDRESS : 0xFFFFFFFF; dump_pv_chunk = FALSE; /* XXX evaluate from modif[] */ printf("pmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); for(i = 0; i < NPTE1_IN_PT1; i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (va >= eva) break; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d g:%d\n", va, pte1, !!(pte1 & PTE1_S), !(pte1 & PTE1_NG)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { dump_link_ok = TRUE; invalid_ok = FALSE; pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); m = PHYS_TO_VM_PAGE(pte1_link_pa(pte1)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X m: %p", va, pte1, pte2, m); if (is_pv_chunk_space(va)) { printf(" - pv_chunk space"); if (dump_pv_chunk) invalid_ok = TRUE; else dump_link_ok = FALSE; } else if (m != NULL) printf(" w:%d w2:%u", m->wire_count, pt2_wirecount_get(m, pte1_index(va))); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO"); else if (pte2_pa(pte1) != pte2_pa(pte2)) printf(" !!! pt2tab entry is DIFFERENT - m: %p", PHYS_TO_VM_PAGE(pte2_pa(pte2))); printf("\n"); if (dump_link_ok) dump_link(pmap, i, invalid_ok); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } } static void dump_pt2tab(pmap_t pmap) { uint32_t i; pt2_entry_t pte2; vm_offset_t va; vm_paddr_t pa; vm_page_t m; printf("PT2TAB:\n"); for (i = 0; i < PT2TAB_ENTRIES; i++) { pte2 = pte2_load(&pmap->pm_pt2tab[i]); if (!pte2_is_valid(pte2)) continue; va = i << PT2TAB_SHIFT; pa = pte2_pa(pte2); m = PHYS_TO_VM_PAGE(pa); printf(" 0x%08X: 0x%08X, TEX%d, s:%d, m:%p", va, pte2, pte2_class(pte2), !!(pte2 & PTE2_S), m); if (m != NULL) printf(" , h: %d, w: %d, f: 0x%04X pidx: %lld", m->hold_count, m->wire_count, m->flags, m->pindex); printf("\n"); } } DB_SHOW_COMMAND(pmap_pt2tab, pmap_pt2tab_print) { /* XXX convert args. */ pmap_t pmap = (pmap_t)addr; pt1_entry_t pte1; pt2_entry_t pte2; vm_offset_t va; uint32_t i, start; if (have_addr) { printf("supported only on current pmap\n"); return; } pmap = PCPU_GET(curpmap); printf("curpmap: 0x%08X\n", (uint32_t)pmap); printf("PT2MAP: 0x%08X\n", (uint32_t)PT2MAP); printf("pt2tab: 0x%08X\n", (uint32_t)pmap->pm_pt2tab); start = pte1_index((vm_offset_t)PT2MAP); for (i = start; i < (start + NPT2_IN_PT2TAB); i++) { pte1 = pte1_load(&pmap->pm_pt1[i]); if (pte1 == 0) continue; va = i << PTE1_SHIFT; if (pte1_is_section(pte1)) { printf("0x%08X: Section 0x%08X, s:%d\n", va, pte1, !!(pte1 & PTE1_S)); dump_section(pmap, i); } else if (pte1_is_link(pte1)) { pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); printf("0x%08X: Link 0x%08X, pt2tab: 0x%08X\n", va, pte1, pte2); if (pte2 == 0) printf(" !!! pt2tab entry is ZERO\n"); } else printf("0x%08X: Invalid entry 0x%08X\n", va, pte1); } dump_pt2tab(pmap); } #endif Index: user/ngie/bsnmp_cleanup/sys/arm/include/cpu-v6.h =================================================================== --- user/ngie/bsnmp_cleanup/sys/arm/include/cpu-v6.h (revision 298467) +++ user/ngie/bsnmp_cleanup/sys/arm/include/cpu-v6.h (revision 298468) @@ -1,586 +1,634 @@ /*- * Copyright 2014 Svatopluk Kraus * Copyright 2014 Michal Meloun * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef MACHINE_CPU_V6_H #define MACHINE_CPU_V6_H /* There are no user serviceable parts here, they may change without notice */ #ifndef _KERNEL #error Only include this file in the kernel #endif #include #include #include #include #include #if __ARM_ARCH < 6 #error Only include this file for ARMv6 #else #define CPU_ASID_KERNEL 0 void dcache_wbinv_poc_all(void); /* !!! NOT SMP coherent function !!! */ vm_offset_t dcache_wb_pou_checked(vm_offset_t, vm_size_t); vm_offset_t icache_inv_pou_checked(vm_offset_t, vm_size_t); #ifdef DEV_PMU #include #define PMU_OVSR_C 0x80000000 /* Cycle Counter */ extern uint32_t ccnt_hi[MAXCPU]; extern int pmu_attched; #endif /* DEV_PMU */ /* * Macros to generate CP15 (system control processor) read/write functions. */ #define _FX(s...) #s #define _RF0(fname, aname...) \ static __inline register_t \ fname(void) \ { \ register_t reg; \ __asm __volatile("mrc\t" _FX(aname): "=r" (reg)); \ return(reg); \ } #define _R64F0(fname, aname) \ static __inline uint64_t \ fname(void) \ { \ uint64_t reg; \ __asm __volatile("mrrc\t" _FX(aname): "=r" (reg)); \ return(reg); \ } #define _WF0(fname, aname...) \ static __inline void \ fname(void) \ { \ __asm __volatile("mcr\t" _FX(aname)); \ } #define _WF1(fname, aname...) \ static __inline void \ fname(register_t reg) \ { \ __asm __volatile("mcr\t" _FX(aname):: "r" (reg)); \ } #define _W64F1(fname, aname...) \ static __inline void \ fname(uint64_t reg) \ { \ __asm __volatile("mcrr\t" _FX(aname):: "r" (reg)); \ } /* * Raw CP15 maintenance operations * !!! not for external use !!! */ /* TLB */ _WF0(_CP15_TLBIALL, CP15_TLBIALL) /* Invalidate entire unified TLB */ #if __ARM_ARCH >= 7 && defined SMP _WF0(_CP15_TLBIALLIS, CP15_TLBIALLIS) /* Invalidate entire unified TLB IS */ #endif _WF1(_CP15_TLBIASID, CP15_TLBIASID(%0)) /* Invalidate unified TLB by ASID */ #if __ARM_ARCH >= 7 && defined SMP _WF1(_CP15_TLBIASIDIS, CP15_TLBIASIDIS(%0)) /* Invalidate unified TLB by ASID IS */ #endif _WF1(_CP15_TLBIMVAA, CP15_TLBIMVAA(%0)) /* Invalidate unified TLB by MVA, all ASID */ #if __ARM_ARCH >= 7 && defined SMP _WF1(_CP15_TLBIMVAAIS, CP15_TLBIMVAAIS(%0)) /* Invalidate unified TLB by MVA, all ASID IS */ #endif _WF1(_CP15_TLBIMVA, CP15_TLBIMVA(%0)) /* Invalidate unified TLB by MVA */ _WF1(_CP15_TTB_SET, CP15_TTBR0(%0)) /* Cache and Branch predictor */ _WF0(_CP15_BPIALL, CP15_BPIALL) /* Branch predictor invalidate all */ #if __ARM_ARCH >= 7 && defined SMP _WF0(_CP15_BPIALLIS, CP15_BPIALLIS) /* Branch predictor invalidate all IS */ #endif _WF1(_CP15_BPIMVA, CP15_BPIMVA(%0)) /* Branch predictor invalidate by MVA */ _WF1(_CP15_DCCIMVAC, CP15_DCCIMVAC(%0)) /* Data cache clean and invalidate by MVA PoC */ _WF1(_CP15_DCCISW, CP15_DCCISW(%0)) /* Data cache clean and invalidate by set/way */ _WF1(_CP15_DCCMVAC, CP15_DCCMVAC(%0)) /* Data cache clean by MVA PoC */ #if __ARM_ARCH >= 7 _WF1(_CP15_DCCMVAU, CP15_DCCMVAU(%0)) /* Data cache clean by MVA PoU */ #endif _WF1(_CP15_DCCSW, CP15_DCCSW(%0)) /* Data cache clean by set/way */ _WF1(_CP15_DCIMVAC, CP15_DCIMVAC(%0)) /* Data cache invalidate by MVA PoC */ _WF1(_CP15_DCISW, CP15_DCISW(%0)) /* Data cache invalidate by set/way */ _WF0(_CP15_ICIALLU, CP15_ICIALLU) /* Instruction cache invalidate all PoU */ #if __ARM_ARCH >= 7 && defined SMP _WF0(_CP15_ICIALLUIS, CP15_ICIALLUIS) /* Instruction cache invalidate all PoU IS */ #endif _WF1(_CP15_ICIMVAU, CP15_ICIMVAU(%0)) /* Instruction cache invalidate */ /* * Publicly accessible functions */ /* CP14 Debug Registers */ _RF0(cp14_dbgdidr_get, CP14_DBGDIDR(%0)) _RF0(cp14_dbgprsr_get, CP14_DBGPRSR(%0)) _RF0(cp14_dbgoslsr_get, CP14_DBGOSLSR(%0)) _RF0(cp14_dbgosdlr_get, CP14_DBGOSDLR(%0)) _RF0(cp14_dbgdscrint_get, CP14_DBGDSCRint(%0)) _WF1(cp14_dbgdscr_v6_set, CP14_DBGDSCRext_V6(%0)) _WF1(cp14_dbgdscr_v7_set, CP14_DBGDSCRext_V7(%0)) _WF1(cp14_dbgvcr_set, CP14_DBGVCR(%0)) _WF1(cp14_dbgoslar_set, CP14_DBGOSLAR(%0)) /* Various control registers */ _RF0(cp15_cpacr_get, CP15_CPACR(%0)) _WF1(cp15_cpacr_set, CP15_CPACR(%0)) _RF0(cp15_dfsr_get, CP15_DFSR(%0)) _RF0(cp15_ifsr_get, CP15_IFSR(%0)) _WF1(cp15_prrr_set, CP15_PRRR(%0)) _WF1(cp15_nmrr_set, CP15_NMRR(%0)) _RF0(cp15_ttbr_get, CP15_TTBR0(%0)) _RF0(cp15_dfar_get, CP15_DFAR(%0)) #if __ARM_ARCH >= 7 _RF0(cp15_ifar_get, CP15_IFAR(%0)) _RF0(cp15_l2ctlr_get, CP15_L2CTLR(%0)) #endif _RF0(cp15_actlr_get, CP15_ACTLR(%0)) _WF1(cp15_actlr_set, CP15_ACTLR(%0)) _WF1(cp15_ats1cpr_set, CP15_ATS1CPR(%0)) _WF1(cp15_ats1cpw_set, CP15_ATS1CPW(%0)) +_WF1(cp15_ats1cur_set, CP15_ATS1CUR(%0)) +_WF1(cp15_ats1cuw_set, CP15_ATS1CUW(%0)) _RF0(cp15_par_get, CP15_PAR(%0)) _RF0(cp15_sctlr_get, CP15_SCTLR(%0)) /*CPU id registers */ _RF0(cp15_midr_get, CP15_MIDR(%0)) _RF0(cp15_ctr_get, CP15_CTR(%0)) _RF0(cp15_tcmtr_get, CP15_TCMTR(%0)) _RF0(cp15_tlbtr_get, CP15_TLBTR(%0)) _RF0(cp15_mpidr_get, CP15_MPIDR(%0)) _RF0(cp15_revidr_get, CP15_REVIDR(%0)) _RF0(cp15_ccsidr_get, CP15_CCSIDR(%0)) _RF0(cp15_clidr_get, CP15_CLIDR(%0)) _RF0(cp15_aidr_get, CP15_AIDR(%0)) _WF1(cp15_csselr_set, CP15_CSSELR(%0)) _RF0(cp15_id_pfr0_get, CP15_ID_PFR0(%0)) _RF0(cp15_id_pfr1_get, CP15_ID_PFR1(%0)) _RF0(cp15_id_dfr0_get, CP15_ID_DFR0(%0)) _RF0(cp15_id_afr0_get, CP15_ID_AFR0(%0)) _RF0(cp15_id_mmfr0_get, CP15_ID_MMFR0(%0)) _RF0(cp15_id_mmfr1_get, CP15_ID_MMFR1(%0)) _RF0(cp15_id_mmfr2_get, CP15_ID_MMFR2(%0)) _RF0(cp15_id_mmfr3_get, CP15_ID_MMFR3(%0)) _RF0(cp15_id_isar0_get, CP15_ID_ISAR0(%0)) _RF0(cp15_id_isar1_get, CP15_ID_ISAR1(%0)) _RF0(cp15_id_isar2_get, CP15_ID_ISAR2(%0)) _RF0(cp15_id_isar3_get, CP15_ID_ISAR3(%0)) _RF0(cp15_id_isar4_get, CP15_ID_ISAR4(%0)) _RF0(cp15_id_isar5_get, CP15_ID_ISAR5(%0)) _RF0(cp15_cbar_get, CP15_CBAR(%0)) /* Performance Monitor registers */ #if __ARM_ARCH == 6 && defined(CPU_ARM1176) _RF0(cp15_pmuserenr_get, CP15_PMUSERENR(%0)) _WF1(cp15_pmuserenr_set, CP15_PMUSERENR(%0)) _RF0(cp15_pmcr_get, CP15_PMCR(%0)) _WF1(cp15_pmcr_set, CP15_PMCR(%0)) _RF0(cp15_pmccntr_get, CP15_PMCCNTR(%0)) _WF1(cp15_pmccntr_set, CP15_PMCCNTR(%0)) #elif __ARM_ARCH > 6 _RF0(cp15_pmcr_get, CP15_PMCR(%0)) _WF1(cp15_pmcr_set, CP15_PMCR(%0)) _RF0(cp15_pmcnten_get, CP15_PMCNTENSET(%0)) _WF1(cp15_pmcnten_set, CP15_PMCNTENSET(%0)) _WF1(cp15_pmcnten_clr, CP15_PMCNTENCLR(%0)) _RF0(cp15_pmovsr_get, CP15_PMOVSR(%0)) _WF1(cp15_pmovsr_set, CP15_PMOVSR(%0)) _WF1(cp15_pmswinc_set, CP15_PMSWINC(%0)) _RF0(cp15_pmselr_get, CP15_PMSELR(%0)) _WF1(cp15_pmselr_set, CP15_PMSELR(%0)) _RF0(cp15_pmccntr_get, CP15_PMCCNTR(%0)) _WF1(cp15_pmccntr_set, CP15_PMCCNTR(%0)) _RF0(cp15_pmxevtyper_get, CP15_PMXEVTYPER(%0)) _WF1(cp15_pmxevtyper_set, CP15_PMXEVTYPER(%0)) _RF0(cp15_pmxevcntr_get, CP15_PMXEVCNTRR(%0)) _WF1(cp15_pmxevcntr_set, CP15_PMXEVCNTRR(%0)) _RF0(cp15_pmuserenr_get, CP15_PMUSERENR(%0)) _WF1(cp15_pmuserenr_set, CP15_PMUSERENR(%0)) _RF0(cp15_pminten_get, CP15_PMINTENSET(%0)) _WF1(cp15_pminten_set, CP15_PMINTENSET(%0)) _WF1(cp15_pminten_clr, CP15_PMINTENCLR(%0)) #endif _RF0(cp15_tpidrurw_get, CP15_TPIDRURW(%0)) _WF1(cp15_tpidrurw_set, CP15_TPIDRURW(%0)) _RF0(cp15_tpidruro_get, CP15_TPIDRURO(%0)) _WF1(cp15_tpidruro_set, CP15_TPIDRURO(%0)) _RF0(cp15_tpidrpwr_get, CP15_TPIDRPRW(%0)) _WF1(cp15_tpidrpwr_set, CP15_TPIDRPRW(%0)) /* Generic Timer registers - only use when you know the hardware is available */ _RF0(cp15_cntfrq_get, CP15_CNTFRQ(%0)) _WF1(cp15_cntfrq_set, CP15_CNTFRQ(%0)) _RF0(cp15_cntkctl_get, CP15_CNTKCTL(%0)) _WF1(cp15_cntkctl_set, CP15_CNTKCTL(%0)) _RF0(cp15_cntp_tval_get, CP15_CNTP_TVAL(%0)) _WF1(cp15_cntp_tval_set, CP15_CNTP_TVAL(%0)) _RF0(cp15_cntp_ctl_get, CP15_CNTP_CTL(%0)) _WF1(cp15_cntp_ctl_set, CP15_CNTP_CTL(%0)) _RF0(cp15_cntv_tval_get, CP15_CNTV_TVAL(%0)) _WF1(cp15_cntv_tval_set, CP15_CNTV_TVAL(%0)) _RF0(cp15_cntv_ctl_get, CP15_CNTV_CTL(%0)) _WF1(cp15_cntv_ctl_set, CP15_CNTV_CTL(%0)) _RF0(cp15_cnthctl_get, CP15_CNTHCTL(%0)) _WF1(cp15_cnthctl_set, CP15_CNTHCTL(%0)) _RF0(cp15_cnthp_tval_get, CP15_CNTHP_TVAL(%0)) _WF1(cp15_cnthp_tval_set, CP15_CNTHP_TVAL(%0)) _RF0(cp15_cnthp_ctl_get, CP15_CNTHP_CTL(%0)) _WF1(cp15_cnthp_ctl_set, CP15_CNTHP_CTL(%0)) _R64F0(cp15_cntpct_get, CP15_CNTPCT(%Q0, %R0)) _R64F0(cp15_cntvct_get, CP15_CNTVCT(%Q0, %R0)) _R64F0(cp15_cntp_cval_get, CP15_CNTP_CVAL(%Q0, %R0)) _W64F1(cp15_cntp_cval_set, CP15_CNTP_CVAL(%Q0, %R0)) _R64F0(cp15_cntv_cval_get, CP15_CNTV_CVAL(%Q0, %R0)) _W64F1(cp15_cntv_cval_set, CP15_CNTV_CVAL(%Q0, %R0)) _R64F0(cp15_cntvoff_get, CP15_CNTVOFF(%Q0, %R0)) _W64F1(cp15_cntvoff_set, CP15_CNTVOFF(%Q0, %R0)) _R64F0(cp15_cnthp_cval_get, CP15_CNTHP_CVAL(%Q0, %R0)) _W64F1(cp15_cnthp_cval_set, CP15_CNTHP_CVAL(%Q0, %R0)) #undef _FX #undef _RF0 #undef _WF0 #undef _WF1 /* * TLB maintenance operations. */ /* Local (i.e. not broadcasting ) operations. */ /* Flush all TLB entries (even global). */ static __inline void tlb_flush_all_local(void) { dsb(); _CP15_TLBIALL(); dsb(); } /* Flush all not global TLB entries. */ static __inline void tlb_flush_all_ng_local(void) { dsb(); _CP15_TLBIASID(CPU_ASID_KERNEL); dsb(); } /* Flush single TLB entry (even global). */ static __inline void tlb_flush_local(vm_offset_t va) { KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va)); dsb(); _CP15_TLBIMVA(va | CPU_ASID_KERNEL); dsb(); } /* Flush range of TLB entries (even global). */ static __inline void tlb_flush_range_local(vm_offset_t va, vm_size_t size) { vm_offset_t eva = va + size; KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va)); KASSERT((size & PAGE_MASK) == 0, ("%s: size %#x not aligned", __func__, size)); dsb(); for (; va < eva; va += PAGE_SIZE) _CP15_TLBIMVA(va | CPU_ASID_KERNEL); dsb(); } /* Broadcasting operations. */ #if __ARM_ARCH >= 7 && defined SMP static __inline void tlb_flush_all(void) { dsb(); _CP15_TLBIALLIS(); dsb(); } static __inline void tlb_flush_all_ng(void) { dsb(); _CP15_TLBIASIDIS(CPU_ASID_KERNEL); dsb(); } static __inline void tlb_flush(vm_offset_t va) { KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va)); dsb(); _CP15_TLBIMVAAIS(va); dsb(); } static __inline void tlb_flush_range(vm_offset_t va, vm_size_t size) { vm_offset_t eva = va + size; KASSERT((va & PAGE_MASK) == 0, ("%s: va %#x not aligned", __func__, va)); KASSERT((size & PAGE_MASK) == 0, ("%s: size %#x not aligned", __func__, size)); dsb(); for (; va < eva; va += PAGE_SIZE) _CP15_TLBIMVAAIS(va); dsb(); } #else /* SMP */ #define tlb_flush_all() tlb_flush_all_local() #define tlb_flush_all_ng() tlb_flush_all_ng_local() #define tlb_flush(va) tlb_flush_local(va) #define tlb_flush_range(va, size) tlb_flush_range_local(va, size) #endif /* SMP */ /* * Cache maintenance operations. */ /* Sync I and D caches to PoU */ static __inline void icache_sync(vm_offset_t va, vm_size_t size) { vm_offset_t eva = va + size; dsb(); va &= ~cpuinfo.dcache_line_mask; for ( ; va < eva; va += cpuinfo.dcache_line_size) { #if __ARM_ARCH >= 7 && defined SMP _CP15_DCCMVAU(va); #else _CP15_DCCMVAC(va); #endif } dsb(); #if __ARM_ARCH >= 7 && defined SMP _CP15_ICIALLUIS(); #else _CP15_ICIALLU(); #endif dsb(); isb(); } /* Invalidate I cache */ static __inline void icache_inv_all(void) { #if __ARM_ARCH >= 7 && defined SMP _CP15_ICIALLUIS(); #else _CP15_ICIALLU(); #endif dsb(); isb(); } /* Invalidate branch predictor buffer */ static __inline void bpb_inv_all(void) { #if __ARM_ARCH >= 7 && defined SMP _CP15_BPIALLIS(); #else _CP15_BPIALL(); #endif dsb(); isb(); } /* Write back D-cache to PoU */ static __inline void dcache_wb_pou(vm_offset_t va, vm_size_t size) { vm_offset_t eva = va + size; dsb(); va &= ~cpuinfo.dcache_line_mask; for ( ; va < eva; va += cpuinfo.dcache_line_size) { #if __ARM_ARCH >= 7 && defined SMP _CP15_DCCMVAU(va); #else _CP15_DCCMVAC(va); #endif } dsb(); } /* * Invalidate D-cache to PoC * * Caches are invalidated from outermost to innermost as fresh cachelines * flow in this direction. In given range, if there was no dirty cacheline * in any cache before, no stale cacheline should remain in them after this * operation finishes. */ static __inline void dcache_inv_poc(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { vm_offset_t eva = va + size; dsb(); /* invalidate L2 first */ cpu_l2cache_inv_range(pa, size); /* then L1 */ va &= ~cpuinfo.dcache_line_mask; for ( ; va < eva; va += cpuinfo.dcache_line_size) { _CP15_DCIMVAC(va); } dsb(); } /* * Discard D-cache lines to PoC, prior to overwrite by DMA engine. * * Normal invalidation does L2 then L1 to ensure that stale data from L2 doesn't * flow into L1 while invalidating. This routine is intended to be used only * when invalidating a buffer before a DMA operation loads new data into memory. * The concern in this case is that dirty lines are not evicted to main memory, * overwriting the DMA data. For that reason, the L1 is done first to ensure * that an evicted L1 line doesn't flow to L2 after the L2 has been cleaned. */ static __inline void dcache_inv_poc_dma(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { vm_offset_t eva = va + size; /* invalidate L1 first */ dsb(); va &= ~cpuinfo.dcache_line_mask; for ( ; va < eva; va += cpuinfo.dcache_line_size) { _CP15_DCIMVAC(va); } dsb(); /* then L2 */ cpu_l2cache_inv_range(pa, size); } /* * Write back D-cache to PoC * * Caches are written back from innermost to outermost as dirty cachelines * flow in this direction. In given range, no dirty cacheline should remain * in any cache after this operation finishes. */ static __inline void dcache_wb_poc(vm_offset_t va, vm_paddr_t pa, vm_size_t size) { vm_offset_t eva = va + size; dsb(); va &= ~cpuinfo.dcache_line_mask; for ( ; va < eva; va += cpuinfo.dcache_line_size) { _CP15_DCCMVAC(va); } dsb(); cpu_l2cache_wb_range(pa, size); } /* Write back and invalidate D-cache to PoC */ static __inline void dcache_wbinv_poc(vm_offset_t sva, vm_paddr_t pa, vm_size_t size) { vm_offset_t va; vm_offset_t eva = sva + size; dsb(); /* write back L1 first */ va = sva & ~cpuinfo.dcache_line_mask; for ( ; va < eva; va += cpuinfo.dcache_line_size) { _CP15_DCCMVAC(va); } dsb(); /* then write back and invalidate L2 */ cpu_l2cache_wbinv_range(pa, size); /* then invalidate L1 */ va = sva & ~cpuinfo.dcache_line_mask; for ( ; va < eva; va += cpuinfo.dcache_line_size) { _CP15_DCIMVAC(va); } dsb(); } /* Set TTB0 register */ static __inline void cp15_ttbr_set(uint32_t reg) { dsb(); _CP15_TTB_SET(reg); dsb(); _CP15_BPIALL(); dsb(); isb(); tlb_flush_all_ng_local(); } -#endif /* _KERNEL */ + +/* + * Functions for address checking: + * + * cp15_ats1cpr_check() ... check stage 1 privileged (PL1) read access + * cp15_ats1cpw_check() ... check stage 1 privileged (PL1) write access + * cp15_ats1cur_check() ... check stage 1 unprivileged (PL0) read access + * cp15_ats1cuw_check() ... check stage 1 unprivileged (PL0) write access + * + * They must be called while interrupts are disabled to get consistent result. + */ +static __inline int +cp15_ats1cpr_check(vm_offset_t addr) +{ + + cp15_ats1cpr_set(addr); + isb(); + return (cp15_par_get() & 0x01 ? EFAULT : 0); +} + +static __inline int +cp15_ats1cpw_check(vm_offset_t addr) +{ + + cp15_ats1cpw_set(addr); + isb(); + return (cp15_par_get() & 0x01 ? EFAULT : 0); +} + +static __inline int +cp15_ats1cur_check(vm_offset_t addr) +{ + + cp15_ats1cur_set(addr); + isb(); + return (cp15_par_get() & 0x01 ? EFAULT : 0); +} + +static __inline int +cp15_ats1cuw_check(vm_offset_t addr) +{ + + cp15_ats1cuw_set(addr); + isb(); + return (cp15_par_get() & 0x01 ? EFAULT : 0); +} +#endif /* !__ARM_ARCH < 6 */ #endif /* !MACHINE_CPU_V6_H */ Index: user/ngie/bsnmp_cleanup/sys/arm/include/pmap_var.h =================================================================== --- user/ngie/bsnmp_cleanup/sys/arm/include/pmap_var.h (revision 298467) +++ user/ngie/bsnmp_cleanup/sys/arm/include/pmap_var.h (revision 298468) @@ -1,512 +1,494 @@ /*- * Copyright 2014 Svatopluk Kraus * Copyright 2014 Michal Meloun * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PMAP_VAR_H_ #define _MACHINE_PMAP_VAR_H_ #include #include /* * Various PMAP defines, exports, and inline functions * definitions also usable in other MD code. */ /* A number of pages in L1 page table. */ #define NPG_IN_PT1 (NB_IN_PT1 / PAGE_SIZE) /* A number of L2 page tables in a page. */ #define NPT2_IN_PG (PAGE_SIZE / NB_IN_PT2) /* A number of L2 page table entries in a page. */ #define NPTE2_IN_PG (NPT2_IN_PG * NPTE2_IN_PT2) #ifdef _KERNEL /* * A L2 page tables page contains NPT2_IN_PG L2 page tables. Masking of * pte1_idx by PT2PG_MASK gives us an index to associated L2 page table * in a page. The PT2PG_SHIFT definition depends on NPT2_IN_PG strictly. * I.e., (1 << PT2PG_SHIFT) == NPT2_IN_PG must be fulfilled. */ #define PT2PG_SHIFT 2 #define PT2PG_MASK ((1 << PT2PG_SHIFT) - 1) /* * A PT2TAB holds all allocated L2 page table pages in a pmap. * Right shifting of virtual address by PT2TAB_SHIFT gives us an index * to L2 page table page in PT2TAB which holds the address mapping. */ #define PT2TAB_ENTRIES (NPTE1_IN_PT1 / NPT2_IN_PG) #define PT2TAB_SHIFT (PTE1_SHIFT + PT2PG_SHIFT) /* * All allocated L2 page table pages in a pmap are mapped into PT2MAP space. * An virtual address right shifting by PT2MAP_SHIFT gives us an index to PTE2 * which maps the address. */ #define PT2MAP_SIZE (NPTE1_IN_PT1 * NB_IN_PT2) #define PT2MAP_SHIFT PTE2_SHIFT extern pt1_entry_t *kern_pt1; extern pt2_entry_t *kern_pt2tab; extern pt2_entry_t *PT2MAP; /* * Virtual interface for L1 page table management. */ static __inline u_int pte1_index(vm_offset_t va) { return (va >> PTE1_SHIFT); } static __inline pt1_entry_t * pte1_ptr(pt1_entry_t *pt1, vm_offset_t va) { return (pt1 + pte1_index(va)); } static __inline vm_offset_t pte1_trunc(vm_offset_t va) { return (va & PTE1_FRAME); } static __inline vm_offset_t pte1_roundup(vm_offset_t va) { return ((va + PTE1_OFFSET) & PTE1_FRAME); } /* * Virtual interface for L1 page table entries management. * * XXX: Some of the following functions now with a synchronization barrier * are called in a loop, so it could be useful to have two versions of them. * One with the barrier and one without the barrier. In this case, pure * barrier pte1_sync() should be implemented as well. */ static __inline void pte1_sync(pt1_entry_t *pte1p) { dsb(); #ifndef PMAP_PTE_NOCACHE if (!cpuinfo.coherent_walk) dcache_wb_pou((vm_offset_t)pte1p, sizeof(*pte1p)); #endif } static __inline void pte1_sync_range(pt1_entry_t *pte1p, vm_size_t size) { dsb(); #ifndef PMAP_PTE_NOCACHE if (!cpuinfo.coherent_walk) dcache_wb_pou((vm_offset_t)pte1p, size); #endif } static __inline void pte1_store(pt1_entry_t *pte1p, pt1_entry_t pte1) { - atomic_store_rel_int(pte1p, pte1); + dmb(); + *pte1p = pte1; pte1_sync(pte1p); } static __inline void pte1_clear(pt1_entry_t *pte1p) { pte1_store(pte1p, 0); } static __inline void pte1_clear_bit(pt1_entry_t *pte1p, uint32_t bit) { - atomic_clear_int(pte1p, bit); + *pte1p &= ~bit; pte1_sync(pte1p); } static __inline boolean_t -pte1_cmpset(pt1_entry_t *pte1p, pt1_entry_t opte1, pt1_entry_t npte1) -{ - boolean_t ret; - - ret = atomic_cmpset_int(pte1p, opte1, npte1); - if (ret) pte1_sync(pte1p); - - return (ret); -} - -static __inline boolean_t pte1_is_link(pt1_entry_t pte1) { return ((pte1 & L1_TYPE_MASK) == L1_TYPE_C); } static __inline int pte1_is_section(pt1_entry_t pte1) { return ((pte1 & L1_TYPE_MASK) == L1_TYPE_S); } static __inline boolean_t pte1_is_dirty(pt1_entry_t pte1) { return ((pte1 & (PTE1_NM | PTE1_RO)) == 0); } static __inline boolean_t pte1_is_global(pt1_entry_t pte1) { return ((pte1 & PTE1_NG) == 0); } static __inline boolean_t pte1_is_valid(pt1_entry_t pte1) { int l1_type; l1_type = pte1 & L1_TYPE_MASK; return ((l1_type == L1_TYPE_C) || (l1_type == L1_TYPE_S)); } static __inline boolean_t pte1_is_wired(pt1_entry_t pte1) { return (pte1 & PTE1_W); } static __inline pt1_entry_t pte1_load(pt1_entry_t *pte1p) { pt1_entry_t pte1; pte1 = *pte1p; return (pte1); } static __inline pt1_entry_t pte1_load_clear(pt1_entry_t *pte1p) { pt1_entry_t opte1; - opte1 = atomic_readandclear_int(pte1p); + opte1 = *pte1p; + *pte1p = 0; pte1_sync(pte1p); return (opte1); } static __inline void pte1_set_bit(pt1_entry_t *pte1p, uint32_t bit) { - atomic_set_int(pte1p, bit); + *pte1p |= bit; pte1_sync(pte1p); } static __inline vm_paddr_t pte1_pa(pt1_entry_t pte1) { return ((vm_paddr_t)(pte1 & PTE1_FRAME)); } static __inline vm_paddr_t pte1_link_pa(pt1_entry_t pte1) { return ((vm_paddr_t)(pte1 & L1_C_ADDR_MASK)); } /* * Virtual interface for L2 page table entries management. * * XXX: Some of the following functions now with a synchronization barrier * are called in a loop, so it could be useful to have two versions of them. * One with the barrier and one without the barrier. */ static __inline void pte2_sync(pt2_entry_t *pte2p) { dsb(); #ifndef PMAP_PTE_NOCACHE if (!cpuinfo.coherent_walk) dcache_wb_pou((vm_offset_t)pte2p, sizeof(*pte2p)); #endif } static __inline void pte2_sync_range(pt2_entry_t *pte2p, vm_size_t size) { dsb(); #ifndef PMAP_PTE_NOCACHE if (!cpuinfo.coherent_walk) dcache_wb_pou((vm_offset_t)pte2p, size); #endif } static __inline void pte2_store(pt2_entry_t *pte2p, pt2_entry_t pte2) { - atomic_store_rel_int(pte2p, pte2); + dmb(); + *pte2p = pte2; pte2_sync(pte2p); } static __inline void pte2_clear(pt2_entry_t *pte2p) { pte2_store(pte2p, 0); } static __inline void pte2_clear_bit(pt2_entry_t *pte2p, uint32_t bit) { - atomic_clear_int(pte2p, bit); + *pte2p &= ~bit; pte2_sync(pte2p); } static __inline boolean_t -pte2_cmpset(pt2_entry_t *pte2p, pt2_entry_t opte2, pt2_entry_t npte2) -{ - boolean_t ret; - - ret = atomic_cmpset_int(pte2p, opte2, npte2); - if (ret) pte2_sync(pte2p); - - return (ret); -} - -static __inline boolean_t pte2_is_dirty(pt2_entry_t pte2) { return ((pte2 & (PTE2_NM | PTE2_RO)) == 0); } static __inline boolean_t pte2_is_global(pt2_entry_t pte2) { return ((pte2 & PTE2_NG) == 0); } static __inline boolean_t pte2_is_valid(pt2_entry_t pte2) { return (pte2 & PTE2_V); } static __inline boolean_t pte2_is_wired(pt2_entry_t pte2) { return (pte2 & PTE2_W); } static __inline pt2_entry_t pte2_load(pt2_entry_t *pte2p) { pt2_entry_t pte2; pte2 = *pte2p; return (pte2); } static __inline pt2_entry_t pte2_load_clear(pt2_entry_t *pte2p) { pt2_entry_t opte2; - opte2 = atomic_readandclear_int(pte2p); + opte2 = *pte2p; + *pte2p = 0; pte2_sync(pte2p); return (opte2); } static __inline void pte2_set_bit(pt2_entry_t *pte2p, uint32_t bit) { - atomic_set_int(pte2p, bit); + *pte2p |= bit; pte2_sync(pte2p); } static __inline void pte2_set_wired(pt2_entry_t *pte2p, boolean_t wired) { /* * Wired bit is transparent for page table walk, * so pte2_sync() is not needed. */ if (wired) - atomic_set_int(pte2p, PTE2_W); + *pte2p |= PTE2_W; else - atomic_clear_int(pte2p, PTE2_W); + *pte2p &= ~PTE2_W; } static __inline vm_paddr_t pte2_pa(pt2_entry_t pte2) { return ((vm_paddr_t)(pte2 & PTE2_FRAME)); } static __inline u_int pte2_attr(pt2_entry_t pte2) { return ((u_int)(pte2 & PTE2_ATTR_MASK)); } /* * Virtual interface for L2 page tables mapping management. */ static __inline u_int pt2tab_index(vm_offset_t va) { return (va >> PT2TAB_SHIFT); } static __inline pt2_entry_t * pt2tab_entry(pt2_entry_t *pt2tab, vm_offset_t va) { return (pt2tab + pt2tab_index(va)); } static __inline void pt2tab_store(pt2_entry_t *pte2p, pt2_entry_t pte2) { pte2_store(pte2p,pte2); } static __inline pt2_entry_t pt2tab_load(pt2_entry_t *pte2p) { return (pte2_load(pte2p)); } static __inline pt2_entry_t pt2tab_load_clear(pt2_entry_t *pte2p) { return (pte2_load_clear(pte2p)); } static __inline u_int pt2map_index(vm_offset_t va) { return (va >> PT2MAP_SHIFT); } static __inline pt2_entry_t * pt2map_entry(vm_offset_t va) { return (PT2MAP + pt2map_index(va)); } /* * Virtual interface for pmap structure & kernel shortcuts. */ static __inline pt1_entry_t * pmap_pte1(pmap_t pmap, vm_offset_t va) { return (pte1_ptr(pmap->pm_pt1, va)); } static __inline pt1_entry_t * kern_pte1(vm_offset_t va) { return (pte1_ptr(kern_pt1, va)); } static __inline pt2_entry_t * pmap_pt2tab_entry(pmap_t pmap, vm_offset_t va) { return (pt2tab_entry(pmap->pm_pt2tab, va)); } static __inline pt2_entry_t * kern_pt2tab_entry(vm_offset_t va) { return (pt2tab_entry(kern_pt2tab, va)); } static __inline vm_page_t pmap_pt2_page(pmap_t pmap, vm_offset_t va) { pt2_entry_t pte2; pte2 = pte2_load(pmap_pt2tab_entry(pmap, va)); return (PHYS_TO_VM_PAGE(pte2 & PTE2_FRAME)); } static __inline vm_page_t kern_pt2_page(vm_offset_t va) { pt2_entry_t pte2; pte2 = pte2_load(kern_pt2tab_entry(va)); return (PHYS_TO_VM_PAGE(pte2 & PTE2_FRAME)); } #endif /* _KERNEL */ #endif /* !_MACHINE_PMAP_VAR_H_ */ Index: user/ngie/bsnmp_cleanup/sys/contrib/rdma/krping/krping.c =================================================================== --- user/ngie/bsnmp_cleanup/sys/contrib/rdma/krping/krping.c (revision 298467) +++ user/ngie/bsnmp_cleanup/sys/contrib/rdma/krping/krping.c (revision 298468) @@ -1,3347 +1,3347 @@ /* * Copyright (c) 2005 Ammasso, Inc. All rights reserved. * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "krping.h" #include "getopt.h" extern int krping_debug; #define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x) #define PRINTF(cb, x...) log(LOG_INFO, x) #define BIND_INFO 1 MODULE_AUTHOR("Steve Wise"); MODULE_DESCRIPTION("RDMA ping client/server"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(krping, 1); MODULE_DEPEND(krping, linuxkpi, 1, 1, 1); static __inline uint64_t get_cycles(void) { uint32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((u_int64_t)high << 32)); } typedef uint64_t cycles_t; enum mem_type { DMA = 1, FASTREG = 2, MW = 3, MR = 4 }; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, {"size", OPT_INT, 'S'}, {"addr", OPT_STRING, 'a'}, {"port", OPT_INT, 'p'}, {"verbose", OPT_NOPARAM, 'v'}, {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, {"mem_mode", OPT_STRING, 'm'}, {"server_inv", OPT_NOPARAM, 'I'}, {"wlat", OPT_NOPARAM, 'l'}, {"rlat", OPT_NOPARAM, 'L'}, {"bw", OPT_NOPARAM, 'B'}, {"duplex", OPT_NOPARAM, 'd'}, {"txdepth", OPT_INT, 'T'}, {"poll", OPT_NOPARAM, 'P'}, {"local_dma_lkey", OPT_NOPARAM, 'Z'}, {"read_inv", OPT_NOPARAM, 'R'}, {"fr", OPT_INT, 'f'}, {NULL, 0, 0} }; #define htonll(x) cpu_to_be64((x)) #define ntohll(x) cpu_to_be64((x)) static struct mutex krping_mutex; /* * List of running krping threads. */ static LIST_HEAD(krping_cbs); /* * krping "ping/pong" loop: * client sends source rkey/addr/len * server receives source rkey/add/len * server rdma reads "ping" data from source * server sends "go ahead" on rdma read completion * client sends sink rkey/addr/len * server receives sink rkey/addr/len * server rdma writes "pong" data to sink * server sends "go ahead" on rdma write completion * */ /* * These states are used to signal events between the completion handler * and the main client or server thread. * * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, * and RDMA_WRITE_COMPLETE for each ping. */ enum test_state { IDLE = 1, CONNECT_REQUEST, ADDR_RESOLVED, ROUTE_RESOLVED, CONNECTED, RDMA_READ_ADV, RDMA_READ_COMPLETE, RDMA_WRITE_ADV, RDMA_WRITE_COMPLETE, ERROR }; struct krping_rdma_info { uint64_t buf; uint32_t rkey; uint32_t size; }; /* * Default max buffer size for IO... */ #define RPING_BUFSIZE 128*1024 #define RPING_SQ_DEPTH 64 /* * Control block struct. */ struct krping_cb { void *cookie; int server; /* 0 iff client */ struct ib_cq *cq; struct ib_pd *pd; struct ib_qp *qp; enum mem_type mem; struct ib_mr *dma_mr; struct ib_fast_reg_page_list *page_list; int page_list_len; struct ib_send_wr fastreg_wr; struct ib_send_wr invalidate_wr; struct ib_mr *fastreg_mr; int server_invalidate; int read_inv; u8 key; struct ib_mw *mw; struct ib_mw_bind bind_attr; struct ib_recv_wr rq_wr; /* recv work request record */ struct ib_sge recv_sgl; /* recv single SGE */ struct krping_rdma_info recv_buf;/* malloc'd buffer */ u64 recv_dma_addr; DECLARE_PCI_UNMAP_ADDR(recv_mapping) struct ib_mr *recv_mr; struct ib_send_wr sq_wr; /* send work requrest record */ struct ib_sge send_sgl; struct krping_rdma_info send_buf;/* single send buf */ u64 send_dma_addr; DECLARE_PCI_UNMAP_ADDR(send_mapping) struct ib_mr *send_mr; struct ib_send_wr rdma_sq_wr; /* rdma work request record */ struct ib_sge rdma_sgl; /* rdma single SGE */ char *rdma_buf; /* used as rdma sink */ u64 rdma_dma_addr; DECLARE_PCI_UNMAP_ADDR(rdma_mapping) struct ib_mr *rdma_mr; uint32_t remote_rkey; /* remote guys RKEY */ uint64_t remote_addr; /* remote guys TO */ uint32_t remote_len; /* remote guys LEN */ char *start_buf; /* rdma read src */ u64 start_dma_addr; DECLARE_PCI_UNMAP_ADDR(start_mapping) struct ib_mr *start_mr; enum test_state state; /* used for cond/signalling */ wait_queue_head_t sem; struct krping_stats stats; uint16_t port; /* dst port in NBO */ struct in_addr addr; /* dst addr in NBO */ char *addr_str; /* dst addr string */ int verbose; /* verbose logging */ int count; /* ping count */ int size; /* ping data size */ int validate; /* validate ping data */ int wlat; /* run wlat test */ int rlat; /* run rlat test */ int bw; /* run bw test */ int duplex; /* run bw full duplex test */ int poll; /* poll or block for rlat test */ int txdepth; /* SQ depth */ int local_dma_lkey; /* use 0 for lkey */ int frtest; /* fastreg test */ int testnum; /* CM stuff */ struct rdma_cm_id *cm_id; /* connection on client side,*/ /* listener on server side. */ struct rdma_cm_id *child_cm_id; /* connection on server side */ struct list_head list; }; static int krping_cma_event_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret; struct krping_cb *cb = cma_id->context; DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { PRINTF(cb, "rdma_resolve_route error %d\n", ret); wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; cb->child_cm_id = cma_id; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: if (cb->state == IDLE) { cb->state = CONNECT_REQUEST; cb->child_cm_id = cma_id; } else { PRINTF(cb, "Received connection request in wrong state" " (%d)\n", cb->state); } DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id); wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: DEBUG_LOG(cb, "ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; } wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: PRINTF(cb, "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: PRINTF(cb, "DISCONNECT EVENT...\n"); cb->state = ERROR; wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: PRINTF(cb, "cma detected device removal!!!!\n"); break; default: PRINTF(cb, "oof bad type!\n"); wake_up_interruptible(&cb->sem); break; } return 0; } static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) cb->state = RDMA_READ_ADV; else cb->state = RDMA_WRITE_ADV; return 0; } static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } if (cb->state == RDMA_READ_ADV) cb->state = RDMA_WRITE_ADV; else cb->state = RDMA_WRITE_COMPLETE; return 0; } static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) { struct krping_cb *cb = ctx; struct ib_wc wc; struct ib_recv_wr *bad_wr; int ret; BUG_ON(cb->cq != cq); if (cb->state == ERROR) { PRINTF(cb, "cq completion in ERROR state\n"); return; } if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { DEBUG_LOG(cb, "cq flushed\n"); continue; } else { PRINTF(cb, "cq completion failed with " "wr_id %jx status %d opcode %d vender_err %x\n", (uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; } } switch (wc.opcode) { case IB_WC_SEND: DEBUG_LOG(cb, "send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: DEBUG_LOG(cb, "rdma write completion\n"); cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: DEBUG_LOG(cb, "rdma read completion\n"); cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: DEBUG_LOG(cb, "recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; if (cb->wlat || cb->rlat || cb->bw || cb->frtest) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : client_recv(cb, &wc); if (ret) { PRINTF(cb, "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "post recv error: %d\n", ret); goto error; } wake_up_interruptible(&cb->sem); break; default: PRINTF(cb, "%s:%d Unexpected opcode %d, Shutting down\n", __func__, __LINE__, wc.opcode); goto error; } } if (ret) { PRINTF(cb, "poll error %d\n", ret); goto error; } return; error: cb->state = ERROR; wake_up_interruptible(&cb->sem); } static int krping_accept(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; DEBUG_LOG(cb, "accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_accept error: %d\n", ret); return ret; } if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } } return 0; } static void krping_setup_wr(struct krping_cb *cb) { cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; if (cb->local_dma_lkey) cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->recv_sgl.lkey = cb->dma_mr->lkey; else cb->recv_sgl.lkey = cb->recv_mr->lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; if (cb->local_dma_lkey) cb->send_sgl.lkey = cb->qp->device->local_dma_lkey; else if (cb->mem == DMA) cb->send_sgl.lkey = cb->dma_mr->lkey; else cb->send_sgl.lkey = cb->send_mr->lkey; cb->sq_wr.opcode = IB_WR_SEND; cb->sq_wr.send_flags = IB_SEND_SIGNALED; cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { cb->rdma_sgl.addr = cb->rdma_dma_addr; if (cb->mem == MR) cb->rdma_sgl.lkey = cb->rdma_mr->lkey; cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; cb->rdma_sq_wr.num_sge = 1; } switch(cb->mem) { case FASTREG: /* * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR. * both unsignaled. The client uses them to reregister * the rdma buffers with a new key each iteration. */ cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR; cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; cb->fastreg_wr.wr.fast_reg.length = cb->size; cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list; cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len; cb->invalidate_wr.next = &cb->fastreg_wr; cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; break; case MW: cb->bind_attr.wr_id = 0xabbaabba; cb->bind_attr.send_flags = 0; /* unsignaled */ #ifdef BIND_INFO cb->bind_attr.bind_info.length = cb->size; #else cb->bind_attr.length = cb->size; #endif break; default: break; } } static int krping_setup_buffers(struct krping_cb *cb) { int ret; struct ib_phys_buf buf; u64 iovbase; DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb); cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, &cb->recv_buf, sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, &cb->send_buf, sizeof(cb->send_buf), DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); if (cb->mem == DMA) { cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE); if (IS_ERR(cb->dma_mr)) { DEBUG_LOG(cb, "reg_dmamr failed\n"); ret = PTR_ERR(cb->dma_mr); goto bail; } } else { if (!cb->local_dma_lkey) { buf.addr = cb->recv_dma_addr; buf.size = sizeof cb->recv_buf; DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->recv_dma_addr; cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_LOCAL_WRITE, &iovbase); if (IS_ERR(cb->recv_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->recv_mr); goto bail; } buf.addr = cb->send_dma_addr; buf.size = sizeof cb->send_buf; DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->send_dma_addr; cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, 0, &iovbase); if (IS_ERR(cb->send_mr)) { DEBUG_LOG(cb, "send_buf reg_mr failed\n"); ret = PTR_ERR(cb->send_mr); goto bail; } } } cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->rdma_buf) { DEBUG_LOG(cb, "rdma_buf malloc failed\n"); ret = -ENOMEM; goto bail; } cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, cb->rdma_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); if (cb->mem != DMA) { switch (cb->mem) { case FASTREG: cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; cb->page_list = ib_alloc_fast_reg_page_list( cb->pd->device, cb->page_list_len); if (IS_ERR(cb->page_list)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->page_list); goto bail; } cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, cb->page_list->max_page_list_len); if (IS_ERR(cb->fastreg_mr)) { DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); ret = PTR_ERR(cb->fastreg_mr); goto bail; } DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p" " page_list_len %u\n", cb->fastreg_mr->rkey, cb->page_list, cb->page_list_len); break; case MW: cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1); if (IS_ERR(cb->mw)) { DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); ret = PTR_ERR(cb->mw); goto bail; } DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey); /*FALLTHROUGH*/ case MR: buf.addr = cb->rdma_dma_addr; buf.size = cb->size; iovbase = cb->rdma_dma_addr; cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE, &iovbase); if (IS_ERR(cb->rdma_mr)) { DEBUG_LOG(cb, "rdma_buf reg_mr failed\n"); ret = PTR_ERR(cb->rdma_mr); goto bail; } DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n", (uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey); break; default: ret = -EINVAL; goto bail; break; } } if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { cb->start_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->start_buf) { DEBUG_LOG(cb, "start_buf malloc failed\n"); ret = -ENOMEM; goto bail; } cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, cb->start_buf, cb->size, DMA_BIDIRECTIONAL); pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); if (cb->mem == MR || cb->mem == MW) { unsigned flags = IB_ACCESS_REMOTE_READ; if (cb->wlat || cb->rlat || cb->bw || cb->frtest) { flags |= IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; } buf.addr = cb->start_dma_addr; buf.size = cb->size; DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n", (uintmax_t)buf.addr, (int)buf.size); iovbase = cb->start_dma_addr; cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, flags, &iovbase); if (IS_ERR(cb->start_mr)) { DEBUG_LOG(cb, "start_buf reg_mr failed\n"); ret = PTR_ERR(cb->start_mr); goto bail; } } } krping_setup_wr(cb); DEBUG_LOG(cb, "allocated & registered buffers...\n"); return 0; bail: if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr)) ib_dereg_mr(cb->fastreg_mr); if (cb->mw && !IS_ERR(cb->mw)) ib_dealloc_mw(cb->mw); if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); if (cb->page_list && !IS_ERR(cb->page_list)) ib_free_fast_reg_page_list(cb->page_list); if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); if (cb->recv_mr && !IS_ERR(cb->recv_mr)) ib_dereg_mr(cb->recv_mr); if (cb->send_mr && !IS_ERR(cb->send_mr)) ib_dereg_mr(cb->send_mr); if (cb->rdma_buf) kfree(cb->rdma_buf); if (cb->start_buf) kfree(cb->start_buf); return ret; } static void krping_free_buffers(struct krping_cb *cb) { DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb); if (cb->dma_mr) ib_dereg_mr(cb->dma_mr); if (cb->send_mr) ib_dereg_mr(cb->send_mr); if (cb->recv_mr) ib_dereg_mr(cb->recv_mr); if (cb->rdma_mr) ib_dereg_mr(cb->rdma_mr); if (cb->start_mr) ib_dereg_mr(cb->start_mr); if (cb->fastreg_mr) ib_dereg_mr(cb->fastreg_mr); if (cb->mw) ib_dealloc_mw(cb->mw); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, send_mapping), sizeof(cb->send_buf), DMA_BIDIRECTIONAL); dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, rdma_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->rdma_buf); if (cb->start_buf) { dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, start_mapping), cb->size, DMA_BIDIRECTIONAL); kfree(cb->start_buf); } } static int krping_create_qp(struct krping_cb *cb) { struct ib_qp_init_attr init_attr; int ret; memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = cb->txdepth; init_attr.cap.max_recv_wr = 2; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = cb->cq; init_attr.recv_cq = cb->cq; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; if (cb->server) { ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->child_cm_id->qp; } else { ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); if (!ret) cb->qp = cb->cm_id->qp; } return ret; } static void krping_free_qp(struct krping_cb *cb) { ib_destroy_qp(cb->qp); ib_destroy_cq(cb->cq); ib_dealloc_pd(cb->pd); } static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) { int ret; cb->pd = ib_alloc_pd(cm_id->device); if (IS_ERR(cb->pd)) { PRINTF(cb, "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } DEBUG_LOG(cb, "created pd %p\n", cb->pd); strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, cb, cb->txdepth * 2, 0); if (IS_ERR(cb->cq)) { PRINTF(cb, "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } DEBUG_LOG(cb, "created cq %p\n", cb->cq); if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) { ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); if (ret) { PRINTF(cb, "ib_create_cq failed\n"); goto err2; } } ret = krping_create_qp(cb); if (ret) { PRINTF(cb, "krping_create_qp failed: %d\n", ret); goto err2; } DEBUG_LOG(cb, "created qp %p\n", cb->qp); return 0; err2: ib_destroy_cq(cb->cq); err1: ib_dealloc_pd(cb->pd); return ret; } /* * return the (possibly rebound) rkey for the rdma buffer. * FASTREG mode: invalidate and rebind via fastreg wr. * MW mode: rebind the MW. * other modes: just return the mr rkey. */ static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv) { u32 rkey = 0xffffffff; u64 p; struct ib_send_wr *bad_wr; int i; int ret; switch (cb->mem) { case FASTREG: cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey; /* * Update the fastreg key. */ ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key); cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey; /* * Update the fastreg WR with new buf info. */ if (buf == (u64)cb->start_dma_addr) cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ; else cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; cb->fastreg_wr.wr.fast_reg.iova_start = buf; p = (u64)(buf & PAGE_MASK); for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len; i++, p += PAGE_SIZE) { cb->page_list->page_list[i] = p; DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p); } DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u" " iova_start %jx page_list_len %u\n", post_inv, cb->fastreg_wr.wr.fast_reg.rkey, cb->fastreg_wr.wr.fast_reg.page_shift, - cb->fastreg_wr.wr.fast_reg.length, + (unsigned)cb->fastreg_wr.wr.fast_reg.length, (uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start, cb->fastreg_wr.wr.fast_reg.page_list_len); if (post_inv) ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr); else ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); cb->state = ERROR; } rkey = cb->fastreg_mr->rkey; break; case MW: /* * Update the MW with new buf info. */ if (buf == (u64)cb->start_dma_addr) { #ifdef BIND_INFO cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ; cb->bind_attr.bind_info.mr = cb->start_mr; #else cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ; cb->bind_attr.mr = cb->start_mr; #endif } else { #ifdef BIND_INFO cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE; cb->bind_attr.bind_info.mr = cb->rdma_mr; #else cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE; cb->bind_attr.mr = cb->rdma_mr; #endif } #ifdef BIND_INFO cb->bind_attr.bind_info.addr = buf; #else cb->bind_attr.addr = buf; #endif DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n", #ifdef BIND_INFO cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey); #else cb->mw->rkey, buf, cb->bind_attr.mr->rkey); #endif ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr); if (ret) { PRINTF(cb, "bind mw error %d\n", ret); cb->state = ERROR; } else rkey = cb->mw->rkey; break; case MR: if (buf == (u64)cb->start_dma_addr) rkey = cb->start_mr->rkey; else rkey = cb->rdma_mr->rkey; break; case DMA: rkey = cb->dma_mr->rkey; break; default: PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__); cb->state = ERROR; break; } return rkey; } static void krping_format_send(struct krping_cb *cb, u64 buf) { struct krping_rdma_info *info = &cb->send_buf; u32 rkey; /* * Client side will do fastreg or mw bind before * advertising the rdma buffer. Server side * sends have no data. */ if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) { rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate); info->buf = htonll(buf); info->rkey = htonl(rkey); info->size = htonl(cb->size); DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n", (unsigned long long)buf, rkey, cb->size); } } static void krping_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr, inv; int ret; while (1) { /* Wait for client's Start STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV); if (cb->state != RDMA_READ_ADV) { PRINTF(cb, "wait for RDMA_READ_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->remote_len; cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1); /* Issue RDMA Read. */ if (cb->read_inv) cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV; else { cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; if (cb->mem == FASTREG) { /* * Immediately follow the read with a * fenced LOCAL_INV. */ cb->rdma_sq_wr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = cb->fastreg_mr->rkey; inv.send_flags = IB_SEND_FENCE; } } ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } cb->rdma_sq_wr.next = NULL; DEBUG_LOG(cb, "server posted rdma read req \n"); /* Wait for read completion */ wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_COMPLETE); if (cb->state != RDMA_READ_COMPLETE) { PRINTF(cb, "wait for RDMA_READ_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received read complete\n"); /* Display data in recv buf */ if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "server ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "server ping data: %s\n", cb->rdma_buf); } /* Tell client to continue */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); /* Wait for client's RDMA STAG/TO/Len */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } DEBUG_LOG(cb, "server received sink adv\n"); /* RDMA Write echo data */ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; if (cb->local_dma_lkey) cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey; else cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0); DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n", cb->rdma_sq_wr.sg_list->lkey, (unsigned long long)cb->rdma_sq_wr.sg_list->addr, cb->rdma_sq_wr.sg_list->length); ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for completion */ ret = wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } DEBUG_LOG(cb, "server rdma write complete \n"); cb->state = CONNECTED; /* Tell client to begin again */ if (cb->server && cb->server_invalidate) { cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey; cb->sq_wr.opcode = IB_WR_SEND_WITH_INV; DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey); } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } DEBUG_LOG(cb, "server posted go ahead\n"); } } static void rlat_test(struct krping_cb *cb) { int scnt; int iters = cb->count; struct timeval start_tv, stop_tv; int ret; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; scnt = 0; cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; microtime(&start_tv); if (!cb->poll) { cb->state = RDMA_READ_ADV; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } while (scnt < iters) { cb->state = RDMA_READ_ADV; ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); if (ret) { PRINTF(cb, "Couldn't post send: ret=%d scnt %d\n", ret, scnt); return; } do { if (!cb->poll) { wait_event_interruptible(cb->sem, cb->state != RDMA_READ_ADV); if (cb->state == RDMA_READ_COMPLETE) { ne = 1; ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); } else { ne = -1; } } else ne = ib_poll_cq(cb->cq, 1, &wc); if (cb->state == ERROR) { PRINTF(cb, "state == ERROR...bailing scnt %d\n", scnt); return; } } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (cb->poll && wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } ++scnt; } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size); } static void wlat_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; volatile char *poll_buf = (char *) cb->start_buf; char *buf = (char *)cb->rdma_buf; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters || rcnt < iters) { /* Wait till buffer changes. */ if (rcnt < iters && !(scnt < 1 && !cb->server)) { ++rcnt; while (*poll_buf != (char)rcnt) { if (cb->state == ERROR) { PRINTF(cb, "state = ERROR, bailing\n"); return; } } } if (scnt < iters) { struct ib_send_wr *bad_wr; *buf = (char)scnt+1; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); scnt++; } if (ccnt < iters) { struct ib_wc wc; int ne; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ++ccnt; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); PRINTF(cb, "scnt=%d, rcnt=%d, ccnt=%d\n", scnt, rcnt, ccnt); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void bw_test(struct krping_cb *cb) { int ccnt, scnt, rcnt; int iters=cb->count; struct timeval start_tv, stop_tv; cycles_t *post_cycles_start, *post_cycles_stop; cycles_t *poll_cycles_start, *poll_cycles_stop; cycles_t *last_poll_cycles_start; cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; int i; int cycle_iters = 1000; ccnt = 0; scnt = 0; rcnt = 0; post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!post_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!poll_cycles_stop) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL); if (!last_poll_cycles_start) { PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__); return; } cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = cb->size; if (cycle_iters > iters) cycle_iters = iters; microtime(&start_tv); while (scnt < iters || ccnt < iters) { while (scnt < iters && scnt - ccnt < cb->txdepth) { struct ib_send_wr *bad_wr; if (scnt < cycle_iters) post_cycles_start[scnt] = get_cycles(); if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send: scnt=%d\n", scnt); return; } if (scnt < cycle_iters) post_cycles_stop[scnt] = get_cycles(); ++scnt; } if (ccnt < iters) { int ne; struct ib_wc wc; if (ccnt < cycle_iters) poll_cycles_start[ccnt] = get_cycles(); do { if (ccnt < cycle_iters) last_poll_cycles_start[ccnt] = get_cycles(); ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ccnt < cycle_iters) poll_cycles_stop[ccnt] = get_cycles(); ccnt += 1; if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } } microtime(&stop_tv); if (stop_tv.tv_usec < start_tv.tv_usec) { stop_tv.tv_usec += 1000000; stop_tv.tv_sec -= 1; } for (i=0; i < cycle_iters; i++) { sum_post += post_cycles_stop[i] - post_cycles_start[i]; sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i]; } PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d" " sum_post %llu sum_poll %llu sum_last_poll %llu\n", (unsigned long)(stop_tv.tv_sec - start_tv.tv_sec), (unsigned long)(stop_tv.tv_usec - start_tv.tv_usec), scnt, cb->size, cycle_iters, (unsigned long long)sum_post, (unsigned long long)sum_poll, (unsigned long long)sum_last_poll); kfree(post_cycles_start); kfree(post_cycles_stop); kfree(poll_cycles_start); kfree(poll_cycles_stop); kfree(last_poll_cycles_start); } static void krping_rlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_wlat_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } wlat_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_bw_test_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) bw_test(cb); wait_event_interruptible(cb->sem, cb->state == ERROR); } static int fastreg_supported(struct krping_cb *cb, int server) { struct ib_device *dev = server?cb->child_cm_id->device: cb->cm_id->device; struct ib_device_attr attr; int ret; ret = ib_query_device(dev, &attr); if (ret) { PRINTF(cb, "ib_query_device failed ret %d\n", ret); return 0; } if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n", (unsigned long long)attr.device_cap_flags); return 0; } DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n", (uintmax_t)attr.device_cap_flags); return 1; } static int krping_bind_server(struct krping_cb *cb) { struct sockaddr_in sin; int ret; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = cb->addr.s_addr; sin.sin_port = cb->port; ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); if (ret) { PRINTF(cb, "rdma_bind_addr error %d\n", ret); return ret; } DEBUG_LOG(cb, "rdma_bind_addr successful\n"); DEBUG_LOG(cb, "rdma_listen\n"); ret = rdma_listen(cb->cm_id, 3); if (ret) { PRINTF(cb, "rdma_listen failed: %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST); if (cb->state != CONNECT_REQUEST) { PRINTF(cb, "wait for CONNECT_REQUEST state %d\n", cb->state); return -1; } if (cb->mem == FASTREG && !fastreg_supported(cb, 1)) return -EINVAL; return 0; } /* * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads * complete. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. */ static void krping_fr_test5(struct krping_cb *cb) { struct ib_fast_reg_page_list **pl; struct ib_send_wr *fr, *read, *bad; struct ib_wc wc; struct ib_sge *sgl; u8 key = 0; struct ib_mr **mr; u8 **buf; dma_addr_t *dma_addr; int i; int ret; int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; time_t start; int count = 0; int scnt; int depth = cb->txdepth >> 1; if (!depth) { PRINTF(cb, "txdepth must be > 1 for this test!\n"); return; } pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); read = kzalloc(sizeof *read * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth); buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) { PRINTF(cb, "kzalloc failed\n"); goto err1; } for (scnt = 0; scnt < depth; scnt++) { pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl[scnt])) { PRINTF(cb, "alloc_fr_page_list failed %ld\n", PTR_ERR(pl[scnt])); goto err2; } DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr[scnt])) { PRINTF(cb, "alloc_fr failed %ld\n", PTR_ERR(mr[scnt])); goto err2; } DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); ib_update_fast_reg_key(mr[scnt], ++key); buf[scnt] = kmalloc(cb->size, GFP_KERNEL); if (!buf[scnt]) { PRINTF(cb, "kmalloc failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device, buf[scnt], cb->size, DMA_BIDIRECTIONAL); if (dma_mapping_error(cb->pd->device->dma_device, dma_addr[scnt])) { PRINTF(cb, "dma_map failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); } sgl[scnt].lkey = mr[scnt]->rkey; sgl[scnt].length = cb->size; sgl[scnt].addr = (u64)buf[scnt]; DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n", __func__, scnt, sgl[scnt].lkey, sgl[scnt].length, (uintmax_t)sgl[scnt].addr); fr[scnt].opcode = IB_WR_FAST_REG_MR; fr[scnt].wr_id = scnt; fr[scnt].send_flags = 0; fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; fr[scnt].wr.fast_reg.length = cb->size; fr[scnt].wr.fast_reg.page_list = pl[scnt]; fr[scnt].wr.fast_reg.page_list_len = plen; fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; fr[scnt].next = &read[scnt]; read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV; read[scnt].wr_id = scnt; read[scnt].send_flags = IB_SEND_SIGNALED; read[scnt].wr.rdma.rkey = cb->remote_rkey; read[scnt].wr.rdma.remote_addr = cb->remote_addr; read[scnt].num_sge = 1; read[scnt].sg_list = &sgl[scnt]; ret = ib_post_send(cb->qp, &fr[scnt], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } start = time_uptime; DEBUG_LOG(cb, "%s starting IO.\n", __func__); while (!cb->count || cb->server || count < cb->count) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, count); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, 1); if (cb->state == ERROR) break; start = time_uptime; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u wr_id %ju " "opcode %d\n", wc.status, (uintmax_t)wc.wr_id, wc.opcode); goto err2; } count++; if (count == cb->count) break; ib_update_fast_reg_key(mr[wc.wr_id], ++key); fr[wc.wr_id].wr.fast_reg.rkey = mr[wc.wr_id]->rkey; sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey; ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } DEBUG_LOG(cb, "%s done!\n", __func__); err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u " "opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mrs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (mr[scnt]) { ib_dereg_mr(mr[scnt]); DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); } } DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (buf[scnt]) { dma_unmap_single(cb->pd->device->dma_device, dma_addr[scnt], cb->size, DMA_BIDIRECTIONAL); kfree(buf[scnt]); DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); } } DEBUG_LOG(cb, "destroying fr page lists!\n"); for (scnt = 0; scnt < depth; scnt++) { if (pl[scnt]) { DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); ib_free_fast_reg_page_list(pl[scnt]); } } err1: if (pl) kfree(pl); if (mr) kfree(mr); if (fr) kfree(fr); if (read) kfree(read); if (sgl) kfree(sgl); if (buf) kfree(buf); if (dma_addr) kfree(dma_addr); } static void krping_fr_test_server(struct krping_cb *cb) { DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test5_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) krping_fr_test5(cb); DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test5_client(struct krping_cb *cb) { struct ib_send_wr *bad; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to server */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); return krping_fr_test5(cb); } /* * sq-depth worth of write + fastreg + inv, reposting them as the invs * complete. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. * If a count is given, then the last IO will have a bogus lkey in the * write work request. This reproduces a fw bug where the connection * will get stuck if a fastreg is processed while the ulptx is failing * the bad write. */ static void krping_fr_test6(struct krping_cb *cb) { struct ib_fast_reg_page_list **pl; struct ib_send_wr *fr, *write, *inv, *bad; struct ib_wc wc; struct ib_sge *sgl; u8 key = 0; struct ib_mr **mr; u8 **buf; dma_addr_t *dma_addr; int i; int ret; int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt; int depth = cb->txdepth / 3; if (!depth) { PRINTF(cb, "txdepth must be > 3 for this test!\n"); return; } pl = kzalloc(sizeof *pl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth); mr = kzalloc(sizeof *mr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth); fr = kzalloc(sizeof *fr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth); sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth); write = kzalloc(sizeof *write * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth); inv = kzalloc(sizeof *inv * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth); buf = kzalloc(sizeof *buf * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth); dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL); DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth); if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) { PRINTF(cb, "kzalloc failed\n"); goto err1; } for (scnt = 0; scnt < depth; scnt++) { pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl[scnt])) { PRINTF(cb, "alloc_fr_page_list failed %ld\n", PTR_ERR(pl[scnt])); goto err2; } DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]); mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr[scnt])) { PRINTF(cb, "alloc_fr failed %ld\n", PTR_ERR(mr[scnt])); goto err2; } DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]); ib_update_fast_reg_key(mr[scnt], ++key); buf[scnt] = kmalloc(cb->size, GFP_KERNEL); if (!buf[scnt]) { PRINTF(cb, "kmalloc failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]); dma_addr[scnt] = dma_map_single(cb->pd->device->dma_device, buf[scnt], cb->size, DMA_BIDIRECTIONAL); if (dma_mapping_error(cb->pd->device->dma_device, dma_addr[scnt])) { PRINTF(cb, "dma_map failed\n"); ret = -ENOMEM; goto err2; } DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]); for (i=0; ipage_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE); DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n", __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]); } write[scnt].opcode = IB_WR_RDMA_WRITE; write[scnt].wr_id = scnt; write[scnt].wr.rdma.rkey = cb->remote_rkey; write[scnt].wr.rdma.remote_addr = cb->remote_addr; write[scnt].num_sge = 1; write[scnt].sg_list = &cb->rdma_sgl; write[scnt].sg_list->length = cb->size; write[scnt].next = &fr[scnt]; fr[scnt].opcode = IB_WR_FAST_REG_MR; fr[scnt].wr_id = scnt; fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT; fr[scnt].wr.fast_reg.length = cb->size; fr[scnt].wr.fast_reg.page_list = pl[scnt]; fr[scnt].wr.fast_reg.page_list_len = plen; fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt]; fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey; fr[scnt].next = &inv[scnt]; inv[scnt].opcode = IB_WR_LOCAL_INV; inv[scnt].send_flags = IB_SEND_SIGNALED; inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey; ret = ib_post_send(cb->qp, &write[scnt], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } start = time_uptime; DEBUG_LOG(cb, "%s starting IO.\n", __func__); while (!cb->count || cb->server || count < cb->count) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__, count); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, 1); if (cb->state == ERROR) break; start = time_uptime; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u wr_id %ju " "opcode %d\n", wc.status, (uintmax_t)wc.wr_id, wc.opcode); goto err2; } count++; if (count == (cb->count -1)) cb->rdma_sgl.lkey = 0x00dead; if (count == cb->count) break; ib_update_fast_reg_key(mr[wc.wr_id], ++key); fr[wc.wr_id].wr.fast_reg.rkey = mr[wc.wr_id]->rkey; inv[wc.wr_id].ex.invalidate_rkey = mr[wc.wr_id]->rkey; ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } } else if (krping_sigpending()){ PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } DEBUG_LOG(cb, "%s done!\n", __func__); err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u " "opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mrs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (mr[scnt]) { ib_dereg_mr(mr[scnt]); DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]); } } DEBUG_LOG(cb, "unmapping/freeing bufs!\n"); for (scnt = 0; scnt < depth; scnt++) { if (buf[scnt]) { dma_unmap_single(cb->pd->device->dma_device, dma_addr[scnt], cb->size, DMA_BIDIRECTIONAL); kfree(buf[scnt]); DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]); } } DEBUG_LOG(cb, "destroying fr page lists!\n"); for (scnt = 0; scnt < depth; scnt++) { if (pl[scnt]) { DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]); ib_free_fast_reg_page_list(pl[scnt]); } } err1: if (pl) kfree(pl); if (mr) kfree(mr); if (fr) kfree(fr); if (write) kfree(write); if (inv) kfree(inv); if (sgl) kfree(sgl); if (buf) kfree(buf); if (dma_addr) kfree(dma_addr); } static void krping_fr_test6_server(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; /* Spin waiting for client's Start STAG/TO/Len */ while (cb->state < RDMA_READ_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completiong error %d\n", wc.status); return; } if (cb->duplex) krping_fr_test6(cb); DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__); wait_event_interruptible(cb->sem, cb->state == ERROR); } static void krping_fr_test6_client(struct krping_cb *cb) { struct ib_send_wr *bad; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to server */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey, (uintmax_t)cb->remote_addr); return krping_fr_test6(cb); } static void krping_run_server(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_server(cb); if (ret) return; ret = krping_setup_qp(cb, cb->child_cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); goto err0; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_accept(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_server(cb); else if (cb->rlat) krping_rlat_test_server(cb); else if (cb->bw) krping_bw_test_server(cb); else if (cb->frtest) { switch (cb->testnum) { case 1: case 2: case 3: case 4: krping_fr_test_server(cb); break; case 5: krping_fr_test5_server(cb); break; case 6: krping_fr_test6_server(cb); break; default: PRINTF(cb, "unknown fr test %d\n", cb->testnum); goto err2; break; } } else krping_test_server(cb); rdma_disconnect(cb->child_cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); err0: rdma_destroy_id(cb->child_cm_id); } static void krping_test_client(struct krping_cb *cb) { int ping, start, cc, i, ret; struct ib_send_wr *bad_wr; unsigned char c; start = 65; for (ping = 0; !cb->count || ping < cb->count; ping++) { cb->state = RDMA_READ_ADV; /* Put some ascii text in the buffer. */ cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); for (i = cc, c = start; i < cb->size; i++) { cb->start_buf[i] = c; c++; if (c > 122) c = 65; } start++; if (start > 122) start = 65; cb->start_buf[cb->size - 1] = 0; krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); break; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for server to ACK */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV); if (cb->state != RDMA_WRITE_ADV) { PRINTF(cb, "wait for RDMA_WRITE_ADV state %d\n", cb->state); break; } krping_format_send(cb, cb->rdma_dma_addr); ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); break; } /* Wait for the server to say the RDMA Write is complete. */ wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_COMPLETE); if (cb->state != RDMA_WRITE_COMPLETE) { PRINTF(cb, "wait for RDMA_WRITE_COMPLETE state %d\n", cb->state); break; } if (cb->validate) if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { PRINTF(cb, "data mismatch!\n"); break; } if (cb->verbose) { if (strlen(cb->rdma_buf) > 128) { char msgbuf[128]; strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf)); PRINTF(cb, "ping data stripped: %s\n", msgbuf); } else PRINTF(cb, "ping data: %s\n", cb->rdma_buf); } #ifdef SLOW_KRPING wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); #endif } } static void krping_rlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } #if 0 { int i; struct timeval start, stop; time_t sec; suseconds_t usec; unsigned long long elapsed; struct ib_wc wc; struct ib_send_wr *bad_wr; int ne; cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; cb->rdma_sq_wr.sg_list->length = 0; cb->rdma_sq_wr.num_sge = 0; microtime(&start); for (i=0; i < 100000; i++) { if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { PRINTF(cb, "Couldn't post send\n"); return; } do { ne = ib_poll_cq(cb->cq, 1, &wc); } while (ne == 0); if (ne < 0) { PRINTF(cb, "poll CQ failed %d\n", ne); return; } if (wc.status != IB_WC_SUCCESS) { PRINTF(cb, "Completion wth error at %s:\n", cb->server ? "server" : "client"); PRINTF(cb, "Failed status %d: wr_id %d\n", wc.status, (int) wc.wr_id); return; } } microtime(&stop); if (stop.tv_usec < start.tv_usec) { stop.tv_usec += 1000000; stop.tv_sec -= 1; } sec = stop.tv_sec - start.tv_sec; usec = stop.tv_usec - start.tv_usec; elapsed = sec * 1000000 + usec; PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed); } #endif rlat_test(cb); } static void krping_wlat_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } wlat_test(cb); } static void krping_bw_test_client(struct krping_cb *cb) { struct ib_send_wr *bad_wr; struct ib_wc wc; int ret; cb->state = RDMA_READ_ADV; /* Send STAG/TO/Len to client */ krping_format_send(cb, cb->start_dma_addr); if (cb->state == ERROR) { PRINTF(cb, "krping_format_send failed\n"); return; } ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); if (ret) { PRINTF(cb, "post send error %d\n", ret); return; } /* Spin waiting for send completion */ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); if (ret < 0) { PRINTF(cb, "poll error %d\n", ret); return; } if (wc.status) { PRINTF(cb, "send completion error %d\n", wc.status); return; } /* Spin waiting for server's Start STAG/TO/Len */ while (cb->state < RDMA_WRITE_ADV) { krping_cq_event_handler(cb->cq, cb); } bw_test(cb); } /* * fastreg 2 valid different mrs and verify the completions. */ static void krping_fr_test1(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, *bad; struct ib_wc wc; struct ib_mr *mr1, *mr2; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } mr2 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr2)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err2; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } fr.wr.fast_reg.rkey = mr2->rkey; DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 2); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr2!\n"); ib_dereg_mr(mr2); err2: DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg the same mr twice, 2nd one should produce error cqe. */ static void krping_fr_test2(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, *bad; struct ib_wc wc; struct ib_mr *mr1; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 2); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg pipelined in a loop as fast as we can until the user interrupts. * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy. */ static void krping_fr_test3(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, inv, *bad; struct ib_wc wc; u8 key = 0; struct ib_mr *mr; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; unsigned long start; int count = 0; int scnt = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.send_flags = IB_SEND_SIGNALED; DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth); start = time_uptime; while (1) { if ((time_uptime - start) >= 9) { DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); if (cb->state == ERROR) break; start = time_uptime; } while (scnt < (cb->txdepth>>1)) { ib_update_fast_reg_key(mr, ++key); fr.wr.fast_reg.rkey = mr->rkey; inv.ex.invalidate_rkey = mr->rkey; size = arc4random() % cb->size; if (size == 0) size = cb->size; plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list_len = plen; ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err2; } scnt+=2; } do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err2; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u\n", wc.status); goto err2; } count++; scnt--; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err2; } } while (ret == 1); } err2: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { if (wc.status) { PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode); } } } while (ret == 1); DEBUG_LOG(cb, "fr_test: done!\n"); ib_dereg_mr(mr); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } /* * fastreg 1 and invalidate 1 mr and verify completion. */ static void krping_fr_test4(struct krping_cb *cb) { struct ib_fast_reg_page_list *pl; struct ib_send_wr fr, inv, *bad; struct ib_wc wc; struct ib_mr *mr1; int i; int ret; int size = cb->size; int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT; int count = 0; pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen); if (IS_ERR(pl)) { PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl)); return; } mr1 = ib_alloc_fast_reg_mr(cb->pd, plen); if (IS_ERR(mr1)) { PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl)); goto err1; } for (i=0; ipage_list[i] = i * PAGE_SIZE; memset(&fr, 0, sizeof fr); fr.opcode = IB_WR_FAST_REG_MR; fr.wr_id = 1; fr.wr.fast_reg.page_shift = PAGE_SHIFT; fr.wr.fast_reg.length = size; fr.wr.fast_reg.page_list = pl; fr.wr.fast_reg.page_list_len = plen; fr.wr.fast_reg.iova_start = 0; fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; fr.send_flags = IB_SEND_SIGNALED; fr.wr.fast_reg.rkey = mr1->rkey; fr.next = &inv; memset(&inv, 0, sizeof inv); inv.opcode = IB_WR_LOCAL_INV; inv.ex.invalidate_rkey = mr1->rkey; DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth); ret = ib_post_send(cb->qp, &fr, &bad); if (ret) { PRINTF(cb, "ib_post_send failed %d\n", ret); goto err3; } DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); goto err3; } if (ret == 1) { DEBUG_LOG(cb, "completion status %u wr %s\n", wc.status, wc.wr_id == 1 ? "fr" : "inv"); count++; } else if (krping_sigpending()) { PRINTF(cb, "signal!\n"); goto err3; } wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); } while (count != 1); err3: DEBUG_LOG(cb, "sleeping 1 second\n"); wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ); DEBUG_LOG(cb, "draining the cq...\n"); do { ret = ib_poll_cq(cb->cq, 1, &wc); if (ret < 0) { PRINTF(cb, "ib_poll_cq failed %d\n", ret); break; } if (ret == 1) { PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode); } } while (ret == 1); DEBUG_LOG(cb, "destroying fr mr1!\n"); ib_dereg_mr(mr1); err1: DEBUG_LOG(cb, "destroying fr page list!\n"); ib_free_fast_reg_page_list(pl); DEBUG_LOG(cb, "%s done!\n", __func__); } static void krping_fr_test(struct krping_cb *cb) { switch (cb->testnum) { case 1: krping_fr_test1(cb); break; case 2: krping_fr_test2(cb); break; case 3: krping_fr_test3(cb); break; case 4: krping_fr_test4(cb); break; case 5: krping_fr_test5_client(cb); break; case 6: krping_fr_test6_client(cb); break; default: PRINTF(cb, "Unkown frtest num %u\n", cb->testnum); break; } } static int krping_connect_client(struct krping_cb *cb) { struct rdma_conn_param conn_param; int ret; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 10; ret = rdma_connect(cb->cm_id, &conn_param); if (ret) { PRINTF(cb, "rdma_connect error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { PRINTF(cb, "wait for CONNECTED state %d\n", cb->state); return -1; } DEBUG_LOG(cb, "rdma_connect successful\n"); return 0; } static int krping_bind_client(struct krping_cb *cb) { struct sockaddr_in sin; int ret; memset(&sin, 0, sizeof(sin)); sin.sin_len = sizeof sin; sin.sin_family = AF_INET; sin.sin_addr.s_addr = cb->addr.s_addr; sin.sin_port = cb->port; ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, 2000); if (ret) { PRINTF(cb, "rdma_resolve_addr error %d\n", ret); return ret; } wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED); if (cb->state != ROUTE_RESOLVED) { PRINTF(cb, "addr/route resolution did not resolve: state %d\n", cb->state); return -EINTR; } if (cb->mem == FASTREG && !fastreg_supported(cb, 0)) return -EINVAL; DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n"); return 0; } static void krping_run_client(struct krping_cb *cb) { struct ib_recv_wr *bad_wr; int ret; ret = krping_bind_client(cb); if (ret) return; ret = krping_setup_qp(cb, cb->cm_id); if (ret) { PRINTF(cb, "setup_qp failed: %d\n", ret); return; } ret = krping_setup_buffers(cb); if (ret) { PRINTF(cb, "krping_setup_buffers failed: %d\n", ret); goto err1; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { PRINTF(cb, "ib_post_recv failed: %d\n", ret); goto err2; } ret = krping_connect_client(cb); if (ret) { PRINTF(cb, "connect error %d\n", ret); goto err2; } if (cb->wlat) krping_wlat_test_client(cb); else if (cb->rlat) krping_rlat_test_client(cb); else if (cb->bw) krping_bw_test_client(cb); else if (cb->frtest) krping_fr_test(cb); else krping_test_client(cb); rdma_disconnect(cb->cm_id); err2: krping_free_buffers(cb); err1: krping_free_qp(cb); } int krping_doit(char *cmd, void *cookie) { struct krping_cb *cb; int op; int ret = 0; char *optarg; unsigned long optint; cb = kzalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return -ENOMEM; mutex_lock(&krping_mutex); list_add_tail(&cb->list, &krping_cbs); mutex_unlock(&krping_mutex); cb->cookie = cookie; cb->server = -1; cb->state = IDLE; cb->size = 64; cb->txdepth = RPING_SQ_DEPTH; cb->mem = DMA; init_waitqueue_head(&cb->sem); while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, &optint)) != 0) { switch (op) { case 'a': cb->addr_str = optarg; DEBUG_LOG(cb, "ipaddr (%s)\n", optarg); if (!inet_aton(optarg, &cb->addr)) { PRINTF(cb, "bad addr string %s\n", optarg); ret = EINVAL; } break; case 'p': cb->port = htons(optint); DEBUG_LOG(cb, "port %d\n", (int)optint); break; case 'P': cb->poll = 1; DEBUG_LOG(cb, "server\n"); break; case 's': cb->server = 1; DEBUG_LOG(cb, "server\n"); break; case 'c': cb->server = 0; DEBUG_LOG(cb, "client\n"); break; case 'S': cb->size = optint; if ((cb->size < 1) || (cb->size > RPING_BUFSIZE)) { PRINTF(cb, "Invalid size %d " "(valid range is 1 to %d)\n", cb->size, RPING_BUFSIZE); ret = EINVAL; } else DEBUG_LOG(cb, "size %d\n", (int)optint); break; case 'C': cb->count = optint; if (cb->count < 0) { PRINTF(cb, "Invalid count %d\n", cb->count); ret = EINVAL; } else DEBUG_LOG(cb, "count %d\n", (int) cb->count); break; case 'v': cb->verbose++; DEBUG_LOG(cb, "verbose\n"); break; case 'V': cb->validate++; DEBUG_LOG(cb, "validate data\n"); break; case 'l': cb->wlat++; break; case 'L': cb->rlat++; break; case 'B': cb->bw++; break; case 'd': cb->duplex++; break; case 'm': if (!strncmp(optarg, "dma", 3)) cb->mem = DMA; else if (!strncmp(optarg, "fastreg", 7)) cb->mem = FASTREG; else if (!strncmp(optarg, "mw", 2)) cb->mem = MW; else if (!strncmp(optarg, "mr", 2)) cb->mem = MR; else { PRINTF(cb, "unknown mem mode %s. " "Must be dma, fastreg, mw, or mr\n", optarg); ret = -EINVAL; break; } break; case 'I': cb->server_invalidate = 1; break; case 'T': cb->txdepth = optint; DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth); break; case 'Z': cb->local_dma_lkey = 1; DEBUG_LOG(cb, "using local dma lkey\n"); break; case 'R': cb->read_inv = 1; DEBUG_LOG(cb, "using read-with-inv\n"); break; case 'f': cb->frtest = 1; cb->testnum = optint; DEBUG_LOG(cb, "fast-reg test!\n"); break; default: PRINTF(cb, "unknown opt %s\n", optarg); ret = -EINVAL; break; } } if (ret) goto out; if (cb->server == -1) { PRINTF(cb, "must be either client or server\n"); ret = -EINVAL; goto out; } if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) { PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n"); ret = -EINVAL; goto out; } if (cb->server_invalidate && cb->mem != FASTREG) { PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->read_inv && cb->mem != FASTREG) { PRINTF(cb, "read_inv only valid with fastreg mem_mode\n"); ret = -EINVAL; goto out; } if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) { PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n"); ret = -EINVAL; goto out; } cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cb->cm_id)) { ret = PTR_ERR(cb->cm_id); PRINTF(cb, "rdma_create_id error %d\n", ret); goto out; } DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id); if (cb->server) krping_run_server(cb); else krping_run_client(cb); DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id); rdma_destroy_id(cb->cm_id); out: mutex_lock(&krping_mutex); list_del(&cb->list); mutex_unlock(&krping_mutex); kfree(cb); return ret; } void krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg) { struct krping_cb *cb; mutex_lock(&krping_mutex); list_for_each_entry(cb, &krping_cbs, list) (*f)(cb->pd ? &cb->stats : NULL, arg); mutex_unlock(&krping_mutex); } void krping_init(void) { mutex_init(&krping_mutex); } Index: user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/core/addr.c =================================================================== --- user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/core/addr.c (revision 298467) +++ user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/core/addr.c (revision 298468) @@ -1,656 +1,658 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("IB Address Translation"); MODULE_LICENSE("Dual BSD/GPL"); struct addr_req { struct list_head list; struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr *addr; struct rdma_addr_client *client; void *context; void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); unsigned long timeout; int status; }; static void process_req(struct work_struct *work); static DEFINE_MUTEX(lock); static LIST_HEAD(req_list); static struct delayed_work work; static struct workqueue_struct *addr_wq; static struct rdma_addr_client self; void rdma_addr_register_client(struct rdma_addr_client *client) { atomic_set(&client->refcount, 1); init_completion(&client->comp); } EXPORT_SYMBOL(rdma_addr_register_client); static inline void put_client(struct rdma_addr_client *client) { if (atomic_dec_and_test(&client->refcount)) complete(&client->comp); } void rdma_addr_unregister_client(struct rdma_addr_client *client) { put_client(client); wait_for_completion(&client->comp); } EXPORT_SYMBOL(rdma_addr_unregister_client); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, const unsigned char *dst_dev_addr) { if (dev->if_type == IFT_INFINIBAND) dev_addr->dev_type = ARPHRD_INFINIBAND; else if (dev->if_type == IFT_ETHER) dev_addr->dev_type = ARPHRD_ETHER; else dev_addr->dev_type = 0; memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen); memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr), dev->if_addrlen); if (dst_dev_addr) memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen); dev_addr->bound_dev_if = dev->if_index; return 0; } EXPORT_SYMBOL(rdma_copy_addr); #define SCOPE_ID_CACHE(_scope_id, _addr6) do { \ (_addr6)->sin6_addr.s6_addr[3] = (_scope_id); \ (_addr6)->sin6_scope_id = 0; } while (0) #define SCOPE_ID_RESTORE(_scope_id, _addr6) do { \ (_addr6)->sin6_scope_id = (_scope_id); \ (_addr6)->sin6_addr.s6_addr[3] = 0; } while (0) int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, u16 *vlan_id) { struct net_device *dev; int ret = -EADDRNOTAVAIL; if (dev_addr->bound_dev_if) { dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (!dev) return -ENODEV; ret = rdma_copy_addr(dev_addr, dev, NULL); dev_put(dev); return ret; } switch (addr->sa_family) { case AF_INET: dev = ip_dev_find(&init_net, ((struct sockaddr_in *) addr)->sin_addr.s_addr); if (!dev) return ret; ret = rdma_copy_addr(dev_addr, dev, NULL); if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; #if defined(INET6) case AF_INET6: { struct sockaddr_in6 *sin6; struct ifaddr *ifa; in_port_t port; uint32_t scope_id; sin6 = (struct sockaddr_in6 *)addr; port = sin6->sin6_port; sin6->sin6_port = 0; scope_id = sin6->sin6_scope_id; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) SCOPE_ID_CACHE(scope_id, sin6); ifa = ifa_ifwithaddr(addr); sin6->sin6_port = port; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) SCOPE_ID_RESTORE(scope_id, sin6); if (ifa == NULL) { ret = -ENODEV; break; } ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL); if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp); ifa_free(ifa); break; } #endif default: break; } return ret; } EXPORT_SYMBOL(rdma_translate_ip); static void set_timeout(unsigned long time) { unsigned long delay; delay = time - jiffies; if ((long)delay <= 0) delay = 1; mod_delayed_work(addr_wq, &work, delay); } static void queue_req(struct addr_req *req) { struct addr_req *temp_req; mutex_lock(&lock); list_for_each_entry_reverse(temp_req, &req_list, list) { if (time_after_eq(req->timeout, temp_req->timeout)) break; } list_add(&req->list, &temp_req->list); if (req_list.next == &req->list) set_timeout(req->timeout); mutex_unlock(&lock); } static int addr_resolve(struct sockaddr *src_in, struct sockaddr *dst_in, struct rdma_dev_addr *addr) { struct sockaddr_in *sin; struct sockaddr_in6 *sin6; struct ifaddr *ifa; struct ifnet *ifp; struct rtentry *rte; #if defined(INET) || defined(INET6) in_port_t port; #endif #ifdef INET6 uint32_t scope_id; #endif u_char edst[MAX_ADDR_LEN]; int multi; int bcast; int is_gw = 0; int error = 0; /* * Determine whether the address is unicast, multicast, or broadcast * and whether the source interface is valid. */ multi = 0; bcast = 0; sin = NULL; sin6 = NULL; ifp = NULL; rte = NULL; ifa = NULL; ifp = NULL; memset(edst, 0, sizeof(edst)); #ifdef INET6 scope_id = -1U; #endif switch (dst_in->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)dst_in; if (sin->sin_addr.s_addr == INADDR_BROADCAST) bcast = 1; if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) multi = 1; sin = (struct sockaddr_in *)src_in; if (sin->sin_addr.s_addr != INADDR_ANY) { /* * Address comparison fails if the port is set * cache it here to be restored later. */ port = sin->sin_port; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); /* * If we have a source address to use look it * up first and verify that it is a local * interface: */ ifa = ifa_ifwithaddr(src_in); sin->sin_port = port; if (ifa == NULL) { error = ENETUNREACH; goto done; } ifp = ifa->ifa_ifp; ifa_free(ifa); if (bcast || multi) goto mcast; } break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)dst_in; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) multi = 1; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { /* * The IB address comparison fails if the * scope ID is set and not part of the addr: */ scope_id = sin6->sin6_scope_id; if (scope_id < 256) SCOPE_ID_CACHE(scope_id, sin6); } sin6 = (struct sockaddr_in6 *)src_in; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { port = sin6->sin6_port; sin6->sin6_port = 0; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { if (scope_id < 256) SCOPE_ID_CACHE(scope_id, sin6); } /* * If we have a source address to use look it * up first and verify that it is a local * interface: */ ifa = ifa_ifwithaddr(src_in); sin6->sin6_port = port; if (ifa == NULL) { error = ENETUNREACH; goto done; } ifp = ifa->ifa_ifp; ifa_free(ifa); if (bcast || multi) goto mcast; } break; #endif default: error = EINVAL; goto done; } /* * Make sure the route exists and has a valid link. */ rte = rtalloc1(dst_in, 1, 0); if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { if (rte) RTFREE_LOCKED(rte); error = EHOSTUNREACH; goto done; } if (rte->rt_flags & RTF_GATEWAY) is_gw = 1; /* * If it's not multicast or broadcast and the route doesn't match the * requested interface return unreachable. Otherwise fetch the * correct interface pointer and unlock the route. */ if (multi || bcast) { if (ifp == NULL) { ifp = rte->rt_ifp; /* rt_ifa holds the route answer source address */ ifa = rte->rt_ifa; } RTFREE_LOCKED(rte); } else if (ifp && ifp != rte->rt_ifp) { RTFREE_LOCKED(rte); error = ENETUNREACH; goto done; } else { if (ifp == NULL) { ifp = rte->rt_ifp; ifa = rte->rt_ifa; } RT_UNLOCK(rte); } #if defined(INET) || defined(INET6) mcast: #endif if (bcast) { memcpy(edst, ifp->if_broadcastaddr, ifp->if_addrlen); goto done; } else if (multi) { struct sockaddr *llsa; struct sockaddr_dl sdl; sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; if (ifp->if_resolvemulti == NULL) { error = EOPNOTSUPP; goto done; } error = ifp->if_resolvemulti(ifp, &llsa, dst_in); if (error == 0) { memcpy(edst, LLADDR((struct sockaddr_dl *)llsa), ifp->if_addrlen); } goto done; } /* * Resolve the link local address. */ switch (dst_in->sa_family) { #ifdef INET case AF_INET: error = arpresolve(ifp, is_gw, NULL, is_gw ? rte->rt_gateway : dst_in, edst, NULL); break; #endif #ifdef INET6 case AF_INET6: error = nd6_resolve(ifp, is_gw, NULL, is_gw ? rte->rt_gateway : dst_in, edst, NULL); break; #endif default: + KASSERT(0, ("rdma_addr_resolve: Unreachable")); + error = EINVAL; break; } RTFREE(rte); done: if (error == 0) error = -rdma_copy_addr(addr, ifp, edst); if (error == 0) memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr)); #ifdef INET6 if (scope_id < 256) { sin6 = (struct sockaddr_in6 *)src_in; SCOPE_ID_RESTORE(scope_id, sin6); sin6 = (struct sockaddr_in6 *)dst_in; SCOPE_ID_RESTORE(scope_id, sin6); } #endif if (error == EWOULDBLOCK) error = ENODATA; return -error; } static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; struct sockaddr *src_in, *dst_in; struct list_head done_list; INIT_LIST_HEAD(&done_list); mutex_lock(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) continue; } list_move_tail(&req->list, &done_list); } if (!list_empty(&req_list)) { req = list_entry(req_list.next, struct addr_req, list); set_timeout(req->timeout); } mutex_unlock(&lock); list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); req->callback(req->status, (struct sockaddr *) &req->src_addr, req->addr, req->context); put_client(req->client); kfree(req); } } int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; int ret = 0; req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) return -ENOMEM; src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; if (src_addr) { if (src_addr->sa_family != dst_addr->sa_family) { ret = -EINVAL; goto err; } memcpy(src_in, src_addr, ip_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } memcpy(dst_in, dst_addr, ip_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; req->client = client; atomic_inc(&client->refcount); req->status = addr_resolve(src_in, dst_in, addr); switch (req->status) { case 0: req->timeout = jiffies; queue_req(req); break; case -ENODATA: req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; queue_req(req); break; default: ret = req->status; atomic_dec(&client->refcount); goto err; } return ret; err: kfree(req); return ret; } EXPORT_SYMBOL(rdma_resolve_ip); void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; mutex_lock(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->addr == addr) { req->status = -ECANCELED; req->timeout = jiffies; list_move(&req->list, &req_list); set_timeout(req->timeout); break; } } mutex_unlock(&lock); } EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { struct rdma_dev_addr *addr; struct completion comp; }; static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct rdma_dev_addr)); complete(&((struct resolve_cb_context *)context)->comp); } int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac, u16 *vlan_id) { int ret = 0; struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; struct net_device *dev; union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid); if (ret) return ret; ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid); if (ret) return ret; memset(&dev_addr, 0, sizeof(dev_addr)); ctx.addr = &dev_addr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, &dev_addr, 1000, resolve_cb, &ctx); if (ret) return ret; wait_for_completion(&ctx.comp); memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); if (!dev) return -ENODEV; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); return ret; } EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) { int ret = 0; struct rdma_dev_addr dev_addr; union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } gid_addr; ret = rdma_gid2ip(&gid_addr._sockaddr, sgid); if (ret) return ret; memset(&dev_addr, 0, sizeof(dev_addr)); ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); if (ret) return ret; memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); return ret; } EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { if (event == NETEVENT_NEIGH_UPDATE) { set_timeout(jiffies); } return 0; } static struct notifier_block nb = { .notifier_call = netevent_callback }; static int __init addr_init(void) { INIT_DELAYED_WORK(&work, process_req); addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) return -ENOMEM; register_netevent_notifier(&nb); rdma_addr_register_client(&self); return 0; } static void __exit addr_cleanup(void) { rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } module_init(addr_init); module_exit(addr_cleanup); Index: user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c =================================================================== --- user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c (revision 298467) +++ user/ngie/bsnmp_cleanup/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c (revision 298468) @@ -1,1452 +1,1457 @@ /* * Copyright (c) 2006 Mellanox Technologies. All rights reserved * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #ifdef CONFIG_INFINIBAND_IPOIB_CM #include #include #include #include #include #include int ipoib_max_conn_qp = 128; module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); MODULE_PARM_DESC(max_nonsrq_conn_qp, "Max number of connected-mode QPs per interface " "(applied only if shared receive queue is not available)"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; module_param_named(cm_data_debug_level, data_debug_level, int, 0644); MODULE_PARM_DESC(cm_data_debug_level, "Enable data path debug tracing for connected mode if > 0"); #endif #define IPOIB_CM_IETF_ID 0x1000000000000000ULL #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) #define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) #define IPOIB_CM_RX_DELAY (3 * 256 * HZ) #define IPOIB_CM_RX_UPDATE_MASK (0x3) static struct ib_qp_attr ipoib_cm_err_attr = { .qp_state = IB_QPS_ERR }; #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff static struct ib_send_wr ipoib_cm_rx_drain_wr = { .wr_id = IPOIB_CM_RX_DRAIN_WRID, .opcode = IB_WR_SEND, }; static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req) { ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req); } static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id) { struct ib_recv_wr *bad_wr; struct ipoib_rx_buf *rx_req; struct mbuf *m; int ret; int i; rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id]; for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { priv->cm.rx_sge[i].addr = rx_req->mapping[i]; priv->cm.rx_sge[i].length = m->m_len; } priv->cm.rx_wr.num_sge = i; priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); ipoib_dma_unmap_rx(priv, rx_req); m_freem(priv->cm.srq_ring[id].mb); priv->cm.srq_ring[id].mb = NULL; } return ret; } static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *rx, struct ib_recv_wr *wr, struct ib_sge *sge, int id) { struct ipoib_rx_buf *rx_req; struct ib_recv_wr *bad_wr; struct mbuf *m; int ret; int i; rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id]; for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { sge[i].addr = rx_req->mapping[i]; sge[i].length = m->m_len; } wr->num_sge = i; wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; ret = ib_post_recv(rx->qp, wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); ipoib_dma_unmap_rx(priv, rx_req); m_freem(rx->rx_ring[id].mb); rx->rx_ring[id].mb = NULL; } return ret; } static struct mbuf * ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req) { return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req, priv->cm.max_cm_mtu); } static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_ring) { int i; for (i = 0; i < ipoib_recvq_size; ++i) if (rx_ring[i].mb) { ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]); m_freem(rx_ring[i].mb); } kfree(rx_ring); } static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) { struct ib_send_wr *bad_wr; struct ipoib_cm_rx *p; /* We only reserved 1 extra slot in CQ for drain WRs, so * make sure we have at most 1 outstanding WR. */ if (list_empty(&priv->cm.rx_flush_list) || !list_empty(&priv->cm.rx_drain_list)) return; /* * QPs on flush list are error state. This way, a "flush * error" WC will be immediately generated for each WR we post. */ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) ipoib_warn(priv, "failed to post drain wr\n"); list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); } static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx) { struct ipoib_cm_rx *p = ctx; struct ipoib_dev_priv *priv = p->priv; unsigned long flags; if (event->event != IB_EVENT_QP_LAST_WQE_REACHED) return; spin_lock_irqsave(&priv->lock, flags); list_move(&p->list, &priv->cm.rx_flush_list); p->state = IPOIB_CM_RX_FLUSH; ipoib_cm_start_rx_drain(priv); spin_unlock_irqrestore(&priv->lock, flags); } static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv, struct ipoib_cm_rx *p) { struct ib_qp_init_attr attr = { .event_handler = ipoib_cm_rx_event_handler, .send_cq = priv->recv_cq, /* For drain WR */ .recv_cq = priv->recv_cq, .srq = priv->cm.srq, .cap.max_send_wr = 1, /* For drain WR */ .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = p, }; if (!ipoib_cm_has_srq(priv)) { attr.cap.max_recv_wr = ipoib_recvq_size; attr.cap.max_recv_sge = priv->cm.num_frags; } return ib_create_qp(priv->pd, &attr); } static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp, unsigned psn) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); return ret; } ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); return ret; } qp_attr.qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); return ret; } qp_attr.rq_psn = psn; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); return ret; } /* * Current Mellanox HCA firmware won't generate completions * with error for drain WRs unless the QP has been moved to * RTS first. This work-around leaves a window where a QP has * moved to error asynchronously, but this will eventually get * fixed in firmware, so let's not error out if modify QP * fails. */ qp_attr.qp_state = IB_QPS_RTS; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); return 0; } ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); return 0; } return 0; } static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv, struct ib_recv_wr *wr, struct ib_sge *sge) { int i; for (i = 0; i < IPOIB_CM_RX_SG; i++) sge[i].lkey = priv->mr->lkey; wr->next = NULL; wr->sg_list = sge; wr->num_sge = 1; } static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx) { struct { struct ib_recv_wr wr; struct ib_sge sge[IPOIB_CM_RX_SG]; } *t; int ret; int i; rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL); if (!rx->rx_ring) { printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); return -ENOMEM; } memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring); t = kmalloc(sizeof *t, GFP_KERNEL); if (!t) { ret = -ENOMEM; goto err_free; } ipoib_cm_init_rx_wr(priv, &t->wr, t->sge); spin_lock_irq(&priv->lock); if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { spin_unlock_irq(&priv->lock); ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0); ret = -EINVAL; goto err_free; } else ++priv->cm.nonsrq_conn_qp; spin_unlock_irq(&priv->lock); for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; } ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i); if (ret) { ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " "failed for buf %d\n", i); ret = -EIO; goto err_count; } } rx->recv_count = ipoib_recvq_size; kfree(t); return 0; err_count: spin_lock_irq(&priv->lock); --priv->cm.nonsrq_conn_qp; spin_unlock_irq(&priv->lock); err_free: kfree(t); ipoib_cm_free_rx_ring(priv, rx->rx_ring); return ret; } static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp, struct ib_cm_req_event_param *req, unsigned psn) { struct ipoib_cm_data data = {}; struct ib_cm_rep_param rep = {}; data.qpn = cpu_to_be32(priv->qp->qp_num); data.mtu = cpu_to_be32(priv->cm.max_cm_mtu); rep.private_data = &data; rep.private_data_len = sizeof data; rep.flow_control = 0; rep.rnr_retry_count = req->rnr_retry_count; rep.srq = ipoib_cm_has_srq(priv); rep.qp_num = qp->qp_num; rep.starting_psn = psn; return ib_send_cm_rep(cm_id, &rep); } static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_dev_priv *priv = cm_id->context; struct ipoib_cm_rx *p; unsigned psn; int ret; ipoib_dbg(priv, "REQ arrived\n"); p = kzalloc(sizeof *p, GFP_KERNEL); if (!p) return -ENOMEM; p->priv = priv; p->id = cm_id; cm_id->context = p; p->state = IPOIB_CM_RX_LIVE; p->jiffies = jiffies; INIT_LIST_HEAD(&p->list); p->qp = ipoib_cm_create_rx_qp(priv, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); goto err_qp; } psn = random() & 0xffffff; ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn); if (ret) goto err_modify; if (!ipoib_cm_has_srq(priv)) { ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p); if (ret) goto err_modify; } spin_lock_irq(&priv->lock); queue_delayed_work(ipoib_workqueue, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); /* Add this entry to passive ids list head, but do not re-add it * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ p->jiffies = jiffies; if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); spin_unlock_irq(&priv->lock); ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn); if (ret) { ipoib_warn(priv, "failed to send REP: %d\n", ret); if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); } return 0; err_modify: ib_destroy_qp(p->qp); err_qp: kfree(p); return ret; } static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_rx *p; struct ipoib_dev_priv *priv; switch (event->event) { case IB_CM_REQ_RECEIVED: return ipoib_cm_req_handler(cm_id, event); case IB_CM_DREQ_RECEIVED: p = cm_id->context; ib_send_cm_drep(cm_id, NULL, 0); /* Fall through */ case IB_CM_REJ_RECEIVED: p = cm_id->context; priv = p->priv; if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) ipoib_warn(priv, "unable to move qp to error state\n"); /* Fall through */ default: return 0; } } void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ipoib_cm_rx_buf saverx; struct ipoib_cm_rx_buf *rx_ring; unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); struct ifnet *dev = priv->dev; struct mbuf *mb, *newmb; struct ipoib_cm_rx *p; int has_srq; u_short proto; + CURVNET_SET_QUIET(dev->if_vnet); + ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { spin_lock(&priv->lock); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); if (priv->cm.id != NULL) queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); spin_unlock(&priv->lock); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", wr_id, ipoib_recvq_size); - return; + goto done; } p = wc->qp->qp_context; has_srq = ipoib_cm_has_srq(priv); rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring; mb = rx_ring[wr_id].mb; if (unlikely(wc->status != IB_WC_SUCCESS)) { ipoib_dbg(priv, "cm recv error " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); if_inc_counter(dev, IFCOUNTER_IERRORS, 1); if (has_srq) goto repost; else { if (!--p->recv_count) { spin_lock(&priv->lock); list_move(&p->list, &priv->cm.rx_reap_list); queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); spin_unlock(&priv->lock); } - return; + goto done; } } if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { p->jiffies = jiffies; /* Move this entry to list head, but do not re-add it * if it has been moved out of list. */ if (p->state == IPOIB_CM_RX_LIVE) list_move(&p->list, &priv->cm.passive_ids); } } memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx)); newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]); if (unlikely(!newmb)) { /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. */ ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); if_inc_counter(dev, IFCOUNTER_IERRORS, 1); memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx)); goto repost; } ipoib_cm_dma_unmap_rx(priv, &saverx); ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); ipoib_dma_mb(priv, mb, wc->byte_len); if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); mb->m_pkthdr.rcvif = dev; proto = *mtod(mb, uint16_t *); m_adj(mb, IPOIB_ENCAP_LEN); IPOIB_MTAP_PROTO(dev, mb, proto); ipoib_demux(dev, mb, ntohs(proto)); repost: if (has_srq) { if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id))) ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " "for buf %d\n", wr_id); } else { if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p, &priv->cm.rx_wr, priv->cm.rx_sge, wr_id))) { --p->recv_count; ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " "for buf %d\n", wr_id); } } +done: + CURVNET_RESTORE(); + return; } static inline int post_send(struct ipoib_dev_priv *priv, struct ipoib_cm_tx *tx, struct ipoib_cm_tx_buf *tx_req, unsigned int wr_id) { struct ib_send_wr *bad_wr; struct mbuf *mb = tx_req->mb; u64 *mapping = tx_req->mapping; struct mbuf *m; int i; for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { priv->tx_sge[i].addr = mapping[i]; priv->tx_sge[i].length = m->m_len; } priv->tx_wr.num_sge = i; priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; priv->tx_wr.opcode = IB_WR_SEND; return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); } void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx) { struct ipoib_cm_tx_buf *tx_req; struct ifnet *dev = priv->dev; if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) while (ipoib_poll_tx(priv)); /* nothing */ m_adj(mb, sizeof(struct ipoib_pseudoheader)); if (unlikely(mb->m_pkthdr.len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", mb->m_pkthdr.len, tx->mtu); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu)); return; } ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num); /* * We put the mb into the tx_ring _before_ we call post_send() * because it's entirely possible that the completion handler will * run before we execute anything after the post_send(). That * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; tx_req->mb = mb; if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req, priv->cm.num_frags))) { if_inc_counter(dev, IFCOUNTER_OERRORS, 1); if (tx_req->mb) m_freem(tx_req->mb); return; } if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) { ipoib_warn(priv, "post_send failed\n"); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); m_freem(mb); } else { ++tx->tx_head; if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) ipoib_warn(priv, "request notify on send CQ failed\n"); dev->if_drv_flags |= IFF_DRV_OACTIVE; } } } void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) { struct ipoib_cm_tx *tx = wc->qp->qp_context; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; struct ifnet *dev = priv->dev; struct ipoib_cm_tx_buf *tx_req; ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", wr_id, wc->status); if (unlikely(wr_id >= ipoib_sendq_size)) { ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", wr_id, ipoib_sendq_size); return; } tx_req = &tx->tx_ring[wr_id]; ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); /* FIXME: is this right? Shouldn't we only increment on success? */ if_inc_counter(dev, IFCOUNTER_OPACKETS, 1); m_freem(tx_req->mb); ++tx->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) { struct ipoib_path *path; ipoib_dbg(priv, "failed cm send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); path = tx->path; if (path) { path->cm = NULL; rb_erase(&path->rb_node, &priv->path_tree); list_del(&path->list); } if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); } } int ipoib_cm_dev_open(struct ipoib_dev_priv *priv) { int ret; if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev))) return 0; priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv); if (IS_ERR(priv->cm.id)) { printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); goto err_cm; } ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 0, NULL); if (ret) { printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, IPOIB_CM_IETF_ID | priv->qp->qp_num); goto err_listen; } return 0; err_listen: ib_destroy_cm_id(priv->cm.id); err_cm: priv->cm.id = NULL; return ret; } static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv) { struct ipoib_cm_rx *rx, *n; LIST_HEAD(list); spin_lock_irq(&priv->lock); list_splice_init(&priv->cm.rx_reap_list, &list); spin_unlock_irq(&priv->lock); list_for_each_entry_safe(rx, n, &list, list) { ib_destroy_cm_id(rx->id); ib_destroy_qp(rx->qp); if (!ipoib_cm_has_srq(priv)) { ipoib_cm_free_rx_ring(priv, rx->rx_ring); spin_lock_irq(&priv->lock); --priv->cm.nonsrq_conn_qp; spin_unlock_irq(&priv->lock); } kfree(rx); } } void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv) { struct ipoib_cm_rx *p; unsigned long begin; int ret; if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)) || !priv->cm.id) return; ib_destroy_cm_id(priv->cm.id); priv->cm.id = NULL; cancel_work_sync(&priv->cm.rx_reap_task); spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); list_move(&p->list, &priv->cm.rx_error_list); p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); if (ret) ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); spin_lock_irq(&priv->lock); } /* Wait for all RX to be drained */ begin = jiffies; while (!list_empty(&priv->cm.rx_error_list) || !list_empty(&priv->cm.rx_flush_list) || !list_empty(&priv->cm.rx_drain_list)) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "RX drain timing out\n"); /* * assume the HW is wedged and just free up everything. */ list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_reap_list); list_splice_init(&priv->cm.rx_error_list, &priv->cm.rx_reap_list); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); break; } spin_unlock_irq(&priv->lock); msleep(1); ipoib_drain_cq(priv); spin_lock_irq(&priv->lock); } spin_unlock_irq(&priv->lock); ipoib_cm_free_rx_reap_list(priv); cancel_delayed_work(&priv->cm.stale_task); } static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_tx *p = cm_id->context; struct ipoib_dev_priv *priv = p->priv; struct ipoib_cm_data *data = event->private_data; struct ifqueue mbqueue; struct ib_qp_attr qp_attr; int qp_attr_mask, ret; struct mbuf *mb; ipoib_dbg(priv, "cm rep handler\n"); p->mtu = be32_to_cpu(data->mtu); if (p->mtu <= IPOIB_ENCAP_LEN) { ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n", p->mtu, IPOIB_ENCAP_LEN); return -EINVAL; } qp_attr.qp_state = IB_QPS_RTR; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); return ret; } qp_attr.rq_psn = 0 /* FIXME */; ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); return ret; } qp_attr.qp_state = IB_QPS_RTS; ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); return ret; } ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); return ret; } bzero(&mbqueue, sizeof(mbqueue)); spin_lock_irq(&priv->lock); set_bit(IPOIB_FLAG_OPER_UP, &p->flags); if (p->path) for (;;) { _IF_DEQUEUE(&p->path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } spin_unlock_irq(&priv->lock); for (;;) { struct ifnet *dev = p->priv->dev; _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } ret = ib_send_cm_rtu(cm_id, NULL, 0); if (ret) { ipoib_warn(priv, "failed to send RTU: %d\n", ret); return ret; } return 0; } static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv, struct ipoib_cm_tx *tx) { struct ib_qp_init_attr attr = { .send_cq = priv->send_cq, .recv_cq = priv->recv_cq, .srq = priv->cm.srq, .cap.max_send_wr = ipoib_sendq_size, .cap.max_send_sge = priv->cm.num_frags, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, .qp_context = tx }; return ib_create_qp(priv->pd, &attr); } static int ipoib_cm_send_req(struct ipoib_dev_priv *priv, struct ib_cm_id *id, struct ib_qp *qp, u32 qpn, struct ib_sa_path_rec *pathrec) { struct ipoib_cm_data data = {}; struct ib_cm_req_param req = {}; ipoib_dbg(priv, "cm send req\n"); data.qpn = cpu_to_be32(priv->qp->qp_num); data.mtu = cpu_to_be32(priv->cm.max_cm_mtu); req.primary_path = pathrec; req.alternate_path = NULL; req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); req.qp_num = qp->qp_num; req.qp_type = qp->qp_type; req.private_data = &data; req.private_data_len = sizeof data; req.flow_control = 0; req.starting_psn = 0; /* FIXME */ /* * Pick some arbitrary defaults here; we could make these * module parameters if anyone cared about setting them. */ req.responder_resources = 4; req.remote_cm_response_timeout = 20; req.local_cm_response_timeout = 20; req.retry_count = 0; /* RFC draft warns against retries */ req.rnr_retry_count = 0; /* RFC draft warns against retries */ req.max_cm_retries = 15; req.srq = ipoib_cm_has_srq(priv); return ib_send_cm_req(id, &req); } static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); if (ret) { ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret); return ret; } qp_attr.qp_state = IB_QPS_INIT; qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; qp_attr.port_num = priv->port; qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) { ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); return ret; } return 0; } static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ib_sa_path_rec *pathrec) { struct ipoib_dev_priv *priv = p->priv; int ret; p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->priv, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); goto err_qp; } p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); if (IS_ERR(p->id)) { ret = PTR_ERR(p->id); ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); goto err_id; } ret = ipoib_cm_modify_tx_init(p->priv, p->id, p->qp); if (ret) { ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); goto err_modify; } ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec); if (ret) { ipoib_warn(priv, "failed to send cm req: %d\n", ret); goto err_send_cm; } ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n", p->qp->qp_num, pathrec->dgid.raw, qpn); return 0; err_send_cm: err_modify: ib_destroy_cm_id(p->id); err_id: p->id = NULL; ib_destroy_qp(p->qp); err_qp: p->qp = NULL; kfree(p->tx_ring); err_tx: return ret; } static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { struct ipoib_dev_priv *priv = p->priv; struct ifnet *dev = priv->dev; struct ipoib_cm_tx_buf *tx_req; unsigned long begin; ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); if (p->path) ipoib_path_free(priv, p->path); if (p->id) ib_destroy_cm_id(p->id); if (p->tx_ring) { /* Wait for all sends to complete */ begin = jiffies; while ((int) p->tx_tail - (int) p->tx_head < 0) { if (time_after(jiffies, begin + 5 * HZ)) { ipoib_warn(priv, "timing out; %d sends not completed\n", p->tx_head - p->tx_tail); goto timeout; } msleep(1); } } timeout: while ((int) p->tx_tail - (int) p->tx_head < 0) { tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req); m_freem(tx_req->mb); ++p->tx_tail; if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) dev->if_drv_flags &= ~IFF_DRV_OACTIVE; } if (p->qp) ib_destroy_qp(p->qp); kfree(p->tx_ring); kfree(p); } static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct ipoib_cm_tx *tx = cm_id->context; struct ipoib_dev_priv *priv = tx->priv; struct ipoib_path *path; unsigned long flags; int ret; switch (event->event) { case IB_CM_DREQ_RECEIVED: ipoib_dbg(priv, "DREQ received.\n"); ib_send_cm_drep(cm_id, NULL, 0); break; case IB_CM_REP_RECEIVED: ipoib_dbg(priv, "REP received.\n"); ret = ipoib_cm_rep_handler(cm_id, event); if (ret) ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); break; case IB_CM_REQ_ERROR: case IB_CM_REJ_RECEIVED: case IB_CM_TIMEWAIT_EXIT: ipoib_dbg(priv, "CM error %d.\n", event->event); spin_lock_irqsave(&priv->lock, flags); path = tx->path; if (path) { path->cm = NULL; tx->path = NULL; rb_erase(&path->rb_node, &priv->path_tree); list_del(&path->list); } if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); } spin_unlock_irqrestore(&priv->lock, flags); if (path) ipoib_path_free(tx->priv, path); break; default: break; } return 0; } struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ipoib_cm_tx *tx; tx = kzalloc(sizeof *tx, GFP_ATOMIC); if (!tx) return NULL; ipoib_dbg(priv, "Creating cm tx\n"); path->cm = tx; tx->path = path; tx->priv = priv; list_add(&tx->list, &priv->cm.start_list); set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); queue_work(ipoib_workqueue, &priv->cm.start_task); return tx; } void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = tx->priv; if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { spin_lock(&priv->lock); list_move(&tx->list, &priv->cm.reap_list); spin_unlock(&priv->lock); queue_work(ipoib_workqueue, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", tx->path->pathrec.dgid.raw); tx->path = NULL; } } static void ipoib_cm_tx_start(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.start_task); struct ipoib_path *path; struct ipoib_cm_tx *p; unsigned long flags; int ret; struct ib_sa_path_rec pathrec; u32 qpn; ipoib_dbg(priv, "cm start task\n"); spin_lock_irqsave(&priv->lock, flags); while (!list_empty(&priv->cm.start_list)) { p = list_entry(priv->cm.start_list.next, typeof(*p), list); list_del_init(&p->list); path = p->path; qpn = IPOIB_QPN(path->hwaddr); memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); spin_unlock_irqrestore(&priv->lock, flags); ret = ipoib_cm_tx_init(p, qpn, &pathrec); spin_lock_irqsave(&priv->lock, flags); if (ret) { path = p->path; if (path) { path->cm = NULL; rb_erase(&path->rb_node, &priv->path_tree); list_del(&path->list); ipoib_path_free(priv, path); } list_del(&p->list); kfree(p); } } spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_cm_tx_reap(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.reap_task); struct ipoib_cm_tx *p; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); while (!list_empty(&priv->cm.reap_list)) { p = list_entry(priv->cm.reap_list.next, typeof(*p), list); list_del(&p->list); spin_unlock_irqrestore(&priv->lock, flags); ipoib_cm_tx_destroy(p); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_cm_mb_reap(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.mb_task); struct mbuf *mb; unsigned long flags; #if defined(INET) || defined(INET6) unsigned mtu = priv->mcast_mtu; #endif uint16_t proto; spin_lock_irqsave(&priv->lock, flags); for (;;) { IF_DEQUEUE(&priv->cm.mb_queue, mb); if (mb == NULL) break; spin_unlock_irqrestore(&priv->lock, flags); proto = htons(*mtod(mb, uint16_t *)); m_adj(mb, IPOIB_ENCAP_LEN); switch (proto) { #if defined(INET) case ETHERTYPE_IP: icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); break; #endif #if defined(INET6) case ETHERTYPE_IPV6: icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu); break; #endif default: m_freem(mb); } spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu) { int e = priv->cm.mb_queue.ifq_len; IF_ENQUEUE(&priv->cm.mb_queue, mb); if (e == 0) queue_work(ipoib_workqueue, &priv->cm.mb_task); } static void ipoib_cm_rx_reap(struct work_struct *work) { ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv, cm.rx_reap_task)); } static void ipoib_cm_stale_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, cm.stale_task.work); struct ipoib_cm_rx *p; int ret; spin_lock_irq(&priv->lock); while (!list_empty(&priv->cm.passive_ids)) { /* List is sorted by LRU, start from tail, * stop when we see a recently used entry */ p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) break; list_move(&p->list, &priv->cm.rx_error_list); p->state = IPOIB_CM_RX_ERROR; spin_unlock_irq(&priv->lock); ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE); if (ret) ipoib_warn(priv, "unable to move qp to error state: %d\n", ret); spin_lock_irq(&priv->lock); } if (!list_empty(&priv->cm.passive_ids)) queue_delayed_work(ipoib_workqueue, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge) { struct ib_srq_init_attr srq_init_attr = { .attr = { .max_wr = ipoib_recvq_size, .max_sge = max_sge } }; priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); if (IS_ERR(priv->cm.srq)) { if (PTR_ERR(priv->cm.srq) != -ENOSYS) printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", priv->ca->name, PTR_ERR(priv->cm.srq)); priv->cm.srq = NULL; return; } priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL); if (!priv->cm.srq_ring) { printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); ib_destroy_srq(priv->cm.srq); priv->cm.srq = NULL; return; } memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring); } int ipoib_cm_dev_init(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; int i, ret; struct ib_device_attr attr; INIT_LIST_HEAD(&priv->cm.passive_ids); INIT_LIST_HEAD(&priv->cm.reap_list); INIT_LIST_HEAD(&priv->cm.start_list); INIT_LIST_HEAD(&priv->cm.rx_error_list); INIT_LIST_HEAD(&priv->cm.rx_flush_list); INIT_LIST_HEAD(&priv->cm.rx_drain_list); INIT_LIST_HEAD(&priv->cm.rx_reap_list); INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap); INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap); INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue)); mtx_init(&priv->cm.mb_queue.ifq_mtx, dev->if_xname, "if send queue", MTX_DEF); ret = ib_query_device(priv->ca, &attr); if (ret) { printk(KERN_WARNING "ib_query_device() failed with %d\n", ret); return ret; } ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge); attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); ipoib_cm_create_srq(priv, attr.max_srq_sge); if (ipoib_cm_has_srq(priv)) { priv->cm.max_cm_mtu = attr.max_srq_sge * MJUMPAGESIZE; priv->cm.num_frags = attr.max_srq_sge; ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", priv->cm.max_cm_mtu, priv->cm.num_frags); } else { priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU; priv->cm.num_frags = IPOIB_CM_RX_SG; } ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge); if (ipoib_cm_has_srq(priv)) { for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(priv); return -ENOMEM; } if (ipoib_cm_post_receive_srq(priv, i)) { ipoib_warn(priv, "ipoib_cm_post_receive_srq " "failed for buf %d\n", i); ipoib_cm_dev_cleanup(priv); return -EIO; } } } IF_LLADDR(priv->dev)[0] = IPOIB_FLAGS_RC; return 0; } void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv) { int ret; if (!priv->cm.srq) return; ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); ret = ib_destroy_srq(priv->cm.srq); if (ret) ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); priv->cm.srq = NULL; if (!priv->cm.srq_ring) return; ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring); priv->cm.srq_ring = NULL; mtx_destroy(&priv->cm.mb_queue.ifq_mtx); } #endif /* CONFIG_INFINIBAND_IPOIB_CM */ Index: user/ngie/bsnmp_cleanup/sys/ufs/ufs/ufs_extattr.c =================================================================== --- user/ngie/bsnmp_cleanup/sys/ufs/ufs/ufs_extattr.c (revision 298467) +++ user/ngie/bsnmp_cleanup/sys/ufs/ufs/ufs_extattr.c (revision 298468) @@ -1,1298 +1,1300 @@ /*- * Copyright (c) 1999-2002 Robert N. M. Watson * Copyright (c) 2002-2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Support for filesystem extended attribute: UFS-specific support functions. */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_EXTATTR +FEATURE(ufs_extattr, "ufs extended attribute support"); + static MALLOC_DEFINE(M_UFS_EXTATTR, "ufs_extattr", "ufs extended attribute"); static int ufs_extattr_sync = 0; SYSCTL_INT(_debug, OID_AUTO, ufs_extattr_sync, CTLFLAG_RW, &ufs_extattr_sync, 0, ""); static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname); static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td); static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct thread *td); static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct thread *td); static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, struct ucred *cred, struct thread *td); static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, struct ucred *cred, struct thread *td); static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, struct ucred *cred, struct thread *td); #ifdef UFS_EXTATTR_AUTOSTART static int ufs_extattr_autostart_locked(struct mount *mp, struct thread *td); #endif static int ufs_extattr_start_locked(struct ufsmount *ump, struct thread *td); /* * Per-FS attribute lock protecting attribute operations. * * XXXRW: Perhaps something more fine-grained would be appropriate, but at * the end of the day we're going to contend on the vnode lock for the * backing file anyway. */ static void ufs_extattr_uepm_lock(struct ufsmount *ump) { sx_xlock(&ump->um_extattr.uepm_lock); } static void ufs_extattr_uepm_unlock(struct ufsmount *ump) { sx_xunlock(&ump->um_extattr.uepm_lock); } /*- * Determine whether the name passed is a valid name for an actual * attribute. * * Invalid currently consists of: * NULL pointer for attrname * zero-length attrname (used to retrieve application attribute list) */ static int ufs_extattr_valid_attrname(int attrnamespace, const char *attrname) { if (attrname == NULL) return (0); if (strlen(attrname) == 0) return (0); return (1); } /* * Locate an attribute given a name and mountpoint. * Must be holding uepm lock for the mount point. */ static struct ufs_extattr_list_entry * ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace, const char *attrname) { struct ufs_extattr_list_entry *search_attribute; sx_assert(&ump->um_extattr.uepm_lock, SA_XLOCKED); for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list); search_attribute != NULL; search_attribute = LIST_NEXT(search_attribute, uele_entries)) { if (!(strncmp(attrname, search_attribute->uele_attrname, UFS_EXTATTR_MAXEXTATTRNAME)) && (attrnamespace == search_attribute->uele_attrnamespace)) { return (search_attribute); } } return (0); } /* * Initialize per-FS structures supporting extended attributes. Do not * start extended attributes yet. */ void ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm) { uepm->uepm_flags = 0; LIST_INIT(&uepm->uepm_list); sx_init(&uepm->uepm_lock, "ufs_extattr_sx"); uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED; } /* * Destroy per-FS structures supporting extended attributes. Assumes * that EAs have already been stopped, and will panic if not. */ void ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm) { if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) panic("ufs_extattr_uepm_destroy: not initialized"); if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED)) panic("ufs_extattr_uepm_destroy: called while still started"); /* * It's not clear that either order for the next two lines is * ideal, and it should never be a problem if this is only called * during unmount, and with vfs_busy(). */ uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED; sx_destroy(&uepm->uepm_lock); } /* * Start extended attribute support on an FS. */ int ufs_extattr_start(struct mount *mp, struct thread *td) { struct ufsmount *ump; int error = 0; ump = VFSTOUFS(mp); ufs_extattr_uepm_lock(ump); error = ufs_extattr_start_locked(ump, td); ufs_extattr_uepm_unlock(ump); return (error); } static int ufs_extattr_start_locked(struct ufsmount *ump, struct thread *td) { if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) return (EOPNOTSUPP); if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) return (EBUSY); ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED; ump->um_extattr.uepm_ucred = crhold(td->td_ucred); return (0); } #ifdef UFS_EXTATTR_AUTOSTART /* * Helper routine: given a locked parent directory and filename, return * the locked vnode of the inode associated with the name. Will not * follow symlinks, may return any type of vnode. Lock on parent will * be released even in the event of a failure. In the event that the * target is the parent (i.e., "."), there will be two references and * one lock, requiring the caller to possibly special-case. */ #define UE_GETDIR_LOCKPARENT 1 #define UE_GETDIR_LOCKPARENT_DONT 2 static int ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, char *dirname, struct vnode **vp, struct thread *td) { struct vop_cachedlookup_args vargs; struct componentname cnp; struct vnode *target_vp; int error; bzero(&cnp, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN; if (lockparent == UE_GETDIR_LOCKPARENT) cnp.cn_flags |= LOCKPARENT; cnp.cn_lkflags = LK_EXCLUSIVE; cnp.cn_thread = td; cnp.cn_cred = td->td_ucred; cnp.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); cnp.cn_nameptr = cnp.cn_pnbuf; error = copystr(dirname, cnp.cn_pnbuf, MAXPATHLEN, (size_t *) &cnp.cn_namelen); if (error) { if (lockparent == UE_GETDIR_LOCKPARENT_DONT) { VOP_UNLOCK(start_dvp, 0); } uma_zfree(namei_zone, cnp.cn_pnbuf); printf("ufs_extattr_lookup: copystr failed\n"); return (error); } cnp.cn_namelen--; /* trim nul termination */ vargs.a_gen.a_desc = NULL; vargs.a_dvp = start_dvp; vargs.a_vpp = &target_vp; vargs.a_cnp = &cnp; error = ufs_lookup(&vargs); uma_zfree(namei_zone, cnp.cn_pnbuf); if (error) { /* * Error condition, may have to release the lock on the parent * if ufs_lookup() didn't. */ if (lockparent == UE_GETDIR_LOCKPARENT_DONT) VOP_UNLOCK(start_dvp, 0); /* * Check that ufs_lookup() didn't release the lock when we * didn't want it to. */ if (lockparent == UE_GETDIR_LOCKPARENT) ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); return (error); } /* if (target_vp == start_dvp) panic("ufs_extattr_lookup: target_vp == start_dvp"); */ if (target_vp != start_dvp && lockparent == UE_GETDIR_LOCKPARENT_DONT) VOP_UNLOCK(start_dvp, 0); if (lockparent == UE_GETDIR_LOCKPARENT) ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); /* printf("ufs_extattr_lookup: success\n"); */ *vp = target_vp; return (0); } #endif /* !UFS_EXTATTR_AUTOSTART */ /* * Enable an EA using the passed filesystem, backing vnode, attribute name, * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp * to be locked when passed in. The vnode will be returned unlocked, * regardless of success/failure of the function. As a result, the caller * will always need to vrele(), but not vput(). */ static int ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { int error; error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL); if (error) { printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed " "with %d\n", error); VOP_UNLOCK(vp, 0); return (error); } VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); vref(vp); VOP_UNLOCK(vp, 0); error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, td); if (error != 0) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); return (error); } #ifdef UFS_EXTATTR_AUTOSTART /* * Given a locked directory vnode, iterate over the names in the directory * and use ufs_extattr_lookup() to retrieve locked vnodes of potential * attribute files. Then invoke ufs_extattr_enable_with_open() on each * to attempt to start the attribute. Leaves the directory locked on * exit. */ static int ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp, int attrnamespace, struct thread *td) { struct vop_readdir_args vargs; struct dirent *dp, *edp; struct vnode *attr_vp; struct uio auio; struct iovec aiov; char *dirbuf; int error, eofflag = 0; if (dvp->v_type != VDIR) return (ENOTDIR); dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; vargs.a_gen.a_desc = NULL; vargs.a_vp = dvp; vargs.a_uio = &auio; vargs.a_cred = td->td_ucred; vargs.a_eofflag = &eofflag; vargs.a_ncookies = NULL; vargs.a_cookies = NULL; while (!eofflag) { auio.uio_resid = DIRBLKSIZ; aiov.iov_base = dirbuf; aiov.iov_len = DIRBLKSIZ; error = ufs_readdir(&vargs); if (error) { printf("ufs_extattr_iterate_directory: ufs_readdir " "%d\n", error); return (error); } edp = (struct dirent *)&dirbuf[DIRBLKSIZ - auio.uio_resid]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { if (dp->d_reclen == 0) break; error = ufs_extattr_lookup(dvp, UE_GETDIR_LOCKPARENT, dp->d_name, &attr_vp, td); if (error) { printf("ufs_extattr_iterate_directory: lookup " "%s %d\n", dp->d_name, error); } else if (attr_vp == dvp) { vrele(attr_vp); } else if (attr_vp->v_type != VREG) { vput(attr_vp); } else { error = ufs_extattr_enable_with_open(ump, attr_vp, attrnamespace, dp->d_name, td); vrele(attr_vp); if (error) { printf("ufs_extattr_iterate_directory: " "enable %s %d\n", dp->d_name, error); } else if (bootverbose) { printf("UFS autostarted EA %s\n", dp->d_name); } } dp = (struct dirent *) ((char *)dp + dp->d_reclen); if (dp >= edp) break; } } free(dirbuf, M_TEMP); return (0); } /* * Auto-start of extended attributes, to be executed (optionally) at * mount-time. */ int ufs_extattr_autostart(struct mount *mp, struct thread *td) { struct ufsmount *ump; int error; ump = VFSTOUFS(mp); ufs_extattr_uepm_lock(ump); error = ufs_extattr_autostart_locked(mp, td); ufs_extattr_uepm_unlock(ump); return (error); } static int ufs_extattr_autostart_locked(struct mount *mp, struct thread *td) { struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp; struct ufsmount *ump = VFSTOUFS(mp); int error; /* * UFS_EXTATTR applies only to UFS1, as UFS2 uses native extended * attributes, so don't autostart. */ if (ump->um_fstype != UFS1) return (0); /* * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root? * If so, automatically start EA's. */ error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp); if (error) { printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n", error); return (error); } error = ufs_extattr_lookup(rvp, UE_GETDIR_LOCKPARENT_DONT, UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, td); if (error) { /* rvp ref'd but now unlocked */ vrele(rvp); return (error); } if (rvp == attr_dvp) { /* Should never happen. */ vput(rvp); vrele(attr_dvp); return (EINVAL); } vrele(rvp); if (attr_dvp->v_type != VDIR) { printf("ufs_extattr_autostart: %s != VDIR\n", UFS_EXTATTR_FSROOTSUBDIR); goto return_vput_attr_dvp; } error = ufs_extattr_start_locked(ump, td); if (error) { printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n", error); goto return_vput_attr_dvp; } /* * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM, * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory, * and start with appropriate type. Failures in either don't * result in an over-all failure. attr_dvp is left locked to * be cleaned up on exit. */ error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, td); if (!error) { error = ufs_extattr_iterate_directory(VFSTOUFS(mp), attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, td); if (error) printf("ufs_extattr_iterate_directory returned %d\n", error); vput(attr_system_dvp); } error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, td); if (!error) { error = ufs_extattr_iterate_directory(VFSTOUFS(mp), attr_user_dvp, EXTATTR_NAMESPACE_USER, td); if (error) printf("ufs_extattr_iterate_directory returned %d\n", error); vput(attr_user_dvp); } /* Mask startup failures in sub-directories. */ error = 0; return_vput_attr_dvp: vput(attr_dvp); return (error); } #endif /* !UFS_EXTATTR_AUTOSTART */ /* * Stop extended attribute support on an FS. */ int ufs_extattr_stop(struct mount *mp, struct thread *td) { struct ufs_extattr_list_entry *uele; struct ufsmount *ump = VFSTOUFS(mp); int error = 0; ufs_extattr_uepm_lock(ump); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { error = EOPNOTSUPP; goto unlock; } while ((uele = LIST_FIRST(&ump->um_extattr.uepm_list)) != NULL) { ufs_extattr_disable(ump, uele->uele_attrnamespace, uele->uele_attrname, td); } ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; crfree(ump->um_extattr.uepm_ucred); ump->um_extattr.uepm_ucred = NULL; unlock: ufs_extattr_uepm_unlock(ump); return (error); } /* * Enable a named attribute on the specified filesystem; provide an * unlocked backing vnode to hold the attribute data. */ static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct vnode *backing_vnode, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct iovec aiov; struct uio auio; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return (EINVAL); if (backing_vnode->v_type != VREG) return (EINVAL); attribute = malloc(sizeof(struct ufs_extattr_list_entry), M_UFS_EXTATTR, M_WAITOK); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { error = EOPNOTSUPP; goto free_exit; } if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) { error = EEXIST; goto free_exit; } strncpy(attribute->uele_attrname, attrname, UFS_EXTATTR_MAXEXTATTRNAME); attribute->uele_attrnamespace = attrnamespace; bzero(&attribute->uele_fileheader, sizeof(struct ufs_extattr_fileheader)); attribute->uele_backing_vnode = backing_vnode; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (caddr_t) &attribute->uele_fileheader; aiov.iov_len = sizeof(struct ufs_extattr_fileheader); auio.uio_resid = sizeof(struct ufs_extattr_fileheader); auio.uio_offset = (off_t) 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(backing_vnode, LK_SHARED | LK_RETRY); error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto unlock_free_exit; if (auio.uio_resid != 0) { printf("ufs_extattr_enable: malformed attribute header\n"); error = EINVAL; goto unlock_free_exit; } if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { printf("ufs_extattr_enable: invalid attribute header magic\n"); error = EINVAL; goto unlock_free_exit; } if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) { printf("ufs_extattr_enable: incorrect attribute header " "version\n"); error = EINVAL; goto unlock_free_exit; } ASSERT_VOP_LOCKED(backing_vnode, "ufs_extattr_enable"); LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, uele_entries); VOP_UNLOCK(backing_vnode, 0); return (0); unlock_free_exit: VOP_UNLOCK(backing_vnode, 0); free_exit: free(attribute, M_UFS_EXTATTR); return (error); } /* * Disable extended attribute support on an FS. */ static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, const char *attrname, struct thread *td) { struct ufs_extattr_list_entry *uele; int error = 0; if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) return (EINVAL); uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); if (!uele) return (ENOATTR); LIST_REMOVE(uele, uele_entries); vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(uele->uele_backing_vnode, "ufs_extattr_disable"); VOP_UNLOCK(uele->uele_backing_vnode, 0); error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, td->td_ucred, td); free(uele, M_UFS_EXTATTR); return (error); } /* * VFS call to manage extended attributes in UFS. If filename_vp is * non-NULL, it must be passed in locked, and regardless of errors in * processing, will be unlocked. */ int ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { struct ufsmount *ump = VFSTOUFS(mp); struct thread *td = curthread; int error; /* * Processes with privilege, but in jail, are not allowed to * configure extended attributes. */ error = priv_check(td, PRIV_UFS_EXTATTRCTL); if (error) { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0); return (error); } /* * We only allow extattrctl(2) on UFS1 file systems, as UFS2 uses * native extended attributes. */ if (ump->um_fstype != UFS1) { if (filename_vp != NULL) VOP_UNLOCK(filename_vp, 0); return (EOPNOTSUPP); } switch(cmd) { case UFS_EXTATTR_CMD_START: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } if (attrname != NULL) return (EINVAL); error = ufs_extattr_start(mp, td); return (error); case UFS_EXTATTR_CMD_STOP: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } if (attrname != NULL) return (EINVAL); error = ufs_extattr_stop(mp, td); return (error); case UFS_EXTATTR_CMD_ENABLE: if (filename_vp == NULL) return (EINVAL); if (attrname == NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } /* * ufs_extattr_enable_with_open() will always unlock the * vnode, regardless of failure. */ ufs_extattr_uepm_lock(ump); error = ufs_extattr_enable_with_open(ump, filename_vp, attrnamespace, attrname, td); ufs_extattr_uepm_unlock(ump); return (error); case UFS_EXTATTR_CMD_DISABLE: if (filename_vp != NULL) { VOP_UNLOCK(filename_vp, 0); return (EINVAL); } if (attrname == NULL) return (EINVAL); ufs_extattr_uepm_lock(ump); error = ufs_extattr_disable(ump, attrnamespace, attrname, td); ufs_extattr_uepm_unlock(ump); return (error); default: return (EINVAL); } } /* * Vnode operating to retrieve a named extended attribute. */ int ufs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; ufs_extattr_uepm_lock(ump); error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump); return (error); } /* * Real work associated with retrieving a named attribute--assumes that * the attribute lock has already been grabbed. */ static int ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, size_t *size, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; size_t len, old_len; int error = 0; if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (strlen(name) == 0) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, VREAD); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Allow only offsets of zero to encourage the read/replace * extended attribute semantic. Otherwise we can't guarantee * atomicity, as we don't provide locks for extended attributes. */ if (uio != NULL && uio->uio_offset != 0) return (ENXIO); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Read in the data header to see if the data is defined, and if so * how much. */ bzero(&ueh, sizeof(struct ufs_extattr_header)); local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_READ; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Acquire locks. * * Don't need to get a lock on the backing file if the getattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY); error = VOP_READ(attribute->uele_backing_vnode, &local_aio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; /* Defined? */ if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { error = ENOATTR; goto vopunlock_exit; } /* Valid for the current inode generation? */ if (ueh.ueh_i_gen != ip->i_gen) { /* * The inode itself has a different generation number * than the attribute data. For now, the best solution * is to coerce this to undefined, and let it get cleaned * up by the next write or extattrctl clean. */ printf("ufs_extattr_get (%s): inode number inconsistency (%d, %ju)\n", mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (uintmax_t)ip->i_gen); error = ENOATTR; goto vopunlock_exit; } /* Local size consistency check. */ if (ueh.ueh_len > attribute->uele_fileheader.uef_size) { error = ENXIO; goto vopunlock_exit; } /* Return full data size if caller requested it. */ if (size != NULL) *size = ueh.ueh_len; /* Return data if the caller requested it. */ if (uio != NULL) { /* Allow for offset into the attribute data. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); /* * Figure out maximum to transfer -- use buffer size and * local data limit. */ len = MIN(uio->uio_resid, ueh.ueh_len); old_len = uio->uio_resid; uio->uio_resid = len; error = VOP_READ(attribute->uele_backing_vnode, uio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; uio->uio_resid = old_len - (len - uio->uio_resid); } vopunlock_exit: if (uio != NULL) uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode, 0); return (error); } /* * Vnode operation to remove a named attribute. */ int ufs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; ufs_extattr_uepm_lock(ump); error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump); return (error); } /* * Vnode operation to set a named attribute. */ int ufs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct mount *mp = ap->a_vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); int error; /* * XXX: No longer a supported way to delete extended attributes. */ if (ap->a_uio == NULL) return (EINVAL); ufs_extattr_uepm_lock(ump); error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_cred, ap->a_td); ufs_extattr_uepm_unlock(ump); return (error); } /* * Real work associated with setting a vnode's extended attributes; * assumes that the attribute lock has already been grabbed. */ static int ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, struct uio *uio, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (!ufs_extattr_valid_attrname(attrnamespace, name)) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Early rejection of invalid offsets/length. * Reject: any offset but 0 (replace) * Any size greater than attribute size limit */ if (uio->uio_offset != 0 || uio->uio_resid > attribute->uele_fileheader.uef_size) return (ENXIO); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Write out a data header for the data. */ ueh.ueh_len = uio->uio_resid; ueh.ueh_flags = UFS_EXTATTR_ATTR_FLAG_INUSE; ueh.ueh_i_gen = ip->i_gen; local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Acquire locks. * * Don't need to get a lock on the backing file if the setattr is * being applied to the backing file, as the lock is already held. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) { error = ENXIO; goto vopunlock_exit; } /* * Write out user data. */ uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag, ump->um_extattr.uepm_ucred); vopunlock_exit: uio->uio_offset = 0; if (attribute->uele_backing_vnode != vp) VOP_UNLOCK(attribute->uele_backing_vnode, 0); return (error); } /* * Real work associated with removing an extended attribute from a vnode. * Assumes the attribute lock has already been grabbed. */ static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, struct ucred *cred, struct thread *td) { struct ufs_extattr_list_entry *attribute; struct ufs_extattr_header ueh; struct iovec local_aiov; struct uio local_aio; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); struct inode *ip = VTOI(vp); off_t base_offset; int error = 0, ioflag; if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) return (EOPNOTSUPP); if (!ufs_extattr_valid_attrname(attrnamespace, name)) return (EINVAL); error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE); if (error) return (error); attribute = ufs_extattr_find_attr(ump, attrnamespace, name); if (!attribute) return (ENOATTR); /* * Find base offset of header in file based on file header size, and * data header size + maximum data size, indexed by inode number. */ base_offset = sizeof(struct ufs_extattr_fileheader) + ip->i_number * (sizeof(struct ufs_extattr_header) + attribute->uele_fileheader.uef_size); /* * Check to see if currently defined. */ bzero(&ueh, sizeof(struct ufs_extattr_header)); local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_READ; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); /* * Don't need to get the lock on the backing vnode if the vnode we're * modifying is it, as we already hold the lock. */ if (attribute->uele_backing_vnode != vp) vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(attribute->uele_backing_vnode, &local_aio, IO_NODELOCKED, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; /* Defined? */ if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { error = ENOATTR; goto vopunlock_exit; } /* Valid for the current inode generation? */ if (ueh.ueh_i_gen != ip->i_gen) { /* * The inode itself has a different generation number than * the attribute data. For now, the best solution is to * coerce this to undefined, and let it get cleaned up by * the next write or extattrctl clean. */ printf("ufs_extattr_rm (%s): inode number inconsistency (%d, %jd)\n", mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen); error = ENOATTR; goto vopunlock_exit; } /* Flag it as not in use. */ ueh.ueh_flags = 0; ueh.ueh_len = 0; local_aiov.iov_base = (caddr_t) &ueh; local_aiov.iov_len = sizeof(struct ufs_extattr_header); local_aio.uio_iov = &local_aiov; local_aio.uio_iovcnt = 1; local_aio.uio_rw = UIO_WRITE; local_aio.uio_segflg = UIO_SYSSPACE; local_aio.uio_td = td; local_aio.uio_offset = base_offset; local_aio.uio_resid = sizeof(struct ufs_extattr_header); ioflag = IO_NODELOCKED; if (ufs_extattr_sync) ioflag |= IO_SYNC; error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, ump->um_extattr.uepm_ucred); if (error) goto vopunlock_exit; if (local_aio.uio_resid != 0) error = ENXIO; vopunlock_exit: VOP_UNLOCK(attribute->uele_backing_vnode, 0); return (error); } /* * Called by UFS when an inode is no longer active and should have its * attributes stripped. */ void ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td) { struct ufs_extattr_list_entry *uele; struct mount *mp = vp->v_mount; struct ufsmount *ump = VFSTOUFS(mp); /* * In that case, we cannot lock. We should not have any active vnodes * on the fs if this is not yet initialized but is going to be, so * this can go unlocked. */ if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) return; ufs_extattr_uepm_lock(ump); if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { ufs_extattr_uepm_unlock(ump); return; } LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) ufs_extattr_rm(vp, uele->uele_attrnamespace, uele->uele_attrname, NULL, td); ufs_extattr_uepm_unlock(ump); } #endif /* !UFS_EXTATTR */ Index: user/ngie/bsnmp_cleanup/usr.bin/xlint/lint1/decl.c =================================================================== --- user/ngie/bsnmp_cleanup/usr.bin/xlint/lint1/decl.c (revision 298467) +++ user/ngie/bsnmp_cleanup/usr.bin/xlint/lint1/decl.c (revision 298468) @@ -1,3052 +1,3052 @@ /* $NetBSD: decl.c,v 1.33 2004/06/20 22:20:16 jmc Exp $ */ /* * Copyright (c) 1996 Christopher G. Demetriou. All Rights Reserved. * Copyright (c) 1994, 1995 Jochen Pohl * All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Jochen Pohl for * The NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #if defined(__RCSID) && !defined(lint) __RCSID("$NetBSD: decl.c,v 1.33 2004/06/20 22:20:16 jmc Exp $"); #endif __FBSDID("$FreeBSD$"); #include #include #include #include #include "lint1.h" const char *unnamed = ""; /* shared type structures for arithmtic types and void */ static type_t *typetab; /* value of next enumerator during declaration of enum types */ int enumval; /* * pointer to top element of a stack which contains informations local * to nested declarations */ dinfo_t *dcs; static type_t *tdeferr(type_t *, tspec_t); static void settdsym(type_t *, sym_t *); static tspec_t mrgtspec(tspec_t, tspec_t); static void align(int, int); static sym_t *newtag(sym_t *, scl_t, int, int); static int eqargs(type_t *, type_t *, int *); static int mnoarg(type_t *, int *); static int chkosdef(sym_t *, sym_t *); static int chkptdecl(sym_t *, sym_t *); static sym_t *nsfunc(sym_t *, sym_t *); static void osfunc(sym_t *, sym_t *); static void ledecl(sym_t *); static int chkinit(sym_t *); static void chkausg(int, sym_t *); static void chkvusg(int, sym_t *); static void chklusg(sym_t *); static void chktusg(sym_t *); static void chkglvar(sym_t *); static void glchksz(sym_t *); /* * initializes all global vars used in declarations */ void initdecl(void) { int i; /* declaration stack */ if ((dcs = calloc(1, sizeof (dinfo_t))) == NULL) nomem(); dcs->d_ctx = EXTERN; dcs->d_ldlsym = &dcs->d_dlsyms; /* type information and classification */ inittyp(); /* shared type structures */ if ((typetab = calloc(NTSPEC, sizeof (type_t))) == NULL) nomem(); for (i = 0; i < NTSPEC; i++) typetab[i].t_tspec = NOTSPEC; typetab[CHAR].t_tspec = CHAR; typetab[SCHAR].t_tspec = SCHAR; typetab[UCHAR].t_tspec = UCHAR; typetab[SHORT].t_tspec = SHORT; typetab[USHORT].t_tspec = USHORT; typetab[INT].t_tspec = INT; typetab[UINT].t_tspec = UINT; typetab[LONG].t_tspec = LONG; typetab[ULONG].t_tspec = ULONG; typetab[QUAD].t_tspec = QUAD; typetab[UQUAD].t_tspec = UQUAD; typetab[FLOAT].t_tspec = FLOAT; typetab[DOUBLE].t_tspec = DOUBLE; typetab[LDOUBLE].t_tspec = LDOUBLE; typetab[VOID].t_tspec = VOID; /* * Next two are not real types. They are only used by the parser * to return keywords "signed" and "unsigned" */ typetab[SIGNED].t_tspec = SIGNED; typetab[UNSIGN].t_tspec = UNSIGN; } /* * Returns a shared type structure vor arithmetic types and void. * * It's important to duplicate this structure (using duptyp() or tdupdyp()) * if it is to be modified (adding qualifiers or anything else). */ type_t * gettyp(tspec_t t) { return (&typetab[t]); } type_t * duptyp(const type_t *tp) { type_t *ntp; ntp = getblk(sizeof (type_t)); STRUCT_ASSIGN(*ntp, *tp); return (ntp); } /* * Use tduptyp() instead of duptyp() inside expressions (if the * allocated memory should be freed after the expr). */ type_t * tduptyp(const type_t *tp) { type_t *ntp; ntp = tgetblk(sizeof (type_t)); STRUCT_ASSIGN(*ntp, *tp); return (ntp); } /* * Returns 1 if the argument is void or an incomplete array, * struct, union or enum type. */ int incompl(type_t *tp) { tspec_t t; if ((t = tp->t_tspec) == VOID) { return (1); } else if (t == ARRAY) { return (tp->t_aincompl); } else if (t == STRUCT || t == UNION) { return (tp->t_str->sincompl); } else if (t == ENUM) { return (tp->t_enum->eincompl); } return (0); } /* * Set the flag for (in)complete array, struct, union or enum * types. */ void setcompl(type_t *tp, int ic) { tspec_t t; if ((t = tp->t_tspec) == ARRAY) { tp->t_aincompl = ic; } else if (t == STRUCT || t == UNION) { tp->t_str->sincompl = ic; } else { if (t != ENUM) LERROR("setcompl()"); tp->t_enum->eincompl = ic; } } /* * Remember the storage class of the current declaration in dcs->d_scl * (the top element of the declaration stack) and detect multiple * storage classes. */ void addscl(scl_t sc) { if (sc == INLINE) { if (dcs->d_inline) /* duplicate '%s' */ warning(10, "inline"); dcs->d_inline = 1; return; } if (dcs->d_type != NULL || dcs->d_atyp != NOTSPEC || dcs->d_smod != NOTSPEC || dcs->d_lmod != NOTSPEC) { /* storage class after type is obsolescent */ warning(83); } if (dcs->d_scl == NOSCL) { dcs->d_scl = sc; } else { /* * multiple storage classes. An error will be reported in * deftyp(). */ dcs->d_mscl = 1; } } /* * Remember the type, modifier or typedef name returned by the parser * in *dcs (top element of decl stack). This information is used in * deftyp() to build the type used for all declarators in this * declaration. * * Is tp->t_typedef 1, the type comes from a previously defined typename. * Otherwise it comes from a type specifier (int, long, ...) or a * struct/union/enum tag. */ void addtype(type_t *tp) { tspec_t t; if (tp->t_typedef) { if (dcs->d_type != NULL || dcs->d_atyp != NOTSPEC || dcs->d_lmod != NOTSPEC || dcs->d_smod != NOTSPEC) { /* * something like "typedef int a; int a b;" * This should not happen with current grammar. */ LERROR("addtype()"); } dcs->d_type = tp; return; } t = tp->t_tspec; if (t == STRUCT || t == UNION || t == ENUM) { /* * something like "int struct a ..." * struct/union/enum with anything else is not allowed */ if (dcs->d_type != NULL || dcs->d_atyp != NOTSPEC || dcs->d_lmod != NOTSPEC || dcs->d_smod != NOTSPEC) { /* * remember that an error must be reported in * deftyp(). */ dcs->d_terr = 1; dcs->d_atyp = dcs->d_lmod = dcs->d_smod = NOTSPEC; } dcs->d_type = tp; return; } if (dcs->d_type != NULL && !dcs->d_type->t_typedef) { /* * something like "struct a int" * struct/union/enum with anything else is not allowed */ dcs->d_terr = 1; return; } if (t == LONG && dcs->d_lmod == LONG) { /* "long long" or "long ... long" */ t = QUAD; dcs->d_lmod = NOTSPEC; if (!quadflg) /* %s C does not support 'long long' */ (void)c99ism(265, tflag ? "traditional" : "c89"); } if (dcs->d_type != NULL && dcs->d_type->t_typedef) { /* something like "typedef int a; a long ..." */ dcs->d_type = tdeferr(dcs->d_type, t); return; } /* now it can be only a combination of arithmetic types and void */ if (t == SIGNED || t == UNSIGN) { /* remember specifiers "signed" and "unsigned" in dcs->d_smod */ if (dcs->d_smod != NOTSPEC) /* * more than one "signed" and/or "unsigned"; print * an error in deftyp() */ dcs->d_terr = 1; dcs->d_smod = t; } else if (t == SHORT || t == LONG || t == QUAD) { /* * remember specifiers "short", "long" and "long long" in * dcs->d_lmod */ if (dcs->d_lmod != NOTSPEC) /* more than one, print error in deftyp() */ dcs->d_terr = 1; dcs->d_lmod = t; } else { /* * remember specifiers "void", "char", "int", "float" or * "double" int dcs->d_atyp */ if (dcs->d_atyp != NOTSPEC) /* more than one, print error in deftyp() */ dcs->d_terr = 1; dcs->d_atyp = t; } } /* * called if a list of declaration specifiers contains a typedef name * and other specifiers (except struct, union, enum, typedef name) */ static type_t * tdeferr(type_t *td, tspec_t t) { tspec_t t2; t2 = td->t_tspec; switch (t) { case SIGNED: case UNSIGN: if (t2 == CHAR || t2 == SHORT || t2 == INT || t2 == LONG || t2 == QUAD) { if (!tflag) /* modifying typedef with ... */ warning(5, ttab[t].tt_name); td = duptyp(gettyp(mrgtspec(t2, t))); td->t_typedef = 1; return (td); } break; case SHORT: if (t2 == INT || t2 == UINT) { /* modifying typedef with ... */ warning(5, "short"); td = duptyp(gettyp(t2 == INT ? SHORT : USHORT)); td->t_typedef = 1; return (td); } break; case LONG: if (t2 == INT || t2 == UINT || t2 == LONG || t2 == ULONG || t2 == FLOAT || t2 == DOUBLE) { /* modifying typedef with ... */ warning(5, "long"); if (t2 == INT) { td = gettyp(LONG); } else if (t2 == UINT) { td = gettyp(ULONG); } else if (t2 == LONG) { td = gettyp(QUAD); } else if (t2 == ULONG) { td = gettyp(UQUAD); } else if (t2 == FLOAT) { td = gettyp(DOUBLE); } else if (t2 == DOUBLE) { td = gettyp(LDOUBLE); } td = duptyp(td); td->t_typedef = 1; return (td); } break; /* LINTED (enumeration values not handled in switch) */ case NOTSPEC: case USHORT: case UCHAR: case SCHAR: case CHAR: case FUNC: case ARRAY: case PTR: case ENUM: case UNION: case STRUCT: case VOID: case LDOUBLE: case DOUBLE: case FLOAT: case UQUAD: case QUAD: case ULONG: case UINT: case INT: break; } /* Anything other is not accepted. */ dcs->d_terr = 1; return (td); } /* * Remember the symbol of a typedef name (2nd arg) in a struct, union * or enum tag if the typedef name is the first defined for this tag. * * If the tag is unnamed, the typdef name is used for identification * of this tag in lint2. Although its possible that more than one typedef * name is defined for one tag, the first name defined should be unique * if the tag is unnamed. */ static void settdsym(type_t *tp, sym_t *sym) { tspec_t t; if ((t = tp->t_tspec) == STRUCT || t == UNION) { if (tp->t_str->stdef == NULL) tp->t_str->stdef = sym; } else if (t == ENUM) { if (tp->t_enum->etdef == NULL) tp->t_enum->etdef = sym; } } /* * Remember a qualifier which is part of the declaration specifiers * (and not the declarator) in the top element of the declaration stack. * Also detect multiple qualifiers of the same kind. * The remembered qualifier is used by deftyp() to construct the type * for all declarators. */ void addqual(tqual_t q) { if (q == CONST) { if (dcs->d_const) { /* duplicate "%s" */ warning(10, "const"); } dcs->d_const = 1; } else { if (q != VOLATILE) LERROR("addqual()"); if (dcs->d_volatile) { /* duplicate "%s" */ warning(10, "volatile"); } dcs->d_volatile = 1; } } /* * Go to the next declaration level (structs, nested structs, blocks, * argument declaration lists ...) */ void pushdecl(scl_t sc) { dinfo_t *di; if (dflag) (void)printf("pushdecl(%d)\n", (int)sc); /* put a new element on the declaration stack */ if ((di = calloc(1, sizeof (dinfo_t))) == NULL) nomem(); di->d_nxt = dcs; dcs = di; di->d_ctx = sc; di->d_ldlsym = &di->d_dlsyms; } /* * Go back to previous declaration level */ void popdecl(void) { dinfo_t *di; if (dflag) (void)printf("popdecl(%d)\n", (int)dcs->d_ctx); if (dcs->d_nxt == NULL) LERROR("popdecl()"); di = dcs; dcs = di->d_nxt; switch (di->d_ctx) { case EXTERN: /* there is nothing after external declarations */ LERROR("popdecl()"); /* NOTREACHED */ case MOS: case MOU: case ENUMCON: /* * Symbols declared in (nested) structs or enums are * part of the next level (they are removed from the * symbol table if the symbols of the outher level are * removed) */ if ((*dcs->d_ldlsym = di->d_dlsyms) != NULL) dcs->d_ldlsym = di->d_ldlsym; break; case ARG: /* * All symbols in dcs->d_dlsyms are introduced in old style * argument declarations (it's not clean, but possible). * They are appended to the list of symbols declared in * an old style argument identifier list or a new style * parameter type list. */ if (di->d_dlsyms != NULL) { *di->d_ldlsym = dcs->d_fpsyms; dcs->d_fpsyms = di->d_dlsyms; } break; case ABSTRACT: /* * casts and sizeof * Append all symbols declared in the abstract declaration * to the list of symbols declared in the surrounding decl. * or block. * XXX I'm not sure whether they should be removed from the * symbol table now or later. */ if ((*dcs->d_ldlsym = di->d_dlsyms) != NULL) dcs->d_ldlsym = di->d_ldlsym; break; case AUTO: /* check usage of local vars */ chkusage(di); /* FALLTHROUGH */ case PARG: /* usage of arguments will be checked by funcend() */ rmsyms(di->d_dlsyms); break; default: LERROR("popdecl()"); } free(di); } /* * Set flag d_asm in all declaration stack elements up to the * outermost one. * * This is used to mark compound statements which have, possibly in * nested compound statements, asm statements. For these compound * statements no warnings about unused or unitialized variables are * printed. * * There is no need to clear d_asm in dinfo structs with context AUTO, * because these structs are freed at the end of the compound statement. * But it must be cleard in the outermost dinfo struct, which has * context EXTERN. This could be done in clrtyp() and would work for * C, but not for C++ (due to mixed statements and declarations). Thus * we clear it in glclup(), which is used to do some cleanup after * global declarations/definitions. */ void setasm(void) { dinfo_t *di; for (di = dcs; di != NULL; di = di->d_nxt) di->d_asm = 1; } /* * Clean all elements of the top element of declaration stack which * will be used by the next declaration */ void clrtyp(void) { dcs->d_atyp = dcs->d_smod = dcs->d_lmod = NOTSPEC; dcs->d_scl = NOSCL; dcs->d_type = NULL; dcs->d_const = dcs->d_volatile = 0; dcs->d_inline = 0; dcs->d_mscl = dcs->d_terr = 0; dcs->d_nedecl = 0; dcs->d_notyp = 0; } /* * Create a type structure from the informations gathered in * the declaration stack. * Complain about storage classes which are not possible in current * context. */ void deftyp(void) { tspec_t t, s, l; type_t *tp; scl_t scl; t = dcs->d_atyp; /* CHAR, INT, FLOAT, DOUBLE, VOID */ s = dcs->d_smod; /* SIGNED, UNSIGNED */ l = dcs->d_lmod; /* SHORT, LONG, QUAD */ tp = dcs->d_type; scl = dcs->d_scl; if (t == NOTSPEC && s == NOTSPEC && l == NOTSPEC && tp == NULL) dcs->d_notyp = 1; if (tp != NULL && (t != NOTSPEC || s != NOTSPEC || l != NOTSPEC)) { /* should never happen */ LERROR("deftyp()"); } if (tp == NULL) { switch (t) { case NOTSPEC: t = INT; /* FALLTHROUGH */ case INT: if (s == NOTSPEC) s = SIGNED; break; case CHAR: if (l != NOTSPEC) { dcs->d_terr = 1; l = NOTSPEC; } break; case FLOAT: if (l == LONG) { l = NOTSPEC; t = DOUBLE; if (!tflag) /* use 'double' instead of ... */ warning(6); } break; case DOUBLE: if (l == LONG) { l = NOTSPEC; t = LDOUBLE; if (tflag) /* 'long double' is illegal in ... */ warning(266); } break; case VOID: break; default: LERROR("deftyp()"); } if (t != INT && t != CHAR && (s != NOTSPEC || l != NOTSPEC)) { dcs->d_terr = 1; l = s = NOTSPEC; } if (l != NOTSPEC) t = l; dcs->d_type = gettyp(mrgtspec(t, s)); } if (dcs->d_mscl) { /* only one storage class allowed */ error(7); } if (dcs->d_terr) { /* illegal type combination */ error(4); } if (dcs->d_ctx == EXTERN) { if (scl == REG || scl == AUTO) { /* illegal storage class */ error(8); scl = NOSCL; } } else if (dcs->d_ctx == ARG || dcs->d_ctx == PARG) { if (scl != NOSCL && scl != REG) { /* only "register" valid ... */ error(9); scl = NOSCL; } } dcs->d_scl = scl; if (dcs->d_const && dcs->d_type->t_const) { if (!dcs->d_type->t_typedef) LERROR("deftyp()"); /* typedef already qualified with "%s" */ warning(68, "const"); } if (dcs->d_volatile && dcs->d_type->t_volatile) { if (!dcs->d_type->t_typedef) LERROR("deftyp()"); /* typedef already qualified with "%s" */ warning(68, "volatile"); } if (dcs->d_const || dcs->d_volatile) { dcs->d_type = duptyp(dcs->d_type); dcs->d_type->t_const |= dcs->d_const; dcs->d_type->t_volatile |= dcs->d_volatile; } } /* * Merge type specifiers (char, ..., long long, signed, unsigned). */ static tspec_t mrgtspec(tspec_t t, tspec_t s) { if (s == SIGNED || s == UNSIGN) { if (t == CHAR) { t = s == SIGNED ? SCHAR : UCHAR; } else if (t == SHORT) { t = s == SIGNED ? SHORT : USHORT; } else if (t == INT) { t = s == SIGNED ? INT : UINT; } else if (t == LONG) { t = s == SIGNED ? LONG : ULONG; } else if (t == QUAD) { t = s == SIGNED ? QUAD : UQUAD; } } return (t); } /* * Return the length of a type in bit. * * Printing a message if the outhermost dimension of an array is 0 must * be done by the caller. All other problems are reported by length() * if name is not NULL. */ int length(type_t *tp, const char *name) { int elem, elsz; elem = 1; while (tp && tp->t_tspec == ARRAY) { elem *= tp->t_dim; tp = tp->t_subt; } if (tp == NULL) return -1; switch (tp->t_tspec) { case FUNC: /* compiler takes size of function */ LERROR("%s", msgs[12]); /* NOTREACHED */ case STRUCT: case UNION: if (incompl(tp) && name != NULL) { /* incomplete structure or union %s: %s */ error(31, tp->t_str->stag->s_name, name); } elsz = tp->t_str->size; break; case ENUM: if (incompl(tp) && name != NULL) { /* incomplete enum type: %s */ warning(13, name); } /* FALLTHROUGH */ default: elsz = size(tp->t_tspec); if (elsz <= 0) LERROR("length()"); break; } return (elem * elsz); } /* * Get the alignment of the given type in bits. */ int getbound(type_t *tp) { int a; tspec_t t; while (tp && tp->t_tspec == ARRAY) tp = tp->t_subt; if (tp == NULL) return -1; if ((t = tp->t_tspec) == STRUCT || t == UNION) { a = tp->t_str->align; } else if (t == FUNC) { /* compiler takes alignment of function */ error(14); a = LINT_ALIGN(1) * CHAR_BIT; } else { if ((a = size(t)) == 0) { a = CHAR_BIT; } else if (a > LINT_ALIGN(1) * CHAR_BIT) { a = LINT_ALIGN(1) * CHAR_BIT; } } if (a < CHAR_BIT || a > LINT_ALIGN(1) * CHAR_BIT) LERROR("getbound()"); return (a); } /* * Concatenate two lists of symbols by s_nxt. Used by declarations of * struct/union/enum elements and parameters. */ sym_t * lnklst(sym_t *l1, sym_t *l2) { sym_t *l; if ((l = l1) == NULL) return (l2); while (l1->s_nxt != NULL) l1 = l1->s_nxt; l1->s_nxt = l2; return (l); } /* * Check if the type of the given symbol is valid and print an error * message if it is not. * * Invalid types are: * - arrays of incomlete types or functions * - functions returning arrays or functions * - void types other than type of function or pointer */ void chktyp(sym_t *sym) { tspec_t to, t; type_t **tpp, *tp; tpp = &sym->s_type; to = NOTSPEC; while ((tp = *tpp) != NULL) { t = tp->t_tspec; /* * If this is the type of an old style function definition, * a better warning is printed in funcdef(). */ if (t == FUNC && !tp->t_proto && !(to == NOTSPEC && sym->s_osdef)) { if (sflag && hflag) /* function declaration is not a prototype */ warning(287); } if (to == FUNC) { if (t == FUNC || t == ARRAY) { /* function returns illegal type */ error(15); if (t == FUNC) { *tpp = incref(*tpp, PTR); } else { *tpp = incref((*tpp)->t_subt, PTR); } return; } else if (tp->t_const || tp->t_volatile) { if (sflag) { /* XXX oder better !tflag ? */ /* function cannot return const... */ warning(228); } } } if (to == ARRAY) { if (t == FUNC) { /* array of function is illegal */ error(16); *tpp = gettyp(INT); return; } else if (t == ARRAY && tp->t_dim == 0) { /* null dimension */ error(17); return; } else if (t == VOID) { /* illegal use of void */ error(18); *tpp = gettyp(INT); #if 0 /* errors are produced by length() */ } else if (incompl(tp)) { /* array of incomplete type */ if (sflag) { error(301); } else { warning(301); } #endif } } else if (to == NOTSPEC && t == VOID) { if (dcs->d_ctx == PARG) { if (sym->s_scl != ABSTRACT) { if (sym->s_name == unnamed) LERROR("chktyp()"); /* void param cannot have name: %s */ error(61, sym->s_name); *tpp = gettyp(INT); } } else if (dcs->d_ctx == ABSTRACT) { /* ok */ } else if (sym->s_scl != TYPEDEF) { /* void type for %s */ error(19, sym->s_name); *tpp = gettyp(INT); } } if (t == VOID && to != PTR) { if (tp->t_const || tp->t_volatile) { /* inappropriate qualifiers with "void" */ warning(69); tp->t_const = tp->t_volatile = 0; } } tpp = &tp->t_subt; to = t; } } /* * Process the declarator of a struct/union element. */ sym_t * decl1str(sym_t *dsym) { type_t *tp; tspec_t t; int sz, len; int o = 0; /* Appease gcc */ scl_t sc; if ((sc = dsym->s_scl) != MOS && sc != MOU) LERROR("decl1str()"); if (dcs->d_rdcsym != NULL) { if ((sc = dcs->d_rdcsym->s_scl) != MOS && sc != MOU) /* should be ensured by storesym() */ LERROR("decl1str()"); if (dsym->s_styp == dcs->d_rdcsym->s_styp) { /* duplicate member name: %s */ error(33, dsym->s_name); rmsym(dcs->d_rdcsym); } } chktyp(dsym); t = (tp = dsym->s_type)->t_tspec; if (dsym->s_field) { /* * bit field * * only unsigned and signed int are portable bit-field types * (at least in ANSI C, in traditional C only unsigned int) */ if (t == CHAR || t == UCHAR || t == SCHAR || t == SHORT || t == USHORT || t == ENUM) { if (bitfieldtype_ok == 0) { if (sflag) { char buf[64]; /* * bit-field type '%s' invalid in * ANSI C */ warning(273, tyname(buf, sizeof(buf), tp)); } else if (pflag) { /* nonportable bit-field type */ warning(34); } } } else if (t == INT && dcs->d_smod == NOTSPEC) { if (pflag && bitfieldtype_ok == 0) { /* nonportable bit-field type */ warning(34); } } else if (t != INT && t != UINT) { /* * Non-integer types are always illegal for * bitfields, regardless of BITFIELDTYPE. * Integer types not dealt with above are * okay only if BITFIELDTYPE is in effect. */ if (bitfieldtype_ok == 0 || isityp(t) == 0) { /* illegal bit-field type */ error(35); sz = tp->t_flen; dsym->s_type = tp = duptyp(gettyp(t = INT)); if ((tp->t_flen = sz) > size(t)) tp->t_flen = size(t); } } if ((len = tp->t_flen) < 0 || len > size(t)) { /* illegal bit-field size */ error(36); tp->t_flen = size(t); } else if (len == 0 && dsym->s_name != unnamed) { /* zero size bit-field */ error(37); tp->t_flen = size(t); } if (dsym->s_scl == MOU) { /* illegal use of bit-field */ error(41); dsym->s_type->t_isfield = 0; dsym->s_field = 0; } } else if (t == FUNC) { /* function illegal in structure or union */ error(38); dsym->s_type = tp = incref(tp, t = PTR); } /* * bit-fields of length 0 are not warned about because length() * does not return the length of the bit-field but the length * of the type the bit-field is packed in (its ok) */ if ((sz = length(dsym->s_type, dsym->s_name)) == 0) { if (t == ARRAY && dsym->s_type->t_dim == 0) { /* illegal zero sized structure member: %s */ c99ism(39, dsym->s_name); } } if (dcs->d_ctx == MOU) { o = dcs->d_offset; dcs->d_offset = 0; } if (dsym->s_field) { align(getbound(tp), tp->t_flen); dsym->s_value.v_quad = (dcs->d_offset / size(t)) * size(t); tp->t_foffs = dcs->d_offset - (int)dsym->s_value.v_quad; dcs->d_offset += tp->t_flen; } else { align(getbound(tp), 0); dsym->s_value.v_quad = dcs->d_offset; dcs->d_offset += sz; } if (dcs->d_ctx == MOU) { if (o > dcs->d_offset) dcs->d_offset = o; } chkfdef(dsym, 0); /* * Clear the BITFIELDTYPE indicator after processing each * structure element. */ bitfieldtype_ok = 0; return (dsym); } /* * Aligns next structure element as required. * * al contains the required alignment, len the length of a bit-field. */ static void align(int al, int len) { int no; /* * The alignment of the current element becomes the alignment of * the struct/union if it is larger than the current alignment * of the struct/union. */ if (al > dcs->d_stralign) dcs->d_stralign = al; - no = (dcs->d_offset + (al - 1)) & ~(al - 1); + no = roundup2(dcs->d_offset, al); if (len == 0 || dcs->d_offset + len > no) dcs->d_offset = no; } /* * Remember the width of the field in its type structure. */ sym_t * bitfield(sym_t *dsym, int len) { if (dsym == NULL) { dsym = getblk(sizeof (sym_t)); dsym->s_name = unnamed; dsym->s_kind = FMOS; dsym->s_scl = MOS; dsym->s_type = gettyp(UINT); dsym->s_blklev = -1; } dsym->s_type = duptyp(dsym->s_type); dsym->s_type->t_isfield = 1; dsym->s_type->t_flen = len; dsym->s_field = 1; return (dsym); } /* * Collect informations about a sequence of asterisks and qualifiers * in a list of type pqinf_t. * Qualifiers refer always to the left asterisk. The rightmost asterisk * will be at the top of the list. */ pqinf_t * mergepq(pqinf_t *p1, pqinf_t *p2) { pqinf_t *p; if (p2->p_pcnt != 0) { /* left '*' at the end of the list */ for (p = p2; p->p_nxt != NULL; p = p->p_nxt) continue; p->p_nxt = p1; return (p2); } else { if (p2->p_const) { if (p1->p_const) { /* duplicate %s */ warning(10, "const"); } p1->p_const = 1; } if (p2->p_volatile) { if (p1->p_volatile) { /* duplicate %s */ warning(10, "volatile"); } p1->p_volatile = 1; } free(p2); return (p1); } } /* * Followint 3 functions extend the type of a declarator with * pointer, function and array types. * * The current type is the type built by deftyp() (dcs->d_type) and * pointer, function and array types already added for this * declarator. The new type extension is inserted between both. */ sym_t * addptr(sym_t *decl, pqinf_t *pi) { type_t **tpp, *tp; pqinf_t *npi; tpp = &decl->s_type; while (*tpp && *tpp != dcs->d_type) tpp = &(*tpp)->t_subt; if (*tpp == NULL) return decl; while (pi != NULL) { *tpp = tp = getblk(sizeof (type_t)); tp->t_tspec = PTR; tp->t_const = pi->p_const; tp->t_volatile = pi->p_volatile; *(tpp = &tp->t_subt) = dcs->d_type; npi = pi->p_nxt; free(pi); pi = npi; } return (decl); } /* * If a dimension was specified, dim is 1, otherwise 0 * n is the specified dimension */ sym_t * addarray(sym_t *decl, int dim, int n) { type_t **tpp, *tp; tpp = &decl->s_type; while (*tpp && *tpp != dcs->d_type) tpp = &(*tpp)->t_subt; if (*tpp == NULL) return decl; *tpp = tp = getblk(sizeof (type_t)); tp->t_tspec = ARRAY; tp->t_subt = dcs->d_type; tp->t_dim = n; if (n < 0) { /* negative array dimension */ error(20, n); n = 0; } else if (n == 0 && dim) { /* zero array dimension */ c99ism(322, dim); } else if (n == 0 && !dim) { /* is incomplete type */ setcompl(tp, 1); } return (decl); } sym_t * addfunc(sym_t *decl, sym_t *args) { type_t **tpp, *tp; if (dcs->d_proto) { if (tflag) /* function prototypes are illegal in traditional C */ warning(270); args = nsfunc(decl, args); } else { osfunc(decl, args); } /* * The symbols are removed from the symbol table by popdecl() after * addfunc(). To be able to restore them if this is a function * definition, a pointer to the list of all symbols is stored in * dcs->d_nxt->d_fpsyms. Also a list of the arguments (concatenated * by s_nxt) is stored in dcs->d_nxt->d_fargs. * (dcs->d_nxt must be used because *dcs is the declaration stack * element created for the list of params and is removed after * addfunc()) */ if (dcs->d_nxt->d_ctx == EXTERN && decl->s_type == dcs->d_nxt->d_type) { dcs->d_nxt->d_fpsyms = dcs->d_dlsyms; dcs->d_nxt->d_fargs = args; } tpp = &decl->s_type; while (*tpp && *tpp != dcs->d_nxt->d_type) tpp = &(*tpp)->t_subt; if (*tpp == NULL) return decl; *tpp = tp = getblk(sizeof (type_t)); tp->t_tspec = FUNC; tp->t_subt = dcs->d_nxt->d_type; if ((tp->t_proto = dcs->d_proto) != 0) tp->t_args = args; tp->t_vararg = dcs->d_vararg; return (decl); } /* * Called for new style function declarations. */ /* ARGSUSED */ static sym_t * nsfunc(sym_t *decl, sym_t *args) { sym_t *arg, *sym; scl_t sc; int n; /* * Declarations of structs/unions/enums in param lists are legal, * but senseless. */ for (sym = dcs->d_dlsyms; sym != NULL; sym = sym->s_dlnxt) { sc = sym->s_scl; if (sc == STRTAG || sc == UNIONTAG || sc == ENUMTAG) { /* dubious tag declaration: %s %s */ warning(85, scltoa(sc), sym->s_name); } } n = 1; for (arg = args; arg != NULL; arg = arg->s_nxt) { if (arg->s_type->t_tspec == VOID) { if (n > 1 || arg->s_nxt != NULL) { /* "void" must be sole parameter */ error(60); arg->s_type = gettyp(INT); } } n++; } /* return NULL if first param is VOID */ return (args != NULL && args->s_type->t_tspec != VOID ? args : NULL); } /* * Called for old style function declarations. */ static void osfunc(sym_t *decl, sym_t *args) { /* * Remember list of params only if this is really seams to be * a function definition. */ if (dcs->d_nxt->d_ctx == EXTERN && decl->s_type == dcs->d_nxt->d_type) { /* * We assume that this becomes a function definition. If * we are wrong, its corrected in chkfdef(). */ if (args != NULL) { decl->s_osdef = 1; decl->s_args = args; } } else { if (args != NULL) /* function prototype parameters must have types */ warning(62); } } /* * Lists of Identifiers in functions declarations are allowed only if * its also a function definition. If this is not the case, print a * error message. */ void chkfdef(sym_t *sym, int msg) { if (sym->s_osdef) { if (msg) { /* incomplete or misplaced function definition */ error(22); } sym->s_osdef = 0; sym->s_args = NULL; } } /* * Process the name in a declarator. * If the symbol does already exists, a new one is created. * The symbol becomes one of the storage classes EXTERN, STATIC, AUTO or * TYPEDEF. * s_def and s_reg are valid after dname(). */ sym_t * dname(sym_t *sym) { scl_t sc = NOSCL; if (sym->s_scl == NOSCL) { dcs->d_rdcsym = NULL; } else if (sym->s_defarg) { sym->s_defarg = 0; dcs->d_rdcsym = NULL; } else { dcs->d_rdcsym = sym; sym = pushdown(sym); } switch (dcs->d_ctx) { case MOS: case MOU: /* Parent setzen */ sym->s_styp = dcs->d_tagtyp->t_str; sym->s_def = DEF; sym->s_value.v_tspec = INT; sc = dcs->d_ctx; break; case EXTERN: /* * static and external symbols without "extern" are * considered to be tentative defined, external * symbols with "extern" are declared, and typedef names * are defined. Tentative defined and declared symbols * may become defined if an initializer is present or * this is a function definition. */ if ((sc = dcs->d_scl) == NOSCL) { sc = EXTERN; sym->s_def = TDEF; } else if (sc == STATIC) { sym->s_def = TDEF; } else if (sc == TYPEDEF) { sym->s_def = DEF; } else if (sc == EXTERN) { sym->s_def = DECL; } else { LERROR("dname()"); } break; case PARG: sym->s_arg = 1; /* FALLTHROUGH */ case ARG: if ((sc = dcs->d_scl) == NOSCL) { sc = AUTO; } else if (sc == REG) { sym->s_reg = 1; sc = AUTO; } else { LERROR("dname()"); } sym->s_def = DEF; break; case AUTO: if ((sc = dcs->d_scl) == NOSCL) { /* * XXX somewhat ugly because we dont know whether * this is AUTO or EXTERN (functions). If we are * wrong it must be corrected in decl1loc(), where * we have the necessary type information. */ sc = AUTO; sym->s_def = DEF; } else if (sc == AUTO || sc == STATIC || sc == TYPEDEF) { sym->s_def = DEF; } else if (sc == REG) { sym->s_reg = 1; sc = AUTO; sym->s_def = DEF; } else if (sc == EXTERN) { sym->s_def = DECL; } else { LERROR("dname()"); } break; default: LERROR("dname()"); } sym->s_scl = sc; sym->s_type = dcs->d_type; dcs->d_fpsyms = NULL; return (sym); } /* * Process a name in the list of formal params in an old style function * definition. */ sym_t * iname(sym_t *sym) { if (sym->s_scl != NOSCL) { if (blklev == sym->s_blklev) { /* redeclaration of formal parameter %s */ error(21, sym->s_name); if (!sym->s_defarg) LERROR("iname()"); } sym = pushdown(sym); } sym->s_type = gettyp(INT); sym->s_scl = AUTO; sym->s_def = DEF; sym->s_defarg = sym->s_arg = 1; return (sym); } /* * Create the type of a tag. * * tag points to the symbol table entry of the tag * kind is the kind of the tag (STRUCT/UNION/ENUM) * decl is 1 if the type of the tag will be completed in this declaration * (the following token is T_LBRACE) * semi is 1 if the following token is T_SEMI */ type_t * mktag(sym_t *tag, tspec_t kind, int decl, int semi) { scl_t scl = NOSCL; type_t *tp; if (kind == STRUCT) { scl = STRTAG; } else if (kind == UNION) { scl = UNIONTAG; } else if (kind == ENUM) { scl = ENUMTAG; } else { LERROR("mktag()"); } if (tag != NULL) { if (tag->s_scl != NOSCL) { tag = newtag(tag, scl, decl, semi); } else { /* a new tag, no empty declaration */ dcs->d_nxt->d_nedecl = 1; if (scl == ENUMTAG && !decl) { if (!tflag && (sflag || pflag)) /* forward reference to enum type */ warning(42); } } if (tag->s_scl == NOSCL) { tag->s_scl = scl; tag->s_type = tp = getblk(sizeof (type_t)); } else { tp = tag->s_type; } } else { tag = getblk(sizeof (sym_t)); tag->s_name = unnamed; UNIQUE_CURR_POS(tag->s_dpos); tag->s_kind = FTAG; tag->s_scl = scl; tag->s_blklev = -1; tag->s_type = tp = getblk(sizeof (type_t)); dcs->d_nxt->d_nedecl = 1; } if (tp->t_tspec == NOTSPEC) { tp->t_tspec = kind; if (kind != ENUM) { tp->t_str = getblk(sizeof (str_t)); tp->t_str->align = CHAR_BIT; tp->t_str->stag = tag; } else { tp->t_isenum = 1; tp->t_enum = getblk(sizeof (enum_t)); tp->t_enum->etag = tag; } /* is incomplete type */ setcompl(tp, 1); } return (tp); } /* * Checks all possible cases of tag redeclarations. * decl is 1 if T_LBRACE follows * semi is 1 if T_SEMI follows */ static sym_t * newtag(sym_t *tag, scl_t scl, int decl, int semi) { if (tag->s_blklev < blklev) { if (semi) { /* "struct a;" */ if (!tflag) { if (!sflag) /* decl. introduces new type ... */ warning(44, scltoa(scl), tag->s_name); tag = pushdown(tag); } else if (tag->s_scl != scl) { /* base type is really "%s %s" */ warning(45, scltoa(tag->s_scl), tag->s_name); } dcs->d_nxt->d_nedecl = 1; } else if (decl) { /* "struct a { ... } " */ if (hflag) /* redefinition hides earlier one: %s */ warning(43, tag->s_name); tag = pushdown(tag); dcs->d_nxt->d_nedecl = 1; } else if (tag->s_scl != scl) { /* base type is really "%s %s" */ warning(45, scltoa(tag->s_scl), tag->s_name); /* declaration introduces new type in ANSI C: %s %s */ if (!sflag) warning(44, scltoa(scl), tag->s_name); tag = pushdown(tag); dcs->d_nxt->d_nedecl = 1; } } else { if (tag->s_scl != scl) { /* (%s) tag redeclared */ error(46, scltoa(tag->s_scl)); prevdecl(-1, tag); tag = pushdown(tag); dcs->d_nxt->d_nedecl = 1; } else if (decl && !incompl(tag->s_type)) { /* (%s) tag redeclared */ error(46, scltoa(tag->s_scl)); prevdecl(-1, tag); tag = pushdown(tag); dcs->d_nxt->d_nedecl = 1; } else if (semi || decl) { dcs->d_nxt->d_nedecl = 1; } } return (tag); } const char * scltoa(scl_t sc) { const char *s; switch (sc) { case EXTERN: s = "extern"; break; case STATIC: s = "static"; break; case AUTO: s = "auto"; break; case REG: s = "register"; break; case TYPEDEF: s = "typedef"; break; case STRTAG: s = "struct"; break; case UNIONTAG: s = "union"; break; case ENUMTAG: s = "enum"; break; default: LERROR("tagttoa()"); } return (s); } /* * Completes the type of a tag in a struct/union/enum declaration. * tp points to the type of the, tag, fmem to the list of members/enums. */ type_t * compltag(type_t *tp, sym_t *fmem) { tspec_t t; str_t *sp; int n; sym_t *mem; /* from now a complete type */ setcompl(tp, 0); if ((t = tp->t_tspec) != ENUM) { align(dcs->d_stralign, 0); sp = tp->t_str; sp->align = dcs->d_stralign; sp->size = dcs->d_offset; sp->memb = fmem; if (sp->size == 0) { /* zero sized %s */ (void)c99ism(47, ttab[t].tt_name); } else { n = 0; for (mem = fmem; mem != NULL; mem = mem->s_nxt) { if (mem->s_name != unnamed) n++; } if (n == 0) { /* %s has no named members */ warning(65, t == STRUCT ? "structure" : "union"); } } } else { tp->t_enum->elem = fmem; } return (tp); } /* * Processes the name of an enumerator in en enum declaration. * * sym points to the enumerator * val is the value of the enumerator * impl is 1 if the value of the enumerator was not explicit specified. */ sym_t * ename(sym_t *sym, int val, int impl) { if (sym->s_scl) { if (sym->s_blklev == blklev) { /* no hflag, because this is illegal!!! */ if (sym->s_arg) { /* enumeration constant hides parameter: %s */ warning(57, sym->s_name); } else { /* redeclaration of %s */ error(27, sym->s_name); /* * inside blocks it should not too complicated * to find the position of the previous * declaration */ if (blklev == 0) prevdecl(-1, sym); } } else { if (hflag) /* redefinition hides earlier one: %s */ warning(43, sym->s_name); } sym = pushdown(sym); } sym->s_scl = ENUMCON; sym->s_type = dcs->d_tagtyp; sym->s_value.v_tspec = INT; sym->s_value.v_quad = val; if (impl && val - 1 == INT_MAX) { /* overflow in enumeration values: %s */ warning(48, sym->s_name); } enumval = val + 1; return (sym); } /* * Process a single external declarator. */ void decl1ext(sym_t *dsym, int initflg) { int warn, rval, redec; sym_t *rdsym; chkfdef(dsym, 1); chktyp(dsym); if (initflg && !(initerr = chkinit(dsym))) dsym->s_def = DEF; /* * Declarations of functions are marked as "tentative" in dname(). * This is wrong because there are no tentative function * definitions. */ if (dsym->s_type->t_tspec == FUNC && dsym->s_def == TDEF) dsym->s_def = DECL; if (dcs->d_inline) { if (dsym->s_type->t_tspec == FUNC) { dsym->s_inline = 1; } else { /* variable declared inline: %s */ warning(268, dsym->s_name); } } /* Write the declaration into the output file */ if (plibflg && llibflg && dsym->s_type->t_tspec == FUNC && dsym->s_type->t_proto) { /* * With both LINTLIBRARY and PROTOLIB the prototype is * written as a function definition to the output file. */ rval = dsym->s_type->t_subt->t_tspec != VOID; outfdef(dsym, &dsym->s_dpos, rval, 0, NULL); } else { outsym(dsym, dsym->s_scl, dsym->s_def); } if ((rdsym = dcs->d_rdcsym) != NULL) { /* * If the old symbol stems from an old style function definition * we have remembered the params in rdsmy->s_args and compare * them with the params of the prototype. */ if (rdsym->s_osdef && dsym->s_type->t_proto) { redec = chkosdef(rdsym, dsym); } else { redec = 0; } if (!redec && !isredec(dsym, (warn = 0, &warn))) { if (warn) { /* redeclaration of %s */ (*(sflag ? error : warning))(27, dsym->s_name); prevdecl(-1, rdsym); } /* * Overtake the remembered params if the new symbol * is not a prototype. */ if (rdsym->s_osdef && !dsym->s_type->t_proto) { dsym->s_osdef = rdsym->s_osdef; dsym->s_args = rdsym->s_args; STRUCT_ASSIGN(dsym->s_dpos, rdsym->s_dpos); } /* * Remember the position of the declaration if the * old symbol was a prototype and the new is not. * Also remember the position if the old symbol * was defined and the new is not. */ if (rdsym->s_type->t_proto && !dsym->s_type->t_proto) { STRUCT_ASSIGN(dsym->s_dpos, rdsym->s_dpos); } else if (rdsym->s_def == DEF && dsym->s_def != DEF) { STRUCT_ASSIGN(dsym->s_dpos, rdsym->s_dpos); } /* * Copy informations about usage of the name into * the new symbol. */ cpuinfo(dsym, rdsym); /* Once a name is defined, it remains defined. */ if (rdsym->s_def == DEF) dsym->s_def = DEF; /* once a function is inline, it remains inline */ if (rdsym->s_inline) dsym->s_inline = 1; compltyp(dsym, rdsym); } rmsym(rdsym); } if (dsym->s_scl == TYPEDEF) { dsym->s_type = duptyp(dsym->s_type); dsym->s_type->t_typedef = 1; settdsym(dsym->s_type, dsym); } } /* * Copies informations about usage into a new symbol table entry of * the same symbol. */ void cpuinfo(sym_t *sym, sym_t *rdsym) { sym->s_spos = rdsym->s_spos; sym->s_upos = rdsym->s_upos; sym->s_set = rdsym->s_set; sym->s_used = rdsym->s_used; } /* * Prints an error and returns 1 if a symbol is redeclared/redefined. * Otherwise returns 0 and, in some cases of minor problems, prints * a warning. */ int isredec(sym_t *dsym, int *warn) { sym_t *rsym; if ((rsym = dcs->d_rdcsym)->s_scl == ENUMCON) { /* redeclaration of %s */ error(27, dsym->s_name); prevdecl(-1, rsym); return (1); } if (rsym->s_scl == TYPEDEF) { /* typedef redeclared: %s */ error(89, dsym->s_name); prevdecl(-1, rsym); return (1); } if (dsym->s_scl == TYPEDEF) { /* redeclaration of %s */ error(27, dsym->s_name); prevdecl(-1, rsym); return (1); } if (rsym->s_def == DEF && dsym->s_def == DEF) { /* redefinition of %s */ error(28, dsym->s_name); prevdecl(-1, rsym); return(1); } if (!eqtype(rsym->s_type, dsym->s_type, 0, 0, warn)) { /* redeclaration of %s */ error(27, dsym->s_name); prevdecl(-1, rsym); return(1); } if (rsym->s_scl == EXTERN && dsym->s_scl == EXTERN) return(0); if (rsym->s_scl == STATIC && dsym->s_scl == STATIC) return(0); if (rsym->s_scl == STATIC && dsym->s_def == DECL) return(0); if (rsym->s_scl == EXTERN && rsym->s_def == DEF) { /* * All cases except "int a = 1; static int a;" are caught * above with or without a warning */ /* redeclaration of %s */ error(27, dsym->s_name); prevdecl(-1, rsym); return(1); } if (rsym->s_scl == EXTERN) { /* previously declared extern, becomes static: %s */ warning(29, dsym->s_name); prevdecl(-1, rsym); return(0); } /* * Now its on of: * "static a; int a;", "static a; int a = 1;", "static a = 1; int a;" */ /* redeclaration of %s; ANSI C requires "static" */ if (sflag) { warning(30, dsym->s_name); prevdecl(-1, rsym); } dsym->s_scl = STATIC; return (0); } /* * Checks if two types are compatible. Returns 0 if not, otherwise 1. * * ignqual ignore qualifiers of type; used for function params * promot promote left type; used for comparison of params of * old style function definitions with params of prototypes. * *warn set to 1 if an old style function declaration is not * compatible with a prototype */ int eqtype(type_t *tp1, type_t *tp2, int ignqual, int promot, int *warn) { tspec_t t; while (tp1 != NULL && tp2 != NULL) { t = tp1->t_tspec; if (promot) { if (t == FLOAT) { t = DOUBLE; } else if (t == CHAR || t == SCHAR) { t = INT; } else if (t == UCHAR) { t = tflag ? UINT : INT; } else if (t == SHORT) { t = INT; } else if (t == USHORT) { /* CONSTCOND */ t = INT_MAX < USHRT_MAX || tflag ? UINT : INT; } } if (t != tp2->t_tspec) return (0); if (tp1->t_const != tp2->t_const && !ignqual && !tflag) return (0); if (tp1->t_volatile != tp2->t_volatile && !ignqual && !tflag) return (0); if (t == STRUCT || t == UNION) return (tp1->t_str == tp2->t_str); if (t == ARRAY && tp1->t_dim != tp2->t_dim) { if (tp1->t_dim != 0 && tp2->t_dim != 0) return (0); } /* dont check prototypes for traditional */ if (t == FUNC && !tflag) { if (tp1->t_proto && tp2->t_proto) { if (!eqargs(tp1, tp2, warn)) return (0); } else if (tp1->t_proto) { if (!mnoarg(tp1, warn)) return (0); } else if (tp2->t_proto) { if (!mnoarg(tp2, warn)) return (0); } } tp1 = tp1->t_subt; tp2 = tp2->t_subt; ignqual = promot = 0; } return (tp1 == tp2); } /* * Compares the parameter types of two prototypes. */ static int eqargs(type_t *tp1, type_t *tp2, int *warn) { sym_t *a1, *a2; if (tp1->t_vararg != tp2->t_vararg) return (0); a1 = tp1->t_args; a2 = tp2->t_args; while (a1 != NULL && a2 != NULL) { if (eqtype(a1->s_type, a2->s_type, 1, 0, warn) == 0) return (0); a1 = a1->s_nxt; a2 = a2->s_nxt; } return (a1 == a2); } /* * mnoarg() (matches functions with no argument type information) * returns 1 if all parameters of a prototype are compatible with * and old style function declaration. * This is the case if following conditions are met: * 1. the prototype must have a fixed number of parameters * 2. no parameter is of type float * 3. no parameter is converted to another type if integer promotion * is applied on it */ static int mnoarg(type_t *tp, int *warn) { sym_t *arg; tspec_t t; if (tp->t_vararg) { if (warn != NULL) *warn = 1; } for (arg = tp->t_args; arg != NULL; arg = arg->s_nxt) { if ((t = arg->s_type->t_tspec) == FLOAT || t == CHAR || t == SCHAR || t == UCHAR || t == SHORT || t == USHORT) { if (warn != NULL) *warn = 1; } } return (1); } /* * Compares a prototype declaration with the remembered arguments of * a previous old style function definition. */ static int chkosdef(sym_t *rdsym, sym_t *dsym) { sym_t *args, *pargs, *arg, *parg; int narg, nparg, n; int warn, msg; args = rdsym->s_args; pargs = dsym->s_type->t_args; msg = 0; narg = nparg = 0; for (arg = args; arg != NULL; arg = arg->s_nxt) narg++; for (parg = pargs; parg != NULL; parg = parg->s_nxt) nparg++; if (narg != nparg) { /* prototype does not match old-style definition */ error(63); msg = 1; goto end; } arg = args; parg = pargs; n = 1; while (narg--) { warn = 0; /* * If it does not match due to promotion and sflag is * not set we print only a warning. */ if (!eqtype(arg->s_type, parg->s_type, 1, 1, &warn) || warn) { /* prototype does not match old-style def., arg #%d */ error(299, n); msg = 1; } arg = arg->s_nxt; parg = parg->s_nxt; n++; } end: if (msg) /* old style definition */ prevdecl(300, rdsym); return (msg); } /* * Completes a type by copying the dimension and prototype information * from a second compatible type. * * Following lines are legal: * "typedef a[]; a b; a b[10]; a c; a c[20];" * "typedef ft(); ft f; f(int); ft g; g(long);" * This means that, if a type is completed, the type structure must * be duplicated. */ void compltyp(sym_t *dsym, sym_t *ssym) { type_t **dstp, *src; type_t *dst; dstp = &dsym->s_type; src = ssym->s_type; while ((dst = *dstp) != NULL) { if (src == NULL || dst->t_tspec != src->t_tspec) LERROR("compltyp()"); if (dst->t_tspec == ARRAY) { if (dst->t_dim == 0 && src->t_dim != 0) { *dstp = dst = duptyp(dst); dst->t_dim = src->t_dim; /* now a complete type */ setcompl(dst, 0); } } else if (dst->t_tspec == FUNC) { if (!dst->t_proto && src->t_proto) { *dstp = dst = duptyp(dst); dst->t_proto = 1; dst->t_args = src->t_args; } } dstp = &dst->t_subt; src = src->t_subt; } } /* * Completes the declaration of a single argument. */ sym_t * decl1arg(sym_t *sym, int initflg) { tspec_t t; chkfdef(sym, 1); chktyp(sym); if (dcs->d_rdcsym != NULL && dcs->d_rdcsym->s_blklev == blklev) { /* redeclaration of formal parameter %s */ error(237, sym->s_name); rmsym(dcs->d_rdcsym); sym->s_arg = 1; } if (!sym->s_arg) { /* declared argument %s is missing */ error(53, sym->s_name); sym->s_arg = 1; } if (initflg) { /* cannot initialize parameter: %s */ error(52, sym->s_name); initerr = 1; } if ((t = sym->s_type->t_tspec) == ARRAY) { sym->s_type = incref(sym->s_type->t_subt, PTR); } else if (t == FUNC) { if (tflag) /* a function is declared as an argument: %s */ warning(50, sym->s_name); sym->s_type = incref(sym->s_type, PTR); } else if (t == FLOAT) { if (tflag) sym->s_type = gettyp(DOUBLE); } if (dcs->d_inline) /* argument declared inline: %s */ warning(269, sym->s_name); /* * Arguments must have complete types. lengths() prints the needed * error messages (null dimension is impossible because arrays are * converted to pointers). */ if (sym->s_type->t_tspec != VOID) (void)length(sym->s_type, sym->s_name); setsflg(sym); return (sym); } /* * Does some checks for lint directives which apply to functions. * Processes arguments in old style function definitions which default * to int. * Checks compatibility of old style function definition with previous * prototype. */ void cluparg(void) { sym_t *args, *arg, *pargs, *parg; int narg, nparg, n, msg; tspec_t t; args = funcsym->s_args; pargs = funcsym->s_type->t_args; /* check for illegal combinations of lint directives */ if (prflstrg != -1 && scflstrg != -1) { /* can't be used together: ** PRINTFLIKE ** ** SCANFLIKE ** */ warning(289); prflstrg = scflstrg = -1; } if (nvararg != -1 && (prflstrg != -1 || scflstrg != -1)) { /* dubious use of ** VARARGS ** with ** %s ** */ warning(288, prflstrg != -1 ? "PRINTFLIKE" : "SCANFLIKE"); nvararg = -1; } /* * check if the argument of a lint directive is compatible with the * number of arguments. */ narg = 0; for (arg = dcs->d_fargs; arg != NULL; arg = arg->s_nxt) narg++; if (nargusg > narg) { /* argument number mismatch with directive: ** %s ** */ warning(283, "ARGSUSED"); nargusg = 0; } if (nvararg > narg) { /* argument number mismatch with directive: ** %s ** */ warning(283, "VARARGS"); nvararg = 0; } if (prflstrg > narg) { /* argument number mismatch with directive: ** %s ** */ warning(283, "PRINTFLIKE"); prflstrg = -1; } else if (prflstrg == 0) { prflstrg = -1; } if (scflstrg > narg) { /* argument number mismatch with directive: ** %s ** */ warning(283, "SCANFLIKE"); scflstrg = -1; } else if (scflstrg == 0) { scflstrg = -1; } if (prflstrg != -1 || scflstrg != -1) { narg = prflstrg != -1 ? prflstrg : scflstrg; arg = dcs->d_fargs; for (n = 1; n < narg; n++) arg = arg->s_nxt; if (arg->s_type->t_tspec != PTR || ((t = arg->s_type->t_subt->t_tspec) != CHAR && t != UCHAR && t != SCHAR)) { /* arg. %d must be 'char *' for PRINTFLIKE/SCANFLIKE */ warning(293, narg); prflstrg = scflstrg = -1; } } /* * print a warning for each argument of an old style function * definition which defaults to int */ for (arg = args; arg != NULL; arg = arg->s_nxt) { if (arg->s_defarg) { /* argument type defaults to int: %s */ warning(32, arg->s_name); arg->s_defarg = 0; setsflg(arg); } } /* * If this is an old style function definition and a prototyp * exists, compare the types of arguments. */ if (funcsym->s_osdef && funcsym->s_type->t_proto) { /* * If the number of arguments does not macht, we need not * continue. */ narg = nparg = 0; msg = 0; for (parg = pargs; parg != NULL; parg = parg->s_nxt) nparg++; for (arg = args; arg != NULL; arg = arg->s_nxt) narg++; if (narg != nparg) { /* parameter mismatch: %d declared, %d defined */ error(51, nparg, narg); msg = 1; } else { parg = pargs; arg = args; while (narg--) { msg |= chkptdecl(arg, parg); parg = parg->s_nxt; arg = arg->s_nxt; } } if (msg) /* prototype declaration */ prevdecl(285, dcs->d_rdcsym); /* from now the prototype is valid */ funcsym->s_osdef = 0; funcsym->s_args = NULL; } } /* * Checks compatibility of an old style function definition with a previous * prototype declaration. * Returns 1 if the position of the previous declaration should be reported. */ static int chkptdecl(sym_t *arg, sym_t *parg) { type_t *tp, *ptp; int warn, msg; tp = arg->s_type; ptp = parg->s_type; msg = 0; warn = 0; if (!eqtype(tp, ptp, 1, 1, &warn)) { if (eqtype(tp, ptp, 1, 0, &warn)) { /* type does not match prototype: %s */ msg = gnuism(58, arg->s_name); } else { /* type does not match prototype: %s */ error(58, arg->s_name); msg = 1; } } else if (warn) { /* type does not match prototype: %s */ (*(sflag ? error : warning))(58, arg->s_name); msg = 1; } return (msg); } /* * Completes a single local declaration/definition. */ void decl1loc(sym_t *dsym, int initflg) { /* Correct a mistake done in dname(). */ if (dsym->s_type->t_tspec == FUNC) { dsym->s_def = DECL; if (dcs->d_scl == NOSCL) dsym->s_scl = EXTERN; } if (dsym->s_type->t_tspec == FUNC) { if (dsym->s_scl == STATIC) { /* dubious static function at block level: %s */ warning(93, dsym->s_name); dsym->s_scl = EXTERN; } else if (dsym->s_scl != EXTERN && dsym->s_scl != TYPEDEF) { /* function has illegal storage class: %s */ error(94, dsym->s_name); dsym->s_scl = EXTERN; } } /* * functions may be declared inline at local scope, although * this has no effect for a later definition of the same * function. * XXX it should have an effect if tflag is set. this would * also be the way gcc behaves. */ if (dcs->d_inline) { if (dsym->s_type->t_tspec == FUNC) { dsym->s_inline = 1; } else { /* variable declared inline: %s */ warning(268, dsym->s_name); } } chkfdef(dsym, 1); chktyp(dsym); if (dcs->d_rdcsym != NULL && dsym->s_scl == EXTERN) ledecl(dsym); if (dsym->s_scl == EXTERN) { /* * XXX wenn die statische Variable auf Ebene 0 erst * spaeter definiert wird, haben wir die Brille auf. */ if (dsym->s_xsym == NULL) { outsym(dsym, EXTERN, dsym->s_def); } else { outsym(dsym, dsym->s_xsym->s_scl, dsym->s_def); } } if (dcs->d_rdcsym != NULL) { if (dcs->d_rdcsym->s_blklev == 0) { switch (dsym->s_scl) { case AUTO: /* automatic hides external declaration: %s */ if (hflag) warning(86, dsym->s_name); break; case STATIC: /* static hides external declaration: %s */ if (hflag) warning(87, dsym->s_name); break; case TYPEDEF: /* typedef hides external declaration: %s */ if (hflag) warning(88, dsym->s_name); break; case EXTERN: /* * Warnings and errors are printed in ledecl() */ break; default: LERROR("decl1loc()"); } } else if (dcs->d_rdcsym->s_blklev == blklev) { /* no hflag, because its illegal! */ if (dcs->d_rdcsym->s_arg) { /* * if !tflag, a "redeclaration of %s" error * is produced below */ if (tflag) { if (hflag) /* decl. hides parameter: %s */ warning(91, dsym->s_name); rmsym(dcs->d_rdcsym); } } } else if (dcs->d_rdcsym->s_blklev < blklev) { if (hflag) /* declaration hides earlier one: %s */ warning(95, dsym->s_name); } if (dcs->d_rdcsym->s_blklev == blklev) { /* redeclaration of %s */ error(27, dsym->s_name); rmsym(dcs->d_rdcsym); } } if (initflg && !(initerr = chkinit(dsym))) { dsym->s_def = DEF; setsflg(dsym); } if (dsym->s_scl == TYPEDEF) { dsym->s_type = duptyp(dsym->s_type); dsym->s_type->t_typedef = 1; settdsym(dsym->s_type, dsym); } /* * Before we can check the size we must wait for an initialisation * which may follow. */ } /* * Processes (re)declarations of external Symbols inside blocks. */ static void ledecl(sym_t *dsym) { int eqt, warn; sym_t *esym; /* look for a symbol with the same name */ esym = dcs->d_rdcsym; while (esym != NULL && esym->s_blklev != 0) { while ((esym = esym->s_link) != NULL) { if (esym->s_kind != FVFT) continue; if (strcmp(dsym->s_name, esym->s_name) == 0) break; } } if (esym == NULL) return; if (esym->s_scl != EXTERN && esym->s_scl != STATIC) { /* gcc accepts this without a warning, pcc prints an error. */ /* redeclaration of %s */ warning(27, dsym->s_name); prevdecl(-1, esym); return; } warn = 0; eqt = eqtype(esym->s_type, dsym->s_type, 0, 0, &warn); if (!eqt || warn) { if (esym->s_scl == EXTERN) { /* inconsistent redeclaration of extern: %s */ warning(90, dsym->s_name); prevdecl(-1, esym); } else { /* inconsistent redeclaration of static: %s */ warning(92, dsym->s_name); prevdecl(-1, esym); } } if (eqt) { /* * Remember the external symbol so we can update usage * information at the end of the block. */ dsym->s_xsym = esym; } } /* * Print an error or a warning if the symbol can't be initialized due * to type/storage class. Return value is 1 if an error has been * detected. */ static int chkinit(sym_t *sym) { int err; err = 0; if (sym->s_type->t_tspec == FUNC) { /* cannot initialize function: %s */ error(24, sym->s_name); err = 1; } else if (sym->s_scl == TYPEDEF) { /* cannot initialize typedef: %s */ error(25, sym->s_name); err = 1; } else if (sym->s_scl == EXTERN && sym->s_def == DECL) { /* cannot initialize "extern" declaration: %s */ if (dcs->d_ctx == EXTERN) { warning(26, sym->s_name); } else { error(26, sym->s_name); err = 1; } } return (err); } /* * Create a symbol for an abstract declaration. */ sym_t * aname(void) { sym_t *sym; if (dcs->d_ctx != ABSTRACT && dcs->d_ctx != PARG) LERROR("aname()"); sym = getblk(sizeof (sym_t)); sym->s_name = unnamed; sym->s_def = DEF; sym->s_scl = ABSTRACT; sym->s_blklev = -1; if (dcs->d_ctx == PARG) sym->s_arg = 1; sym->s_type = dcs->d_type; dcs->d_rdcsym = NULL; dcs->d_vararg = 0; return (sym); } /* * Removes anything which has nothing to do on global level. */ void globclup(void) { while (dcs->d_nxt != NULL) popdecl(); cleanup(); blklev = 0; mblklev = 0; /* * remove all information about pending lint directives without * warnings. */ glclup(1); } /* * Process an abstract type declaration */ sym_t * decl1abs(sym_t *sym) { chkfdef(sym, 1); chktyp(sym); return (sym); } /* * Checks size after declarations of variables and their initialisation. */ void chksz(sym_t *dsym) { /* * check size only for symbols which are defined and no function and * not typedef name */ if (dsym->s_def != DEF) return; if (dsym->s_scl == TYPEDEF) return; if (dsym->s_type->t_tspec == FUNC) return; if (length(dsym->s_type, dsym->s_name) == 0 && dsym->s_type->t_tspec == ARRAY && dsym->s_type->t_dim == 0) { /* empty array declaration: %s */ if (tflag) { warning(190, dsym->s_name); } else { error(190, dsym->s_name); } } } /* * Mark an object as set if it is not already */ void setsflg(sym_t *sym) { if (!sym->s_set) { sym->s_set = 1; UNIQUE_CURR_POS(sym->s_spos); } } /* * Mark an object as used if it is not already */ void setuflg(sym_t *sym, int fcall, int szof) { if (!sym->s_used) { sym->s_used = 1; UNIQUE_CURR_POS(sym->s_upos); } /* * for function calls another record is written * * XXX Should symbols used in sizeof() treated as used or not? * Probably not, because there is no sense to declare an * external variable only to get their size. */ if (!fcall && !szof && sym->s_kind == FVFT && sym->s_scl == EXTERN) outusg(sym); } /* * Prints warnings for a list of variables and labels (concatenated * with s_dlnxt) if these are not used or only set. */ void chkusage(dinfo_t *di) { sym_t *sym; int mknowarn; /* for this warnings LINTED has no effect */ mknowarn = nowarn; nowarn = 0; for (sym = di->d_dlsyms; sym != NULL; sym = sym->s_dlnxt) chkusg1(di->d_asm, sym); nowarn = mknowarn; } /* * Prints a warning for a single variable or label if it is not used or * only set. */ void chkusg1(int novar, sym_t *sym) { pos_t cpos; if (sym->s_blklev == -1) return; STRUCT_ASSIGN(cpos, curr_pos); if (sym->s_kind == FVFT) { if (sym->s_arg) { chkausg(novar, sym); } else { chkvusg(novar, sym); } } else if (sym->s_kind == FLAB) { chklusg(sym); } else if (sym->s_kind == FTAG) { chktusg(sym); } STRUCT_ASSIGN(curr_pos, cpos); } static void chkausg(int novar, sym_t *arg) { if (!arg->s_set) LERROR("chkausg()"); if (novar) return; if (!arg->s_used && vflag) { STRUCT_ASSIGN(curr_pos, arg->s_dpos); /* argument %s unused in function %s */ warning(231, arg->s_name, funcsym->s_name); } } static void chkvusg(int novar, sym_t *sym) { scl_t sc; sym_t *xsym; if (blklev == 0 || sym->s_blklev == 0) LERROR("chkvusg()"); /* errors in expressions easily cause lots of these warnings */ if (nerr != 0) return; /* * XXX Only variables are checkd, although types should * probably also be checked */ if ((sc = sym->s_scl) != EXTERN && sc != STATIC && sc != AUTO && sc != REG) { return; } if (novar) return; if (sc == EXTERN) { if (!sym->s_used && !sym->s_set) { STRUCT_ASSIGN(curr_pos, sym->s_dpos); /* %s unused in function %s */ warning(192, sym->s_name, funcsym->s_name); } } else { if (sym->s_set && !sym->s_used) { STRUCT_ASSIGN(curr_pos, sym->s_spos); /* %s set but not used in function %s */ warning(191, sym->s_name, funcsym->s_name); } else if (!sym->s_used) { STRUCT_ASSIGN(curr_pos, sym->s_dpos); /* %s unused in function %s */ warning(192, sym->s_name, funcsym->s_name); } } if (sc == EXTERN) { /* * information about usage is taken over into the symbol * tabel entry at level 0 if the symbol was locally declared * as an external symbol. * * XXX This is wrong for symbols declared static at level 0 * if the usage information stems from sizeof(). This is * because symbols at level 0 only used in sizeof() are * considered to not be used. */ if ((xsym = sym->s_xsym) != NULL) { if (sym->s_used && !xsym->s_used) { xsym->s_used = 1; STRUCT_ASSIGN(xsym->s_upos, sym->s_upos); } if (sym->s_set && !xsym->s_set) { xsym->s_set = 1; STRUCT_ASSIGN(xsym->s_spos, sym->s_spos); } } } } static void chklusg(sym_t *lab) { if (blklev != 1 || lab->s_blklev != 1) LERROR("chklusg()"); if (lab->s_set && !lab->s_used) { STRUCT_ASSIGN(curr_pos, lab->s_spos); /* label %s unused in function %s */ warning(192, lab->s_name, funcsym->s_name); } else if (!lab->s_set) { STRUCT_ASSIGN(curr_pos, lab->s_upos); /* undefined label %s */ warning(23, lab->s_name); } } static void chktusg(sym_t *sym) { if (!incompl(sym->s_type)) return; /* complain always about incomplete tags declared inside blocks */ if (!zflag || dcs->d_ctx != EXTERN) return; STRUCT_ASSIGN(curr_pos, sym->s_dpos); switch (sym->s_type->t_tspec) { case STRUCT: /* struct %s never defined */ warning(233, sym->s_name); break; case UNION: /* union %s never defined */ warning(234, sym->s_name); break; case ENUM: /* enum %s never defined */ warning(235, sym->s_name); break; default: LERROR("chktusg()"); } } /* * Called after the entire translation unit has been parsed. * Changes tentative definitions in definitions. * Performs some tests on global Symbols. Detected Problems are: * - defined variables of incomplete type * - constant variables which are not initialized * - static symbols which are never used */ void chkglsyms(void) { sym_t *sym; pos_t cpos; if (blklev != 0 || dcs->d_nxt != NULL) norecover(); STRUCT_ASSIGN(cpos, curr_pos); for (sym = dcs->d_dlsyms; sym != NULL; sym = sym->s_dlnxt) { if (sym->s_blklev == -1) continue; if (sym->s_kind == FVFT) { chkglvar(sym); } else if (sym->s_kind == FTAG) { chktusg(sym); } else { if (sym->s_kind != FMOS) LERROR("chkglsyms()"); } } STRUCT_ASSIGN(curr_pos, cpos); } static void chkglvar(sym_t *sym) { if (sym->s_scl == TYPEDEF || sym->s_scl == ENUMCON) return; if (sym->s_scl != EXTERN && sym->s_scl != STATIC) LERROR("chkglvar()"); glchksz(sym); if (sym->s_scl == STATIC) { if (sym->s_type->t_tspec == FUNC) { if (sym->s_used && sym->s_def != DEF) { STRUCT_ASSIGN(curr_pos, sym->s_upos); /* static func. called but not def.. */ error(225, sym->s_name); } } if (!sym->s_used) { STRUCT_ASSIGN(curr_pos, sym->s_dpos); if (sym->s_type->t_tspec == FUNC) { if (sym->s_def == DEF) { if (!sym->s_inline) /* static function %s unused */ warning(236, sym->s_name); } else { /* static function %s decl. but ... */ warning(290, sym->s_name); } } else if (!sym->s_set) { /* static variable %s unused */ warning(226, sym->s_name); } else { /* static variable %s set but not used */ warning(307, sym->s_name); } } if (!tflag && sym->s_def == TDEF && sym->s_type->t_const) { STRUCT_ASSIGN(curr_pos, sym->s_dpos); /* const object %s should have initializer */ warning(227, sym->s_name); } } } static void glchksz(sym_t *sym) { if (sym->s_def == TDEF) { if (sym->s_type->t_tspec == FUNC) /* * this can happen if a syntax error occurred * after a function declaration */ return; STRUCT_ASSIGN(curr_pos, sym->s_dpos); if (length(sym->s_type, sym->s_name) == 0 && sym->s_type->t_tspec == ARRAY && sym->s_type->t_dim == 0) { /* empty array declaration: %s */ if (tflag || (sym->s_scl == EXTERN && !sflag)) { warning(190, sym->s_name); } else { error(190, sym->s_name); } } } } /* * Prints information about location of previous definition/declaration. */ void prevdecl(int msg, sym_t *psym) { pos_t cpos; if (!rflag) return; STRUCT_ASSIGN(cpos, curr_pos); STRUCT_ASSIGN(curr_pos, psym->s_dpos); if (msg != -1) { message(msg, psym->s_name); } else if (psym->s_def == DEF || psym->s_def == TDEF) { /* previous definition of %s */ message(261, psym->s_name); } else { /* previous declaration of %s */ message(260, psym->s_name); } STRUCT_ASSIGN(curr_pos, cpos); } Index: user/ngie/bsnmp_cleanup/usr.sbin/bhyve/fwctl.c =================================================================== --- user/ngie/bsnmp_cleanup/usr.sbin/bhyve/fwctl.c (revision 298467) +++ user/ngie/bsnmp_cleanup/usr.sbin/bhyve/fwctl.c (revision 298468) @@ -1,549 +1,549 @@ /*- * Copyright (c) 2015 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does, * but with a request/response messaging protocol. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "inout.h" #include "fwctl.h" /* * Messaging protocol base operations */ #define OP_NULL 1 #define OP_ECHO 2 #define OP_GET 3 #define OP_GET_LEN 4 #define OP_SET 5 #define OP_MAX OP_SET /* I/O ports */ #define FWCTL_OUT 0x510 #define FWCTL_IN 0x511 /* * Back-end state-machine */ enum state { DORMANT, IDENT_WAIT, IDENT_SEND, REQ, RESP } be_state = DORMANT; static uint8_t sig[] = { 'B', 'H', 'Y', 'V' }; static u_int ident_idx; struct op_info { int op; int (*op_start)(int len); void (*op_data)(uint32_t data, int len); int (*op_result)(struct iovec **data); void (*op_done)(struct iovec *data); }; static struct op_info *ops[OP_MAX+1]; /* Return 0-padded uint32_t */ static uint32_t fwctl_send_rest(uint32_t *data, size_t len) { union { uint8_t c[4]; uint32_t w; } u; uint8_t *cdata; int i; cdata = (uint8_t *) data; u.w = 0; for (i = 0, u.w = 0; i < len; i++) u.c[i] = *cdata++; return (u.w); } /* * error op dummy proto - drop all data sent and return an error */ static int errop_code; static void errop_set(int err) { errop_code = err; } static int errop_start(int len) { errop_code = ENOENT; /* accept any length */ return (errop_code); } static void errop_data(uint32_t data, int len) { /* ignore */ } static int errop_result(struct iovec **data) { /* no data to send back; always successful */ *data = NULL; return (errop_code); } static void errop_done(struct iovec *data) { /* assert data is NULL */ } static struct op_info errop_info = { .op_start = errop_start, .op_data = errop_data, .op_result = errop_result, .op_done = errop_done }; /* OID search */ SET_DECLARE(ctl_set, struct ctl); CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus)); static struct ctl * ctl_locate(const char *str, int maxlen) { struct ctl *cp, **cpp; SET_FOREACH(cpp, ctl_set) { cp = *cpp; if (!strncmp(str, cp->c_oid, maxlen)) return (cp); } return (NULL); } /* uefi-sysctl get-len */ #define FGET_STRSZ 80 static struct iovec fget_biov[2]; static char fget_str[FGET_STRSZ]; static struct { size_t f_sz; uint32_t f_data[1024]; } fget_buf; static int fget_cnt; static size_t fget_size; static int fget_start(int len) { if (len > FGET_STRSZ) return(E2BIG); fget_cnt = 0; return (0); } static void fget_data(uint32_t data, int len) { *((uint32_t *) &fget_str[fget_cnt]) = data; fget_cnt += sizeof(uint32_t); } static int fget_result(struct iovec **data, int val) { struct ctl *cp; int err; err = 0; /* Locate the OID */ cp = ctl_locate(fget_str, fget_cnt); if (cp == NULL) { *data = NULL; err = ENOENT; } else { if (val) { /* For now, copy the len/data into a buffer */ memset(&fget_buf, 0, sizeof(fget_buf)); fget_buf.f_sz = cp->c_len; memcpy(fget_buf.f_data, cp->c_data, cp->c_len); fget_biov[0].iov_base = (char *)&fget_buf; fget_biov[0].iov_len = sizeof(fget_buf.f_sz) + cp->c_len; } else { fget_size = cp->c_len; fget_biov[0].iov_base = (char *)&fget_size; fget_biov[0].iov_len = sizeof(fget_size); } fget_biov[1].iov_base = NULL; fget_biov[1].iov_len = 0; *data = fget_biov; } return (err); } static void fget_done(struct iovec *data) { /* nothing needs to be freed */ } static int fget_len_result(struct iovec **data) { return (fget_result(data, 0)); } static int fget_val_result(struct iovec **data) { return (fget_result(data, 1)); } static struct op_info fgetlen_info = { .op_start = fget_start, .op_data = fget_data, .op_result = fget_len_result, .op_done = fget_done }; static struct op_info fgetval_info = { .op_start = fget_start, .op_data = fget_data, .op_result = fget_val_result, .op_done = fget_done }; static struct req_info { int req_error; u_int req_count; uint32_t req_size; uint32_t req_type; uint32_t req_txid; struct op_info *req_op; int resp_error; int resp_count; int resp_size; int resp_off; struct iovec *resp_biov; } rinfo; static void fwctl_response_done(void) { (*rinfo.req_op->op_done)(rinfo.resp_biov); /* reinit the req data struct */ memset(&rinfo, 0, sizeof(rinfo)); } static void fwctl_request_done(void) { rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov); /* XXX only a single vector supported at the moment */ rinfo.resp_off = 0; if (rinfo.resp_biov == NULL) { rinfo.resp_size = 0; } else { rinfo.resp_size = rinfo.resp_biov[0].iov_len; } } static int fwctl_request_start(void) { int err; /* Data size doesn't include header */ rinfo.req_size -= 12; rinfo.req_op = &errop_info; if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL) rinfo.req_op = ops[rinfo.req_type]; err = (*rinfo.req_op->op_start)(rinfo.req_size); if (err) { errop_set(err); rinfo.req_op = &errop_info; } /* Catch case of zero-length message here */ if (rinfo.req_size == 0) { fwctl_request_done(); return (1); } return (0); } static int fwctl_request_data(uint32_t value) { int remlen; /* Make sure remaining size is >= 0 */ rinfo.req_size -= sizeof(uint32_t); - remlen = (rinfo.req_size > 0) ? rinfo.req_size: 0; + remlen = MAX(rinfo.req_size, 0); (*rinfo.req_op->op_data)(value, remlen); if (rinfo.req_size < sizeof(uint32_t)) { fwctl_request_done(); return (1); } return (0); } static int fwctl_request(uint32_t value) { int ret; ret = 0; switch (rinfo.req_count) { case 0: /* Verify size */ if (value < 12) { printf("msg size error"); exit(1); } rinfo.req_size = value; rinfo.req_count = 1; break; case 1: rinfo.req_type = value; rinfo.req_count++; break; case 2: rinfo.req_txid = value; rinfo.req_count++; ret = fwctl_request_start(); break; default: ret = fwctl_request_data(value); break; } return (ret); } static int fwctl_response(uint32_t *retval) { uint32_t *dp; int remlen; switch(rinfo.resp_count) { case 0: /* 4 x u32 header len + data */ *retval = 4*sizeof(uint32_t) + roundup(rinfo.resp_size, sizeof(uint32_t)); rinfo.resp_count++; break; case 1: *retval = rinfo.req_type; rinfo.resp_count++; break; case 2: *retval = rinfo.req_txid; rinfo.resp_count++; break; case 3: *retval = rinfo.resp_error; rinfo.resp_count++; break; default: remlen = rinfo.resp_size - rinfo.resp_off; dp = (uint32_t *) ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off); if (remlen >= sizeof(uint32_t)) { *retval = *dp; } else if (remlen > 0) { *retval = fwctl_send_rest(dp, remlen); } rinfo.resp_off += sizeof(uint32_t); break; } if (rinfo.resp_count > 3 && rinfo.resp_size - rinfo.resp_off <= 0) { fwctl_response_done(); return (1); } return (0); } /* * i/o port handling. */ static uint8_t fwctl_inb(void) { uint8_t retval; retval = 0xff; switch (be_state) { case IDENT_SEND: retval = sig[ident_idx++]; if (ident_idx >= sizeof(sig)) be_state = REQ; break; default: break; } return (retval); } static void fwctl_outw(uint16_t val) { switch (be_state) { case IDENT_WAIT: if (val == 0) { be_state = IDENT_SEND; ident_idx = 0; } break; default: /* ignore */ break; } } static uint32_t fwctl_inl(void) { uint32_t retval; switch (be_state) { case RESP: if (fwctl_response(&retval)) be_state = REQ; break; default: retval = 0xffffffff; break; } return (retval); } static void fwctl_outl(uint32_t val) { switch (be_state) { case REQ: if (fwctl_request(val)) be_state = RESP; default: break; } } static int fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { if (in) { if (bytes == 1) *eax = fwctl_inb(); else if (bytes == 4) *eax = fwctl_inl(); else *eax = 0xffff; } else { if (bytes == 2) fwctl_outw(*eax); else if (bytes == 4) fwctl_outl(*eax); } return (0); } INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler); INOUT_PORT(fwctl_rreg, FWCTL_IN, IOPORT_F_IN, fwctl_handler); void fwctl_init(void) { ops[OP_GET_LEN] = &fgetlen_info; ops[OP_GET] = &fgetval_info; be_state = IDENT_WAIT; } Index: user/ngie/bsnmp_cleanup/usr.sbin/bhyve/pci_ahci.c =================================================================== --- user/ngie/bsnmp_cleanup/usr.sbin/bhyve/pci_ahci.c (revision 298467) +++ user/ngie/bsnmp_cleanup/usr.sbin/bhyve/pci_ahci.c (revision 298468) @@ -1,2347 +1,2347 @@ /*- * Copyright (c) 2013 Zhixiang Yu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "pci_emul.h" #include "ahci.h" #include "block_if.h" #define MAX_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define WPRINTF(format, arg...) printf(format, ##arg) struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; uint8_t *cmd_lst; uint8_t *rfis; char ident[20 + 1]; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * generate HBA intr depending on whether or not ports within * the controller have an interrupt pending. */ static void ahci_generate_intr(struct pci_ahci_softc *sc) { struct pci_devinst *pi; int i; pi = sc->asc_pi; for (i = 0; i < sc->ports; i++) { struct ahci_port *pr; pr = &sc->port[i]; if (pr->is & pr->ie) sc->is |= (1 << i); } DPRINTF("%s %x\n", __func__, sc->is); if (sc->is && (sc->ghc & AHCI_GHC_IE)) { if (pci_msi_enabled(pi)) { /* * Generate an MSI interrupt on every edge */ pci_generate_msi(pi, 0); } else if (!sc->lintr) { /* * Only generate a pin-based interrupt if one wasn't * in progress */ sc->lintr = 1; pci_lintr_assert(pi); } } else if (sc->lintr) { /* * No interrupts: deassert pin-based signal if it had * been asserted */ pci_lintr_deassert(pi); sc->lintr = 0; } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d\n", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { p->is |= irq; ahci_generate_intr(p->pr_sc); } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; int i, j, skip, todo, left, extra; uint32_t dbcsz; /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; void *to; int i, len; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); - sublen = len < dbcsz ? len : dbcsz; + sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; void *from; int i, len; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); - sublen = len < dbcsz ? len : dbcsz; + sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint8_t buf[512]; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[4] != 0x10 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); memcpy(buf, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf, sizeof(buf)); if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); memset(buf, 0, sizeof(buf)); buf[0] = 0x0040; buf[1] = cyl; buf[3] = heads; buf[6] = sech; ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); buf[47] = (0x8000 | 128); buf[48] = 0; buf[49] = (1 << 8 | 1 << 9 | 1 << 11); buf[50] = (1 << 14); buf[53] = (1 << 1 | 1 << 2); if (p->mult_sectors) buf[59] = (0x100 | p->mult_sectors); if (sectors <= 0x0fffffff) { buf[60] = sectors; buf[61] = (sectors >> 16); } else { buf[60] = 0xffff; buf[61] = 0x0fff; } buf[63] = 0x7; if (p->xfermode & ATA_WDMA0) buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 0x3; buf[65] = 120; buf[66] = 120; buf[67] = 120; buf[68] = 120; buf[69] = 0; buf[75] = 31; buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); buf[80] = 0x3f0; buf[81] = 0x28; buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); buf[84] = (1 << 14); buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); buf[87] = (1 << 14); buf[88] = 0x7f; if (p->xfermode & ATA_UDMA0) buf[88] |= (1 << ((p->xfermode & 7) + 8)); buf[100] = sectors; buf[101] = (sectors >> 16); buf[102] = (sectors >> 32); buf[103] = (sectors >> 48); if (candelete && !ro) { buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; buf[105] = 1; buf[169] = ATA_SUPPORT_DSM_TRIM; } buf[106] = 0x4000; buf[209] = 0x4000; if (psectsz > sectsz) { buf[106] |= 0x2000; buf[106] |= ffsl(psectsz / sectsz) - 1; buf[209] |= (psectoff / sectsz); } if (sectsz > 512) { buf[106] |= 0x1000; buf[117] = sectsz / 2; buf[118] = ((sectsz / 2) >> 16); } buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); buf[222] = 0x1020; buf[255] = 0x00a5; ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; memset(buf, 0, sizeof(buf)); buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); buf[49] = (1 << 9 | 1 << 8); buf[50] = (1 << 14 | 1); buf[53] = (1 << 2 | 1 << 1); buf[62] = 0x3f; buf[63] = 7; if (p->xfermode & ATA_WDMA0) buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 3; buf[65] = 120; buf[66] = 120; buf[67] = 120; buf[68] = 120; buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); buf[78] = (1 << 5); buf[80] = 0x3f0; buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[83] = (1 << 14); buf[84] = (1 << 14); buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[87] = (1 << 14); buf[88] = 0x7f; if (p->xfermode & ATA_UDMA0) buf[88] |= (1 << ((p->xfermode & 7) + 8)); buf[222] = 0x1020; buf[255] = 0x00a5; ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { int msf, size; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { int msf, size; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF("\n"); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[13] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x\n", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct pci_ahci_softc *sc; uint8_t *cfis; int cfl; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); cfl = (hdr->flags & 0x1f) * 4; cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); #ifdef AHCI_DEBUG DPRINTF("\ncfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF("\n"); DPRINTF("%02x ", cfis[i]); } DPRINTF("\n"); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { WPRINTF("Not a H2D FIS:%02x\n", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d\n", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit\n", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d\n", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit\n", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_generate_intr(sc); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) ahci_reset(sc); else if (value & AHCI_GHC_IE) { sc->ghc |= AHCI_GHC_IE; ahci_generate_intr(sc); } break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc); break; default: break; } } static void pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n", port, offset, value); return value; } static uint64_t pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } static int pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) { char bident[sizeof("XX:X:X")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; int ret, slots; MD5_CTX mdctx; u_char digest[16]; ret = 0; if (opts == NULL) { fprintf(stderr, "pci_ahci: backing device required\n"); return (1); } #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; sc->ports = MAX_PORTS; /* * Only use port 0 for a backing device. All other ports will be * marked as unused */ sc->port[0].atapi = atapi; /* * Attempt to open the backing image. Use the PCI * slot/func for the identifier string. */ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); bctxt = blockif_open(opts, bident); if (bctxt == NULL) { ret = 1; goto open_fail; } sc->port[0].bctx = bctxt; sc->port[0].pr_sc = sc; /* * Create an identifier for the backing file. Use parts of the * md5 sum of the filename */ MD5Init(&mdctx); MD5Update(&mdctx, opts, strlen(opts)); MD5Final(digest, &mdctx); sprintf(sc->port[0].ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[0]); pthread_mutex_init(&sc->mtx, NULL); /* Intel ICH8 AHCI */ slots = sc->port[0].ioqsz; if (slots > 32) slots = 32; --slots; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); /* Only port 0 implemented */ sc->pi = 1; sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); pci_emul_add_msicap(pi, 1); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { if (sc->port[0].bctx != NULL) blockif_close(sc->port[0].bctx); free(sc); } return (ret); } static int pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { return (pci_ahci_init(ctx, pi, opts, 0)); } static int pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { return (pci_ahci_init(ctx, pi, opts, 1)); } /* * Use separate emulation names to distinguish drive and atapi devices */ struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_init = pci_ahci_hd_init, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read }; PCI_EMUL_SET(pci_de_ahci_hd); struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", .pe_init = pci_ahci_atapi_init, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read }; PCI_EMUL_SET(pci_de_ahci_cd); Index: user/ngie/bsnmp_cleanup =================================================================== --- user/ngie/bsnmp_cleanup (revision 298467) +++ user/ngie/bsnmp_cleanup (revision 298468) Property changes on: user/ngie/bsnmp_cleanup ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r298453-298467