Index: projects/building-blocks/sbin/ipfw/nat.c =================================================================== --- projects/building-blocks/sbin/ipfw/nat.c (revision 277723) +++ projects/building-blocks/sbin/ipfw/nat.c (revision 277724) @@ -1,1115 +1,1115 @@ /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD$ * * In-kernel nat support */ #include #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include /* def. of struct route */ #include #include #include #include typedef int (nat_cb_t)(struct nat44_cfg_nat *cfg, void *arg); static void nat_show_cfg(struct nat44_cfg_nat *n, void *arg); static void nat_show_log(struct nat44_cfg_nat *n, void *arg); static int nat_show_data(struct nat44_cfg_nat *cfg, void *arg); static int natname_cmp(const void *a, const void *b); static int nat_foreach(nat_cb_t *f, void *arg, int sort); static int nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh); static struct _s_x nat_params[] = { { "ip", TOK_IP }, { "if", TOK_IF }, { "log", TOK_ALOG }, { "deny_in", TOK_DENY_INC }, { "same_ports", TOK_SAME_PORTS }, { "unreg_only", TOK_UNREG_ONLY }, { "skip_global", TOK_SKIP_GLOBAL }, { "reset", TOK_RESET_ADDR }, { "reverse", TOK_ALIAS_REV }, { "proxy_only", TOK_PROXY_ONLY }, { "redirect_addr", TOK_REDIR_ADDR }, { "redirect_port", TOK_REDIR_PORT }, { "redirect_proto", TOK_REDIR_PROTO }, { NULL, 0 } /* terminator */ }; /* * Search for interface with name "ifn", and fill n accordingly: * * n->ip ip address of interface "ifn" * n->if_name copy of interface name "ifn" */ static void set_addr_dynamic(const char *ifn, struct nat44_cfg_nat *n) { size_t needed; int mib[6]; char *buf, *lim, *next; struct if_msghdr *ifm; struct ifa_msghdr *ifam; struct sockaddr_dl *sdl; struct sockaddr_in *sin; int ifIndex, ifMTU; mib[0] = CTL_NET; mib[1] = PF_ROUTE; mib[2] = 0; mib[3] = AF_INET; mib[4] = NET_RT_IFLIST; mib[5] = 0; /* * Get interface data. */ if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1) err(1, "iflist-sysctl-estimate"); buf = safe_calloc(1, needed); if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1) err(1, "iflist-sysctl-get"); lim = buf + needed; /* * Loop through interfaces until one with * given name is found. This is done to * find correct interface index for routing * message processing. */ ifIndex = 0; next = buf; while (next < lim) { ifm = (struct if_msghdr *)next; next += ifm->ifm_msglen; if (ifm->ifm_version != RTM_VERSION) { if (co.verbose) warnx("routing message version %d " "not understood", ifm->ifm_version); continue; } if (ifm->ifm_type == RTM_IFINFO) { sdl = (struct sockaddr_dl *)(ifm + 1); if (strlen(ifn) == sdl->sdl_nlen && strncmp(ifn, sdl->sdl_data, sdl->sdl_nlen) == 0) { ifIndex = ifm->ifm_index; ifMTU = ifm->ifm_data.ifi_mtu; break; } } } if (!ifIndex) errx(1, "unknown interface name %s", ifn); /* * Get interface address. */ sin = NULL; while (next < lim) { ifam = (struct ifa_msghdr *)next; next += ifam->ifam_msglen; if (ifam->ifam_version != RTM_VERSION) { if (co.verbose) warnx("routing message version %d " "not understood", ifam->ifam_version); continue; } if (ifam->ifam_type != RTM_NEWADDR) break; if (ifam->ifam_addrs & RTA_IFA) { int i; char *cp = (char *)(ifam + 1); for (i = 1; i < RTA_IFA; i <<= 1) { if (ifam->ifam_addrs & i) cp += SA_SIZE((struct sockaddr *)cp); } if (((struct sockaddr *)cp)->sa_family == AF_INET) { sin = (struct sockaddr_in *)cp; break; } } } if (sin == NULL) - errx(1, "%s: cannot get interface address", ifn); - - n->ip = sin->sin_addr; + n->ip.s_addr = htonl(INADDR_ANY); + else + n->ip = sin->sin_addr; strncpy(n->if_name, ifn, IF_NAMESIZE); free(buf); } /* * XXX - The following functions, macros and definitions come from natd.c: * it would be better to move them outside natd.c, in a file * (redirect_support.[ch]?) shared by ipfw and natd, but for now i can live * with it. */ /* * Definition of a port range, and macros to deal with values. * FORMAT: HI 16-bits == first port in range, 0 == all ports. * LO 16-bits == number of ports in range * NOTES: - Port values are not stored in network byte order. */ #define port_range u_long #define GETLOPORT(x) ((x) >> 0x10) #define GETNUMPORTS(x) ((x) & 0x0000ffff) #define GETHIPORT(x) (GETLOPORT((x)) + GETNUMPORTS((x))) /* Set y to be the low-port value in port_range variable x. */ #define SETLOPORT(x,y) ((x) = ((x) & 0x0000ffff) | ((y) << 0x10)) /* Set y to be the number of ports in port_range variable x. */ #define SETNUMPORTS(x,y) ((x) = ((x) & 0xffff0000) | (y)) static void StrToAddr (const char* str, struct in_addr* addr) { struct hostent* hp; if (inet_aton (str, addr)) return; hp = gethostbyname (str); if (!hp) errx (1, "unknown host %s", str); memcpy (addr, hp->h_addr, sizeof (struct in_addr)); } static int StrToPortRange (const char* str, const char* proto, port_range *portRange) { char* sep; struct servent* sp; char* end; u_short loPort; u_short hiPort; /* First see if this is a service, return corresponding port if so. */ sp = getservbyname (str,proto); if (sp) { SETLOPORT(*portRange, ntohs(sp->s_port)); SETNUMPORTS(*portRange, 1); return 0; } /* Not a service, see if it's a single port or port range. */ sep = strchr (str, '-'); if (sep == NULL) { SETLOPORT(*portRange, strtol(str, &end, 10)); if (end != str) { /* Single port. */ SETNUMPORTS(*portRange, 1); return 0; } /* Error in port range field. */ errx (EX_DATAERR, "%s/%s: unknown service", str, proto); } /* Port range, get the values and sanity check. */ sscanf (str, "%hu-%hu", &loPort, &hiPort); SETLOPORT(*portRange, loPort); SETNUMPORTS(*portRange, 0); /* Error by default */ if (loPort <= hiPort) SETNUMPORTS(*portRange, hiPort - loPort + 1); if (GETNUMPORTS(*portRange) == 0) errx (EX_DATAERR, "invalid port range %s", str); return 0; } static int StrToProto (const char* str) { if (!strcmp (str, "tcp")) return IPPROTO_TCP; if (!strcmp (str, "udp")) return IPPROTO_UDP; if (!strcmp (str, "sctp")) return IPPROTO_SCTP; errx (EX_DATAERR, "unknown protocol %s. Expected sctp, tcp or udp", str); } static int StrToAddrAndPortRange (const char* str, struct in_addr* addr, char* proto, port_range *portRange) { char* ptr; ptr = strchr (str, ':'); if (!ptr) errx (EX_DATAERR, "%s is missing port number", str); *ptr = '\0'; ++ptr; StrToAddr (str, addr); return StrToPortRange (ptr, proto, portRange); } /* End of stuff taken from natd.c. */ /* * The next 3 functions add support for the addr, port and proto redirect and * their logic is loosely based on SetupAddressRedirect(), SetupPortRedirect() * and SetupProtoRedirect() from natd.c. * * Every setup_* function fills at least one redirect entry * (struct nat44_cfg_redir) and zero or more server pool entry * (struct nat44_cfg_spool) in buf. * * The format of data in buf is: * * nat44_cfg_nat nat44_cfg_redir nat44_cfg_spool ...... nat44_cfg_spool * * ------------------------------------- ------------ * | | .....X ..... | | | | ..... * ------------------------------------- ...... ------------ * ^ * spool_cnt n=0 ...... n=(X-1) * * len points to the amount of available space in buf * space counts the memory consumed by every function * * XXX - Every function get all the argv params so it * has to check, in optional parameters, that the next * args is a valid option for the redir entry and not * another token. Only redir_port and redir_proto are * affected by this. */ static int estimate_redir_addr(int *ac, char ***av) { size_t space = sizeof(struct nat44_cfg_redir); char *sep = **av; u_int c = 0; (void)ac; /* UNUSED */ while ((sep = strchr(sep, ',')) != NULL) { c++; sep++; } if (c > 0) c++; space += c * sizeof(struct nat44_cfg_spool); return (space); } static int setup_redir_addr(char *buf, int *ac, char ***av) { struct nat44_cfg_redir *r; char *sep; size_t space; r = (struct nat44_cfg_redir *)buf; r->mode = REDIR_ADDR; /* Skip nat44_cfg_redir at beginning of buf. */ buf = &buf[sizeof(struct nat44_cfg_redir)]; space = sizeof(struct nat44_cfg_redir); /* Extract local address. */ if (strchr(**av, ',') != NULL) { struct nat44_cfg_spool *spool; /* Setup LSNAT server pool. */ r->laddr.s_addr = INADDR_NONE; sep = strtok(**av, ","); while (sep != NULL) { spool = (struct nat44_cfg_spool *)buf; space += sizeof(struct nat44_cfg_spool); StrToAddr(sep, &spool->addr); spool->port = ~0; r->spool_cnt++; /* Point to the next possible nat44_cfg_spool. */ buf = &buf[sizeof(struct nat44_cfg_spool)]; sep = strtok(NULL, ","); } } else StrToAddr(**av, &r->laddr); (*av)++; (*ac)--; /* Extract public address. */ StrToAddr(**av, &r->paddr); (*av)++; (*ac)--; return (space); } static int estimate_redir_port(int *ac, char ***av) { size_t space = sizeof(struct nat44_cfg_redir); char *sep = **av; u_int c = 0; (void)ac; /* UNUSED */ while ((sep = strchr(sep, ',')) != NULL) { c++; sep++; } if (c > 0) c++; space += c * sizeof(struct nat44_cfg_spool); return (space); } static int setup_redir_port(char *buf, int *ac, char ***av) { struct nat44_cfg_redir *r; char *sep, *protoName, *lsnat = NULL; size_t space; u_short numLocalPorts; port_range portRange; numLocalPorts = 0; r = (struct nat44_cfg_redir *)buf; r->mode = REDIR_PORT; /* Skip nat44_cfg_redir at beginning of buf. */ buf = &buf[sizeof(struct nat44_cfg_redir)]; space = sizeof(struct nat44_cfg_redir); /* * Extract protocol. */ r->proto = StrToProto(**av); protoName = **av; (*av)++; (*ac)--; /* * Extract local address. */ if (strchr(**av, ',') != NULL) { r->laddr.s_addr = INADDR_NONE; r->lport = ~0; numLocalPorts = 1; lsnat = **av; } else { /* * The sctp nat does not allow the port numbers to be mapped to * new port numbers. Therefore, no ports are to be specified * in the target port field. */ if (r->proto == IPPROTO_SCTP) { if (strchr(**av, ':')) errx(EX_DATAERR, "redirect_port:" "port numbers do not change in sctp, so do " "not specify them as part of the target"); else StrToAddr(**av, &r->laddr); } else { if (StrToAddrAndPortRange(**av, &r->laddr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid local port range"); r->lport = GETLOPORT(portRange); numLocalPorts = GETNUMPORTS(portRange); } } (*av)++; (*ac)--; /* * Extract public port and optionally address. */ if (strchr(**av, ':') != NULL) { if (StrToAddrAndPortRange(**av, &r->paddr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid public port range"); } else { r->paddr.s_addr = INADDR_ANY; if (StrToPortRange(**av, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid public port range"); } r->pport = GETLOPORT(portRange); if (r->proto == IPPROTO_SCTP) { /* so the logic below still works */ numLocalPorts = GETNUMPORTS(portRange); r->lport = r->pport; } r->pport_cnt = GETNUMPORTS(portRange); (*av)++; (*ac)--; /* * Extract remote address and optionally port. */ /* * NB: isdigit(**av) => we've to check that next parameter is really an * option for this redirect entry, else stop here processing arg[cv]. */ if (*ac != 0 && isdigit(***av)) { if (strchr(**av, ':') != NULL) { if (StrToAddrAndPortRange(**av, &r->raddr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid remote port range"); } else { SETLOPORT(portRange, 0); SETNUMPORTS(portRange, 1); StrToAddr(**av, &r->raddr); } (*av)++; (*ac)--; } else { SETLOPORT(portRange, 0); SETNUMPORTS(portRange, 1); r->raddr.s_addr = INADDR_ANY; } r->rport = GETLOPORT(portRange); r->rport_cnt = GETNUMPORTS(portRange); /* * Make sure port ranges match up, then add the redirect ports. */ if (numLocalPorts != r->pport_cnt) errx(EX_DATAERR, "redirect_port: " "port ranges must be equal in size"); /* Remote port range is allowed to be '0' which means all ports. */ if (r->rport_cnt != numLocalPorts && (r->rport_cnt != 1 || r->rport != 0)) errx(EX_DATAERR, "redirect_port: remote port must" "be 0 or equal to local port range in size"); /* Setup LSNAT server pool. */ if (lsnat != NULL) { struct nat44_cfg_spool *spool; sep = strtok(lsnat, ","); while (sep != NULL) { spool = (struct nat44_cfg_spool *)buf; space += sizeof(struct nat44_cfg_spool); /* * The sctp nat does not allow the port numbers to * be mapped to new port numbers. Therefore, no ports * are to be specified in the target port field. */ if (r->proto == IPPROTO_SCTP) { if (strchr (sep, ':')) { errx(EX_DATAERR, "redirect_port:" "port numbers do not change in " "sctp, so do not specify them as " "part of the target"); } else { StrToAddr(sep, &spool->addr); spool->port = r->pport; } } else { if (StrToAddrAndPortRange(sep, &spool->addr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port:" "invalid local port range"); if (GETNUMPORTS(portRange) != 1) errx(EX_DATAERR, "redirect_port: " "local port must be single in " "this context"); spool->port = GETLOPORT(portRange); } r->spool_cnt++; /* Point to the next possible nat44_cfg_spool. */ buf = &buf[sizeof(struct nat44_cfg_spool)]; sep = strtok(NULL, ","); } } return (space); } static int setup_redir_proto(char *buf, int *ac, char ***av) { struct nat44_cfg_redir *r; struct protoent *protoent; size_t space; r = (struct nat44_cfg_redir *)buf; r->mode = REDIR_PROTO; /* Skip nat44_cfg_redir at beginning of buf. */ buf = &buf[sizeof(struct nat44_cfg_redir)]; space = sizeof(struct nat44_cfg_redir); /* * Extract protocol. */ protoent = getprotobyname(**av); if (protoent == NULL) errx(EX_DATAERR, "redirect_proto: unknown protocol %s", **av); else r->proto = protoent->p_proto; (*av)++; (*ac)--; /* * Extract local address. */ StrToAddr(**av, &r->laddr); (*av)++; (*ac)--; /* * Extract optional public address. */ if (*ac == 0) { r->paddr.s_addr = INADDR_ANY; r->raddr.s_addr = INADDR_ANY; } else { /* see above in setup_redir_port() */ if (isdigit(***av)) { StrToAddr(**av, &r->paddr); (*av)++; (*ac)--; /* * Extract optional remote address. */ /* see above in setup_redir_port() */ if (*ac != 0 && isdigit(***av)) { StrToAddr(**av, &r->raddr); (*av)++; (*ac)--; } } } return (space); } static void nat_show_log(struct nat44_cfg_nat *n, void *arg) { char *buf; buf = (char *)(n + 1); if (buf[0] != '\0') printf("nat %s: %s\n", n->name, buf); } static void nat_show_cfg(struct nat44_cfg_nat *n, void *arg) { int i, cnt, flag, off; struct nat44_cfg_redir *t; struct nat44_cfg_spool *s; caddr_t buf; struct protoent *p; buf = (caddr_t)n; flag = 1; off = sizeof(*n); printf("ipfw nat %s config", n->name); if (strlen(n->if_name) != 0) printf(" if %s", n->if_name); else if (n->ip.s_addr != 0) printf(" ip %s", inet_ntoa(n->ip)); while (n->mode != 0) { if (n->mode & PKT_ALIAS_LOG) { printf(" log"); n->mode &= ~PKT_ALIAS_LOG; } else if (n->mode & PKT_ALIAS_DENY_INCOMING) { printf(" deny_in"); n->mode &= ~PKT_ALIAS_DENY_INCOMING; } else if (n->mode & PKT_ALIAS_SAME_PORTS) { printf(" same_ports"); n->mode &= ~PKT_ALIAS_SAME_PORTS; } else if (n->mode & PKT_ALIAS_SKIP_GLOBAL) { printf(" skip_global"); n->mode &= ~PKT_ALIAS_SKIP_GLOBAL; } else if (n->mode & PKT_ALIAS_UNREGISTERED_ONLY) { printf(" unreg_only"); n->mode &= ~PKT_ALIAS_UNREGISTERED_ONLY; } else if (n->mode & PKT_ALIAS_RESET_ON_ADDR_CHANGE) { printf(" reset"); n->mode &= ~PKT_ALIAS_RESET_ON_ADDR_CHANGE; } else if (n->mode & PKT_ALIAS_REVERSE) { printf(" reverse"); n->mode &= ~PKT_ALIAS_REVERSE; } else if (n->mode & PKT_ALIAS_PROXY_ONLY) { printf(" proxy_only"); n->mode &= ~PKT_ALIAS_PROXY_ONLY; } } /* Print all the redirect's data configuration. */ for (cnt = 0; cnt < n->redir_cnt; cnt++) { t = (struct nat44_cfg_redir *)&buf[off]; off += sizeof(struct nat44_cfg_redir); switch (t->mode) { case REDIR_ADDR: printf(" redirect_addr"); if (t->spool_cnt == 0) printf(" %s", inet_ntoa(t->laddr)); else for (i = 0; i < t->spool_cnt; i++) { s = (struct nat44_cfg_spool *)&buf[off]; if (i) printf(","); else printf(" "); printf("%s", inet_ntoa(s->addr)); off += sizeof(struct nat44_cfg_spool); } printf(" %s", inet_ntoa(t->paddr)); break; case REDIR_PORT: p = getprotobynumber(t->proto); printf(" redirect_port %s ", p->p_name); if (!t->spool_cnt) { printf("%s:%u", inet_ntoa(t->laddr), t->lport); if (t->pport_cnt > 1) printf("-%u", t->lport + t->pport_cnt - 1); } else for (i=0; i < t->spool_cnt; i++) { s = (struct nat44_cfg_spool *)&buf[off]; if (i) printf(","); printf("%s:%u", inet_ntoa(s->addr), s->port); off += sizeof(struct nat44_cfg_spool); } printf(" "); if (t->paddr.s_addr) printf("%s:", inet_ntoa(t->paddr)); printf("%u", t->pport); if (!t->spool_cnt && t->pport_cnt > 1) printf("-%u", t->pport + t->pport_cnt - 1); if (t->raddr.s_addr) { printf(" %s", inet_ntoa(t->raddr)); if (t->rport) { printf(":%u", t->rport); if (!t->spool_cnt && t->rport_cnt > 1) printf("-%u", t->rport + t->rport_cnt - 1); } } break; case REDIR_PROTO: p = getprotobynumber(t->proto); printf(" redirect_proto %s %s", p->p_name, inet_ntoa(t->laddr)); if (t->paddr.s_addr != 0) { printf(" %s", inet_ntoa(t->paddr)); if (t->raddr.s_addr) printf(" %s", inet_ntoa(t->raddr)); } break; default: errx(EX_DATAERR, "unknown redir mode"); break; } } printf("\n"); } void ipfw_config_nat(int ac, char **av) { ipfw_obj_header *oh; struct nat44_cfg_nat *n; /* Nat instance configuration. */ int i, off, tok, ac1; char *id, *buf, **av1, *end; size_t len; av++; ac--; /* Nat id. */ if (ac == 0) errx(EX_DATAERR, "missing nat id"); id = *av; i = (int)strtol(id, &end, 0); if (i <= 0 || *end != '\0') errx(EX_DATAERR, "illegal nat id: %s", id); av++; ac--; if (ac == 0) errx(EX_DATAERR, "missing option"); len = sizeof(*oh) + sizeof(*n); ac1 = ac; av1 = av; while (ac1 > 0) { tok = match_token(nat_params, *av1); ac1--; av1++; switch (tok) { case TOK_IP: case TOK_IF: ac1--; av1++; break; case TOK_ALOG: case TOK_DENY_INC: case TOK_SAME_PORTS: case TOK_SKIP_GLOBAL: case TOK_UNREG_ONLY: case TOK_RESET_ADDR: case TOK_ALIAS_REV: case TOK_PROXY_ONLY: break; case TOK_REDIR_ADDR: if (ac1 < 2) errx(EX_DATAERR, "redirect_addr: " "not enough arguments"); len += estimate_redir_addr(&ac1, &av1); av1 += 2; ac1 -= 2; break; case TOK_REDIR_PORT: if (ac1 < 3) errx(EX_DATAERR, "redirect_port: " "not enough arguments"); av1++; ac1--; len += estimate_redir_port(&ac1, &av1); av1 += 2; ac1 -= 2; /* Skip optional remoteIP/port */ if (ac1 != 0 && isdigit(**av1)) { av1++; ac1--; } break; case TOK_REDIR_PROTO: if (ac1 < 2) errx(EX_DATAERR, "redirect_proto: " "not enough arguments"); len += sizeof(struct nat44_cfg_redir); av1 += 2; ac1 -= 2; /* Skip optional remoteIP/port */ if (ac1 != 0 && isdigit(**av1)) { av1++; ac1--; } if (ac1 != 0 && isdigit(**av1)) { av1++; ac1--; } break; default: errx(EX_DATAERR, "unrecognised option ``%s''", av1[-1]); } } if ((buf = malloc(len)) == NULL) errx(EX_OSERR, "malloc failed"); /* Offset in buf: save space for header at the beginning. */ off = sizeof(*oh) + sizeof(*n); memset(buf, 0, len); oh = (ipfw_obj_header *)buf; n = (struct nat44_cfg_nat *)(oh + 1); oh->ntlv.head.length = sizeof(oh->ntlv); snprintf(oh->ntlv.name, sizeof(oh->ntlv.name), "%d", i); snprintf(n->name, sizeof(n->name), "%d", i); while (ac > 0) { tok = match_token(nat_params, *av); ac--; av++; switch (tok) { case TOK_IP: if (ac == 0) errx(EX_DATAERR, "missing option"); if (!inet_aton(av[0], &(n->ip))) errx(EX_DATAERR, "bad ip address ``%s''", av[0]); ac--; av++; break; case TOK_IF: if (ac == 0) errx(EX_DATAERR, "missing option"); set_addr_dynamic(av[0], n); ac--; av++; break; case TOK_ALOG: n->mode |= PKT_ALIAS_LOG; break; case TOK_DENY_INC: n->mode |= PKT_ALIAS_DENY_INCOMING; break; case TOK_SAME_PORTS: n->mode |= PKT_ALIAS_SAME_PORTS; break; case TOK_UNREG_ONLY: n->mode |= PKT_ALIAS_UNREGISTERED_ONLY; break; case TOK_SKIP_GLOBAL: n->mode |= PKT_ALIAS_SKIP_GLOBAL; break; case TOK_RESET_ADDR: n->mode |= PKT_ALIAS_RESET_ON_ADDR_CHANGE; break; case TOK_ALIAS_REV: n->mode |= PKT_ALIAS_REVERSE; break; case TOK_PROXY_ONLY: n->mode |= PKT_ALIAS_PROXY_ONLY; break; /* * All the setup_redir_* functions work directly in * the final buffer, see above for details. */ case TOK_REDIR_ADDR: case TOK_REDIR_PORT: case TOK_REDIR_PROTO: switch (tok) { case TOK_REDIR_ADDR: i = setup_redir_addr(&buf[off], &ac, &av); break; case TOK_REDIR_PORT: i = setup_redir_port(&buf[off], &ac, &av); break; case TOK_REDIR_PROTO: i = setup_redir_proto(&buf[off], &ac, &av); break; } n->redir_cnt++; off += i; break; } } i = do_set3(IP_FW_NAT44_XCONFIG, &oh->opheader, len); if (i != 0) err(1, "setsockopt(%s)", "IP_FW_NAT44_XCONFIG"); if (!co.do_quiet) { /* After every modification, we show the resultant rule. */ int _ac = 3; const char *_av[] = {"show", "config", id}; ipfw_show_nat(_ac, (char **)(void *)_av); } } struct nat_list_arg { uint16_t cmd; int is_all; }; static int nat_show_data(struct nat44_cfg_nat *cfg, void *arg) { struct nat_list_arg *nla; ipfw_obj_header *oh; nla = (struct nat_list_arg *)arg; switch (nla->cmd) { case IP_FW_NAT44_XGETCONFIG: if (nat_get_cmd(cfg->name, nla->cmd, &oh) != 0) { warnx("Error getting nat instance %s info", cfg->name); break; } nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL); free(oh); break; case IP_FW_NAT44_XGETLOG: if (nat_get_cmd(cfg->name, nla->cmd, &oh) == 0) { nat_show_log((struct nat44_cfg_nat *)(oh + 1), NULL); free(oh); break; } /* Handle error */ if (nla->is_all != 0 && errno == ENOENT) break; warn("Error getting nat instance %s info", cfg->name); break; } return (0); } /* * Compare nat names. * Honor number comparison. */ static int natname_cmp(const void *a, const void *b) { struct nat44_cfg_nat *ia, *ib; ia = (struct nat44_cfg_nat *)a; ib = (struct nat44_cfg_nat *)b; return (stringnum_cmp(ia->name, ib->name)); } /* * Retrieves nat list from kernel, * optionally sorts it and calls requested function for each table. * Returns 0 on success. */ static int nat_foreach(nat_cb_t *f, void *arg, int sort) { ipfw_obj_lheader *olh; struct nat44_cfg_nat *cfg; size_t sz; int i, error; /* Start with reasonable default */ sz = sizeof(*olh) + 16 * sizeof(struct nat44_cfg_nat); for (;;) { if ((olh = calloc(1, sz)) == NULL) return (ENOMEM); olh->size = sz; if (do_get3(IP_FW_NAT44_LIST_NAT, &olh->opheader, &sz) != 0) { free(olh); if (errno == ENOMEM) { sz = olh->size; continue; } return (errno); } if (sort != 0) qsort(olh + 1, olh->count, olh->objsize, natname_cmp); cfg = (struct nat44_cfg_nat*)(olh + 1); for (i = 0; i < olh->count; i++) { error = f(cfg, arg); /* Ignore errors for now */ cfg = (struct nat44_cfg_nat *)((caddr_t)cfg + olh->objsize); } free(olh); break; } return (0); } static int nat_get_cmd(char *name, uint16_t cmd, ipfw_obj_header **ooh) { ipfw_obj_header *oh; struct nat44_cfg_nat *cfg; size_t sz; /* Start with reasonable default */ sz = sizeof(*oh) + sizeof(*cfg) + 128; for (;;) { if ((oh = calloc(1, sz)) == NULL) return (ENOMEM); cfg = (struct nat44_cfg_nat *)(oh + 1); oh->ntlv.head.length = sizeof(oh->ntlv); strlcpy(oh->ntlv.name, name, sizeof(oh->ntlv.name)); strlcpy(cfg->name, name, sizeof(cfg->name)); if (do_get3(cmd, &oh->opheader, &sz) != 0) { sz = cfg->size; free(oh); if (errno == ENOMEM) continue; return (errno); } *ooh = oh; break; } return (0); } void ipfw_show_nat(int ac, char **av) { ipfw_obj_header *oh; char *name; int cmd; struct nat_list_arg nla; ac--; av++; if (co.test_only) return; /* Parse parameters. */ cmd = 0; /* XXX: Change to IP_FW_NAT44_XGETLOG @ MFC */ name = NULL; for ( ; ac != 0; ac--, av++) { if (!strncmp(av[0], "config", strlen(av[0]))) { cmd = IP_FW_NAT44_XGETCONFIG; continue; } if (strcmp(av[0], "log") == 0) { cmd = IP_FW_NAT44_XGETLOG; continue; } if (name != NULL) err(EX_USAGE,"only one instance name may be specified"); name = av[0]; } if (cmd == 0) errx(EX_USAGE, "Please specify action. Available: config,log"); if (name == NULL) { memset(&nla, 0, sizeof(nla)); nla.cmd = cmd; nla.is_all = 1; nat_foreach(nat_show_data, &nla, 1); } else { if (nat_get_cmd(name, cmd, &oh) != 0) err(EX_OSERR, "Error getting nat %s instance info", name); nat_show_cfg((struct nat44_cfg_nat *)(oh + 1), NULL); free(oh); } } Index: projects/building-blocks/sbin/ipfw =================================================================== --- projects/building-blocks/sbin/ipfw (revision 277723) +++ projects/building-blocks/sbin/ipfw (revision 277724) Property changes on: projects/building-blocks/sbin/ipfw ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sbin/ipfw:r276595-277723 Index: projects/building-blocks/sbin =================================================================== --- projects/building-blocks/sbin (revision 277723) +++ projects/building-blocks/sbin (revision 277724) Property changes on: projects/building-blocks/sbin ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sbin:r277689-277723 Index: projects/building-blocks/sys/amd64/amd64/machdep.c =================================================================== --- projects/building-blocks/sys/amd64/amd64/machdep.c (revision 277723) +++ projects/building-blocks/sys/amd64/amd64/machdep.c (revision 277724) @@ -1,2808 +1,2824 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_perfmon.h" #include "opt_platform.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #include #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); extern u_int64_t hammer_time(u_int64_t, u_int64_t); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup(void *); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Preload data parse function */ static caddr_t native_parse_preload_data(u_int64_t); /* Native function to fetch and parse the e820 map */ static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); /* Default init_ops implementation. */ struct init_ops init_ops = { .parse_preload_data = native_parse_preload_data, .early_clock_source_init = i8254_init, .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, #ifdef SMP .mp_bootaddress = mp_bootaddress, .start_all_aps = native_start_all_aps, #endif .msi_init = msi_init, }; /* * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is * the physical address at which the kernel is loaded. */ extern char kernphys[]; struct msgbuf *msgbufp; /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; long Maxmem = 0; long realmem = 0; /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; struct region_descriptor r_gdt, r_idt; struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; struct mem_range_softc mem_range_softc; struct mtx dt_lock; /* lock for GDT and LDT */ void (*vmm_resume_p)(void); static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_cnt.v_free_count), ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct pcb *pcb; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; pcb = td->td_pcb; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, sizeof(sf.sf_uc.uc_mcontext.mc_spare)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct pcb *pcb; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; long rflags; int cs, error, ret; ksiginfo_t ksi; pcb = td->td_pcb; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { uprintf("pid %d (%s): sigreturn copyin failed\n", p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(rflags, regs->tf_rflags)) { uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, td->td_name, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); set_pcb_flags(pcb, PCB_FULL_IRET); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) cpu_idle_hook(sbt); else __asm __volatile("sti; hlt"); *state = STATE_RUNNING; } static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) enable_intr(); else __asm __volatile("sti; hlt"); *state = STATE_RUNNING; } /* * MWAIT cpu power states. Lower 4 bits are sub-states. */ #define MWAIT_C0 0xf0 #define MWAIT_C1 0x00 #define MWAIT_C2 0x10 #define MWAIT_C3 0x20 #define MWAIT_C4 0x30 static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_MWAIT; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) { enable_intr(); *state = STATE_RUNNING; return; } cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); *state = STATE_RUNNING; } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_RUNNING; /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ */ #define MSR_AMDK8_IPM 0xc0010055 #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) static void cpu_probe_amdc1e(void) { /* * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. */ if (cpu_vendor_id == CPU_VENDOR_AMD && (cpu_id & 0x00000f00) == 0x00000f00 && (cpu_id & 0x0fff0000) >= 0x00040000) { cpu_ident_amdc1e = 1; } } void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); #ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } int cpu_idle_wakeup(int cpu) { struct pcpu *pcpu; int *state; pcpu = pcpu_find(cpu); state = (int *)pcpu->pc_monitorbuf; /* * This doesn't need to be atomic since missing the race will * simply result in unnecessary IPIs. */ if (*state == STATE_SLEEPING) return (0); if (*state == STATE_MWAIT) *state = STATE_RUNNING; return (1); } /* * Ordered by speed/power consumption. */ struct { void *id_fn; char *id_name; } idle_tbl[] = { { cpu_idle_spin, "spin" }, { cpu_idle_mwait, "mwait" }, { cpu_idle_hlt, "hlt" }, { cpu_idle_acpi, "acpi" }, { NULL, NULL } }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static int idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; int error; char *p; int i; p = "unknown"; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; if (strcmp(idle_tbl[i].id_name, buf)) continue; cpu_idle_fn = idle_tbl[i].id_fn; return (0); } return (EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, idle_sysctl, "A", "currently selected idle function"); /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; mtx_lock(&dt_lock); if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock(&dt_lock); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; set_pcb_flags(pcb, PCB_FULL_IRET); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; td->td_retval[1] = 0; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } clear_pcb_flags(pcb, PCB_DBREGS); } /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { register_t cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); static char nmi0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); struct amd64tss common_tss[MAXCPU]; /* * Software prototypes -- in more palatable form. * * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same * slots as corresponding segments for i386 kernel. */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GNULL2_SEL 1 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 8 64 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, .ssd_type = SDT_SYSTSS, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Actually, the TSS is a system descriptor which is double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 LDT Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 12 LDT Descriptor, double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; void setidt(idx, func, typ, dpl, ist) int idx; inthand_t *func; int typ; int dpl; int ist; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), #endif IDTVEC(fast_syscall), IDTVEC(fast_syscall32); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { struct { uint16_t limit; uint64_t base; } __packed idtr, gdtr; uint16_t ldt, tr; __asm __volatile("sidt %0" : "=m" (idtr)); db_printf("idtr\t0x%016lx/%04x\n", (u_long)idtr.base, (u_int)idtr.limit); __asm __volatile("sgdt %0" : "=m" (gdtr)); db_printf("gdtr\t0x%016lx/%04x\n", (u_long)gdtr.base, (u_int)gdtr.limit); __asm __volatile("sldt %0" : "=r" (ldt)); db_printf("ldtr\t0x%04x\n", ldt); __asm __volatile("str %0" : "=r" (tr)); db_printf("tr\t0x%04x\n", tr); db_printf("cr0\t0x%016lx\n", rcr0()); db_printf("cr2\t0x%016lx\n", rcr2()); db_printf("cr3\t0x%016lx\n", rcr3()); db_printf("cr4\t0x%016lx\n", rcr4()); db_printf("EFER\t%016lx\n", rdmsr(MSR_EFER)); db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); db_printf("DEBUG_CTL\t%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); db_printf("PAT\t%016lx\n", rdmsr(MSR_PAT)); db_printf("GSBASE\t%016lx\n", rdmsr(MSR_GSBASE)); } #endif void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(ssd, sd) struct soft_segment_descriptor *ssd; struct user_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } #if !defined(DEV_ATPIC) && defined(DEV_ISA) #include #include /* * Return a bitmap of the current interrupt requests. This is 8259-specific * and is only suitable for use at probe time. * This is only here to pacify sio. It is NOT FATAL if this doesn't work. * It shouldn't be here. There should probably be an APIC centric * implementation in the apic driver code, if at all. */ intrmask_t isa_irq_pending(void) { u_char irr1; u_char irr2; irr1 = inb(IO_ICU1); irr2 = inb(IO_ICU2); return ((irr2 << 8) | irr1); } #endif u_int basemem; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = physmap_idx + 2; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } void bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap, *smapend; smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (!add_physmap_entry(smap->base, smap->length, physmap, physmap_idx)) break; } } #define efi_next_descriptor(ptr, size) \ ((struct efi_md *)(((uint8_t *) ptr) + size)) static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, int *physmap_idx) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type <= EFI_MD_TYPE_PALCODE) type = types[p->md_type]; else type = ""; printf("%23s %012lx %12p %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), physmap, physmap_idx)) break; } } static char bootmethod[16] = ""; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); static void native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap; struct efi_map_header *efihdr; u_int32_t size; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); smap = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (efihdr == NULL && smap == NULL) panic("No BIOS smap or EFI map info from loader!"); if (efihdr != NULL) { add_efi_map_entries(efihdr, physmap, physmap_idx); strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); } else { size = *((u_int32_t *)smap - 1); bios_add_smap_entries(smap, size, physmap, physmap_idx); strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } } +#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) + /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYSMAP_SIZE]; u_long physmem_start, physmem_tunable, memtest; pt_entry_t *pte; quad_t dcons_addr, dcons_size; + int page_counter; bzero(physmap, sizeof(physmap)); basemem = 0; physmap_idx = 0; init_ops.parse_memmap(kmdp, physmap, &physmap_idx); /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0) panic("BIOS smap did not include a basemem segment!"); /* * Make hole for "AP -> long mode" bootstrap code. The * mp_bootaddress vector is only available when the kernel * is configured to support APs and APs for the system start * in 32bit mode (e.g. SMP bare metal). */ if (init_ops.mp_bootaddress) physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024); /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. * * XXX Some BIOSes corrupt low 64KB between suspend and resume. * By default, mask off the first 16 pages unless we appear to be * running in a VM. */ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); if (physmem_start < PAGE_SIZE) physmap[0] = PAGE_SIZE; else if (physmem_start >= physmap[1]) physmap[0] = round_page(physmap[1] - PAGE_SIZE); else physmap[0] = round_page(physmem_start); pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ + page_counter = 0; + if (memtest != 0) + printf("Testing system memory"); for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= (vm_paddr_t)kernphys && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* + * Print a "." every GB to show we're making + * progress. + */ + page_counter++; + if ((page_counter % PAGES_PER_GB) == 0) + printf("."); + + /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); + if (memtest != 0) + printf("\n"); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); } static caddr_t native_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; #endif preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE; #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); db_fetch_ksymtab(ksym_start, ksym_end); #endif return (kmdp); } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; struct nmi_pcpu *np; struct xstate_hdr *xhdr; u_int64_t msr; char *env; size_t kstack0_sz; thread0.td_kstack = physfree + KERNBASE; thread0.td_kstack_pages = KSTACK_PAGES; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); physfree += kstack0_sz; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); kmdp = init_ops.parse_preload_data(modulep); /* Init basic tunables, hz etc */ init_param1(); /* * make gdt memory segments */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long) gdt; lgdt(&r_gdt); pc = &__pcpu[0]; wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ pcpu_init(pc, 0, sizeof(struct pcpu)); dpcpu_init((void *)(physfree + KERNBASE), 0); physfree += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); PCPU_SET(tssp, &common_tss[0]); PCPU_SET(commontssp, &common_tss[0]); PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); /* * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) * transition). */ if (kmdp != NULL && preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL) vty_set_preferred(VTY_VT); /* * Initialize the console before we print anything out. */ cninit(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?"; #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif identify_cpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ initializecpucache(); /* doublefault stack space, runs on ist1 */ common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; /* * NMI stack, runs on ist2. The pcpu pointer is stored just * above the start of the ist2 stack. */ np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; np->np_pcpu = (register_t) pc; common_tss[0].tss_ist2 = (long) np; /* Set the IO permission bitmap (empty due to tss seg limit) */ common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); /* Set up the fast syscall stuff */ msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ msgbufinit(msgbufp, msgbufsize); fpuinit(); /* * Set up thread0 pcb after fpuinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb; /* Ensure the stack is aligned to 16 bytes */ common_tss[0].tss_rsp0 &= ~0xFul; PCPU_SET(rsp0, common_tss[0].tss_rsp0); PCPU_SET(curpcb, thread0.td_pcb); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */ thread0.td_frame = &proc0_tf; env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); cpu_probe_amdc1e(); #ifdef FDT x86_init_fdt(); #endif /* Location of kernel stack for locore */ return ((u_int64_t)thread0.td_pcb); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); static int efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct efi_map_header *efihdr; caddr_t kmdp; uint32_t efisize; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) return (0); efisize = *((uint32_t *)efihdr - 1); return (SYSCTL_OUT(req, efihdr, efisize)); } SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; critical_exit(); flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(flags); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } int ptrace_set_pc(struct thread *td, unsigned long addr) { td->td_frame->tf_rip = addr; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_rflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_rflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; if (tp->tf_flags & TF_HASSEGS) { regs->r_ds = tp->tf_ds; regs->r_es = tp->tf_es; regs->r_fs = tp->tf_fs; regs->r_gs = tp->tf_gs; } else { regs->r_ds = 0; regs->r_es = 0; regs->r_fs = 0; regs->r_gs = 0; } return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; if (0) { /* XXXKIB */ tp->tf_ds = regs->r_ds; tp->tf_es = regs->r_es; tp->tf_fs = regs->r_fs; tp->tf_gs = regs->r_gs; tp->tf_flags = TF_HASSEGS; } set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); fpugetregs(td); fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); fpuuserinited(td); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_ds = tp->tf_ds; mcp->mc_es = tp->tf_es; mcp->mc_fs = tp->tf_fs; mcp->mc_gs = tp->tf_gs; mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, const mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tp; char *xfpustate; long rflags; int ret; pcb = td->td_pcb; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; tp->tf_flags = mcp->mc_flags; if (tp->tf_flags & TF_HASSEGS) { tp->tf_ds = mcp->mc_ds; tp->tf_es = mcp->mc_es; tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } set_pcb_flags(pcb, PCB_FULL_IRET); return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(struct savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { struct savefpu *fpstate; int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { fpstate = (struct savefpu *)&mcp->mc_fpstate; fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask; error = fpusetregs(td, fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetuserregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetuserregs()... perhaps we just * have too many layers. */ clear_pcb_flags(curthread->td_pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[8] = 0; dbregs->dr[9] = 0; dbregs->dr[10] = 0; dbregs->dr[11] = 0; dbregs->dr[12] = 0; dbregs->dr[13] = 0; dbregs->dr[14] = 0; dbregs->dr[15] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP or a general protection fault right here. * Upper bits of dr6 and dr7 must not be set */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (td->td_frame->tf_cs == _ucode32sel && DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) return (EINVAL); } if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || (dbregs->dr[7] & 0xffffffff00000000ul) != 0) return (EINVAL); pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; set_pcb_flags(pcb, PCB_DBREGS); } return (0); } void reset_dbregs(void) { load_dr7(0); /* Turn off the control bits first */ load_dr0(0); load_dr1(0); load_dr2(0); load_dr3(0); load_dr6(0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int64_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ Index: projects/building-blocks/sys/arm/ti/am335x/am335x_lcd.c =================================================================== --- projects/building-blocks/sys/arm/ti/am335x/am335x_lcd.c (revision 277723) +++ projects/building-blocks/sys/arm/ti/am335x/am335x_lcd.c (revision 277724) @@ -1,714 +1,759 @@ /*- * Copyright 2013 Oleksandr Tymoshenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_syscons.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include - -/* syscons bits */ #include #include #include #include #include #include #include #include +#ifdef DEV_SC #include +#else /* VT */ +#include +#endif #include #include #include "am335x_lcd.h" #include "am335x_pwm.h" +#include "fb_if.h" + #define LCD_PID 0x00 #define LCD_CTRL 0x04 #define CTRL_DIV_MASK 0xff #define CTRL_DIV_SHIFT 8 #define CTRL_AUTO_UFLOW_RESTART (1 << 1) #define CTRL_RASTER_MODE 1 #define CTRL_LIDD_MODE 0 #define LCD_LIDD_CTRL 0x0C #define LCD_LIDD_CS0_CONF 0x10 #define LCD_LIDD_CS0_ADDR 0x14 #define LCD_LIDD_CS0_DATA 0x18 #define LCD_LIDD_CS1_CONF 0x1C #define LCD_LIDD_CS1_ADDR 0x20 #define LCD_LIDD_CS1_DATA 0x24 #define LCD_RASTER_CTRL 0x28 #define RASTER_CTRL_TFT24_UNPACKED (1 << 26) #define RASTER_CTRL_TFT24 (1 << 25) #define RASTER_CTRL_STN565 (1 << 24) #define RASTER_CTRL_TFTPMAP (1 << 23) #define RASTER_CTRL_NIBMODE (1 << 22) #define RASTER_CTRL_PALMODE_SHIFT 20 #define PALETTE_PALETTE_AND_DATA 0x00 #define PALETTE_PALETTE_ONLY 0x01 #define PALETTE_DATA_ONLY 0x02 #define RASTER_CTRL_REQDLY_SHIFT 12 #define RASTER_CTRL_MONO8B (1 << 9) #define RASTER_CTRL_RBORDER (1 << 8) #define RASTER_CTRL_LCDTFT (1 << 7) #define RASTER_CTRL_LCDBW (1 << 1) #define RASTER_CTRL_LCDEN (1 << 0) #define LCD_RASTER_TIMING_0 0x2C #define RASTER_TIMING_0_HBP_SHIFT 24 #define RASTER_TIMING_0_HFP_SHIFT 16 #define RASTER_TIMING_0_HSW_SHIFT 10 #define RASTER_TIMING_0_PPLLSB_SHIFT 4 #define RASTER_TIMING_0_PPLMSB_SHIFT 3 #define LCD_RASTER_TIMING_1 0x30 #define RASTER_TIMING_1_VBP_SHIFT 24 #define RASTER_TIMING_1_VFP_SHIFT 16 #define RASTER_TIMING_1_VSW_SHIFT 10 #define RASTER_TIMING_1_LPP_SHIFT 0 #define LCD_RASTER_TIMING_2 0x34 #define RASTER_TIMING_2_HSWHI_SHIFT 27 #define RASTER_TIMING_2_LPP_B10_SHIFT 26 #define RASTER_TIMING_2_PHSVS (1 << 25) #define RASTER_TIMING_2_PHSVS_RISE (1 << 24) #define RASTER_TIMING_2_PHSVS_FALL (0 << 24) #define RASTER_TIMING_2_IOE (1 << 23) #define RASTER_TIMING_2_IPC (1 << 22) #define RASTER_TIMING_2_IHS (1 << 21) #define RASTER_TIMING_2_IVS (1 << 20) #define RASTER_TIMING_2_ACBI_SHIFT 16 #define RASTER_TIMING_2_ACB_SHIFT 8 #define RASTER_TIMING_2_HBPHI_SHIFT 4 #define RASTER_TIMING_2_HFPHI_SHIFT 0 #define LCD_RASTER_SUBPANEL 0x38 #define LCD_RASTER_SUBPANEL2 0x3C #define LCD_LCDDMA_CTRL 0x40 #define LCDDMA_CTRL_DMA_MASTER_PRIO_SHIFT 16 #define LCDDMA_CTRL_TH_FIFO_RDY_SHIFT 8 #define LCDDMA_CTRL_BURST_SIZE_SHIFT 4 #define LCDDMA_CTRL_BYTES_SWAP (1 << 3) #define LCDDMA_CTRL_BE (1 << 1) #define LCDDMA_CTRL_FB0_ONLY 0 #define LCDDMA_CTRL_FB0_FB1 (1 << 0) #define LCD_LCDDMA_FB0_BASE 0x44 #define LCD_LCDDMA_FB0_CEILING 0x48 #define LCD_LCDDMA_FB1_BASE 0x4C #define LCD_LCDDMA_FB1_CEILING 0x50 #define LCD_SYSCONFIG 0x54 #define SYSCONFIG_STANDBY_FORCE (0 << 4) #define SYSCONFIG_STANDBY_NONE (1 << 4) #define SYSCONFIG_STANDBY_SMART (2 << 4) #define SYSCONFIG_IDLE_FORCE (0 << 2) #define SYSCONFIG_IDLE_NONE (1 << 2) #define SYSCONFIG_IDLE_SMART (2 << 2) #define LCD_IRQSTATUS_RAW 0x58 #define LCD_IRQSTATUS 0x5C #define LCD_IRQENABLE_SET 0x60 #define LCD_IRQENABLE_CLEAR 0x64 #define IRQ_EOF1 (1 << 9) #define IRQ_EOF0 (1 << 8) #define IRQ_PL (1 << 6) #define IRQ_FUF (1 << 5) #define IRQ_ACB (1 << 3) #define IRQ_SYNC_LOST (1 << 2) #define IRQ_RASTER_DONE (1 << 1) #define IRQ_FRAME_DONE (1 << 0) #define LCD_END_OF_INT_IND 0x68 #define LCD_CLKC_ENABLE 0x6C #define CLKC_ENABLE_DMA (1 << 2) #define CLKC_ENABLE_LDID (1 << 1) #define CLKC_ENABLE_CORE (1 << 0) #define LCD_CLKC_RESET 0x70 #define CLKC_RESET_MAIN (1 << 3) #define CLKC_RESET_DMA (1 << 2) #define CLKC_RESET_LDID (1 << 1) #define CLKC_RESET_CORE (1 << 0) #define LCD_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx) #define LCD_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx) #define LCD_LOCK_INIT(_sc) mtx_init(&(_sc)->sc_mtx, \ device_get_nameunit(_sc->sc_dev), "am335x_lcd", MTX_DEF) #define LCD_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->sc_mtx); #define LCD_READ4(_sc, reg) bus_read_4((_sc)->sc_mem_res, reg); #define LCD_WRITE4(_sc, reg, value) \ bus_write_4((_sc)->sc_mem_res, reg, value); /* Backlight is controlled by eCAS interface on PWM unit 0 */ #define PWM_UNIT 0 #define PWM_PERIOD 100 struct am335x_lcd_softc { device_t sc_dev; + struct fb_info sc_fb_info; struct resource *sc_mem_res; struct resource *sc_irq_res; void *sc_intr_hl; struct mtx sc_mtx; int sc_backlight; struct sysctl_oid *sc_oid; /* Framebuffer */ bus_dma_tag_t sc_dma_tag; bus_dmamap_t sc_dma_map; size_t sc_fb_size; bus_addr_t sc_fb_phys; uint8_t *sc_fb_base; }; static void am335x_fb_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err) { bus_addr_t *addr; if (err) return; addr = (bus_addr_t*)arg; *addr = segs[0].ds_addr; } static uint32_t am335x_lcd_calc_divisor(uint32_t reference, uint32_t freq) { uint32_t div; /* Raster mode case: divisors are in range from 2 to 255 */ for (div = 2; div < 255; div++) if (reference/div <= freq) return (div); return (255); } static int am335x_lcd_sysctl_backlight(SYSCTL_HANDLER_ARGS) { struct am335x_lcd_softc *sc = (struct am335x_lcd_softc*)arg1; int error; int backlight; backlight = sc->sc_backlight; error = sysctl_handle_int(oidp, &backlight, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (backlight < 0) backlight = 0; if (backlight > 100) backlight = 100; LCD_LOCK(sc); error = am335x_pwm_config_ecas(PWM_UNIT, PWM_PERIOD, backlight*PWM_PERIOD/100); if (error == 0) sc->sc_backlight = backlight; LCD_UNLOCK(sc); return (error); } static int am335x_read_panel_property(device_t dev, const char *name, uint32_t *val) { phandle_t node; pcell_t cell; node = ofw_bus_get_node(dev); if ((OF_getprop(node, name, &cell, sizeof(cell))) <= 0) { device_printf(dev, "missing '%s' attribute in LCD panel info\n", name); return (ENXIO); } *val = fdt32_to_cpu(cell); return (0); } static int am335x_read_panel_info(device_t dev, struct panel_info *panel) { int error; error = 0; if ((error = am335x_read_panel_property(dev, "panel_width", &panel->panel_width))) goto out; if ((error = am335x_read_panel_property(dev, "panel_height", &panel->panel_height))) goto out; if ((error = am335x_read_panel_property(dev, "panel_hfp", &panel->panel_hfp))) goto out; if ((error = am335x_read_panel_property(dev, "panel_hbp", &panel->panel_hbp))) goto out; if ((error = am335x_read_panel_property(dev, "panel_hsw", &panel->panel_hsw))) goto out; if ((error = am335x_read_panel_property(dev, "panel_vfp", &panel->panel_vfp))) goto out; if ((error = am335x_read_panel_property(dev, "panel_vbp", &panel->panel_vbp))) goto out; if ((error = am335x_read_panel_property(dev, "panel_vsw", &panel->panel_vsw))) goto out; if ((error = am335x_read_panel_property(dev, "panel_pxl_clk", &panel->panel_pxl_clk))) goto out; if ((error = am335x_read_panel_property(dev, "panel_invert_pxl_clk", &panel->panel_invert_pxl_clk))) goto out; if ((error = am335x_read_panel_property(dev, "ac_bias", &panel->ac_bias))) goto out; if ((error = am335x_read_panel_property(dev, "ac_bias_intrpt", &panel->ac_bias_intrpt))) goto out; if ((error = am335x_read_panel_property(dev, "dma_burst_sz", &panel->dma_burst_sz))) goto out; if ((error = am335x_read_panel_property(dev, "bpp", &panel->bpp))) goto out; if ((error = am335x_read_panel_property(dev, "fdd", &panel->fdd))) goto out; if ((error = am335x_read_panel_property(dev, "invert_line_clock", &panel->invert_line_clock))) goto out; if ((error = am335x_read_panel_property(dev, "invert_frm_clock", &panel->invert_frm_clock))) goto out; if ((error = am335x_read_panel_property(dev, "sync_edge", &panel->sync_edge))) goto out; error = am335x_read_panel_property(dev, "sync_ctrl", &panel->sync_ctrl); out: return (error); } static void am335x_lcd_intr(void *arg) { struct am335x_lcd_softc *sc = arg; uint32_t reg; reg = LCD_READ4(sc, LCD_IRQSTATUS); LCD_WRITE4(sc, LCD_IRQSTATUS, reg); /* Read value back to make sure it reached the hardware */ reg = LCD_READ4(sc, LCD_IRQSTATUS); if (reg & IRQ_SYNC_LOST) { reg = LCD_READ4(sc, LCD_RASTER_CTRL); reg &= ~RASTER_CTRL_LCDEN; LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); reg = LCD_READ4(sc, LCD_RASTER_CTRL); reg |= RASTER_CTRL_LCDEN; LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); goto done; } if (reg & IRQ_PL) { reg = LCD_READ4(sc, LCD_RASTER_CTRL); reg &= ~RASTER_CTRL_LCDEN; LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); reg = LCD_READ4(sc, LCD_RASTER_CTRL); reg |= RASTER_CTRL_LCDEN; LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); goto done; } if (reg & IRQ_EOF0) { LCD_WRITE4(sc, LCD_LCDDMA_FB0_BASE, sc->sc_fb_phys); LCD_WRITE4(sc, LCD_LCDDMA_FB0_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); reg &= ~IRQ_EOF0; } if (reg & IRQ_EOF1) { LCD_WRITE4(sc, LCD_LCDDMA_FB1_BASE, sc->sc_fb_phys); LCD_WRITE4(sc, LCD_LCDDMA_FB1_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); reg &= ~IRQ_EOF1; } if (reg & IRQ_FUF) { /* TODO: Handle FUF */ } if (reg & IRQ_ACB) { /* TODO: Handle ACB */ } done: LCD_WRITE4(sc, LCD_END_OF_INT_IND, 0); /* Read value back to make sure it reached the hardware */ reg = LCD_READ4(sc, LCD_END_OF_INT_IND); } static int am335x_lcd_probe(device_t dev) { +#ifdef DEV_SC int err; +#endif if (!ofw_bus_status_okay(dev)) return (ENXIO); if (!ofw_bus_is_compatible(dev, "ti,am335x-lcd")) return (ENXIO); device_set_desc(dev, "AM335x LCD controller"); +#ifdef DEV_SC err = sc_probe_unit(device_get_unit(dev), device_get_flags(dev) | SC_AUTODETECT_KBD); if (err != 0) return (err); +#endif return (BUS_PROBE_DEFAULT); } static int am335x_lcd_attach(device_t dev) { struct am335x_lcd_softc *sc; int rid; int div; struct panel_info panel; uint32_t reg, timing0, timing1, timing2; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; uint32_t burst_log; int err; size_t dma_size; uint32_t hbp, hfp, hsw; uint32_t vbp, vfp, vsw; uint32_t width, height; sc = device_get_softc(dev); sc->sc_dev = dev; if (am335x_read_panel_info(dev, &panel)) return (ENXIO); int ref_freq = 0; ti_prcm_clk_enable(LCDC_CLK); if (ti_prcm_clk_get_source_freq(LCDC_CLK, &ref_freq)) { device_printf(dev, "Can't get reference frequency\n"); return (ENXIO); } rid = 0; sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!sc->sc_mem_res) { device_printf(dev, "cannot allocate memory window\n"); return (ENXIO); } rid = 0; sc->sc_irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (!sc->sc_irq_res) { bus_release_resource(dev, SYS_RES_MEMORY, 0, sc->sc_mem_res); device_printf(dev, "cannot allocate interrupt\n"); return (ENXIO); } if (bus_setup_intr(dev, sc->sc_irq_res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, am335x_lcd_intr, sc, &sc->sc_intr_hl) != 0) { bus_release_resource(dev, SYS_RES_IRQ, rid, sc->sc_irq_res); bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->sc_mem_res); device_printf(dev, "Unable to setup the irq handler.\n"); return (ENXIO); } LCD_LOCK_INIT(sc); /* Panle initialization */ dma_size = round_page(panel.panel_width*panel.panel_height*panel.bpp/8); /* * Now allocate framebuffer memory */ err = bus_dma_tag_create( bus_get_dma_tag(dev), 4, 0, /* alignment, boundary */ BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ dma_size, 1, /* maxsize, nsegments */ dma_size, 0, /* maxsegsize, flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->sc_dma_tag); if (err) goto fail; err = bus_dmamem_alloc(sc->sc_dma_tag, (void **)&sc->sc_fb_base, BUS_DMA_COHERENT, &sc->sc_dma_map); if (err) { device_printf(dev, "cannot allocate framebuffer\n"); goto fail; } err = bus_dmamap_load(sc->sc_dma_tag, sc->sc_dma_map, sc->sc_fb_base, dma_size, am335x_fb_dmamap_cb, &sc->sc_fb_phys, BUS_DMA_NOWAIT); if (err) { device_printf(dev, "cannot load DMA map\n"); goto fail; } /* Make sure it's blank */ memset(sc->sc_fb_base, 0x00, dma_size); /* Calculate actual FB Size */ sc->sc_fb_size = panel.panel_width*panel.panel_height*panel.bpp/8; /* Only raster mode is supported */ reg = CTRL_RASTER_MODE; div = am335x_lcd_calc_divisor(ref_freq, panel.panel_pxl_clk); reg |= (div << CTRL_DIV_SHIFT); LCD_WRITE4(sc, LCD_CTRL, reg); /* Set timing */ timing0 = timing1 = timing2 = 0; hbp = panel.panel_hbp - 1; hfp = panel.panel_hfp - 1; hsw = panel.panel_hsw - 1; vbp = panel.panel_vbp; vfp = panel.panel_vfp; vsw = panel.panel_vsw - 1; height = panel.panel_height - 1; width = panel.panel_width - 1; /* Horizontal back porch */ timing0 |= (hbp & 0xff) << RASTER_TIMING_0_HBP_SHIFT; timing2 |= ((hbp >> 8) & 3) << RASTER_TIMING_2_HBPHI_SHIFT; /* Horizontal front porch */ timing0 |= (hfp & 0xff) << RASTER_TIMING_0_HFP_SHIFT; timing2 |= ((hfp >> 8) & 3) << RASTER_TIMING_2_HFPHI_SHIFT; /* Horizontal sync width */ timing0 |= (hsw & 0x3f) << RASTER_TIMING_0_HSW_SHIFT; timing2 |= ((hsw >> 6) & 0xf) << RASTER_TIMING_2_HSWHI_SHIFT; /* Vertical back porch, front porch, sync width */ timing1 |= (vbp & 0xff) << RASTER_TIMING_1_VBP_SHIFT; timing1 |= (vfp & 0xff) << RASTER_TIMING_1_VFP_SHIFT; timing1 |= (vsw & 0x3f) << RASTER_TIMING_1_VSW_SHIFT; /* Pixels per line */ timing0 |= ((width >> 10) & 1) << RASTER_TIMING_0_PPLMSB_SHIFT; timing0 |= ((width >> 4) & 0x3f) << RASTER_TIMING_0_PPLLSB_SHIFT; /* Lines per panel */ timing1 |= (height & 0x3ff) << RASTER_TIMING_1_LPP_SHIFT; timing2 |= ((height >> 10 ) & 1) << RASTER_TIMING_2_LPP_B10_SHIFT; /* clock signal settings */ if (panel.sync_ctrl) timing2 |= RASTER_TIMING_2_PHSVS; if (panel.sync_edge) timing2 |= RASTER_TIMING_2_PHSVS_RISE; else timing2 |= RASTER_TIMING_2_PHSVS_FALL; if (panel.invert_line_clock) timing2 |= RASTER_TIMING_2_IHS; if (panel.invert_frm_clock) timing2 |= RASTER_TIMING_2_IVS; if (panel.panel_invert_pxl_clk) timing2 |= RASTER_TIMING_2_IPC; /* AC bias */ timing2 |= (panel.ac_bias << RASTER_TIMING_2_ACB_SHIFT); timing2 |= (panel.ac_bias_intrpt << RASTER_TIMING_2_ACBI_SHIFT); LCD_WRITE4(sc, LCD_RASTER_TIMING_0, timing0); LCD_WRITE4(sc, LCD_RASTER_TIMING_1, timing1); LCD_WRITE4(sc, LCD_RASTER_TIMING_2, timing2); /* DMA settings */ reg = LCDDMA_CTRL_FB0_FB1; /* Find power of 2 for current burst size */ switch (panel.dma_burst_sz) { case 1: burst_log = 0; break; case 2: burst_log = 1; break; case 4: burst_log = 2; break; case 8: burst_log = 3; break; case 16: default: burst_log = 4; break; } reg |= (burst_log << LCDDMA_CTRL_BURST_SIZE_SHIFT); /* XXX: FIFO TH */ reg |= (0 << LCDDMA_CTRL_TH_FIFO_RDY_SHIFT); LCD_WRITE4(sc, LCD_LCDDMA_CTRL, reg); LCD_WRITE4(sc, LCD_LCDDMA_FB0_BASE, sc->sc_fb_phys); LCD_WRITE4(sc, LCD_LCDDMA_FB0_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); LCD_WRITE4(sc, LCD_LCDDMA_FB1_BASE, sc->sc_fb_phys); LCD_WRITE4(sc, LCD_LCDDMA_FB1_CEILING, sc->sc_fb_phys + sc->sc_fb_size - 1); /* Enable LCD */ reg = RASTER_CTRL_LCDTFT; reg |= (panel.fdd << RASTER_CTRL_REQDLY_SHIFT); reg |= (PALETTE_DATA_ONLY << RASTER_CTRL_PALMODE_SHIFT); if (panel.bpp >= 24) reg |= RASTER_CTRL_TFT24; if (panel.bpp == 32) reg |= RASTER_CTRL_TFT24_UNPACKED; LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); LCD_WRITE4(sc, LCD_CLKC_ENABLE, CLKC_ENABLE_DMA | CLKC_ENABLE_LDID | CLKC_ENABLE_CORE); LCD_WRITE4(sc, LCD_CLKC_RESET, CLKC_RESET_MAIN); DELAY(100); LCD_WRITE4(sc, LCD_CLKC_RESET, 0); reg = IRQ_EOF1 | IRQ_EOF0 | IRQ_FUF | IRQ_PL | IRQ_ACB | IRQ_SYNC_LOST | IRQ_RASTER_DONE | IRQ_FRAME_DONE; LCD_WRITE4(sc, LCD_IRQENABLE_SET, reg); reg = LCD_READ4(sc, LCD_RASTER_CTRL); reg |= RASTER_CTRL_LCDEN; LCD_WRITE4(sc, LCD_RASTER_CTRL, reg); LCD_WRITE4(sc, LCD_SYSCONFIG, SYSCONFIG_STANDBY_SMART | SYSCONFIG_IDLE_SMART); /* Init backlight interface */ ctx = device_get_sysctl_ctx(sc->sc_dev); tree = device_get_sysctl_tree(sc->sc_dev); sc->sc_oid = SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "backlight", CTLTYPE_INT | CTLFLAG_RW, sc, 0, am335x_lcd_sysctl_backlight, "I", "LCD backlight"); sc->sc_backlight = 0; /* Check if eCAS interface is available at this point */ if (am335x_pwm_config_ecas(PWM_UNIT, PWM_PERIOD, PWM_PERIOD) == 0) sc->sc_backlight = 100; + sc->sc_fb_info.fb_name = device_get_nameunit(sc->sc_dev); + sc->sc_fb_info.fb_vbase = (intptr_t)sc->sc_fb_base; + sc->sc_fb_info.fb_pbase = sc->sc_fb_phys; + sc->sc_fb_info.fb_size = sc->sc_fb_size; + sc->sc_fb_info.fb_bpp = sc->sc_fb_info.fb_depth = panel.bpp; + sc->sc_fb_info.fb_stride = panel.panel_width*panel.bpp / 8; + sc->sc_fb_info.fb_width = panel.panel_width; + sc->sc_fb_info.fb_height = panel.panel_height; + +#ifdef DEV_SC err = (sc_attach_unit(device_get_unit(dev), device_get_flags(dev) | SC_AUTODETECT_KBD)); if (err) { device_printf(dev, "failed to attach syscons\n"); goto fail; } am335x_lcd_syscons_setup((vm_offset_t)sc->sc_fb_base, sc->sc_fb_phys, &panel); +#else /* VT */ + device_t fbd = device_add_child(dev, "fbd", + device_get_unit(dev)); + if (fbd == NULL) { + device_printf(dev, "Failed to add fbd child\n"); + goto fail; + } + if (device_probe_and_attach(fbd) != 0) { + device_printf(dev, "Failed to attach fbd device\n"); + goto fail; + } +#endif return (0); fail: return (err); } static int am335x_lcd_detach(device_t dev) { /* Do not let unload driver */ return (EBUSY); } +static struct fb_info * +am335x_lcd_fb_getinfo(device_t dev) +{ + struct am335x_lcd_softc *sc; + + sc = device_get_softc(dev); + + return (&sc->sc_fb_info); +} + static device_method_t am335x_lcd_methods[] = { DEVMETHOD(device_probe, am335x_lcd_probe), DEVMETHOD(device_attach, am335x_lcd_attach), DEVMETHOD(device_detach, am335x_lcd_detach), + /* Framebuffer service methods */ + DEVMETHOD(fb_getinfo, am335x_lcd_fb_getinfo), + DEVMETHOD_END }; static driver_t am335x_lcd_driver = { - "am335x_lcd", + "fb", am335x_lcd_methods, sizeof(struct am335x_lcd_softc), }; static devclass_t am335x_lcd_devclass; DRIVER_MODULE(am335x_lcd, simplebus, am335x_lcd_driver, am335x_lcd_devclass, 0, 0); MODULE_VERSION(am335x_lcd, 1); MODULE_DEPEND(am335x_lcd, simplebus, 1, 1, 1); Index: projects/building-blocks/sys/arm/ti/am335x/files.am335x =================================================================== --- projects/building-blocks/sys/arm/ti/am335x/files.am335x (revision 277723) +++ projects/building-blocks/sys/arm/ti/am335x/files.am335x (revision 277724) @@ -1,17 +1,17 @@ #$FreeBSD$ arm/ti/aintc.c standard arm/ti/am335x/am335x_dmtimer.c standard arm/ti/am335x/am335x_gpio.c optional gpio -arm/ti/am335x/am335x_lcd.c optional sc +arm/ti/am335x/am335x_lcd.c optional sc | vt arm/ti/am335x/am335x_lcd_syscons.c optional sc arm/ti/am335x/am335x_pmic.c optional am335x_pmic arm/ti/am335x/am335x_prcm.c standard arm/ti/am335x/am335x_pwm.c standard arm/ti/am335x/am335x_rtc.c optional am335x_rtc arm/ti/am335x/am335x_scm_padconf.c standard arm/ti/am335x/am335x_usbss.c optional musb fdt arm/ti/ti_edma3.c standard arm/ti/cpsw/if_cpsw.c optional cpsw Index: projects/building-blocks/sys/arm/ti/ti_i2c.c =================================================================== --- projects/building-blocks/sys/arm/ti/ti_i2c.c (revision 277723) +++ projects/building-blocks/sys/arm/ti/ti_i2c.c (revision 277724) @@ -1,945 +1,979 @@ /*- * Copyright (c) 2011 Ben Gray . * Copyright (c) 2014 Luiz Otavio O Souza . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /** * Driver for the I2C module on the TI SoC. * * This driver is heavily based on the TWI driver for the AT91 (at91_twi.c). * * CAUTION: The I2Ci registers are limited to 16 bit and 8 bit data accesses, * 32 bit data access is not allowed and can corrupt register content. * * This driver currently doesn't use DMA for the transfer, although I hope to * incorporate that sometime in the future. The idea being that for transaction * larger than a certain size the DMA engine is used, for anything less the * normal interrupt/fifo driven option is used. * * * WARNING: This driver uses mtx_sleep and interrupts to perform transactions, * which means you can't do a transaction during startup before the interrupts * have been enabled. Hint - the freebsd function config_intrhook_establish(). */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iicbus_if.h" /** * I2C device driver context, a pointer to this is stored in the device * driver structure. */ struct ti_i2c_softc { device_t sc_dev; uint32_t device_id; struct resource* sc_irq_res; struct resource* sc_mem_res; device_t sc_iicbus; void* sc_irq_h; struct mtx sc_mtx; struct iic_msg* sc_buffer; int sc_bus_inuse; int sc_buffer_pos; int sc_error; int sc_fifo_trsh; + int sc_timeout; uint16_t sc_con_reg; uint16_t sc_rev; }; struct ti_i2c_clock_config { u_int frequency; /* Bus frequency in Hz */ uint8_t psc; /* Fast/Standard mode prescale divider */ uint8_t scll; /* Fast/Standard mode SCL low time */ uint8_t sclh; /* Fast/Standard mode SCL high time */ uint8_t hsscll; /* High Speed mode SCL low time */ uint8_t hssclh; /* High Speed mode SCL high time */ }; #if defined(SOC_OMAP4) /* * OMAP4 i2c bus clock is 96MHz / ((psc + 1) * (scll + 7 + sclh + 5)). * The prescaler values for 100KHz and 400KHz modes come from the table in the * OMAP4 TRM. The table doesn't list 1MHz; these values should give that speed. */ static struct ti_i2c_clock_config ti_omap4_i2c_clock_configs[] = { { 100000, 23, 13, 15, 0, 0}, { 400000, 9, 5, 7, 0, 0}, { 1000000, 3, 5, 7, 0, 0}, /* { 3200000, 1, 113, 115, 7, 10}, - HS mode */ { 0 /* Table terminator */ } }; #endif #if defined(SOC_TI_AM335X) /* * AM335x i2c bus clock is 48MHZ / ((psc + 1) * (scll + 7 + sclh + 5)) * In all cases we prescale the clock to 24MHz as recommended in the manual. */ static struct ti_i2c_clock_config ti_am335x_i2c_clock_configs[] = { { 100000, 1, 111, 117, 0, 0}, { 400000, 1, 23, 25, 0, 0}, { 1000000, 1, 5, 7, 0, 0}, { 0 /* Table terminator */ } }; #endif /** * Locking macros used throughout the driver */ #define TI_I2C_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx) #define TI_I2C_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx) #define TI_I2C_LOCK_INIT(_sc) \ mtx_init(&_sc->sc_mtx, device_get_nameunit(_sc->sc_dev), \ "ti_i2c", MTX_DEF) #define TI_I2C_LOCK_DESTROY(_sc) mtx_destroy(&_sc->sc_mtx) #define TI_I2C_ASSERT_LOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_OWNED) #define TI_I2C_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED) #ifdef DEBUG #define ti_i2c_dbg(_sc, fmt, args...) \ device_printf((_sc)->sc_dev, fmt, ##args) #else #define ti_i2c_dbg(_sc, fmt, args...) #endif /** * ti_i2c_read_2 - reads a 16-bit value from one of the I2C registers * @sc: I2C device context * @off: the byte offset within the register bank to read from. * * * LOCKING: * No locking required * * RETURNS: * 16-bit value read from the register. */ static inline uint16_t ti_i2c_read_2(struct ti_i2c_softc *sc, bus_size_t off) { return (bus_read_2(sc->sc_mem_res, off)); } /** * ti_i2c_write_2 - writes a 16-bit value to one of the I2C registers * @sc: I2C device context * @off: the byte offset within the register bank to read from. * @val: the value to write into the register * * LOCKING: * No locking required * * RETURNS: * 16-bit value read from the register. */ static inline void ti_i2c_write_2(struct ti_i2c_softc *sc, bus_size_t off, uint16_t val) { bus_write_2(sc->sc_mem_res, off, val); } static int ti_i2c_transfer_intr(struct ti_i2c_softc* sc, uint16_t status) { int amount, done, i; done = 0; amount = 0; /* Check for the error conditions. */ if (status & I2C_STAT_NACK) { /* No ACK from slave. */ ti_i2c_dbg(sc, "NACK\n"); ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_NACK); sc->sc_error = ENXIO; } else if (status & I2C_STAT_AL) { /* Arbitration lost. */ ti_i2c_dbg(sc, "Arbitration lost\n"); ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_AL); sc->sc_error = ENXIO; } /* Check if we have finished. */ if (status & I2C_STAT_ARDY) { /* Register access ready - transaction complete basically. */ ti_i2c_dbg(sc, "ARDY transaction complete\n"); if (sc->sc_error != 0 && sc->sc_buffer->flags & IIC_M_NOSTOP) { ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg | I2C_CON_STP); } ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_ARDY | I2C_STAT_RDR | I2C_STAT_RRDY | I2C_STAT_XDR | I2C_STAT_XRDY); return (1); } if (sc->sc_buffer->flags & IIC_M_RD) { /* Read some data. */ if (status & I2C_STAT_RDR) { /* * Receive draining interrupt - last data received. * The set FIFO threshold wont be reached to trigger * RRDY. */ ti_i2c_dbg(sc, "Receive draining interrupt\n"); /* * Drain the FIFO. Read the pending data in the FIFO. */ amount = sc->sc_buffer->len - sc->sc_buffer_pos; } else if (status & I2C_STAT_RRDY) { /* * Receive data ready interrupt - FIFO has reached the * set threshold. */ ti_i2c_dbg(sc, "Receive data ready interrupt\n"); amount = min(sc->sc_fifo_trsh, sc->sc_buffer->len - sc->sc_buffer_pos); } /* Read the bytes from the fifo. */ for (i = 0; i < amount; i++) sc->sc_buffer->buf[sc->sc_buffer_pos++] = (uint8_t)(ti_i2c_read_2(sc, I2C_REG_DATA) & 0xff); if (status & I2C_STAT_RDR) ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_RDR); if (status & I2C_STAT_RRDY) ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_RRDY); } else { /* Write some data. */ if (status & I2C_STAT_XDR) { /* * Transmit draining interrupt - FIFO level is below * the set threshold and the amount of data still to * be transferred wont reach the set FIFO threshold. */ ti_i2c_dbg(sc, "Transmit draining interrupt\n"); /* * Drain the TX data. Write the pending data in the * FIFO. */ amount = sc->sc_buffer->len - sc->sc_buffer_pos; } else if (status & I2C_STAT_XRDY) { /* * Transmit data ready interrupt - the FIFO level * is below the set threshold. */ ti_i2c_dbg(sc, "Transmit data ready interrupt\n"); amount = min(sc->sc_fifo_trsh, sc->sc_buffer->len - sc->sc_buffer_pos); } /* Write the bytes from the fifo. */ for (i = 0; i < amount; i++) ti_i2c_write_2(sc, I2C_REG_DATA, sc->sc_buffer->buf[sc->sc_buffer_pos++]); if (status & I2C_STAT_XDR) ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_XDR); if (status & I2C_STAT_XRDY) ti_i2c_write_2(sc, I2C_REG_STATUS, I2C_STAT_XRDY); } return (done); } /** * ti_i2c_intr - interrupt handler for the I2C module * @dev: i2c device handle * * * * LOCKING: * Called from timer context * * RETURNS: * EH_HANDLED or EH_NOT_HANDLED */ static void ti_i2c_intr(void *arg) { int done; struct ti_i2c_softc *sc; uint16_t events, status; sc = (struct ti_i2c_softc *)arg; TI_I2C_LOCK(sc); status = ti_i2c_read_2(sc, I2C_REG_STATUS); if (status == 0) { TI_I2C_UNLOCK(sc); return; } /* Save enabled interrupts. */ events = ti_i2c_read_2(sc, I2C_REG_IRQENABLE_SET); /* We only care about enabled interrupts. */ status &= events; done = 0; if (sc->sc_buffer != NULL) done = ti_i2c_transfer_intr(sc, status); else { ti_i2c_dbg(sc, "Transfer interrupt without buffer\n"); sc->sc_error = EINVAL; done = 1; } if (done) /* Wakeup the process that started the transaction. */ wakeup(sc); TI_I2C_UNLOCK(sc); } /** * ti_i2c_transfer - called to perform the transfer * @dev: i2c device handle * @msgs: the messages to send/receive * @nmsgs: the number of messages in the msgs array * * * LOCKING: * Internally locked * * RETURNS: * 0 on function succeeded * EINVAL if invalid message is passed as an arg */ static int ti_i2c_transfer(device_t dev, struct iic_msg *msgs, uint32_t nmsgs) { int err, i, repstart, timeout; struct ti_i2c_softc *sc; uint16_t reg; sc = device_get_softc(dev); TI_I2C_LOCK(sc); /* If the controller is busy wait until it is available. */ while (sc->sc_bus_inuse == 1) mtx_sleep(sc, &sc->sc_mtx, 0, "i2cbuswait", 0); /* Now we have control over the I2C controller. */ sc->sc_bus_inuse = 1; err = 0; repstart = 0; for (i = 0; i < nmsgs; i++) { sc->sc_buffer = &msgs[i]; sc->sc_buffer_pos = 0; sc->sc_error = 0; /* Zero byte transfers aren't allowed. */ if (sc->sc_buffer == NULL || sc->sc_buffer->buf == NULL || sc->sc_buffer->len == 0) { err = EINVAL; break; } /* Check if the i2c bus is free. */ if (repstart == 0) { /* * On repeated start we send the START condition while * the bus _is_ busy. */ timeout = 0; while (ti_i2c_read_2(sc, I2C_REG_STATUS_RAW) & I2C_STAT_BB) { if (timeout++ > 100) { err = EBUSY; goto out; } DELAY(1000); } timeout = 0; } else repstart = 0; if (sc->sc_buffer->flags & IIC_M_NOSTOP) repstart = 1; /* Set the slave address. */ ti_i2c_write_2(sc, I2C_REG_SA, msgs[i].slave >> 1); /* Write the data length. */ ti_i2c_write_2(sc, I2C_REG_CNT, sc->sc_buffer->len); /* Clear the RX and the TX FIFO. */ reg = ti_i2c_read_2(sc, I2C_REG_BUF); reg |= I2C_BUF_RXFIFO_CLR | I2C_BUF_TXFIFO_CLR; ti_i2c_write_2(sc, I2C_REG_BUF, reg); reg = sc->sc_con_reg | I2C_CON_STT; if (repstart == 0) reg |= I2C_CON_STP; if ((sc->sc_buffer->flags & IIC_M_RD) == 0) reg |= I2C_CON_TRX; ti_i2c_write_2(sc, I2C_REG_CON, reg); /* Wait for an event. */ - err = mtx_sleep(sc, &sc->sc_mtx, 0, "i2ciowait", hz); + err = mtx_sleep(sc, &sc->sc_mtx, 0, "i2ciowait", sc->sc_timeout); if (err == 0) err = sc->sc_error; if (err) break; } out: if (timeout == 0) { while (ti_i2c_read_2(sc, I2C_REG_STATUS_RAW) & I2C_STAT_BB) { if (timeout++ > 100) break; DELAY(1000); } } /* Put the controller in master mode again. */ if ((ti_i2c_read_2(sc, I2C_REG_CON) & I2C_CON_MST) == 0) ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg); sc->sc_buffer = NULL; sc->sc_bus_inuse = 0; /* Wake up the processes that are waiting for the bus. */ wakeup(sc); TI_I2C_UNLOCK(sc); return (err); } static int ti_i2c_reset(struct ti_i2c_softc *sc, u_char speed) { int timeout; struct ti_i2c_clock_config *clkcfg; u_int busfreq; uint16_t fifo_trsh, reg, scll, sclh; switch (ti_chip()) { #ifdef SOC_OMAP4 case CHIP_OMAP_4: clkcfg = ti_omap4_i2c_clock_configs; break; #endif #ifdef SOC_TI_AM335X case CHIP_AM335X: clkcfg = ti_am335x_i2c_clock_configs; break; #endif default: panic("Unknown Ti SoC, unable to reset the i2c"); } /* * If we haven't attached the bus yet, just init at the default slow * speed. This lets us get the hardware initialized enough to attach * the bus which is where the real speed configuration is handled. After * the bus is attached, get the configured speed from it. Search the * configuration table for the best speed we can do that doesn't exceed * the requested speed. */ if (sc->sc_iicbus == NULL) busfreq = 100000; else busfreq = IICBUS_GET_FREQUENCY(sc->sc_iicbus, speed); for (;;) { if (clkcfg[1].frequency == 0 || clkcfg[1].frequency > busfreq) break; clkcfg++; } /* * 23.1.4.3 - HS I2C Software Reset * From OMAP4 TRM at page 4068. * * 1. Ensure that the module is disabled. */ sc->sc_con_reg = 0; ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg); /* 2. Issue a softreset to the controller. */ bus_write_2(sc->sc_mem_res, I2C_REG_SYSC, I2C_REG_SYSC_SRST); /* * 3. Enable the module. * The I2Ci.I2C_SYSS[0] RDONE bit is asserted only after the module * is enabled by setting the I2Ci.I2C_CON[15] I2C_EN bit to 1. */ ti_i2c_write_2(sc, I2C_REG_CON, I2C_CON_I2C_EN); /* 4. Wait for the software reset to complete. */ timeout = 0; while ((ti_i2c_read_2(sc, I2C_REG_SYSS) & I2C_SYSS_RDONE) == 0) { if (timeout++ > 100) return (EBUSY); DELAY(100); } /* * Disable the I2C controller once again, now that the reset has * finished. */ ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg); /* * The following sequence is taken from the OMAP4 TRM at page 4077. * * 1. Enable the functional and interface clocks (see Section * 23.1.5.1.1.1.1). Done at ti_i2c_activate(). * * 2. Program the prescaler to obtain an approximately 12MHz internal * sampling clock (I2Ci_INTERNAL_CLK) by programming the * corresponding value in the I2Ci.I2C_PSC[3:0] PSC field. * This value depends on the frequency of the functional clock * (I2Ci_FCLK). Because this frequency is 96MHz, the * I2Ci.I2C_PSC[7:0] PSC field value is 0x7. */ ti_i2c_write_2(sc, I2C_REG_PSC, clkcfg->psc); /* * 3. Program the I2Ci.I2C_SCLL[7:0] SCLL and I2Ci.I2C_SCLH[7:0] SCLH * bit fields to obtain a bit rate of 100 Kbps, 400 Kbps or 1Mbps. * These values depend on the internal sampling clock frequency * (see Table 23-8). */ scll = clkcfg->scll & I2C_SCLL_MASK; sclh = clkcfg->sclh & I2C_SCLH_MASK; /* * 4. (Optional) Program the I2Ci.I2C_SCLL[15:8] HSSCLL and * I2Ci.I2C_SCLH[15:8] HSSCLH fields to obtain a bit rate of * 400K bps or 3.4M bps (for the second phase of HS mode). These * values depend on the internal sampling clock frequency (see * Table 23-8). * * 5. (Optional) If a bit rate of 3.4M bps is used and the bus line * capacitance exceeds 45 pF, (see Section 18.4.8, PAD Functional * Multiplexing and Configuration). */ switch (ti_chip()) { #ifdef SOC_OMAP4 case CHIP_OMAP_4: if ((clkcfg->hsscll + clkcfg->hssclh) > 0) { scll |= clkcfg->hsscll << I2C_HSSCLL_SHIFT; sclh |= clkcfg->hssclh << I2C_HSSCLH_SHIFT; sc->sc_con_reg |= I2C_CON_OPMODE_HS; } break; #endif } /* Write the selected bit rate. */ ti_i2c_write_2(sc, I2C_REG_SCLL, scll); ti_i2c_write_2(sc, I2C_REG_SCLH, sclh); /* * 6. Configure the Own Address of the I2C controller by storing it in * the I2Ci.I2C_OA0 register. Up to four Own Addresses can be * programmed in the I2Ci.I2C_OAi registers (where i = 0, 1, 2, 3) * for each I2C controller. * * Note: For a 10-bit address, set the corresponding expand Own Address * bit in the I2Ci.I2C_CON register. * * Driver currently always in single master mode so ignore this step. */ /* * 7. Set the TX threshold (in transmitter mode) and the RX threshold * (in receiver mode) by setting the I2Ci.I2C_BUF[5:0]XTRSH field to * (TX threshold - 1) and the I2Ci.I2C_BUF[13:8]RTRSH field to (RX * threshold - 1), where the TX and RX thresholds are greater than * or equal to 1. * * The threshold is set to 5 for now. */ fifo_trsh = (sc->sc_fifo_trsh - 1) & I2C_BUF_TRSH_MASK; reg = fifo_trsh | (fifo_trsh << I2C_BUF_RXTRSH_SHIFT); ti_i2c_write_2(sc, I2C_REG_BUF, reg); /* * 8. Take the I2C controller out of reset by setting the * I2Ci.I2C_CON[15] I2C_EN bit to 1. * * 23.1.5.1.1.1.2 - Initialize the I2C Controller * * To initialize the I2C controller, perform the following steps: * * 1. Configure the I2Ci.I2C_CON register: * . For master or slave mode, set the I2Ci.I2C_CON[10] MST bit * (0: slave, 1: master). * . For transmitter or receiver mode, set the I2Ci.I2C_CON[9] TRX * bit (0: receiver, 1: transmitter). */ /* Enable the I2C controller in master mode. */ sc->sc_con_reg |= I2C_CON_I2C_EN | I2C_CON_MST; ti_i2c_write_2(sc, I2C_REG_CON, sc->sc_con_reg); /* * 2. If using an interrupt to transmit/receive data, set the * corresponding bit in the I2Ci.I2C_IE register (the I2Ci.I2C_IE[4] * XRDY_IE bit for the transmit interrupt, the I2Ci.I2C_IE[3] RRDY * bit for the receive interrupt). */ /* Set the interrupts we want to be notified. */ reg = I2C_IE_XDR | /* Transmit draining interrupt. */ I2C_IE_XRDY | /* Transmit Data Ready interrupt. */ I2C_IE_RDR | /* Receive draining interrupt. */ I2C_IE_RRDY | /* Receive Data Ready interrupt. */ I2C_IE_ARDY | /* Register Access Ready interrupt. */ I2C_IE_NACK | /* No Acknowledgment interrupt. */ I2C_IE_AL; /* Arbitration lost interrupt. */ /* Enable the interrupts. */ ti_i2c_write_2(sc, I2C_REG_IRQENABLE_SET, reg); /* * 3. If using DMA to receive/transmit data, set to 1 the corresponding * bit in the I2Ci.I2C_BUF register (the I2Ci.I2C_BUF[15] RDMA_EN * bit for the receive DMA channel, the I2Ci.I2C_BUF[7] XDMA_EN bit * for the transmit DMA channel). * * Not using DMA for now, so ignore this. */ return (0); } static int ti_i2c_iicbus_reset(device_t dev, u_char speed, u_char addr, u_char *oldaddr) { struct ti_i2c_softc *sc; int err; sc = device_get_softc(dev); TI_I2C_LOCK(sc); err = ti_i2c_reset(sc, speed); TI_I2C_UNLOCK(sc); if (err) return (err); return (IIC_ENOADDR); } static int ti_i2c_activate(device_t dev) { clk_ident_t clk; int err; struct ti_i2c_softc *sc; sc = (struct ti_i2c_softc*)device_get_softc(dev); /* * 1. Enable the functional and interface clocks (see Section * 23.1.5.1.1.1.1). */ clk = I2C0_CLK + sc->device_id; err = ti_prcm_clk_enable(clk); if (err) return (err); return (ti_i2c_reset(sc, IIC_UNKNOWN)); } /** * ti_i2c_deactivate - deactivates the controller and releases resources * @dev: i2c device handle * * * * LOCKING: * Assumed called in an atomic context. * * RETURNS: * nothing */ static void ti_i2c_deactivate(device_t dev) { struct ti_i2c_softc *sc = device_get_softc(dev); clk_ident_t clk; /* Disable the controller - cancel all transactions. */ ti_i2c_write_2(sc, I2C_REG_IRQENABLE_CLR, 0xffff); ti_i2c_write_2(sc, I2C_REG_STATUS, 0xffff); ti_i2c_write_2(sc, I2C_REG_CON, 0); /* Release the interrupt handler. */ if (sc->sc_irq_h != NULL) { bus_teardown_intr(dev, sc->sc_irq_res, sc->sc_irq_h); sc->sc_irq_h = NULL; } bus_generic_detach(sc->sc_dev); /* Unmap the I2C controller registers. */ if (sc->sc_mem_res != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, 0, sc->sc_mem_res); sc->sc_mem_res = NULL; } /* Release the IRQ resource. */ if (sc->sc_irq_res != NULL) { bus_release_resource(dev, SYS_RES_IRQ, 0, sc->sc_irq_res); sc->sc_irq_res = NULL; } /* Finally disable the functional and interface clocks. */ clk = I2C0_CLK + sc->device_id; ti_prcm_clk_disable(clk); } static int ti_i2c_sysctl_clk(SYSCTL_HANDLER_ARGS) { - device_t dev; int clk, psc, sclh, scll; struct ti_i2c_softc *sc; - dev = (device_t)arg1; - sc = device_get_softc(dev); + sc = arg1; TI_I2C_LOCK(sc); /* Get the system prescaler value. */ psc = (int)ti_i2c_read_2(sc, I2C_REG_PSC) + 1; /* Get the bitrate. */ scll = (int)ti_i2c_read_2(sc, I2C_REG_SCLL) & I2C_SCLL_MASK; sclh = (int)ti_i2c_read_2(sc, I2C_REG_SCLH) & I2C_SCLH_MASK; clk = I2C_CLK / psc / (scll + 7 + sclh + 5); TI_I2C_UNLOCK(sc); return (sysctl_handle_int(oidp, &clk, 0, req)); } static int +ti_i2c_sysctl_timeout(SYSCTL_HANDLER_ARGS) +{ + struct ti_i2c_softc *sc; + unsigned int val; + int err; + + sc = arg1; + + /* + * MTX_DEF lock can't be held while doing uimove in + * sysctl_handle_int + */ + TI_I2C_LOCK(sc); + val = sc->sc_timeout; + TI_I2C_UNLOCK(sc); + + err = sysctl_handle_int(oidp, &val, 0, req); + /* Write request? */ + if ((err == 0) && (req->newptr != NULL)) { + TI_I2C_LOCK(sc); + sc->sc_timeout = val; + TI_I2C_UNLOCK(sc); + } + + return (err); +} + +static int ti_i2c_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (!ofw_bus_is_compatible(dev, "ti,i2c")) return (ENXIO); device_set_desc(dev, "TI I2C Controller"); return (0); } static int ti_i2c_attach(device_t dev) { int err, rid; phandle_t node; struct ti_i2c_softc *sc; struct sysctl_ctx_list *ctx; struct sysctl_oid_list *tree; uint16_t fifosz; sc = device_get_softc(dev); sc->sc_dev = dev; /* Get the i2c device id from FDT. */ node = ofw_bus_get_node(dev); if ((OF_getencprop(node, "i2c-device-id", &sc->device_id, sizeof(sc->device_id))) <= 0) { device_printf(dev, "missing i2c-device-id attribute in FDT\n"); return (ENXIO); } /* Get the memory resource for the register mapping. */ rid = 0; sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->sc_mem_res == NULL) { device_printf(dev, "Cannot map registers.\n"); return (ENXIO); } /* Allocate our IRQ resource. */ rid = 0; sc->sc_irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE | RF_SHAREABLE); if (sc->sc_irq_res == NULL) { bus_release_resource(dev, SYS_RES_MEMORY, 0, sc->sc_mem_res); device_printf(dev, "Cannot allocate interrupt.\n"); return (ENXIO); } TI_I2C_LOCK_INIT(sc); /* First of all, we _must_ activate the H/W. */ err = ti_i2c_activate(dev); if (err) { device_printf(dev, "ti_i2c_activate failed\n"); goto out; } /* Read the version number of the I2C module */ sc->sc_rev = ti_i2c_read_2(sc, I2C_REG_REVNB_HI) & 0xff; /* Get the fifo size. */ fifosz = ti_i2c_read_2(sc, I2C_REG_BUFSTAT); fifosz >>= I2C_BUFSTAT_FIFODEPTH_SHIFT; fifosz &= I2C_BUFSTAT_FIFODEPTH_MASK; device_printf(dev, "I2C revision %d.%d FIFO size: %d bytes\n", sc->sc_rev >> 4, sc->sc_rev & 0xf, 8 << fifosz); /* Set the FIFO threshold to 5 for now. */ sc->sc_fifo_trsh = 5; + /* Set I2C bus timeout */ + sc->sc_timeout = 5*hz; + ctx = device_get_sysctl_ctx(dev); tree = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_PROC(ctx, tree, OID_AUTO, "i2c_clock", - CTLFLAG_RD | CTLTYPE_UINT | CTLFLAG_MPSAFE, dev, 0, + CTLFLAG_RD | CTLTYPE_UINT | CTLFLAG_MPSAFE, sc, 0, ti_i2c_sysctl_clk, "IU", "I2C bus clock"); + + SYSCTL_ADD_PROC(ctx, tree, OID_AUTO, "i2c_timeout", + CTLFLAG_RW | CTLTYPE_UINT | CTLFLAG_MPSAFE, sc, 0, + ti_i2c_sysctl_timeout, "IU", "I2C bus timeout (in ticks)"); /* Activate the interrupt. */ err = bus_setup_intr(dev, sc->sc_irq_res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, ti_i2c_intr, sc, &sc->sc_irq_h); if (err) goto out; /* Attach the iicbus. */ if ((sc->sc_iicbus = device_add_child(dev, "iicbus", -1)) == NULL) { device_printf(dev, "could not allocate iicbus instance\n"); err = ENXIO; goto out; } /* Probe and attach the iicbus */ bus_generic_attach(dev); out: if (err) { ti_i2c_deactivate(dev); TI_I2C_LOCK_DESTROY(sc); } return (err); } static int ti_i2c_detach(device_t dev) { struct ti_i2c_softc *sc; int rv; sc = device_get_softc(dev); ti_i2c_deactivate(dev); TI_I2C_LOCK_DESTROY(sc); if (sc->sc_iicbus && (rv = device_delete_child(dev, sc->sc_iicbus)) != 0) return (rv); return (0); } static phandle_t ti_i2c_get_node(device_t bus, device_t dev) { /* Share controller node with iibus device. */ return (ofw_bus_get_node(bus)); } static device_method_t ti_i2c_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ti_i2c_probe), DEVMETHOD(device_attach, ti_i2c_attach), DEVMETHOD(device_detach, ti_i2c_detach), /* OFW methods */ DEVMETHOD(ofw_bus_get_node, ti_i2c_get_node), /* iicbus interface */ DEVMETHOD(iicbus_callback, iicbus_null_callback), DEVMETHOD(iicbus_reset, ti_i2c_iicbus_reset), DEVMETHOD(iicbus_transfer, ti_i2c_transfer), DEVMETHOD_END }; static driver_t ti_i2c_driver = { "iichb", ti_i2c_methods, sizeof(struct ti_i2c_softc), }; static devclass_t ti_i2c_devclass; DRIVER_MODULE(ti_iic, simplebus, ti_i2c_driver, ti_i2c_devclass, 0, 0); DRIVER_MODULE(iicbus, ti_iic, iicbus_driver, iicbus_devclass, 0, 0); MODULE_DEPEND(ti_iic, ti_prcm, 1, 1, 1); MODULE_DEPEND(ti_iic, iicbus, 1, 1, 1); Index: projects/building-blocks/sys/dev/mps/mps.c =================================================================== --- projects/building-blocks/sys/dev/mps/mps.c (revision 277723) +++ projects/building-blocks/sys/dev/mps/mps.c (revision 277724) @@ -1,2699 +1,2705 @@ /*- * Copyright (c) 2009 Yahoo! Inc. * Copyright (c) 2012 LSI Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * LSI MPT-Fusion Host Adapter FreeBSD * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); /* Communications core for LSI MPT2 */ /* TODO Move headers to mpsvar */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int mps_diag_reset(struct mps_softc *sc, int sleep_flag); static int mps_init_queues(struct mps_softc *sc); static int mps_message_unit_reset(struct mps_softc *sc, int sleep_flag); static int mps_transition_operational(struct mps_softc *sc); static int mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching); static void mps_iocfacts_free(struct mps_softc *sc); static void mps_startup(void *arg); static int mps_send_iocinit(struct mps_softc *sc); static int mps_alloc_queues(struct mps_softc *sc); static int mps_alloc_replies(struct mps_softc *sc); static int mps_alloc_requests(struct mps_softc *sc); static int mps_attach_log(struct mps_softc *sc); static __inline void mps_complete_command(struct mps_softc *sc, struct mps_command *cm); static void mps_dispatch_event(struct mps_softc *sc, uintptr_t data, MPI2_EVENT_NOTIFICATION_REPLY *reply); static void mps_config_complete(struct mps_softc *sc, struct mps_command *cm); static void mps_periodic(void *); static int mps_reregister_events(struct mps_softc *sc); static void mps_enqueue_request(struct mps_softc *sc, struct mps_command *cm); static int mps_get_iocfacts(struct mps_softc *sc, MPI2_IOC_FACTS_REPLY *facts); static int mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag); SYSCTL_NODE(_hw, OID_AUTO, mps, CTLFLAG_RD, 0, "MPS Driver Parameters"); MALLOC_DEFINE(M_MPT2, "mps", "mpt2 driver memory"); /* * Do a "Diagnostic Reset" aka a hard reset. This should get the chip out of * any state and back to its initialization state machine. */ static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d }; /* Added this union to smoothly convert le64toh cm->cm_desc.Words. * Compiler only support unint64_t to be passed as argument. * Otherwise it will through below error * "aggregate value used where an integer was expected" */ typedef union _reply_descriptor { u64 word; struct { u32 low; u32 high; } u; }reply_descriptor,address_descriptor; /* Rate limit chain-fail messages to 1 per minute */ static struct timeval mps_chainfail_interval = { 60, 0 }; /* * sleep_flag can be either CAN_SLEEP or NO_SLEEP. * If this function is called from process context, it can sleep * and there is no harm to sleep, in case if this fuction is called * from Interrupt handler, we can not sleep and need NO_SLEEP flag set. * based on sleep flags driver will call either msleep, pause or DELAY. * msleep and pause are of same variant, but pause is used when mps_mtx * is not hold by driver. * */ static int mps_diag_reset(struct mps_softc *sc,int sleep_flag) { uint32_t reg; int i, error, tries = 0; uint8_t first_wait_done = FALSE; mps_dprint(sc, MPS_TRACE, "%s\n", __func__); /* Clear any pending interrupts */ mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); /*Force NO_SLEEP for threads prohibited to sleep * e.a Thread from interrupt handler are prohibited to sleep. */ if (curthread->td_no_sleeping != 0) sleep_flag = NO_SLEEP; /* Push the magic sequence */ error = ETIMEDOUT; while (tries++ < 20) { for (i = 0; i < sizeof(mpt2_reset_magic); i++) mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, mpt2_reset_magic[i]); /* wait 100 msec */ if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP) msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0, "mpsdiag", hz/10); else if (sleep_flag == CAN_SLEEP) pause("mpsdiag", hz/10); else DELAY(100 * 1000); reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET); if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) { error = 0; break; } } if (error) return (error); /* Send the actual reset. XXX need to refresh the reg? */ mps_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET, reg | MPI2_DIAG_RESET_ADAPTER); /* Wait up to 300 seconds in 50ms intervals */ error = ETIMEDOUT; for (i = 0; i < 6000; i++) { /* * Wait 50 msec. If this is the first time through, wait 256 * msec to satisfy Diag Reset timing requirements. */ if (first_wait_done) { if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP) msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0, "mpsdiag", hz/20); else if (sleep_flag == CAN_SLEEP) pause("mpsdiag", hz/20); else DELAY(50 * 1000); } else { DELAY(256 * 1000); first_wait_done = TRUE; } /* * Check for the RESET_ADAPTER bit to be cleared first, then * wait for the RESET state to be cleared, which takes a little * longer. */ reg = mps_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET); if (reg & MPI2_DIAG_RESET_ADAPTER) { continue; } reg = mps_regread(sc, MPI2_DOORBELL_OFFSET); if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) { error = 0; break; } } if (error) return (error); mps_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0); return (0); } static int mps_message_unit_reset(struct mps_softc *sc, int sleep_flag) { MPS_FUNCTRACE(sc); mps_regwrite(sc, MPI2_DOORBELL_OFFSET, MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET << MPI2_DOORBELL_FUNCTION_SHIFT); if (mps_wait_db_ack(sc, 5, sleep_flag) != 0) { mps_dprint(sc, MPS_FAULT, "Doorbell handshake failed : <%s>\n", __func__); return (ETIMEDOUT); } return (0); } static int mps_transition_ready(struct mps_softc *sc) { uint32_t reg, state; int error, tries = 0; int sleep_flags; MPS_FUNCTRACE(sc); /* If we are in attach call, do not sleep */ sleep_flags = (sc->mps_flags & MPS_FLAGS_ATTACH_DONE) ? CAN_SLEEP:NO_SLEEP; error = 0; while (tries++ < 1200) { reg = mps_regread(sc, MPI2_DOORBELL_OFFSET); mps_dprint(sc, MPS_INIT, "Doorbell= 0x%x\n", reg); /* * Ensure the IOC is ready to talk. If it's not, try * resetting it. */ if (reg & MPI2_DOORBELL_USED) { mps_diag_reset(sc, sleep_flags); DELAY(50000); continue; } /* Is the adapter owned by another peer? */ if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) == (MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) { device_printf(sc->mps_dev, "IOC is under the control " "of another peer host, aborting initialization.\n"); return (ENXIO); } state = reg & MPI2_IOC_STATE_MASK; if (state == MPI2_IOC_STATE_READY) { /* Ready to go! */ error = 0; break; } else if (state == MPI2_IOC_STATE_FAULT) { mps_dprint(sc, MPS_FAULT, "IOC in fault state 0x%x, resetting\n", state & MPI2_DOORBELL_FAULT_CODE_MASK); mps_diag_reset(sc, sleep_flags); } else if (state == MPI2_IOC_STATE_OPERATIONAL) { /* Need to take ownership */ mps_message_unit_reset(sc, sleep_flags); } else if (state == MPI2_IOC_STATE_RESET) { /* Wait a bit, IOC might be in transition */ mps_dprint(sc, MPS_FAULT, "IOC in unexpected reset state\n"); } else { mps_dprint(sc, MPS_FAULT, "IOC in unknown state 0x%x\n", state); error = EINVAL; break; } /* Wait 50ms for things to settle down. */ DELAY(50000); } if (error) device_printf(sc->mps_dev, "Cannot transition IOC to ready\n"); return (error); } static int mps_transition_operational(struct mps_softc *sc) { uint32_t reg, state; int error; MPS_FUNCTRACE(sc); error = 0; reg = mps_regread(sc, MPI2_DOORBELL_OFFSET); mps_dprint(sc, MPS_INIT, "Doorbell= 0x%x\n", reg); state = reg & MPI2_IOC_STATE_MASK; if (state != MPI2_IOC_STATE_READY) { if ((error = mps_transition_ready(sc)) != 0) { mps_dprint(sc, MPS_FAULT, "%s failed to transition ready\n", __func__); return (error); } } error = mps_send_iocinit(sc); return (error); } /* * This is called during attach and when re-initializing due to a Diag Reset. * IOC Facts is used to allocate many of the structures needed by the driver. * If called from attach, de-allocation is not required because the driver has * not allocated any structures yet, but if called from a Diag Reset, previously * allocated structures based on IOC Facts will need to be freed and re- * allocated bases on the latest IOC Facts. */ static int mps_iocfacts_allocate(struct mps_softc *sc, uint8_t attaching) { int error; Mpi2IOCFactsReply_t saved_facts; uint8_t saved_mode, reallocating; mps_dprint(sc, MPS_TRACE, "%s\n", __func__); /* Save old IOC Facts and then only reallocate if Facts have changed */ if (!attaching) { bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY)); } /* * Get IOC Facts. In all cases throughout this function, panic if doing * a re-initialization and only return the error if attaching so the OS * can handle it. */ if ((error = mps_get_iocfacts(sc, sc->facts)) != 0) { if (attaching) { mps_dprint(sc, MPS_FAULT, "%s failed to get IOC Facts " "with error %d\n", __func__, error); return (error); } else { panic("%s failed to get IOC Facts with error %d\n", __func__, error); } } mps_print_iocfacts(sc, sc->facts); snprintf(sc->fw_version, sizeof(sc->fw_version), "%02d.%02d.%02d.%02d", sc->facts->FWVersion.Struct.Major, sc->facts->FWVersion.Struct.Minor, sc->facts->FWVersion.Struct.Unit, sc->facts->FWVersion.Struct.Dev); mps_printf(sc, "Firmware: %s, Driver: %s\n", sc->fw_version, MPS_DRIVER_VERSION); mps_printf(sc, "IOCCapabilities: %b\n", sc->facts->IOCCapabilities, "\20" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf" "\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR" "\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc"); /* * If the chip doesn't support event replay then a hard reset will be * required to trigger a full discovery. Do the reset here then * retransition to Ready. A hard reset might have already been done, * but it doesn't hurt to do it again. Only do this if attaching, not * for a Diag Reset. */ if (attaching) { if ((sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0) { mps_diag_reset(sc, NO_SLEEP); if ((error = mps_transition_ready(sc)) != 0) { mps_dprint(sc, MPS_FAULT, "%s failed to " "transition to ready with error %d\n", __func__, error); return (error); } } } /* * Set flag if IR Firmware is loaded. If the RAID Capability has * changed from the previous IOC Facts, log a warning, but only if * checking this after a Diag Reset and not during attach. */ saved_mode = sc->ir_firmware; if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID) sc->ir_firmware = 1; if (!attaching) { if (sc->ir_firmware != saved_mode) { mps_dprint(sc, MPS_FAULT, "%s new IR/IT mode in IOC " "Facts does not match previous mode\n", __func__); } } /* Only deallocate and reallocate if relevant IOC Facts have changed */ reallocating = FALSE; if ((!attaching) && ((saved_facts.MsgVersion != sc->facts->MsgVersion) || (saved_facts.HeaderVersion != sc->facts->HeaderVersion) || (saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) || (saved_facts.RequestCredit != sc->facts->RequestCredit) || (saved_facts.ProductID != sc->facts->ProductID) || (saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) || (saved_facts.IOCRequestFrameSize != sc->facts->IOCRequestFrameSize) || (saved_facts.MaxTargets != sc->facts->MaxTargets) || (saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) || (saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) || (saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) || (saved_facts.MaxReplyDescriptorPostQueueDepth != sc->facts->MaxReplyDescriptorPostQueueDepth) || (saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) || (saved_facts.MaxVolumes != sc->facts->MaxVolumes) || (saved_facts.MaxPersistentEntries != sc->facts->MaxPersistentEntries))) { reallocating = TRUE; } /* * Some things should be done if attaching or re-allocating after a Diag * Reset, but are not needed after a Diag Reset if the FW has not * changed. */ if (attaching || reallocating) { /* * Check if controller supports FW diag buffers and set flag to * enable each type. */ if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER) sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE]. enabled = TRUE; if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER) sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT]. enabled = TRUE; if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER) sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED]. enabled = TRUE; /* * Set flag if EEDP is supported and if TLR is supported. */ if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP) sc->eedp_enabled = TRUE; if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR) sc->control_TLR = TRUE; /* * Size the queues. Since the reply queues always need one free * entry, we'll just deduct one reply message here. */ sc->num_reqs = MIN(MPS_REQ_FRAMES, sc->facts->RequestCredit); sc->num_replies = MIN(MPS_REPLY_FRAMES + MPS_EVT_REPLY_FRAMES, sc->facts->MaxReplyDescriptorPostQueueDepth) - 1; /* * Initialize all Tail Queues */ TAILQ_INIT(&sc->req_list); TAILQ_INIT(&sc->high_priority_req_list); TAILQ_INIT(&sc->chain_list); TAILQ_INIT(&sc->tm_list); } /* * If doing a Diag Reset and the FW is significantly different * (reallocating will be set above in IOC Facts comparison), then all * buffers based on the IOC Facts will need to be freed before they are * reallocated. */ if (reallocating) { mps_iocfacts_free(sc); mpssas_realloc_targets(sc, saved_facts.MaxTargets); } /* * Any deallocation has been completed. Now start reallocating * if needed. Will only need to reallocate if attaching or if the new * IOC Facts are different from the previous IOC Facts after a Diag * Reset. Targets have already been allocated above if needed. */ if (attaching || reallocating) { if (((error = mps_alloc_queues(sc)) != 0) || ((error = mps_alloc_replies(sc)) != 0) || ((error = mps_alloc_requests(sc)) != 0)) { if (attaching ) { mps_dprint(sc, MPS_FAULT, "%s failed to alloc " "queues with error %d\n", __func__, error); mps_free(sc); return (error); } else { panic("%s failed to alloc queues with error " "%d\n", __func__, error); } } } /* Always initialize the queues */ bzero(sc->free_queue, sc->fqdepth * 4); mps_init_queues(sc); /* * Always get the chip out of the reset state, but only panic if not * attaching. If attaching and there is an error, that is handled by * the OS. */ error = mps_transition_operational(sc); if (error != 0) { if (attaching) { mps_printf(sc, "%s failed to transition to operational " "with error %d\n", __func__, error); mps_free(sc); return (error); } else { panic("%s failed to transition to operational with " "error %d\n", __func__, error); } } /* * Finish the queue initialization. * These are set here instead of in mps_init_queues() because the * IOC resets these values during the state transition in * mps_transition_operational(). The free index is set to 1 * because the corresponding index in the IOC is set to 0, and the * IOC treats the queues as full if both are set to the same value. * Hence the reason that the queue can't hold all of the possible * replies. */ sc->replypostindex = 0; mps_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex); mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0); /* * Attach the subsystems so they can prepare their event masks. */ /* XXX Should be dynamic so that IM/IR and user modules can attach */ if (attaching) { if (((error = mps_attach_log(sc)) != 0) || ((error = mps_attach_sas(sc)) != 0) || ((error = mps_attach_user(sc)) != 0)) { mps_printf(sc, "%s failed to attach all subsystems: " "error %d\n", __func__, error); mps_free(sc); return (error); } if ((error = mps_pci_setup_interrupts(sc)) != 0) { mps_printf(sc, "%s failed to setup interrupts\n", __func__); mps_free(sc); return (error); } } /* * Set flag if this is a WD controller. This shouldn't ever change, but * reset it after a Diag Reset, just in case. */ sc->WD_available = FALSE; if (pci_get_device(sc->mps_dev) == MPI2_MFGPAGE_DEVID_SSS6200) sc->WD_available = TRUE; return (error); } /* * This is called if memory is being free (during detach for example) and when * buffers need to be reallocated due to a Diag Reset. */ static void mps_iocfacts_free(struct mps_softc *sc) { struct mps_command *cm; int i; mps_dprint(sc, MPS_TRACE, "%s\n", __func__); if (sc->free_busaddr != 0) bus_dmamap_unload(sc->queues_dmat, sc->queues_map); if (sc->free_queue != NULL) bus_dmamem_free(sc->queues_dmat, sc->free_queue, sc->queues_map); if (sc->queues_dmat != NULL) bus_dma_tag_destroy(sc->queues_dmat); if (sc->chain_busaddr != 0) bus_dmamap_unload(sc->chain_dmat, sc->chain_map); if (sc->chain_frames != NULL) bus_dmamem_free(sc->chain_dmat, sc->chain_frames, sc->chain_map); if (sc->chain_dmat != NULL) bus_dma_tag_destroy(sc->chain_dmat); if (sc->sense_busaddr != 0) bus_dmamap_unload(sc->sense_dmat, sc->sense_map); if (sc->sense_frames != NULL) bus_dmamem_free(sc->sense_dmat, sc->sense_frames, sc->sense_map); if (sc->sense_dmat != NULL) bus_dma_tag_destroy(sc->sense_dmat); if (sc->reply_busaddr != 0) bus_dmamap_unload(sc->reply_dmat, sc->reply_map); if (sc->reply_frames != NULL) bus_dmamem_free(sc->reply_dmat, sc->reply_frames, sc->reply_map); if (sc->reply_dmat != NULL) bus_dma_tag_destroy(sc->reply_dmat); if (sc->req_busaddr != 0) bus_dmamap_unload(sc->req_dmat, sc->req_map); if (sc->req_frames != NULL) bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map); if (sc->req_dmat != NULL) bus_dma_tag_destroy(sc->req_dmat); if (sc->chains != NULL) free(sc->chains, M_MPT2); if (sc->commands != NULL) { for (i = 1; i < sc->num_reqs; i++) { cm = &sc->commands[i]; bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap); } free(sc->commands, M_MPT2); } if (sc->buffer_dmat != NULL) bus_dma_tag_destroy(sc->buffer_dmat); } /* * The terms diag reset and hard reset are used interchangeably in the MPI * docs to mean resetting the controller chip. In this code diag reset * cleans everything up, and the hard reset function just sends the reset * sequence to the chip. This should probably be refactored so that every * subsystem gets a reset notification of some sort, and can clean up * appropriately. */ int mps_reinit(struct mps_softc *sc) { int error; struct mpssas_softc *sassc; sassc = sc->sassc; MPS_FUNCTRACE(sc); mtx_assert(&sc->mps_mtx, MA_OWNED); if (sc->mps_flags & MPS_FLAGS_DIAGRESET) { mps_dprint(sc, MPS_INIT, "%s reset already in progress\n", __func__); return 0; } mps_dprint(sc, MPS_INFO, "Reinitializing controller,\n"); /* make sure the completion callbacks can recognize they're getting * a NULL cm_reply due to a reset. */ sc->mps_flags |= MPS_FLAGS_DIAGRESET; /* * Mask interrupts here. */ mps_dprint(sc, MPS_INIT, "%s mask interrupts\n", __func__); mps_mask_intr(sc); error = mps_diag_reset(sc, CAN_SLEEP); if (error != 0) { /* XXXSL No need to panic here */ panic("%s hard reset failed with error %d\n", __func__, error); } /* Restore the PCI state, including the MSI-X registers */ mps_pci_restore(sc); /* Give the I/O subsystem special priority to get itself prepared */ mpssas_handle_reinit(sc); /* * Get IOC Facts and allocate all structures based on this information. * The attach function will also call mps_iocfacts_allocate at startup. * If relevant values have changed in IOC Facts, this function will free * all of the memory based on IOC Facts and reallocate that memory. */ if ((error = mps_iocfacts_allocate(sc, FALSE)) != 0) { panic("%s IOC Facts based allocation failed with error %d\n", __func__, error); } /* * Mapping structures will be re-allocated after getting IOC Page8, so * free these structures here. */ mps_mapping_exit(sc); /* * The static page function currently read is IOC Page8. Others can be * added in future. It's possible that the values in IOC Page8 have * changed after a Diag Reset due to user modification, so always read * these. Interrupts are masked, so unmask them before getting config * pages. */ mps_unmask_intr(sc); sc->mps_flags &= ~MPS_FLAGS_DIAGRESET; mps_base_static_config_pages(sc); /* * Some mapping info is based in IOC Page8 data, so re-initialize the * mapping tables. */ mps_mapping_initialize(sc); /* * Restart will reload the event masks clobbered by the reset, and * then enable the port. */ mps_reregister_events(sc); /* the end of discovery will release the simq, so we're done. */ mps_dprint(sc, MPS_INFO, "%s finished sc %p post %u free %u\n", __func__, sc, sc->replypostindex, sc->replyfreeindex); mpssas_release_simq_reinit(sassc); return 0; } /* Wait for the chip to ACK a word that we've put into its FIFO * Wait for seconds. In single loop wait for busy loop * for 500 microseconds. * Total is [ 0.5 * (2000 * ) ] in miliseconds. * */ static int mps_wait_db_ack(struct mps_softc *sc, int timeout, int sleep_flag) { u32 cntdn, count; u32 int_status; u32 doorbell; count = 0; cntdn = (sleep_flag == CAN_SLEEP) ? 1000*timeout : 2000*timeout; do { int_status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET); if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) { mps_dprint(sc, MPS_INIT, "%s: successfull count(%d), timeout(%d)\n", __func__, count, timeout); return 0; } else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) { doorbell = mps_regread(sc, MPI2_DOORBELL_OFFSET); if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) { mps_dprint(sc, MPS_FAULT, "fault_state(0x%04x)!\n", doorbell); return (EFAULT); } } else if (int_status == 0xFFFFFFFF) goto out; /* If it can sleep, sleep for 1 milisecond, else busy loop for * 0.5 milisecond */ if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP) msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0, "mpsdba", hz/1000); else if (sleep_flag == CAN_SLEEP) pause("mpsdba", hz/1000); else DELAY(500); count++; } while (--cntdn); out: mps_dprint(sc, MPS_FAULT, "%s: failed due to timeout count(%d), " "int_status(%x)!\n", __func__, count, int_status); return (ETIMEDOUT); } /* Wait for the chip to signal that the next word in its FIFO can be fetched */ static int mps_wait_db_int(struct mps_softc *sc) { int retry; for (retry = 0; retry < MPS_DB_MAX_WAIT; retry++) { if ((mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) & MPI2_HIS_IOC2SYS_DB_STATUS) != 0) return (0); DELAY(2000); } return (ETIMEDOUT); } /* Step through the synchronous command state machine, i.e. "Doorbell mode" */ static int mps_request_sync(struct mps_softc *sc, void *req, MPI2_DEFAULT_REPLY *reply, int req_sz, int reply_sz, int timeout) { uint32_t *data32; uint16_t *data16; int i, count, ioc_sz, residual; int sleep_flags = CAN_SLEEP; if (curthread->td_no_sleeping != 0) sleep_flags = NO_SLEEP; /* Step 1 */ mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); /* Step 2 */ if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) return (EBUSY); /* Step 3 * Announce that a message is coming through the doorbell. Messages * are pushed at 32bit words, so round up if needed. */ count = (req_sz + 3) / 4; mps_regwrite(sc, MPI2_DOORBELL_OFFSET, (MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) | (count << MPI2_DOORBELL_ADD_DWORDS_SHIFT)); /* Step 4 */ if (mps_wait_db_int(sc) || (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) { mps_dprint(sc, MPS_FAULT, "Doorbell failed to activate\n"); return (ENXIO); } mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) { mps_dprint(sc, MPS_FAULT, "Doorbell handshake failed\n"); return (ENXIO); } /* Step 5 */ /* Clock out the message data synchronously in 32-bit dwords*/ data32 = (uint32_t *)req; for (i = 0; i < count; i++) { mps_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i])); if (mps_wait_db_ack(sc, 5, sleep_flags) != 0) { mps_dprint(sc, MPS_FAULT, "Timeout while writing doorbell\n"); return (ENXIO); } } /* Step 6 */ /* Clock in the reply in 16-bit words. The total length of the * message is always in the 4th byte, so clock out the first 2 words * manually, then loop the rest. */ data16 = (uint16_t *)reply; if (mps_wait_db_int(sc) != 0) { mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 0\n"); return (ENXIO); } data16[0] = mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK; mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); if (mps_wait_db_int(sc) != 0) { mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell 1\n"); return (ENXIO); } data16[1] = mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK; mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); /* Number of 32bit words in the message */ ioc_sz = reply->MsgLength; /* * Figure out how many 16bit words to clock in without overrunning. * The precision loss with dividing reply_sz can safely be * ignored because the messages can only be multiples of 32bits. */ residual = 0; count = MIN((reply_sz / 4), ioc_sz) * 2; if (count < ioc_sz * 2) { residual = ioc_sz * 2 - count; mps_dprint(sc, MPS_ERROR, "Driver error, throwing away %d " "residual message words\n", residual); } for (i = 2; i < count; i++) { if (mps_wait_db_int(sc) != 0) { mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell %d\n", i); return (ENXIO); } data16[i] = mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK; mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); } /* * Pull out residual words that won't fit into the provided buffer. * This keeps the chip from hanging due to a driver programming * error. */ while (residual--) { if (mps_wait_db_int(sc) != 0) { mps_dprint(sc, MPS_FAULT, "Timeout reading doorbell\n"); return (ENXIO); } (void)mps_regread(sc, MPI2_DOORBELL_OFFSET); mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); } /* Step 7 */ if (mps_wait_db_int(sc) != 0) { mps_dprint(sc, MPS_FAULT, "Timeout waiting to exit doorbell\n"); return (ENXIO); } if (mps_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) mps_dprint(sc, MPS_FAULT, "Warning, doorbell still active\n"); mps_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0); return (0); } static void mps_enqueue_request(struct mps_softc *sc, struct mps_command *cm) { reply_descriptor rd; MPS_FUNCTRACE(sc); mps_dprint(sc, MPS_TRACE, "SMID %u cm %p ccb %p\n", cm->cm_desc.Default.SMID, cm, cm->cm_ccb); if (sc->mps_flags & MPS_FLAGS_ATTACH_DONE && !(sc->mps_flags & MPS_FLAGS_SHUTDOWN)) mtx_assert(&sc->mps_mtx, MA_OWNED); if (++sc->io_cmds_active > sc->io_cmds_highwater) sc->io_cmds_highwater++; rd.u.low = cm->cm_desc.Words.Low; rd.u.high = cm->cm_desc.Words.High; rd.word = htole64(rd.word); /* TODO-We may need to make below regwrite atomic */ mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET, rd.u.low); mps_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET, rd.u.high); } /* * Just the FACTS, ma'am. */ static int mps_get_iocfacts(struct mps_softc *sc, MPI2_IOC_FACTS_REPLY *facts) { MPI2_DEFAULT_REPLY *reply; MPI2_IOC_FACTS_REQUEST request; int error, req_sz, reply_sz; MPS_FUNCTRACE(sc); req_sz = sizeof(MPI2_IOC_FACTS_REQUEST); reply_sz = sizeof(MPI2_IOC_FACTS_REPLY); reply = (MPI2_DEFAULT_REPLY *)facts; bzero(&request, req_sz); request.Function = MPI2_FUNCTION_IOC_FACTS; error = mps_request_sync(sc, &request, reply, req_sz, reply_sz, 5); return (error); } static int mps_send_iocinit(struct mps_softc *sc) { MPI2_IOC_INIT_REQUEST init; MPI2_DEFAULT_REPLY reply; int req_sz, reply_sz, error; struct timeval now; uint64_t time_in_msec; MPS_FUNCTRACE(sc); req_sz = sizeof(MPI2_IOC_INIT_REQUEST); reply_sz = sizeof(MPI2_IOC_INIT_REPLY); bzero(&init, req_sz); bzero(&reply, reply_sz); /* * Fill in the init block. Note that most addresses are * deliberately in the lower 32bits of memory. This is a micro- * optimzation for PCI/PCIX, though it's not clear if it helps PCIe. */ init.Function = MPI2_FUNCTION_IOC_INIT; init.WhoInit = MPI2_WHOINIT_HOST_DRIVER; init.MsgVersion = htole16(MPI2_VERSION); init.HeaderVersion = htole16(MPI2_HEADER_VERSION); init.SystemRequestFrameSize = htole16(sc->facts->IOCRequestFrameSize); init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth); init.ReplyFreeQueueDepth = htole16(sc->fqdepth); init.SenseBufferAddressHigh = 0; init.SystemReplyAddressHigh = 0; init.SystemRequestFrameBaseAddress.High = 0; init.SystemRequestFrameBaseAddress.Low = htole32((uint32_t)sc->req_busaddr); init.ReplyDescriptorPostQueueAddress.High = 0; init.ReplyDescriptorPostQueueAddress.Low = htole32((uint32_t)sc->post_busaddr); init.ReplyFreeQueueAddress.High = 0; init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr); getmicrotime(&now); time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000); init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF); init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF); error = mps_request_sync(sc, &init, &reply, req_sz, reply_sz, 5); if ((reply.IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) error = ENXIO; mps_dprint(sc, MPS_INIT, "IOCInit status= 0x%x\n", reply.IOCStatus); return (error); } void mps_memaddr_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { bus_addr_t *addr; addr = arg; *addr = segs[0].ds_addr; } static int mps_alloc_queues(struct mps_softc *sc) { bus_addr_t queues_busaddr; uint8_t *queues; int qsize, fqsize, pqsize; /* * The reply free queue contains 4 byte entries in multiples of 16 and * aligned on a 16 byte boundary. There must always be an unused entry. * This queue supplies fresh reply frames for the firmware to use. * * The reply descriptor post queue contains 8 byte entries in * multiples of 16 and aligned on a 16 byte boundary. This queue * contains filled-in reply frames sent from the firmware to the host. * * These two queues are allocated together for simplicity. */ sc->fqdepth = roundup2((sc->num_replies + 1), 16); sc->pqdepth = roundup2((sc->num_replies + 1), 16); fqsize= sc->fqdepth * 4; pqsize = sc->pqdepth * 8; qsize = fqsize + pqsize; if (bus_dma_tag_create( sc->mps_parent_dmat, /* parent */ 16, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR_32BIT,/* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ qsize, /* maxsize */ 1, /* nsegments */ qsize, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->queues_dmat)) { device_printf(sc->mps_dev, "Cannot allocate queues DMA tag\n"); return (ENOMEM); } if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT, &sc->queues_map)) { device_printf(sc->mps_dev, "Cannot allocate queues memory\n"); return (ENOMEM); } bzero(queues, qsize); bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize, mps_memaddr_cb, &queues_busaddr, 0); sc->free_queue = (uint32_t *)queues; sc->free_busaddr = queues_busaddr; sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize); sc->post_busaddr = queues_busaddr + fqsize; return (0); } static int mps_alloc_replies(struct mps_softc *sc) { int rsize, num_replies; /* * sc->num_replies should be one less than sc->fqdepth. We need to * allocate space for sc->fqdepth replies, but only sc->num_replies * replies can be used at once. */ num_replies = max(sc->fqdepth, sc->num_replies); rsize = sc->facts->ReplyFrameSize * num_replies * 4; if (bus_dma_tag_create( sc->mps_parent_dmat, /* parent */ 4, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR_32BIT,/* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ rsize, /* maxsize */ 1, /* nsegments */ rsize, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->reply_dmat)) { device_printf(sc->mps_dev, "Cannot allocate replies DMA tag\n"); return (ENOMEM); } if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames, BUS_DMA_NOWAIT, &sc->reply_map)) { device_printf(sc->mps_dev, "Cannot allocate replies memory\n"); return (ENOMEM); } bzero(sc->reply_frames, rsize); bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize, mps_memaddr_cb, &sc->reply_busaddr, 0); return (0); } static int mps_alloc_requests(struct mps_softc *sc) { struct mps_command *cm; struct mps_chain *chain; int i, rsize, nsegs; rsize = sc->facts->IOCRequestFrameSize * sc->num_reqs * 4; if (bus_dma_tag_create( sc->mps_parent_dmat, /* parent */ 16, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR_32BIT,/* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ rsize, /* maxsize */ 1, /* nsegments */ rsize, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->req_dmat)) { device_printf(sc->mps_dev, "Cannot allocate request DMA tag\n"); return (ENOMEM); } if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames, BUS_DMA_NOWAIT, &sc->req_map)) { device_printf(sc->mps_dev, "Cannot allocate request memory\n"); return (ENOMEM); } bzero(sc->req_frames, rsize); bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize, mps_memaddr_cb, &sc->req_busaddr, 0); rsize = sc->facts->IOCRequestFrameSize * sc->max_chains * 4; if (bus_dma_tag_create( sc->mps_parent_dmat, /* parent */ 16, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR_32BIT,/* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ rsize, /* maxsize */ 1, /* nsegments */ rsize, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->chain_dmat)) { device_printf(sc->mps_dev, "Cannot allocate chain DMA tag\n"); return (ENOMEM); } if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames, BUS_DMA_NOWAIT, &sc->chain_map)) { device_printf(sc->mps_dev, "Cannot allocate chain memory\n"); return (ENOMEM); } bzero(sc->chain_frames, rsize); bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames, rsize, mps_memaddr_cb, &sc->chain_busaddr, 0); rsize = MPS_SENSE_LEN * sc->num_reqs; if (bus_dma_tag_create( sc->mps_parent_dmat, /* parent */ 1, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR_32BIT,/* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ rsize, /* maxsize */ 1, /* nsegments */ rsize, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->sense_dmat)) { device_printf(sc->mps_dev, "Cannot allocate sense DMA tag\n"); return (ENOMEM); } if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames, BUS_DMA_NOWAIT, &sc->sense_map)) { device_printf(sc->mps_dev, "Cannot allocate sense memory\n"); return (ENOMEM); } bzero(sc->sense_frames, rsize); bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize, mps_memaddr_cb, &sc->sense_busaddr, 0); sc->chains = malloc(sizeof(struct mps_chain) * sc->max_chains, M_MPT2, M_WAITOK | M_ZERO); if(!sc->chains) { device_printf(sc->mps_dev, "Cannot allocate chains memory %s %d\n", __func__, __LINE__); return (ENOMEM); } for (i = 0; i < sc->max_chains; i++) { chain = &sc->chains[i]; chain->chain = (MPI2_SGE_IO_UNION *)(sc->chain_frames + i * sc->facts->IOCRequestFrameSize * 4); chain->chain_busaddr = sc->chain_busaddr + i * sc->facts->IOCRequestFrameSize * 4; mps_free_chain(sc, chain); sc->chain_free_lowwater++; } /* XXX Need to pick a more precise value */ nsegs = (MAXPHYS / PAGE_SIZE) + 1; if (bus_dma_tag_create( sc->mps_parent_dmat, /* parent */ 1, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ nsegs, /* nsegments */ BUS_SPACE_MAXSIZE_24BIT,/* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ busdma_lock_mutex, /* lockfunc */ &sc->mps_mtx, /* lockarg */ &sc->buffer_dmat)) { device_printf(sc->mps_dev, "Cannot allocate buffer DMA tag\n"); return (ENOMEM); } /* * SMID 0 cannot be used as a free command per the firmware spec. * Just drop that command instead of risking accounting bugs. */ sc->commands = malloc(sizeof(struct mps_command) * sc->num_reqs, M_MPT2, M_WAITOK | M_ZERO); if(!sc->commands) { device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } for (i = 1; i < sc->num_reqs; i++) { cm = &sc->commands[i]; cm->cm_req = sc->req_frames + i * sc->facts->IOCRequestFrameSize * 4; cm->cm_req_busaddr = sc->req_busaddr + i * sc->facts->IOCRequestFrameSize * 4; cm->cm_sense = &sc->sense_frames[i]; cm->cm_sense_busaddr = sc->sense_busaddr + i * MPS_SENSE_LEN; cm->cm_desc.Default.SMID = i; cm->cm_sc = sc; TAILQ_INIT(&cm->cm_chain_list); callout_init_mtx(&cm->cm_callout, &sc->mps_mtx, 0); /* XXX Is a failure here a critical problem? */ if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap) == 0) if (i <= sc->facts->HighPriorityCredit) mps_free_high_priority_command(sc, cm); else mps_free_command(sc, cm); else { panic("failed to allocate command %d\n", i); sc->num_reqs = i; break; } } return (0); } static int mps_init_queues(struct mps_softc *sc) { int i; memset((uint8_t *)sc->post_queue, 0xff, sc->pqdepth * 8); /* * According to the spec, we need to use one less reply than we * have space for on the queue. So sc->num_replies (the number we * use) should be less than sc->fqdepth (allocated size). */ if (sc->num_replies >= sc->fqdepth) return (EINVAL); /* * Initialize all of the free queue entries. */ for (i = 0; i < sc->fqdepth; i++) sc->free_queue[i] = sc->reply_busaddr + (i * sc->facts->ReplyFrameSize * 4); sc->replyfreeindex = sc->num_replies; return (0); } /* Get the driver parameter tunables. Lowest priority are the driver defaults. * Next are the global settings, if they exist. Highest are the per-unit * settings, if they exist. */ static void mps_get_tunables(struct mps_softc *sc) { char tmpstr[80]; /* XXX default to some debugging for now */ sc->mps_debug = MPS_INFO|MPS_FAULT; sc->disable_msix = 0; sc->disable_msi = 0; sc->max_chains = MPS_CHAIN_FRAMES; /* * Grab the global variables. */ TUNABLE_INT_FETCH("hw.mps.debug_level", &sc->mps_debug); TUNABLE_INT_FETCH("hw.mps.disable_msix", &sc->disable_msix); TUNABLE_INT_FETCH("hw.mps.disable_msi", &sc->disable_msi); TUNABLE_INT_FETCH("hw.mps.max_chains", &sc->max_chains); /* Grab the unit-instance variables */ snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.debug_level", device_get_unit(sc->mps_dev)); TUNABLE_INT_FETCH(tmpstr, &sc->mps_debug); snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msix", device_get_unit(sc->mps_dev)); TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix); snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.disable_msi", device_get_unit(sc->mps_dev)); TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi); snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.max_chains", device_get_unit(sc->mps_dev)); TUNABLE_INT_FETCH(tmpstr, &sc->max_chains); bzero(sc->exclude_ids, sizeof(sc->exclude_ids)); snprintf(tmpstr, sizeof(tmpstr), "dev.mps.%d.exclude_ids", device_get_unit(sc->mps_dev)); TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids)); } static void mps_setup_sysctl(struct mps_softc *sc) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; char tmpstr[80], tmpstr2[80]; /* * Setup the sysctl variable so the user can change the debug level * on the fly. */ snprintf(tmpstr, sizeof(tmpstr), "MPS controller %d", device_get_unit(sc->mps_dev)); snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mps_dev)); sysctl_ctx = device_get_sysctl_ctx(sc->mps_dev); if (sysctl_ctx != NULL) sysctl_tree = device_get_sysctl_tree(sc->mps_dev); if (sysctl_tree == NULL) { sysctl_ctx_init(&sc->sysctl_ctx); sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_hw_mps), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr); if (sc->sysctl_tree == NULL) return; sysctl_ctx = &sc->sysctl_ctx; sysctl_tree = sc->sysctl_tree; } SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "debug_level", CTLFLAG_RW, &sc->mps_debug, 0, "mps debug level"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0, "Disable the use of MSI-X interrupts"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "disable_msi", CTLFLAG_RD, &sc->disable_msi, 0, "Disable the use of MSI interrupts"); SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "firmware_version", CTLFLAG_RW, sc->fw_version, strlen(sc->fw_version), "firmware version"); SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "driver_version", CTLFLAG_RW, MPS_DRIVER_VERSION, strlen(MPS_DRIVER_VERSION), "driver version"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "io_cmds_active", CTLFLAG_RD, &sc->io_cmds_active, 0, "number of currently active commands"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "io_cmds_highwater", CTLFLAG_RD, &sc->io_cmds_highwater, 0, "maximum active commands seen"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "chain_free", CTLFLAG_RD, &sc->chain_free, 0, "number of free chain elements"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "chain_free_lowwater", CTLFLAG_RD, &sc->chain_free_lowwater, 0,"lowest number of free chain elements"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_chains", CTLFLAG_RD, &sc->max_chains, 0,"maximum chain frames that will be allocated"); #if __FreeBSD_version >= 900030 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "chain_alloc_fail", CTLFLAG_RD, &sc->chain_alloc_fail, "chain allocation failures"); #endif //FreeBSD_version >= 900030 } int mps_attach(struct mps_softc *sc) { int error; mps_get_tunables(sc); MPS_FUNCTRACE(sc); mtx_init(&sc->mps_mtx, "MPT2SAS lock", NULL, MTX_DEF); callout_init_mtx(&sc->periodic, &sc->mps_mtx, 0); TAILQ_INIT(&sc->event_list); timevalclear(&sc->lastfail); if ((error = mps_transition_ready(sc)) != 0) { mps_printf(sc, "%s failed to transition ready\n", __func__); return (error); } sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPT2, M_ZERO|M_NOWAIT); if(!sc->facts) { device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } /* * Get IOC Facts and allocate all structures based on this information. * A Diag Reset will also call mps_iocfacts_allocate and re-read the IOC * Facts. If relevant values have changed in IOC Facts, this function * will free all of the memory based on IOC Facts and reallocate that * memory. If this fails, any allocated memory should already be freed. */ if ((error = mps_iocfacts_allocate(sc, TRUE)) != 0) { mps_dprint(sc, MPS_FAULT, "%s IOC Facts based allocation " "failed with error %d\n", __func__, error); return (error); } /* Start the periodic watchdog check on the IOC Doorbell */ mps_periodic(sc); /* * The portenable will kick off discovery events that will drive the * rest of the initialization process. The CAM/SAS module will * hold up the boot sequence until discovery is complete. */ sc->mps_ich.ich_func = mps_startup; sc->mps_ich.ich_arg = sc; if (config_intrhook_establish(&sc->mps_ich) != 0) { mps_dprint(sc, MPS_ERROR, "Cannot establish MPS config hook\n"); error = EINVAL; } /* * Allow IR to shutdown gracefully when shutdown occurs. */ sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final, mpssas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT); if (sc->shutdown_eh == NULL) mps_dprint(sc, MPS_ERROR, "shutdown event registration " "failed\n"); mps_setup_sysctl(sc); sc->mps_flags |= MPS_FLAGS_ATTACH_DONE; return (error); } /* Run through any late-start handlers. */ static void mps_startup(void *arg) { struct mps_softc *sc; sc = (struct mps_softc *)arg; mps_lock(sc); mps_unmask_intr(sc); /* initialize device mapping tables */ mps_base_static_config_pages(sc); mps_mapping_initialize(sc); mpssas_startup(sc); mps_unlock(sc); } /* Periodic watchdog. Is called with the driver lock already held. */ static void mps_periodic(void *arg) { struct mps_softc *sc; uint32_t db; sc = (struct mps_softc *)arg; if (sc->mps_flags & MPS_FLAGS_SHUTDOWN) return; db = mps_regread(sc, MPI2_DOORBELL_OFFSET); if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) { mps_dprint(sc, MPS_FAULT, "IOC Fault 0x%08x, Resetting\n", db); mps_reinit(sc); } callout_reset(&sc->periodic, MPS_PERIODIC_DELAY * hz, mps_periodic, sc); } static void mps_log_evt_handler(struct mps_softc *sc, uintptr_t data, MPI2_EVENT_NOTIFICATION_REPLY *event) { MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry; mps_print_event(sc, event); switch (event->Event) { case MPI2_EVENT_LOG_DATA: mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_DATA:\n"); if (sc->mps_debug & MPS_EVENT) hexdump(event->EventData, event->EventDataLength, NULL, 0); break; case MPI2_EVENT_LOG_ENTRY_ADDED: entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData; mps_dprint(sc, MPS_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event " "0x%x Sequence %d:\n", entry->LogEntryQualifier, entry->LogSequence); break; default: break; } return; } static int mps_attach_log(struct mps_softc *sc) { u32 events[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS]; bzero(events, 16); setbit(events, MPI2_EVENT_LOG_DATA); setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED); mps_register_events(sc, events, mps_log_evt_handler, NULL, &sc->mps_log_eh); return (0); } static int mps_detach_log(struct mps_softc *sc) { if (sc->mps_log_eh != NULL) mps_deregister_events(sc, sc->mps_log_eh); return (0); } /* * Free all of the driver resources and detach submodules. Should be called * without the lock held. */ int mps_free(struct mps_softc *sc) { int error; /* Turn off the watchdog */ mps_lock(sc); sc->mps_flags |= MPS_FLAGS_SHUTDOWN; mps_unlock(sc); /* Lock must not be held for this */ callout_drain(&sc->periodic); if (((error = mps_detach_log(sc)) != 0) || ((error = mps_detach_sas(sc)) != 0)) return (error); mps_detach_user(sc); /* Put the IOC back in the READY state. */ mps_lock(sc); if ((error = mps_transition_ready(sc)) != 0) { mps_unlock(sc); return (error); } mps_unlock(sc); if (sc->facts != NULL) free(sc->facts, M_MPT2); /* * Free all buffers that are based on IOC Facts. A Diag Reset may need * to free these buffers too. */ mps_iocfacts_free(sc); if (sc->sysctl_tree != NULL) sysctl_ctx_free(&sc->sysctl_ctx); /* Deregister the shutdown function */ if (sc->shutdown_eh != NULL) EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh); mtx_destroy(&sc->mps_mtx); return (0); } static __inline void mps_complete_command(struct mps_softc *sc, struct mps_command *cm) { MPS_FUNCTRACE(sc); if (cm == NULL) { mps_dprint(sc, MPS_ERROR, "Completing NULL command\n"); return; } if (cm->cm_flags & MPS_CM_FLAGS_POLLED) cm->cm_flags |= MPS_CM_FLAGS_COMPLETE; if (cm->cm_complete != NULL) { mps_dprint(sc, MPS_TRACE, "%s cm %p calling cm_complete %p data %p reply %p\n", __func__, cm, cm->cm_complete, cm->cm_complete_data, cm->cm_reply); cm->cm_complete(sc, cm); } if (cm->cm_flags & MPS_CM_FLAGS_WAKEUP) { mps_dprint(sc, MPS_TRACE, "waking up %p\n", cm); wakeup(cm); } if (cm->cm_sc->io_cmds_active != 0) { cm->cm_sc->io_cmds_active--; } else { mps_dprint(sc, MPS_ERROR, "Warning: io_cmds_active is " "out of sync - resynching to 0\n"); } } static void mps_sas_log_info(struct mps_softc *sc , u32 log_info) { union loginfo_type { u32 loginfo; struct { u32 subcode:16; u32 code:8; u32 originator:4; u32 bus_type:4; } dw; }; union loginfo_type sas_loginfo; char *originator_str = NULL; sas_loginfo.loginfo = log_info; if (sas_loginfo.dw.bus_type != 3 /*SAS*/) return; /* each nexus loss loginfo */ if (log_info == 0x31170000) return; /* eat the loginfos associated with task aborts */ if ((log_info == 30050000 || log_info == 0x31140000 || log_info == 0x31130000)) return; switch (sas_loginfo.dw.originator) { case 0: originator_str = "IOP"; break; case 1: originator_str = "PL"; break; case 2: originator_str = "IR"; break; } mps_dprint(sc, MPS_LOG, "log_info(0x%08x): originator(%s), " "code(0x%02x), sub_code(0x%04x)\n", log_info, originator_str, sas_loginfo.dw.code, sas_loginfo.dw.subcode); } static void mps_display_reply_info(struct mps_softc *sc, uint8_t *reply) { MPI2DefaultReply_t *mpi_reply; u16 sc_status; mpi_reply = (MPI2DefaultReply_t*)reply; sc_status = le16toh(mpi_reply->IOCStatus); if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) mps_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo)); } void mps_intr(void *data) { struct mps_softc *sc; uint32_t status; sc = (struct mps_softc *)data; mps_dprint(sc, MPS_TRACE, "%s\n", __func__); /* * Check interrupt status register to flush the bus. This is * needed for both INTx interrupts and driver-driven polling */ status = mps_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET); if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0) return; mps_lock(sc); mps_intr_locked(data); mps_unlock(sc); return; } /* * In theory, MSI/MSIX interrupts shouldn't need to read any registers on the * chip. Hopefully this theory is correct. */ void mps_intr_msi(void *data) { struct mps_softc *sc; sc = (struct mps_softc *)data; mps_dprint(sc, MPS_TRACE, "%s\n", __func__); mps_lock(sc); mps_intr_locked(data); mps_unlock(sc); return; } /* * The locking is overly broad and simplistic, but easy to deal with for now. */ void mps_intr_locked(void *data) { MPI2_REPLY_DESCRIPTORS_UNION *desc; struct mps_softc *sc; struct mps_command *cm = NULL; uint8_t flags; u_int pq; MPI2_DIAG_RELEASE_REPLY *rel_rep; mps_fw_diagnostic_buffer_t *pBuffer; sc = (struct mps_softc *)data; pq = sc->replypostindex; mps_dprint(sc, MPS_TRACE, "%s sc %p starting with replypostindex %u\n", __func__, sc, sc->replypostindex); for ( ;; ) { cm = NULL; desc = &sc->post_queue[sc->replypostindex]; flags = desc->Default.ReplyFlags & MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK; if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED) || (le32toh(desc->Words.High) == 0xffffffff)) break; /* increment the replypostindex now, so that event handlers * and cm completion handlers which decide to do a diag * reset can zero it without it getting incremented again * afterwards, and we break out of this loop on the next * iteration since the reply post queue has been cleared to * 0xFF and all descriptors look unused (which they are). */ if (++sc->replypostindex >= sc->pqdepth) sc->replypostindex = 0; switch (flags) { case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS: cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)]; cm->cm_reply = NULL; break; case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY: { uint32_t baddr; uint8_t *reply; /* * Re-compose the reply address from the address * sent back from the chip. The ReplyFrameAddress * is the lower 32 bits of the physical address of * particular reply frame. Convert that address to * host format, and then use that to provide the * offset against the virtual address base * (sc->reply_frames). */ baddr = le32toh(desc->AddressReply.ReplyFrameAddress); reply = sc->reply_frames + (baddr - ((uint32_t)sc->reply_busaddr)); /* * Make sure the reply we got back is in a valid * range. If not, go ahead and panic here, since * we'll probably panic as soon as we deference the * reply pointer anyway. */ if ((reply < sc->reply_frames) || (reply > (sc->reply_frames + (sc->fqdepth * sc->facts->ReplyFrameSize * 4)))) { printf("%s: WARNING: reply %p out of range!\n", __func__, reply); printf("%s: reply_frames %p, fqdepth %d, " "frame size %d\n", __func__, sc->reply_frames, sc->fqdepth, sc->facts->ReplyFrameSize * 4); printf("%s: baddr %#x,\n", __func__, baddr); /* LSI-TODO. See Linux Code. Need Gracefull exit*/ panic("Reply address out of range"); } if (le16toh(desc->AddressReply.SMID) == 0) { if (((MPI2_DEFAULT_REPLY *)reply)->Function == MPI2_FUNCTION_DIAG_BUFFER_POST) { /* * If SMID is 0 for Diag Buffer Post, * this implies that the reply is due to * a release function with a status that * the buffer has been released. Set * the buffer flags accordingly. */ rel_rep = (MPI2_DIAG_RELEASE_REPLY *)reply; if (le16toh(rel_rep->IOCStatus) == MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED) { pBuffer = &sc->fw_diag_buffer_list[ rel_rep->BufferType]; pBuffer->valid_data = TRUE; pBuffer->owned_by_firmware = FALSE; pBuffer->immediate = FALSE; } } else mps_dispatch_event(sc, baddr, (MPI2_EVENT_NOTIFICATION_REPLY *) reply); } else { cm = &sc->commands[le16toh(desc->AddressReply.SMID)]; cm->cm_reply = reply; cm->cm_reply_data = le32toh(desc->AddressReply.ReplyFrameAddress); } break; } case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS: case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER: case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS: default: /* Unhandled */ mps_dprint(sc, MPS_ERROR, "Unhandled reply 0x%x\n", desc->Default.ReplyFlags); cm = NULL; break; } if (cm != NULL) { // Print Error reply frame if (cm->cm_reply) mps_display_reply_info(sc,cm->cm_reply); mps_complete_command(sc, cm); } desc->Words.Low = 0xffffffff; desc->Words.High = 0xffffffff; } if (pq != sc->replypostindex) { mps_dprint(sc, MPS_TRACE, "%s sc %p writing postindex %d\n", __func__, sc, sc->replypostindex); mps_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, sc->replypostindex); } return; } static void mps_dispatch_event(struct mps_softc *sc, uintptr_t data, MPI2_EVENT_NOTIFICATION_REPLY *reply) { struct mps_event_handle *eh; int event, handled = 0; event = le16toh(reply->Event); TAILQ_FOREACH(eh, &sc->event_list, eh_list) { if (isset(eh->mask, event)) { eh->callback(sc, data, reply); handled++; } } if (handled == 0) mps_dprint(sc, MPS_EVENT, "Unhandled event 0x%x\n", le16toh(event)); /* * This is the only place that the event/reply should be freed. * Anything wanting to hold onto the event data should have * already copied it into their own storage. */ mps_free_reply(sc, data); } static void mps_reregister_events_complete(struct mps_softc *sc, struct mps_command *cm) { mps_dprint(sc, MPS_TRACE, "%s\n", __func__); if (cm->cm_reply) mps_print_event(sc, (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply); mps_free_command(sc, cm); /* next, send a port enable */ mpssas_startup(sc); } /* * For both register_events and update_events, the caller supplies a bitmap * of events that it _wants_. These functions then turn that into a bitmask * suitable for the controller. */ int mps_register_events(struct mps_softc *sc, u32 *mask, mps_evt_callback_t *cb, void *data, struct mps_event_handle **handle) { struct mps_event_handle *eh; int error = 0; eh = malloc(sizeof(struct mps_event_handle), M_MPT2, M_WAITOK|M_ZERO); if(!eh) { device_printf(sc->mps_dev, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } eh->callback = cb; eh->data = data; TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list); if (mask != NULL) error = mps_update_events(sc, eh, mask); *handle = eh; return (error); } int mps_update_events(struct mps_softc *sc, struct mps_event_handle *handle, u32 *mask) { MPI2_EVENT_NOTIFICATION_REQUEST *evtreq; MPI2_EVENT_NOTIFICATION_REPLY *reply; struct mps_command *cm; int error, i; mps_dprint(sc, MPS_TRACE, "%s\n", __func__); if ((mask != NULL) && (handle != NULL)) bcopy(mask, &handle->mask[0], sizeof(u32) * MPI2_EVENT_NOTIFY_EVENTMASK_WORDS); for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++) sc->event_mask[i] = -1; for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++) sc->event_mask[i] &= ~handle->mask[i]; if ((cm = mps_alloc_command(sc)) == NULL) return (EBUSY); evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req; evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION; evtreq->MsgFlags = 0; evtreq->SASBroadcastPrimitiveMasks = 0; #ifdef MPS_DEBUG_ALL_EVENTS { u_char fullmask[16]; memset(fullmask, 0x00, 16); bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) * MPI2_EVENT_NOTIFY_EVENTMASK_WORDS); } #else for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++) evtreq->EventMasks[i] = htole32(sc->event_mask[i]); #endif cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_data = NULL; error = mps_request_polled(sc, cm); reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply; if ((reply == NULL) || (reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS) error = ENXIO; mps_print_event(sc, reply); mps_dprint(sc, MPS_TRACE, "%s finished error %d\n", __func__, error); mps_free_command(sc, cm); return (error); } static int mps_reregister_events(struct mps_softc *sc) { MPI2_EVENT_NOTIFICATION_REQUEST *evtreq; struct mps_command *cm; struct mps_event_handle *eh; int error, i; mps_dprint(sc, MPS_TRACE, "%s\n", __func__); /* first, reregister events */ for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++) sc->event_mask[i] = -1; TAILQ_FOREACH(eh, &sc->event_list, eh_list) { for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++) sc->event_mask[i] &= ~eh->mask[i]; } if ((cm = mps_alloc_command(sc)) == NULL) return (EBUSY); evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req; evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION; evtreq->MsgFlags = 0; evtreq->SASBroadcastPrimitiveMasks = 0; #ifdef MPS_DEBUG_ALL_EVENTS { u_char fullmask[16]; memset(fullmask, 0x00, 16); bcopy(fullmask, &evtreq->EventMasks[0], sizeof(u32) * MPI2_EVENT_NOTIFY_EVENTMASK_WORDS); } #else for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++) evtreq->EventMasks[i] = htole32(sc->event_mask[i]); #endif cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_data = NULL; cm->cm_complete = mps_reregister_events_complete; error = mps_map_command(sc, cm); mps_dprint(sc, MPS_TRACE, "%s finished with error %d\n", __func__, error); return (error); } void mps_deregister_events(struct mps_softc *sc, struct mps_event_handle *handle) { TAILQ_REMOVE(&sc->event_list, handle, eh_list); free(handle, M_MPT2); } /* * Add a chain element as the next SGE for the specified command. * Reset cm_sge and cm_sgesize to indicate all the available space. */ static int mps_add_chain(struct mps_command *cm) { MPI2_SGE_CHAIN32 *sgc; struct mps_chain *chain; int space; if (cm->cm_sglsize < MPS_SGC_SIZE) panic("MPS: Need SGE Error Code\n"); chain = mps_alloc_chain(cm->cm_sc); if (chain == NULL) return (ENOBUFS); space = (int)cm->cm_sc->facts->IOCRequestFrameSize * 4; /* * Note: a double-linked list is used to make it easier to * walk for debugging. */ TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link); sgc = (MPI2_SGE_CHAIN32 *)&cm->cm_sge->MpiChain; sgc->Length = htole16(space); sgc->NextChainOffset = 0; /* TODO Looks like bug in Setting sgc->Flags. * sgc->Flags = ( MPI2_SGE_FLAGS_CHAIN_ELEMENT | MPI2_SGE_FLAGS_64_BIT_ADDRESSING | * MPI2_SGE_FLAGS_SYSTEM_ADDRESS) << MPI2_SGE_FLAGS_SHIFT * This is fine.. because we are not using simple element. In case of * MPI2_SGE_CHAIN32, we have seperate Length and Flags feild. */ sgc->Flags = MPI2_SGE_FLAGS_CHAIN_ELEMENT; sgc->Address = htole32(chain->chain_busaddr); cm->cm_sge = (MPI2_SGE_IO_UNION *)&chain->chain->MpiSimple; cm->cm_sglsize = space; return (0); } /* * Add one scatter-gather element (chain, simple, transaction context) * to the scatter-gather list for a command. Maintain cm_sglsize and * cm_sge as the remaining size and pointer to the next SGE to fill * in, respectively. */ int mps_push_sge(struct mps_command *cm, void *sgep, size_t len, int segsleft) { MPI2_SGE_TRANSACTION_UNION *tc = sgep; MPI2_SGE_SIMPLE64 *sge = sgep; int error, type; uint32_t saved_buf_len, saved_address_low, saved_address_high; type = (tc->Flags & MPI2_SGE_FLAGS_ELEMENT_MASK); #ifdef INVARIANTS switch (type) { case MPI2_SGE_FLAGS_TRANSACTION_ELEMENT: { if (len != tc->DetailsLength + 4) panic("TC %p length %u or %zu?", tc, tc->DetailsLength + 4, len); } break; case MPI2_SGE_FLAGS_CHAIN_ELEMENT: /* Driver only uses 32-bit chain elements */ if (len != MPS_SGC_SIZE) panic("CHAIN %p length %u or %zu?", sgep, MPS_SGC_SIZE, len); break; case MPI2_SGE_FLAGS_SIMPLE_ELEMENT: /* Driver only uses 64-bit SGE simple elements */ if (len != MPS_SGE64_SIZE) panic("SGE simple %p length %u or %zu?", sge, MPS_SGE64_SIZE, len); if (((le32toh(sge->FlagsLength) >> MPI2_SGE_FLAGS_SHIFT) & MPI2_SGE_FLAGS_ADDRESS_SIZE) == 0) panic("SGE simple %p not marked 64-bit?", sge); break; default: panic("Unexpected SGE %p, flags %02x", tc, tc->Flags); } #endif /* * case 1: 1 more segment, enough room for it * case 2: 2 more segments, enough room for both * case 3: >=2 more segments, only enough room for 1 and a chain * case 4: >=1 more segment, enough room for only a chain * case 5: >=1 more segment, no room for anything (error) */ /* * There should be room for at least a chain element, or this * code is buggy. Case (5). */ if (cm->cm_sglsize < MPS_SGC_SIZE) panic("MPS: Need SGE Error Code\n"); if (segsleft >= 2 && cm->cm_sglsize < len + MPS_SGC_SIZE + MPS_SGE64_SIZE) { /* * There are 2 or more segments left to add, and only * enough room for 1 and a chain. Case (3). * * Mark as last element in this chain if necessary. */ if (type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) { sge->FlagsLength |= htole32( MPI2_SGE_FLAGS_LAST_ELEMENT << MPI2_SGE_FLAGS_SHIFT); } /* * Add the item then a chain. Do the chain now, * rather than on the next iteration, to simplify * understanding the code. */ cm->cm_sglsize -= len; bcopy(sgep, cm->cm_sge, len); cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len); return (mps_add_chain(cm)); } if (segsleft >= 1 && cm->cm_sglsize < len + MPS_SGC_SIZE) { /* * 1 or more segment, enough room for only a chain. * Hope the previous element wasn't a Simple entry * that needed to be marked with * MPI2_SGE_FLAGS_LAST_ELEMENT. Case (4). */ if ((error = mps_add_chain(cm)) != 0) return (error); } #ifdef INVARIANTS /* Case 1: 1 more segment, enough room for it. */ if (segsleft == 1 && cm->cm_sglsize < len) panic("1 seg left and no room? %u versus %zu", cm->cm_sglsize, len); /* Case 2: 2 more segments, enough room for both */ if (segsleft == 2 && cm->cm_sglsize < len + MPS_SGE64_SIZE) panic("2 segs left and no room? %u versus %zu", cm->cm_sglsize, len); #endif if (segsleft == 1 && type == MPI2_SGE_FLAGS_SIMPLE_ELEMENT) { /* * If this is a bi-directional request, need to account for that * here. Save the pre-filled sge values. These will be used * either for the 2nd SGL or for a single direction SGL. If * cm_out_len is non-zero, this is a bi-directional request, so * fill in the OUT SGL first, then the IN SGL, otherwise just * fill in the IN SGL. Note that at this time, when filling in * 2 SGL's for a bi-directional request, they both use the same * DMA buffer (same cm command). */ saved_buf_len = le32toh(sge->FlagsLength) & 0x00FFFFFF; saved_address_low = sge->Address.Low; saved_address_high = sge->Address.High; if (cm->cm_out_len) { sge->FlagsLength = htole32(cm->cm_out_len | ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC | MPI2_SGE_FLAGS_64_BIT_ADDRESSING) << MPI2_SGE_FLAGS_SHIFT)); cm->cm_sglsize -= len; bcopy(sgep, cm->cm_sge, len); cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len); } saved_buf_len |= ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_LIST | MPI2_SGE_FLAGS_64_BIT_ADDRESSING) << MPI2_SGE_FLAGS_SHIFT); if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) { saved_buf_len |= ((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) << MPI2_SGE_FLAGS_SHIFT); } else { saved_buf_len |= ((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) << MPI2_SGE_FLAGS_SHIFT); } sge->FlagsLength = htole32(saved_buf_len); sge->Address.Low = saved_address_low; sge->Address.High = saved_address_high; } cm->cm_sglsize -= len; bcopy(sgep, cm->cm_sge, len); cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len); return (0); } /* * Add one dma segment to the scatter-gather list for a command. */ int mps_add_dmaseg(struct mps_command *cm, vm_paddr_t pa, size_t len, u_int flags, int segsleft) { MPI2_SGE_SIMPLE64 sge; /* * This driver always uses 64-bit address elements for simplicity. */ bzero(&sge, sizeof(sge)); flags |= MPI2_SGE_FLAGS_SIMPLE_ELEMENT | MPI2_SGE_FLAGS_64_BIT_ADDRESSING; sge.FlagsLength = htole32(len | (flags << MPI2_SGE_FLAGS_SHIFT)); mps_from_u64(pa, &sge.Address); return (mps_push_sge(cm, &sge, sizeof sge, segsleft)); } static void mps_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { struct mps_softc *sc; struct mps_command *cm; u_int i, dir, sflags; cm = (struct mps_command *)arg; sc = cm->cm_sc; /* * In this case, just print out a warning and let the chip tell the * user they did the wrong thing. */ if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) { mps_dprint(sc, MPS_ERROR, "%s: warning: busdma returned %d segments, " "more than the %d allowed\n", __func__, nsegs, cm->cm_max_segs); } /* * Set up DMA direction flags. Bi-directional requests are also handled * here. In that case, both direction flags will be set. */ sflags = 0; if (cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) { /* * We have to add a special case for SMP passthrough, there * is no easy way to generically handle it. The first * S/G element is used for the command (therefore the * direction bit needs to be set). The second one is used * for the reply. We'll leave it to the caller to make * sure we only have two buffers. */ /* * Even though the busdma man page says it doesn't make * sense to have both direction flags, it does in this case. * We have one s/g element being accessed in each direction. */ dir = BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD; /* * Set the direction flag on the first buffer in the SMP * passthrough request. We'll clear it for the second one. */ sflags |= MPI2_SGE_FLAGS_DIRECTION | MPI2_SGE_FLAGS_END_OF_BUFFER; } else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT) { sflags |= MPI2_SGE_FLAGS_HOST_TO_IOC; dir = BUS_DMASYNC_PREWRITE; } else dir = BUS_DMASYNC_PREREAD; for (i = 0; i < nsegs; i++) { if ((cm->cm_flags & MPS_CM_FLAGS_SMP_PASS) && (i != 0)) { sflags &= ~MPI2_SGE_FLAGS_DIRECTION; } error = mps_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len, sflags, nsegs - i); if (error != 0) { /* Resource shortage, roll back! */ if (ratecheck(&sc->lastfail, &mps_chainfail_interval)) mps_dprint(sc, MPS_INFO, "Out of chain frames, " "consider increasing hw.mps.max_chains.\n"); cm->cm_flags |= MPS_CM_FLAGS_CHAIN_FAILED; mps_complete_command(sc, cm); return; } } bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir); mps_enqueue_request(sc, cm); return; } static void mps_data_cb2(void *arg, bus_dma_segment_t *segs, int nsegs, bus_size_t mapsize, int error) { mps_data_cb(arg, segs, nsegs, error); } /* * This is the routine to enqueue commands ansynchronously. * Note that the only error path here is from bus_dmamap_load(), which can * return EINPROGRESS if it is waiting for resources. Other than this, it's * assumed that if you have a command in-hand, then you have enough credits * to use it. */ int mps_map_command(struct mps_softc *sc, struct mps_command *cm) { int error = 0; if (cm->cm_flags & MPS_CM_FLAGS_USE_UIO) { error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap, &cm->cm_uio, mps_data_cb2, cm, 0); } else if (cm->cm_flags & MPS_CM_FLAGS_USE_CCB) { error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap, cm->cm_data, mps_data_cb, cm, 0); } else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) { error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap, cm->cm_data, cm->cm_length, mps_data_cb, cm, 0); } else { /* Add a zero-length element as needed */ if (cm->cm_sge != NULL) mps_add_dmaseg(cm, 0, 0, 0, 1); mps_enqueue_request(sc, cm); } return (error); } /* * This is the routine to enqueue commands synchronously. An error of * EINPROGRESS from mps_map_command() is ignored since the command will * be executed and enqueued automatically. Other errors come from msleep(). */ int mps_wait_command(struct mps_softc *sc, struct mps_command *cm, int timeout, int sleep_flag) { int error, rc; struct timeval cur_time, start_time; if (sc->mps_flags & MPS_FLAGS_DIAGRESET) return EBUSY; cm->cm_complete = NULL; cm->cm_flags |= (MPS_CM_FLAGS_WAKEUP + MPS_CM_FLAGS_POLLED); error = mps_map_command(sc, cm); if ((error != 0) && (error != EINPROGRESS)) return (error); // Check for context and wait for 50 mSec at a time until time has // expired or the command has finished. If msleep can't be used, need // to poll. if (curthread->td_no_sleeping != 0) sleep_flag = NO_SLEEP; getmicrotime(&start_time); if (mtx_owned(&sc->mps_mtx) && sleep_flag == CAN_SLEEP) { error = msleep(cm, &sc->mps_mtx, 0, "mpswait", timeout*hz); } else { while ((cm->cm_flags & MPS_CM_FLAGS_COMPLETE) == 0) { mps_intr_locked(sc); if (sleep_flag == CAN_SLEEP) pause("mpswait", hz/20); else DELAY(50000); getmicrotime(&cur_time); if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { error = EWOULDBLOCK; break; } } } if (error == EWOULDBLOCK) { mps_dprint(sc, MPS_FAULT, "Calling Reinit from %s\n", __func__); rc = mps_reinit(sc); mps_dprint(sc, MPS_FAULT, "Reinit %s\n", (rc == 0) ? "success" : "failed"); error = ETIMEDOUT; } return (error); } /* * This is the routine to enqueue a command synchonously and poll for * completion. Its use should be rare. */ int mps_request_polled(struct mps_softc *sc, struct mps_command *cm) { int error, timeout = 0, rc; struct timeval cur_time, start_time; error = 0; cm->cm_flags |= MPS_CM_FLAGS_POLLED; cm->cm_complete = NULL; mps_map_command(sc, cm); getmicrotime(&start_time); while ((cm->cm_flags & MPS_CM_FLAGS_COMPLETE) == 0) { mps_intr_locked(sc); if (mtx_owned(&sc->mps_mtx)) msleep(&sc->msleep_fake_chan, &sc->mps_mtx, 0, "mpspoll", hz/20); else pause("mpsdiag", hz/20); /* * Check for real-time timeout and fail if more than 60 seconds. */ getmicrotime(&cur_time); timeout = cur_time.tv_sec - start_time.tv_sec; if (timeout > 60) { mps_dprint(sc, MPS_FAULT, "polling failed\n"); error = ETIMEDOUT; break; } } if (error) { mps_dprint(sc, MPS_FAULT, "Calling Reinit from %s\n", __func__); rc = mps_reinit(sc); mps_dprint(sc, MPS_FAULT, "Reinit %s\n", (rc == 0) ? "success" : "failed"); } return (error); } /* * The MPT driver had a verbose interface for config pages. In this driver, * reduce it to much simplier terms, similar to the Linux driver. */ int mps_read_config_page(struct mps_softc *sc, struct mps_config_params *params) { MPI2_CONFIG_REQUEST *req; struct mps_command *cm; int error; if (sc->mps_flags & MPS_FLAGS_BUSY) { return (EBUSY); } cm = mps_alloc_command(sc); if (cm == NULL) { return (EBUSY); } req = (MPI2_CONFIG_REQUEST *)cm->cm_req; req->Function = MPI2_FUNCTION_CONFIG; req->Action = params->action; req->SGLFlags = 0; req->ChainOffset = 0; req->PageAddress = params->page_address; if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) { MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr; hdr = ¶ms->hdr.Ext; req->ExtPageType = hdr->ExtPageType; req->ExtPageLength = hdr->ExtPageLength; req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED; req->Header.PageLength = 0; /* Must be set to zero */ req->Header.PageNumber = hdr->PageNumber; req->Header.PageVersion = hdr->PageVersion; } else { MPI2_CONFIG_PAGE_HEADER *hdr; hdr = ¶ms->hdr.Struct; req->Header.PageType = hdr->PageType; req->Header.PageNumber = hdr->PageNumber; req->Header.PageLength = hdr->PageLength; req->Header.PageVersion = hdr->PageVersion; } cm->cm_data = params->buffer; cm->cm_length = params->length; - cm->cm_sge = &req->PageBufferSGE; - cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION); - cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE | MPS_CM_FLAGS_DATAIN; + if (cm->cm_data != NULL) { + cm->cm_sge = &req->PageBufferSGE; + cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION); + cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE | MPS_CM_FLAGS_DATAIN; + } else + cm->cm_sge = NULL; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_complete_data = params; if (params->callback != NULL) { cm->cm_complete = mps_config_complete; return (mps_map_command(sc, cm)); } else { error = mps_wait_command(sc, cm, 0, CAN_SLEEP); if (error) { mps_dprint(sc, MPS_FAULT, "Error %d reading config page\n", error); mps_free_command(sc, cm); return (error); } mps_config_complete(sc, cm); } return (0); } int mps_write_config_page(struct mps_softc *sc, struct mps_config_params *params) { return (EINVAL); } static void mps_config_complete(struct mps_softc *sc, struct mps_command *cm) { MPI2_CONFIG_REPLY *reply; struct mps_config_params *params; MPS_FUNCTRACE(sc); params = cm->cm_complete_data; if (cm->cm_data != NULL) { bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap); } /* * XXX KDM need to do more error recovery? This results in the * device in question not getting probed. */ if ((cm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { params->status = MPI2_IOCSTATUS_BUSY; goto done; } reply = (MPI2_CONFIG_REPLY *)cm->cm_reply; if (reply == NULL) { params->status = MPI2_IOCSTATUS_BUSY; goto done; } params->status = reply->IOCStatus; - if (params->hdr.Ext.ExtPageType != 0) { + if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) { params->hdr.Ext.ExtPageType = reply->ExtPageType; params->hdr.Ext.ExtPageLength = reply->ExtPageLength; + params->hdr.Ext.PageType = reply->Header.PageType; + params->hdr.Ext.PageNumber = reply->Header.PageNumber; + params->hdr.Ext.PageVersion = reply->Header.PageVersion; } else { params->hdr.Struct.PageType = reply->Header.PageType; params->hdr.Struct.PageNumber = reply->Header.PageNumber; params->hdr.Struct.PageLength = reply->Header.PageLength; params->hdr.Struct.PageVersion = reply->Header.PageVersion; } done: mps_free_command(sc, cm); if (params->callback != NULL) params->callback(sc, params); return; } Index: projects/building-blocks/sys/dev/mps/mps_user.c =================================================================== --- projects/building-blocks/sys/dev/mps/mps_user.c (revision 277723) +++ projects/building-blocks/sys/dev/mps/mps_user.c (revision 277724) @@ -1,2423 +1,2427 @@ /*- * Copyright (c) 2008 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * LSI MPT-Fusion Host Adapter FreeBSD userland interface */ /*- * Copyright (c) 2011, 2012 LSI Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * LSI MPT-Fusion Host Adapter FreeBSD * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" /* TODO Move headers to mpsvar */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static d_open_t mps_open; static d_close_t mps_close; static d_ioctl_t mps_ioctl_devsw; static struct cdevsw mps_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = mps_open, .d_close = mps_close, .d_ioctl = mps_ioctl_devsw, .d_name = "mps", }; typedef int (mps_user_f)(struct mps_command *, struct mps_usr_command *); static mps_user_f mpi_pre_ioc_facts; static mps_user_f mpi_pre_port_facts; static mps_user_f mpi_pre_fw_download; static mps_user_f mpi_pre_fw_upload; static mps_user_f mpi_pre_sata_passthrough; static mps_user_f mpi_pre_smp_passthrough; static mps_user_f mpi_pre_config; static mps_user_f mpi_pre_sas_io_unit_control; static int mps_user_read_cfg_header(struct mps_softc *, struct mps_cfg_page_req *); static int mps_user_read_cfg_page(struct mps_softc *, struct mps_cfg_page_req *, void *); static int mps_user_read_extcfg_header(struct mps_softc *, struct mps_ext_cfg_page_req *); static int mps_user_read_extcfg_page(struct mps_softc *, struct mps_ext_cfg_page_req *, void *); static int mps_user_write_cfg_page(struct mps_softc *, struct mps_cfg_page_req *, void *); static int mps_user_setup_request(struct mps_command *, struct mps_usr_command *); static int mps_user_command(struct mps_softc *, struct mps_usr_command *); static int mps_user_pass_thru(struct mps_softc *sc, mps_pass_thru_t *data); static void mps_user_get_adapter_data(struct mps_softc *sc, mps_adapter_data_t *data); static void mps_user_read_pci_info(struct mps_softc *sc, mps_pci_info_t *data); static uint8_t mps_get_fw_diag_buffer_number(struct mps_softc *sc, uint32_t unique_id); static int mps_post_fw_diag_buffer(struct mps_softc *sc, mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code); static int mps_release_fw_diag_buffer(struct mps_softc *sc, mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code, uint32_t diag_type); static int mps_diag_register(struct mps_softc *sc, mps_fw_diag_register_t *diag_register, uint32_t *return_code); static int mps_diag_unregister(struct mps_softc *sc, mps_fw_diag_unregister_t *diag_unregister, uint32_t *return_code); static int mps_diag_query(struct mps_softc *sc, mps_fw_diag_query_t *diag_query, uint32_t *return_code); static int mps_diag_read_buffer(struct mps_softc *sc, mps_diag_read_buffer_t *diag_read_buffer, uint8_t *ioctl_buf, uint32_t *return_code); static int mps_diag_release(struct mps_softc *sc, mps_fw_diag_release_t *diag_release, uint32_t *return_code); static int mps_do_diag_action(struct mps_softc *sc, uint32_t action, uint8_t *diag_action, uint32_t length, uint32_t *return_code); static int mps_user_diag_action(struct mps_softc *sc, mps_diag_action_t *data); static void mps_user_event_query(struct mps_softc *sc, mps_event_query_t *data); static void mps_user_event_enable(struct mps_softc *sc, mps_event_enable_t *data); static int mps_user_event_report(struct mps_softc *sc, mps_event_report_t *data); static int mps_user_reg_access(struct mps_softc *sc, mps_reg_access_t *data); static int mps_user_btdh(struct mps_softc *sc, mps_btdh_mapping_t *data); static MALLOC_DEFINE(M_MPSUSER, "mps_user", "Buffers for mps(4) ioctls"); /* Macros from compat/freebsd32/freebsd32.h */ #define PTRIN(v) (void *)(uintptr_t)(v) #define PTROUT(v) (uint32_t)(uintptr_t)(v) #define CP(src,dst,fld) do { (dst).fld = (src).fld; } while (0) #define PTRIN_CP(src,dst,fld) \ do { (dst).fld = PTRIN((src).fld); } while (0) #define PTROUT_CP(src,dst,fld) \ do { (dst).fld = PTROUT((src).fld); } while (0) int mps_attach_user(struct mps_softc *sc) { int unit; unit = device_get_unit(sc->mps_dev); sc->mps_cdev = make_dev(&mps_cdevsw, unit, UID_ROOT, GID_OPERATOR, 0640, "mps%d", unit); if (sc->mps_cdev == NULL) { return (ENOMEM); } sc->mps_cdev->si_drv1 = sc; return (0); } void mps_detach_user(struct mps_softc *sc) { /* XXX: do a purge of pending requests? */ if (sc->mps_cdev != NULL) destroy_dev(sc->mps_cdev); } static int mps_open(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } static int mps_close(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } static int mps_user_read_cfg_header(struct mps_softc *sc, struct mps_cfg_page_req *page_req) { MPI2_CONFIG_PAGE_HEADER *hdr; struct mps_config_params params; int error; hdr = ¶ms.hdr.Struct; params.action = MPI2_CONFIG_ACTION_PAGE_HEADER; params.page_address = le32toh(page_req->page_address); hdr->PageVersion = 0; hdr->PageLength = 0; hdr->PageNumber = page_req->header.PageNumber; hdr->PageType = page_req->header.PageType; params.buffer = NULL; params.length = 0; params.callback = NULL; if ((error = mps_read_config_page(sc, ¶ms)) != 0) { /* * Leave the request. Without resetting the chip, it's * still owned by it and we'll just get into trouble * freeing it now. Mark it as abandoned so that if it * shows up later it can be freed. */ mps_printf(sc, "read_cfg_header timed out\n"); return (ETIMEDOUT); } page_req->ioc_status = htole16(params.status); if ((page_req->ioc_status & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SUCCESS) { bcopy(hdr, &page_req->header, sizeof(page_req->header)); } return (0); } static int mps_user_read_cfg_page(struct mps_softc *sc, struct mps_cfg_page_req *page_req, void *buf) { MPI2_CONFIG_PAGE_HEADER *reqhdr, *hdr; struct mps_config_params params; int error; reqhdr = buf; hdr = ¶ms.hdr.Struct; hdr->PageVersion = reqhdr->PageVersion; hdr->PageLength = reqhdr->PageLength; hdr->PageNumber = reqhdr->PageNumber; hdr->PageType = reqhdr->PageType & MPI2_CONFIG_PAGETYPE_MASK; params.action = MPI2_CONFIG_ACTION_PAGE_READ_CURRENT; params.page_address = le32toh(page_req->page_address); params.buffer = buf; params.length = le32toh(page_req->len); params.callback = NULL; if ((error = mps_read_config_page(sc, ¶ms)) != 0) { mps_printf(sc, "mps_user_read_cfg_page timed out\n"); return (ETIMEDOUT); } page_req->ioc_status = htole16(params.status); return (0); } static int mps_user_read_extcfg_header(struct mps_softc *sc, struct mps_ext_cfg_page_req *ext_page_req) { MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr; struct mps_config_params params; int error; hdr = ¶ms.hdr.Ext; params.action = MPI2_CONFIG_ACTION_PAGE_HEADER; hdr->PageVersion = ext_page_req->header.PageVersion; hdr->PageType = MPI2_CONFIG_PAGETYPE_EXTENDED; hdr->ExtPageLength = 0; hdr->PageNumber = ext_page_req->header.PageNumber; hdr->ExtPageType = ext_page_req->header.ExtPageType; params.page_address = le32toh(ext_page_req->page_address); + params.buffer = NULL; + params.length = 0; + params.callback = NULL; + if ((error = mps_read_config_page(sc, ¶ms)) != 0) { /* * Leave the request. Without resetting the chip, it's * still owned by it and we'll just get into trouble * freeing it now. Mark it as abandoned so that if it * shows up later it can be freed. */ mps_printf(sc, "mps_user_read_extcfg_header timed out\n"); return (ETIMEDOUT); } ext_page_req->ioc_status = htole16(params.status); if ((ext_page_req->ioc_status & MPI2_IOCSTATUS_MASK) == MPI2_IOCSTATUS_SUCCESS) { ext_page_req->header.PageVersion = hdr->PageVersion; ext_page_req->header.PageNumber = hdr->PageNumber; ext_page_req->header.PageType = hdr->PageType; ext_page_req->header.ExtPageLength = hdr->ExtPageLength; ext_page_req->header.ExtPageType = hdr->ExtPageType; } return (0); } static int mps_user_read_extcfg_page(struct mps_softc *sc, struct mps_ext_cfg_page_req *ext_page_req, void *buf) { MPI2_CONFIG_EXTENDED_PAGE_HEADER *reqhdr, *hdr; struct mps_config_params params; int error; reqhdr = buf; hdr = ¶ms.hdr.Ext; params.action = MPI2_CONFIG_ACTION_PAGE_READ_CURRENT; params.page_address = le32toh(ext_page_req->page_address); hdr->PageVersion = reqhdr->PageVersion; hdr->PageType = MPI2_CONFIG_PAGETYPE_EXTENDED; hdr->PageNumber = reqhdr->PageNumber; hdr->ExtPageType = reqhdr->ExtPageType; hdr->ExtPageLength = reqhdr->ExtPageLength; params.buffer = buf; params.length = le32toh(ext_page_req->len); params.callback = NULL; if ((error = mps_read_config_page(sc, ¶ms)) != 0) { mps_printf(sc, "mps_user_read_extcfg_page timed out\n"); return (ETIMEDOUT); } ext_page_req->ioc_status = htole16(params.status); return (0); } static int mps_user_write_cfg_page(struct mps_softc *sc, struct mps_cfg_page_req *page_req, void *buf) { MPI2_CONFIG_PAGE_HEADER *reqhdr, *hdr; struct mps_config_params params; u_int hdr_attr; int error; reqhdr = buf; hdr = ¶ms.hdr.Struct; hdr_attr = reqhdr->PageType & MPI2_CONFIG_PAGEATTR_MASK; if (hdr_attr != MPI2_CONFIG_PAGEATTR_CHANGEABLE && hdr_attr != MPI2_CONFIG_PAGEATTR_PERSISTENT) { mps_printf(sc, "page type 0x%x not changeable\n", reqhdr->PageType & MPI2_CONFIG_PAGETYPE_MASK); return (EINVAL); } /* * There isn't any point in restoring stripped out attributes * if you then mask them going down to issue the request. */ hdr->PageVersion = reqhdr->PageVersion; hdr->PageLength = reqhdr->PageLength; hdr->PageNumber = reqhdr->PageNumber; hdr->PageType = reqhdr->PageType; params.action = MPI2_CONFIG_ACTION_PAGE_WRITE_CURRENT; params.page_address = le32toh(page_req->page_address); params.buffer = buf; params.length = le32toh(page_req->len); params.callback = NULL; if ((error = mps_write_config_page(sc, ¶ms)) != 0) { mps_printf(sc, "mps_write_cfg_page timed out\n"); return (ETIMEDOUT); } page_req->ioc_status = htole16(params.status); return (0); } void mpi_init_sge(struct mps_command *cm, void *req, void *sge) { int off, space; space = (int)cm->cm_sc->facts->IOCRequestFrameSize * 4; off = (uintptr_t)sge - (uintptr_t)req; KASSERT(off < space, ("bad pointers %p %p, off %d, space %d", req, sge, off, space)); cm->cm_sge = sge; cm->cm_sglsize = space - off; } /* * Prepare the mps_command for an IOC_FACTS request. */ static int mpi_pre_ioc_facts(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_IOC_FACTS_REQUEST *req = (void *)cm->cm_req; MPI2_IOC_FACTS_REPLY *rpl; if (cmd->req_len != sizeof *req) return (EINVAL); if (cmd->rpl_len != sizeof *rpl) return (EINVAL); cm->cm_sge = NULL; cm->cm_sglsize = 0; return (0); } /* * Prepare the mps_command for a PORT_FACTS request. */ static int mpi_pre_port_facts(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_PORT_FACTS_REQUEST *req = (void *)cm->cm_req; MPI2_PORT_FACTS_REPLY *rpl; if (cmd->req_len != sizeof *req) return (EINVAL); if (cmd->rpl_len != sizeof *rpl) return (EINVAL); cm->cm_sge = NULL; cm->cm_sglsize = 0; return (0); } /* * Prepare the mps_command for a FW_DOWNLOAD request. */ static int mpi_pre_fw_download(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_FW_DOWNLOAD_REQUEST *req = (void *)cm->cm_req; MPI2_FW_DOWNLOAD_REPLY *rpl; MPI2_FW_DOWNLOAD_TCSGE tc; int error; /* * This code assumes there is room in the request's SGL for * the TransactionContext plus at least a SGL chain element. */ CTASSERT(sizeof req->SGL >= sizeof tc + MPS_SGC_SIZE); if (cmd->req_len != sizeof *req) return (EINVAL); if (cmd->rpl_len != sizeof *rpl) return (EINVAL); if (cmd->len == 0) return (EINVAL); error = copyin(cmd->buf, cm->cm_data, cmd->len); if (error != 0) return (error); mpi_init_sge(cm, req, &req->SGL); bzero(&tc, sizeof tc); /* * For now, the F/W image must be provided in a single request. */ if ((req->MsgFlags & MPI2_FW_DOWNLOAD_MSGFLGS_LAST_SEGMENT) == 0) return (EINVAL); if (req->TotalImageSize != cmd->len) return (EINVAL); /* * The value of the first two elements is specified in the * Fusion-MPT Message Passing Interface document. */ tc.ContextSize = 0; tc.DetailsLength = 12; tc.ImageOffset = 0; tc.ImageSize = cmd->len; cm->cm_flags |= MPS_CM_FLAGS_DATAOUT; return (mps_push_sge(cm, &tc, sizeof tc, 0)); } /* * Prepare the mps_command for a FW_UPLOAD request. */ static int mpi_pre_fw_upload(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_FW_UPLOAD_REQUEST *req = (void *)cm->cm_req; MPI2_FW_UPLOAD_REPLY *rpl; MPI2_FW_UPLOAD_TCSGE tc; /* * This code assumes there is room in the request's SGL for * the TransactionContext plus at least a SGL chain element. */ CTASSERT(sizeof req->SGL >= sizeof tc + MPS_SGC_SIZE); if (cmd->req_len != sizeof *req) return (EINVAL); if (cmd->rpl_len != sizeof *rpl) return (EINVAL); mpi_init_sge(cm, req, &req->SGL); bzero(&tc, sizeof tc); /* * The value of the first two elements is specified in the * Fusion-MPT Message Passing Interface document. */ tc.ContextSize = 0; tc.DetailsLength = 12; /* * XXX Is there any reason to fetch a partial image? I.e. to * set ImageOffset to something other than 0? */ tc.ImageOffset = 0; tc.ImageSize = cmd->len; cm->cm_flags |= MPS_CM_FLAGS_DATAIN; return (mps_push_sge(cm, &tc, sizeof tc, 0)); } /* * Prepare the mps_command for a SATA_PASSTHROUGH request. */ static int mpi_pre_sata_passthrough(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_SATA_PASSTHROUGH_REQUEST *req = (void *)cm->cm_req; MPI2_SATA_PASSTHROUGH_REPLY *rpl; if (cmd->req_len != sizeof *req) return (EINVAL); if (cmd->rpl_len != sizeof *rpl) return (EINVAL); mpi_init_sge(cm, req, &req->SGL); return (0); } /* * Prepare the mps_command for a SMP_PASSTHROUGH request. */ static int mpi_pre_smp_passthrough(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_SMP_PASSTHROUGH_REQUEST *req = (void *)cm->cm_req; MPI2_SMP_PASSTHROUGH_REPLY *rpl; if (cmd->req_len != sizeof *req) return (EINVAL); if (cmd->rpl_len != sizeof *rpl) return (EINVAL); mpi_init_sge(cm, req, &req->SGL); return (0); } /* * Prepare the mps_command for a CONFIG request. */ static int mpi_pre_config(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_CONFIG_REQUEST *req = (void *)cm->cm_req; MPI2_CONFIG_REPLY *rpl; if (cmd->req_len != sizeof *req) return (EINVAL); if (cmd->rpl_len != sizeof *rpl) return (EINVAL); mpi_init_sge(cm, req, &req->PageBufferSGE); return (0); } /* * Prepare the mps_command for a SAS_IO_UNIT_CONTROL request. */ static int mpi_pre_sas_io_unit_control(struct mps_command *cm, struct mps_usr_command *cmd) { cm->cm_sge = NULL; cm->cm_sglsize = 0; return (0); } /* * A set of functions to prepare an mps_command for the various * supported requests. */ struct mps_user_func { U8 Function; mps_user_f *f_pre; } mps_user_func_list[] = { { MPI2_FUNCTION_IOC_FACTS, mpi_pre_ioc_facts }, { MPI2_FUNCTION_PORT_FACTS, mpi_pre_port_facts }, { MPI2_FUNCTION_FW_DOWNLOAD, mpi_pre_fw_download }, { MPI2_FUNCTION_FW_UPLOAD, mpi_pre_fw_upload }, { MPI2_FUNCTION_SATA_PASSTHROUGH, mpi_pre_sata_passthrough }, { MPI2_FUNCTION_SMP_PASSTHROUGH, mpi_pre_smp_passthrough}, { MPI2_FUNCTION_CONFIG, mpi_pre_config}, { MPI2_FUNCTION_SAS_IO_UNIT_CONTROL, mpi_pre_sas_io_unit_control }, { 0xFF, NULL } /* list end */ }; static int mps_user_setup_request(struct mps_command *cm, struct mps_usr_command *cmd) { MPI2_REQUEST_HEADER *hdr = (MPI2_REQUEST_HEADER *)cm->cm_req; struct mps_user_func *f; for (f = mps_user_func_list; f->f_pre != NULL; f++) { if (hdr->Function == f->Function) return (f->f_pre(cm, cmd)); } return (EINVAL); } static int mps_user_command(struct mps_softc *sc, struct mps_usr_command *cmd) { MPI2_REQUEST_HEADER *hdr; MPI2_DEFAULT_REPLY *rpl; void *buf = NULL; struct mps_command *cm = NULL; int err = 0; int sz; mps_lock(sc); cm = mps_alloc_command(sc); if (cm == NULL) { mps_printf(sc, "%s: no mps requests\n", __func__); err = ENOMEM; goto Ret; } mps_unlock(sc); hdr = (MPI2_REQUEST_HEADER *)cm->cm_req; mps_dprint(sc, MPS_USER, "%s: req %p %d rpl %p %d\n", __func__, cmd->req, cmd->req_len, cmd->rpl, cmd->rpl_len); if (cmd->req_len > (int)sc->facts->IOCRequestFrameSize * 4) { err = EINVAL; goto RetFreeUnlocked; } err = copyin(cmd->req, hdr, cmd->req_len); if (err != 0) goto RetFreeUnlocked; mps_dprint(sc, MPS_USER, "%s: Function %02X MsgFlags %02X\n", __func__, hdr->Function, hdr->MsgFlags); if (cmd->len > 0) { buf = malloc(cmd->len, M_MPSUSER, M_WAITOK|M_ZERO); if(!buf) { mps_printf(sc, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } cm->cm_data = buf; cm->cm_length = cmd->len; } else { cm->cm_data = NULL; cm->cm_length = 0; } cm->cm_flags = MPS_CM_FLAGS_SGE_SIMPLE; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; err = mps_user_setup_request(cm, cmd); if (err == EINVAL) { mps_printf(sc, "%s: unsupported parameter or unsupported " "function in request (function = 0x%X)\n", __func__, hdr->Function); } if (err != 0) goto RetFreeUnlocked; mps_lock(sc); err = mps_wait_command(sc, cm, 60, CAN_SLEEP); if (err) { mps_printf(sc, "%s: invalid request: error %d\n", __func__, err); goto Ret; } rpl = (MPI2_DEFAULT_REPLY *)cm->cm_reply; if (rpl != NULL) sz = rpl->MsgLength * 4; else sz = 0; if (sz > cmd->rpl_len) { mps_printf(sc, "%s: user reply buffer (%d) smaller than " "returned buffer (%d)\n", __func__, cmd->rpl_len, sz); sz = cmd->rpl_len; } mps_unlock(sc); copyout(rpl, cmd->rpl, sz); if (buf != NULL) copyout(buf, cmd->buf, cmd->len); mps_dprint(sc, MPS_USER, "%s: reply size %d\n", __func__, sz); RetFreeUnlocked: mps_lock(sc); if (cm != NULL) mps_free_command(sc, cm); Ret: mps_unlock(sc); if (buf != NULL) free(buf, M_MPSUSER); return (err); } static int mps_user_pass_thru(struct mps_softc *sc, mps_pass_thru_t *data) { MPI2_REQUEST_HEADER *hdr, tmphdr; MPI2_DEFAULT_REPLY *rpl; struct mps_command *cm = NULL; int err = 0, dir = 0, sz; uint8_t function = 0; u_int sense_len; /* * Only allow one passthru command at a time. Use the MPS_FLAGS_BUSY * bit to denote that a passthru is being processed. */ mps_lock(sc); if (sc->mps_flags & MPS_FLAGS_BUSY) { mps_dprint(sc, MPS_USER, "%s: Only one passthru command " "allowed at a single time.", __func__); mps_unlock(sc); return (EBUSY); } sc->mps_flags |= MPS_FLAGS_BUSY; mps_unlock(sc); /* * Do some validation on data direction. Valid cases are: * 1) DataSize is 0 and direction is NONE * 2) DataSize is non-zero and one of: * a) direction is READ or * b) direction is WRITE or * c) direction is BOTH and DataOutSize is non-zero * If valid and the direction is BOTH, change the direction to READ. * if valid and the direction is not BOTH, make sure DataOutSize is 0. */ if (((data->DataSize == 0) && (data->DataDirection == MPS_PASS_THRU_DIRECTION_NONE)) || ((data->DataSize != 0) && ((data->DataDirection == MPS_PASS_THRU_DIRECTION_READ) || (data->DataDirection == MPS_PASS_THRU_DIRECTION_WRITE) || ((data->DataDirection == MPS_PASS_THRU_DIRECTION_BOTH) && (data->DataOutSize != 0))))) { if (data->DataDirection == MPS_PASS_THRU_DIRECTION_BOTH) data->DataDirection = MPS_PASS_THRU_DIRECTION_READ; else data->DataOutSize = 0; } else return (EINVAL); mps_dprint(sc, MPS_USER, "%s: req 0x%jx %d rpl 0x%jx %d " "data in 0x%jx %d data out 0x%jx %d data dir %d\n", __func__, data->PtrRequest, data->RequestSize, data->PtrReply, data->ReplySize, data->PtrData, data->DataSize, data->PtrDataOut, data->DataOutSize, data->DataDirection); /* * copy in the header so we know what we're dealing with before we * commit to allocating a command for it. */ err = copyin(PTRIN(data->PtrRequest), &tmphdr, data->RequestSize); if (err != 0) goto RetFreeUnlocked; if (data->RequestSize > (int)sc->facts->IOCRequestFrameSize * 4) { err = EINVAL; goto RetFreeUnlocked; } function = tmphdr.Function; mps_dprint(sc, MPS_USER, "%s: Function %02X MsgFlags %02X\n", __func__, function, tmphdr.MsgFlags); /* * Handle a passthru TM request. */ if (function == MPI2_FUNCTION_SCSI_TASK_MGMT) { MPI2_SCSI_TASK_MANAGE_REQUEST *task; mps_lock(sc); cm = mpssas_alloc_tm(sc); if (cm == NULL) { err = EINVAL; goto Ret; } /* Copy the header in. Only a small fixup is needed. */ task = (MPI2_SCSI_TASK_MANAGE_REQUEST *)cm->cm_req; bcopy(&tmphdr, task, data->RequestSize); task->TaskMID = cm->cm_desc.Default.SMID; cm->cm_data = NULL; cm->cm_desc.HighPriority.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY; cm->cm_complete = NULL; cm->cm_complete_data = NULL; err = mps_wait_command(sc, cm, 30, CAN_SLEEP); if (err != 0) { err = EIO; mps_dprint(sc, MPS_FAULT, "%s: task management failed", __func__); } /* * Copy the reply data and sense data to user space. */ if (cm->cm_reply != NULL) { rpl = (MPI2_DEFAULT_REPLY *)cm->cm_reply; sz = rpl->MsgLength * 4; if (sz > data->ReplySize) { mps_printf(sc, "%s: user reply buffer (%d) " "smaller than returned buffer (%d)\n", __func__, data->ReplySize, sz); } mps_unlock(sc); copyout(cm->cm_reply, PTRIN(data->PtrReply), data->ReplySize); mps_lock(sc); } mpssas_free_tm(sc, cm); goto Ret; } mps_lock(sc); cm = mps_alloc_command(sc); if (cm == NULL) { mps_printf(sc, "%s: no mps requests\n", __func__); err = ENOMEM; goto Ret; } mps_unlock(sc); hdr = (MPI2_REQUEST_HEADER *)cm->cm_req; bcopy(&tmphdr, hdr, data->RequestSize); /* * Do some checking to make sure the IOCTL request contains a valid * request. Then set the SGL info. */ mpi_init_sge(cm, hdr, (void *)((uint8_t *)hdr + data->RequestSize)); /* * Set up for read, write or both. From check above, DataOutSize will * be 0 if direction is READ or WRITE, but it will have some non-zero * value if the direction is BOTH. So, just use the biggest size to get * the cm_data buffer size. If direction is BOTH, 2 SGLs need to be set * up; the first is for the request and the second will contain the * response data. cm_out_len needs to be set here and this will be used * when the SGLs are set up. */ cm->cm_data = NULL; cm->cm_length = MAX(data->DataSize, data->DataOutSize); cm->cm_out_len = data->DataOutSize; cm->cm_flags = 0; if (cm->cm_length != 0) { cm->cm_data = malloc(cm->cm_length, M_MPSUSER, M_WAITOK | M_ZERO); if (cm->cm_data == NULL) { mps_dprint(sc, MPS_FAULT, "%s: alloc failed for IOCTL " "passthru length %d\n", __func__, cm->cm_length); } else { cm->cm_flags = MPS_CM_FLAGS_DATAIN; if (data->DataOutSize) { cm->cm_flags |= MPS_CM_FLAGS_DATAOUT; err = copyin(PTRIN(data->PtrDataOut), cm->cm_data, data->DataOutSize); } else if (data->DataDirection == MPS_PASS_THRU_DIRECTION_WRITE) { cm->cm_flags = MPS_CM_FLAGS_DATAOUT; err = copyin(PTRIN(data->PtrData), cm->cm_data, data->DataSize); } if (err != 0) mps_dprint(sc, MPS_FAULT, "%s: failed to copy " "IOCTL data from user space\n", __func__); } } cm->cm_flags |= MPS_CM_FLAGS_SGE_SIMPLE; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; /* * Set up Sense buffer and SGL offset for IO passthru. SCSI IO request * uses SCSI IO descriptor. */ if ((function == MPI2_FUNCTION_SCSI_IO_REQUEST) || (function == MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH)) { MPI2_SCSI_IO_REQUEST *scsi_io_req; scsi_io_req = (MPI2_SCSI_IO_REQUEST *)hdr; /* * Put SGE for data and data_out buffer at the end of * scsi_io_request message header (64 bytes in total). * Following above SGEs, the residual space will be used by * sense data. */ scsi_io_req->SenseBufferLength = (uint8_t)(data->RequestSize - 64); scsi_io_req->SenseBufferLowAddress = htole32(cm->cm_sense_busaddr); /* * Set SGLOffset0 value. This is the number of dwords that SGL * is offset from the beginning of MPI2_SCSI_IO_REQUEST struct. */ scsi_io_req->SGLOffset0 = 24; /* * Setup descriptor info. RAID passthrough must use the * default request descriptor which is already set, so if this * is a SCSI IO request, change the descriptor to SCSI IO. * Also, if this is a SCSI IO request, handle the reply in the * mpssas_scsio_complete function. */ if (function == MPI2_FUNCTION_SCSI_IO_REQUEST) { cm->cm_desc.SCSIIO.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO; cm->cm_desc.SCSIIO.DevHandle = scsi_io_req->DevHandle; /* * Make sure the DevHandle is not 0 because this is a * likely error. */ if (scsi_io_req->DevHandle == 0) { err = EINVAL; goto RetFreeUnlocked; } } } mps_lock(sc); err = mps_wait_command(sc, cm, 30, CAN_SLEEP); if (err) { mps_printf(sc, "%s: invalid request: error %d\n", __func__, err); mps_unlock(sc); goto RetFreeUnlocked; } /* * Sync the DMA data, if any. Then copy the data to user space. */ if (cm->cm_data != NULL) { if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) dir = BUS_DMASYNC_POSTREAD; else if (cm->cm_flags & MPS_CM_FLAGS_DATAOUT) dir = BUS_DMASYNC_POSTWRITE; bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir); bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap); if (cm->cm_flags & MPS_CM_FLAGS_DATAIN) { mps_unlock(sc); err = copyout(cm->cm_data, PTRIN(data->PtrData), data->DataSize); mps_lock(sc); if (err != 0) mps_dprint(sc, MPS_FAULT, "%s: failed to copy " "IOCTL data to user space\n", __func__); } } /* * Copy the reply data and sense data to user space. */ if (cm->cm_reply != NULL) { rpl = (MPI2_DEFAULT_REPLY *)cm->cm_reply; sz = rpl->MsgLength * 4; if (sz > data->ReplySize) { mps_printf(sc, "%s: user reply buffer (%d) smaller " "than returned buffer (%d)\n", __func__, data->ReplySize, sz); } mps_unlock(sc); copyout(cm->cm_reply, PTRIN(data->PtrReply), data->ReplySize); mps_lock(sc); if ((function == MPI2_FUNCTION_SCSI_IO_REQUEST) || (function == MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH)) { if (((MPI2_SCSI_IO_REPLY *)rpl)->SCSIState & MPI2_SCSI_STATE_AUTOSENSE_VALID) { sense_len = MIN((le32toh(((MPI2_SCSI_IO_REPLY *)rpl)->SenseCount)), sizeof(struct scsi_sense_data)); mps_unlock(sc); copyout(cm->cm_sense, cm->cm_req + 64, sense_len); mps_lock(sc); } } } mps_unlock(sc); RetFreeUnlocked: mps_lock(sc); if (cm != NULL) { if (cm->cm_data) free(cm->cm_data, M_MPSUSER); mps_free_command(sc, cm); } Ret: sc->mps_flags &= ~MPS_FLAGS_BUSY; mps_unlock(sc); return (err); } static void mps_user_get_adapter_data(struct mps_softc *sc, mps_adapter_data_t *data) { Mpi2ConfigReply_t mpi_reply; Mpi2BiosPage3_t config_page; /* * Use the PCI interface functions to get the Bus, Device, and Function * information. */ data->PciInformation.u.bits.BusNumber = pci_get_bus(sc->mps_dev); data->PciInformation.u.bits.DeviceNumber = pci_get_slot(sc->mps_dev); data->PciInformation.u.bits.FunctionNumber = pci_get_function(sc->mps_dev); /* * Get the FW version that should already be saved in IOC Facts. */ data->MpiFirmwareVersion = sc->facts->FWVersion.Word; /* * General device info. */ data->AdapterType = MPSIOCTL_ADAPTER_TYPE_SAS2; if (sc->mps_flags & MPS_FLAGS_WD_AVAILABLE) data->AdapterType = MPSIOCTL_ADAPTER_TYPE_SAS2_SSS6200; data->PCIDeviceHwId = pci_get_device(sc->mps_dev); data->PCIDeviceHwRev = pci_read_config(sc->mps_dev, PCIR_REVID, 1); data->SubSystemId = pci_get_subdevice(sc->mps_dev); data->SubsystemVendorId = pci_get_subvendor(sc->mps_dev); /* * Get the driver version. */ strcpy((char *)&data->DriverVersion[0], MPS_DRIVER_VERSION); /* * Need to get BIOS Config Page 3 for the BIOS Version. */ data->BiosVersion = 0; mps_lock(sc); if (mps_config_get_bios_pg3(sc, &mpi_reply, &config_page)) printf("%s: Error while retrieving BIOS Version\n", __func__); else data->BiosVersion = config_page.BiosVersion; mps_unlock(sc); } static void mps_user_read_pci_info(struct mps_softc *sc, mps_pci_info_t *data) { int i; /* * Use the PCI interface functions to get the Bus, Device, and Function * information. */ data->BusNumber = pci_get_bus(sc->mps_dev); data->DeviceNumber = pci_get_slot(sc->mps_dev); data->FunctionNumber = pci_get_function(sc->mps_dev); /* * Now get the interrupt vector and the pci header. The vector can * only be 0 right now. The header is the first 256 bytes of config * space. */ data->InterruptVector = 0; for (i = 0; i < sizeof (data->PciHeader); i++) { data->PciHeader[i] = pci_read_config(sc->mps_dev, i, 1); } } static uint8_t mps_get_fw_diag_buffer_number(struct mps_softc *sc, uint32_t unique_id) { uint8_t index; for (index = 0; index < MPI2_DIAG_BUF_TYPE_COUNT; index++) { if (sc->fw_diag_buffer_list[index].unique_id == unique_id) { return (index); } } return (MPS_FW_DIAGNOSTIC_UID_NOT_FOUND); } static int mps_post_fw_diag_buffer(struct mps_softc *sc, mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code) { MPI2_DIAG_BUFFER_POST_REQUEST *req; MPI2_DIAG_BUFFER_POST_REPLY *reply; struct mps_command *cm = NULL; int i, status; /* * If buffer is not enabled, just leave. */ *return_code = MPS_FW_DIAG_ERROR_POST_FAILED; if (!pBuffer->enabled) { return (MPS_DIAG_FAILURE); } /* * Clear some flags initially. */ pBuffer->force_release = FALSE; pBuffer->valid_data = FALSE; pBuffer->owned_by_firmware = FALSE; /* * Get a command. */ cm = mps_alloc_command(sc); if (cm == NULL) { mps_printf(sc, "%s: no mps requests\n", __func__); return (MPS_DIAG_FAILURE); } /* * Build the request for releasing the FW Diag Buffer and send it. */ req = (MPI2_DIAG_BUFFER_POST_REQUEST *)cm->cm_req; req->Function = MPI2_FUNCTION_DIAG_BUFFER_POST; req->BufferType = pBuffer->buffer_type; req->ExtendedType = pBuffer->extended_type; req->BufferLength = pBuffer->size; for (i = 0; i < (sizeof(req->ProductSpecific) / 4); i++) req->ProductSpecific[i] = pBuffer->product_specific[i]; mps_from_u64(sc->fw_diag_busaddr, &req->BufferAddress); cm->cm_data = NULL; cm->cm_length = 0; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_complete_data = NULL; /* * Send command synchronously. */ status = mps_wait_command(sc, cm, 30, CAN_SLEEP); if (status) { mps_printf(sc, "%s: invalid request: error %d\n", __func__, status); status = MPS_DIAG_FAILURE; goto done; } /* * Process POST reply. */ reply = (MPI2_DIAG_BUFFER_POST_REPLY *)cm->cm_reply; if (reply->IOCStatus != MPI2_IOCSTATUS_SUCCESS) { status = MPS_DIAG_FAILURE; mps_dprint(sc, MPS_FAULT, "%s: post of FW Diag Buffer failed " "with IOCStatus = 0x%x, IOCLogInfo = 0x%x and " "TransferLength = 0x%x\n", __func__, reply->IOCStatus, reply->IOCLogInfo, reply->TransferLength); goto done; } /* * Post was successful. */ pBuffer->valid_data = TRUE; pBuffer->owned_by_firmware = TRUE; *return_code = MPS_FW_DIAG_ERROR_SUCCESS; status = MPS_DIAG_SUCCESS; done: mps_free_command(sc, cm); return (status); } static int mps_release_fw_diag_buffer(struct mps_softc *sc, mps_fw_diagnostic_buffer_t *pBuffer, uint32_t *return_code, uint32_t diag_type) { MPI2_DIAG_RELEASE_REQUEST *req; MPI2_DIAG_RELEASE_REPLY *reply; struct mps_command *cm = NULL; int status; /* * If buffer is not enabled, just leave. */ *return_code = MPS_FW_DIAG_ERROR_RELEASE_FAILED; if (!pBuffer->enabled) { mps_dprint(sc, MPS_USER, "%s: This buffer type is not " "supported by the IOC", __func__); return (MPS_DIAG_FAILURE); } /* * Clear some flags initially. */ pBuffer->force_release = FALSE; pBuffer->valid_data = FALSE; pBuffer->owned_by_firmware = FALSE; /* * Get a command. */ cm = mps_alloc_command(sc); if (cm == NULL) { mps_printf(sc, "%s: no mps requests\n", __func__); return (MPS_DIAG_FAILURE); } /* * Build the request for releasing the FW Diag Buffer and send it. */ req = (MPI2_DIAG_RELEASE_REQUEST *)cm->cm_req; req->Function = MPI2_FUNCTION_DIAG_RELEASE; req->BufferType = pBuffer->buffer_type; cm->cm_data = NULL; cm->cm_length = 0; cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE; cm->cm_complete_data = NULL; /* * Send command synchronously. */ status = mps_wait_command(sc, cm, 30, CAN_SLEEP); if (status) { mps_printf(sc, "%s: invalid request: error %d\n", __func__, status); status = MPS_DIAG_FAILURE; goto done; } /* * Process RELEASE reply. */ reply = (MPI2_DIAG_RELEASE_REPLY *)cm->cm_reply; if ((reply->IOCStatus != MPI2_IOCSTATUS_SUCCESS) || pBuffer->owned_by_firmware) { status = MPS_DIAG_FAILURE; mps_dprint(sc, MPS_FAULT, "%s: release of FW Diag Buffer " "failed with IOCStatus = 0x%x and IOCLogInfo = 0x%x\n", __func__, reply->IOCStatus, reply->IOCLogInfo); goto done; } /* * Release was successful. */ *return_code = MPS_FW_DIAG_ERROR_SUCCESS; status = MPS_DIAG_SUCCESS; /* * If this was for an UNREGISTER diag type command, clear the unique ID. */ if (diag_type == MPS_FW_DIAG_TYPE_UNREGISTER) { pBuffer->unique_id = MPS_FW_DIAG_INVALID_UID; } done: return (status); } static int mps_diag_register(struct mps_softc *sc, mps_fw_diag_register_t *diag_register, uint32_t *return_code) { mps_fw_diagnostic_buffer_t *pBuffer; uint8_t extended_type, buffer_type, i; uint32_t buffer_size; uint32_t unique_id; int status; extended_type = diag_register->ExtendedType; buffer_type = diag_register->BufferType; buffer_size = diag_register->RequestedBufferSize; unique_id = diag_register->UniqueId; /* * Check for valid buffer type */ if (buffer_type >= MPI2_DIAG_BUF_TYPE_COUNT) { *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; return (MPS_DIAG_FAILURE); } /* * Get the current buffer and look up the unique ID. The unique ID * should not be found. If it is, the ID is already in use. */ i = mps_get_fw_diag_buffer_number(sc, unique_id); pBuffer = &sc->fw_diag_buffer_list[buffer_type]; if (i != MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) { *return_code = MPS_FW_DIAG_ERROR_INVALID_UID; return (MPS_DIAG_FAILURE); } /* * The buffer's unique ID should not be registered yet, and the given * unique ID cannot be 0. */ if ((pBuffer->unique_id != MPS_FW_DIAG_INVALID_UID) || (unique_id == MPS_FW_DIAG_INVALID_UID)) { *return_code = MPS_FW_DIAG_ERROR_INVALID_UID; return (MPS_DIAG_FAILURE); } /* * If this buffer is already posted as immediate, just change owner. */ if (pBuffer->immediate && pBuffer->owned_by_firmware && (pBuffer->unique_id == MPS_FW_DIAG_INVALID_UID)) { pBuffer->immediate = FALSE; pBuffer->unique_id = unique_id; return (MPS_DIAG_SUCCESS); } /* * Post a new buffer after checking if it's enabled. The DMA buffer * that is allocated will be contiguous (nsegments = 1). */ if (!pBuffer->enabled) { *return_code = MPS_FW_DIAG_ERROR_NO_BUFFER; return (MPS_DIAG_FAILURE); } if (bus_dma_tag_create( sc->mps_parent_dmat, /* parent */ 1, 0, /* algnmnt, boundary */ BUS_SPACE_MAXADDR_32BIT,/* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ buffer_size, /* maxsize */ 1, /* nsegments */ buffer_size, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &sc->fw_diag_dmat)) { device_printf(sc->mps_dev, "Cannot allocate FW diag buffer DMA " "tag\n"); return (ENOMEM); } if (bus_dmamem_alloc(sc->fw_diag_dmat, (void **)&sc->fw_diag_buffer, BUS_DMA_NOWAIT, &sc->fw_diag_map)) { device_printf(sc->mps_dev, "Cannot allocate FW diag buffer " "memory\n"); return (ENOMEM); } bzero(sc->fw_diag_buffer, buffer_size); bus_dmamap_load(sc->fw_diag_dmat, sc->fw_diag_map, sc->fw_diag_buffer, buffer_size, mps_memaddr_cb, &sc->fw_diag_busaddr, 0); pBuffer->size = buffer_size; /* * Copy the given info to the diag buffer and post the buffer. */ pBuffer->buffer_type = buffer_type; pBuffer->immediate = FALSE; if (buffer_type == MPI2_DIAG_BUF_TYPE_TRACE) { for (i = 0; i < (sizeof (pBuffer->product_specific) / 4); i++) { pBuffer->product_specific[i] = diag_register->ProductSpecific[i]; } } pBuffer->extended_type = extended_type; pBuffer->unique_id = unique_id; status = mps_post_fw_diag_buffer(sc, pBuffer, return_code); /* * In case there was a failure, free the DMA buffer. */ if (status == MPS_DIAG_FAILURE) { if (sc->fw_diag_busaddr != 0) bus_dmamap_unload(sc->fw_diag_dmat, sc->fw_diag_map); if (sc->fw_diag_buffer != NULL) bus_dmamem_free(sc->fw_diag_dmat, sc->fw_diag_buffer, sc->fw_diag_map); if (sc->fw_diag_dmat != NULL) bus_dma_tag_destroy(sc->fw_diag_dmat); } return (status); } static int mps_diag_unregister(struct mps_softc *sc, mps_fw_diag_unregister_t *diag_unregister, uint32_t *return_code) { mps_fw_diagnostic_buffer_t *pBuffer; uint8_t i; uint32_t unique_id; int status; unique_id = diag_unregister->UniqueId; /* * Get the current buffer and look up the unique ID. The unique ID * should be there. */ i = mps_get_fw_diag_buffer_number(sc, unique_id); if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) { *return_code = MPS_FW_DIAG_ERROR_INVALID_UID; return (MPS_DIAG_FAILURE); } pBuffer = &sc->fw_diag_buffer_list[i]; /* * Try to release the buffer from FW before freeing it. If release * fails, don't free the DMA buffer in case FW tries to access it * later. If buffer is not owned by firmware, can't release it. */ if (!pBuffer->owned_by_firmware) { status = MPS_DIAG_SUCCESS; } else { status = mps_release_fw_diag_buffer(sc, pBuffer, return_code, MPS_FW_DIAG_TYPE_UNREGISTER); } /* * At this point, return the current status no matter what happens with * the DMA buffer. */ pBuffer->unique_id = MPS_FW_DIAG_INVALID_UID; if (status == MPS_DIAG_SUCCESS) { if (sc->fw_diag_busaddr != 0) bus_dmamap_unload(sc->fw_diag_dmat, sc->fw_diag_map); if (sc->fw_diag_buffer != NULL) bus_dmamem_free(sc->fw_diag_dmat, sc->fw_diag_buffer, sc->fw_diag_map); if (sc->fw_diag_dmat != NULL) bus_dma_tag_destroy(sc->fw_diag_dmat); } return (status); } static int mps_diag_query(struct mps_softc *sc, mps_fw_diag_query_t *diag_query, uint32_t *return_code) { mps_fw_diagnostic_buffer_t *pBuffer; uint8_t i; uint32_t unique_id; unique_id = diag_query->UniqueId; /* * If ID is valid, query on ID. * If ID is invalid, query on buffer type. */ if (unique_id == MPS_FW_DIAG_INVALID_UID) { i = diag_query->BufferType; if (i >= MPI2_DIAG_BUF_TYPE_COUNT) { *return_code = MPS_FW_DIAG_ERROR_INVALID_UID; return (MPS_DIAG_FAILURE); } } else { i = mps_get_fw_diag_buffer_number(sc, unique_id); if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) { *return_code = MPS_FW_DIAG_ERROR_INVALID_UID; return (MPS_DIAG_FAILURE); } } /* * Fill query structure with the diag buffer info. */ pBuffer = &sc->fw_diag_buffer_list[i]; diag_query->BufferType = pBuffer->buffer_type; diag_query->ExtendedType = pBuffer->extended_type; if (diag_query->BufferType == MPI2_DIAG_BUF_TYPE_TRACE) { for (i = 0; i < (sizeof(diag_query->ProductSpecific) / 4); i++) { diag_query->ProductSpecific[i] = pBuffer->product_specific[i]; } } diag_query->TotalBufferSize = pBuffer->size; diag_query->DriverAddedBufferSize = 0; diag_query->UniqueId = pBuffer->unique_id; diag_query->ApplicationFlags = 0; diag_query->DiagnosticFlags = 0; /* * Set/Clear application flags */ if (pBuffer->immediate) { diag_query->ApplicationFlags &= ~MPS_FW_DIAG_FLAG_APP_OWNED; } else { diag_query->ApplicationFlags |= MPS_FW_DIAG_FLAG_APP_OWNED; } if (pBuffer->valid_data || pBuffer->owned_by_firmware) { diag_query->ApplicationFlags |= MPS_FW_DIAG_FLAG_BUFFER_VALID; } else { diag_query->ApplicationFlags &= ~MPS_FW_DIAG_FLAG_BUFFER_VALID; } if (pBuffer->owned_by_firmware) { diag_query->ApplicationFlags |= MPS_FW_DIAG_FLAG_FW_BUFFER_ACCESS; } else { diag_query->ApplicationFlags &= ~MPS_FW_DIAG_FLAG_FW_BUFFER_ACCESS; } return (MPS_DIAG_SUCCESS); } static int mps_diag_read_buffer(struct mps_softc *sc, mps_diag_read_buffer_t *diag_read_buffer, uint8_t *ioctl_buf, uint32_t *return_code) { mps_fw_diagnostic_buffer_t *pBuffer; uint8_t i, *pData; uint32_t unique_id; int status; unique_id = diag_read_buffer->UniqueId; /* * Get the current buffer and look up the unique ID. The unique ID * should be there. */ i = mps_get_fw_diag_buffer_number(sc, unique_id); if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) { *return_code = MPS_FW_DIAG_ERROR_INVALID_UID; return (MPS_DIAG_FAILURE); } pBuffer = &sc->fw_diag_buffer_list[i]; /* * Make sure requested read is within limits */ if (diag_read_buffer->StartingOffset + diag_read_buffer->BytesToRead > pBuffer->size) { *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; return (MPS_DIAG_FAILURE); } /* * Copy the requested data from DMA to the diag_read_buffer. The DMA * buffer that was allocated is one contiguous buffer. */ pData = (uint8_t *)(sc->fw_diag_buffer + diag_read_buffer->StartingOffset); if (copyout(pData, ioctl_buf, diag_read_buffer->BytesToRead) != 0) return (MPS_DIAG_FAILURE); diag_read_buffer->Status = 0; /* * Set or clear the Force Release flag. */ if (pBuffer->force_release) { diag_read_buffer->Flags |= MPS_FW_DIAG_FLAG_FORCE_RELEASE; } else { diag_read_buffer->Flags &= ~MPS_FW_DIAG_FLAG_FORCE_RELEASE; } /* * If buffer is to be reregistered, make sure it's not already owned by * firmware first. */ status = MPS_DIAG_SUCCESS; if (!pBuffer->owned_by_firmware) { if (diag_read_buffer->Flags & MPS_FW_DIAG_FLAG_REREGISTER) { status = mps_post_fw_diag_buffer(sc, pBuffer, return_code); } } return (status); } static int mps_diag_release(struct mps_softc *sc, mps_fw_diag_release_t *diag_release, uint32_t *return_code) { mps_fw_diagnostic_buffer_t *pBuffer; uint8_t i; uint32_t unique_id; int status; unique_id = diag_release->UniqueId; /* * Get the current buffer and look up the unique ID. The unique ID * should be there. */ i = mps_get_fw_diag_buffer_number(sc, unique_id); if (i == MPS_FW_DIAGNOSTIC_UID_NOT_FOUND) { *return_code = MPS_FW_DIAG_ERROR_INVALID_UID; return (MPS_DIAG_FAILURE); } pBuffer = &sc->fw_diag_buffer_list[i]; /* * If buffer is not owned by firmware, it's already been released. */ if (!pBuffer->owned_by_firmware) { *return_code = MPS_FW_DIAG_ERROR_ALREADY_RELEASED; return (MPS_DIAG_FAILURE); } /* * Release the buffer. */ status = mps_release_fw_diag_buffer(sc, pBuffer, return_code, MPS_FW_DIAG_TYPE_RELEASE); return (status); } static int mps_do_diag_action(struct mps_softc *sc, uint32_t action, uint8_t *diag_action, uint32_t length, uint32_t *return_code) { mps_fw_diag_register_t diag_register; mps_fw_diag_unregister_t diag_unregister; mps_fw_diag_query_t diag_query; mps_diag_read_buffer_t diag_read_buffer; mps_fw_diag_release_t diag_release; int status = MPS_DIAG_SUCCESS; uint32_t original_return_code; original_return_code = *return_code; *return_code = MPS_FW_DIAG_ERROR_SUCCESS; switch (action) { case MPS_FW_DIAG_TYPE_REGISTER: if (!length) { *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; status = MPS_DIAG_FAILURE; break; } if (copyin(diag_action, &diag_register, sizeof(diag_register)) != 0) return (MPS_DIAG_FAILURE); status = mps_diag_register(sc, &diag_register, return_code); break; case MPS_FW_DIAG_TYPE_UNREGISTER: if (length < sizeof(diag_unregister)) { *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; status = MPS_DIAG_FAILURE; break; } if (copyin(diag_action, &diag_unregister, sizeof(diag_unregister)) != 0) return (MPS_DIAG_FAILURE); status = mps_diag_unregister(sc, &diag_unregister, return_code); break; case MPS_FW_DIAG_TYPE_QUERY: if (length < sizeof (diag_query)) { *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; status = MPS_DIAG_FAILURE; break; } if (copyin(diag_action, &diag_query, sizeof(diag_query)) != 0) return (MPS_DIAG_FAILURE); status = mps_diag_query(sc, &diag_query, return_code); if (status == MPS_DIAG_SUCCESS) if (copyout(&diag_query, diag_action, sizeof (diag_query)) != 0) return (MPS_DIAG_FAILURE); break; case MPS_FW_DIAG_TYPE_READ_BUFFER: if (copyin(diag_action, &diag_read_buffer, sizeof(diag_read_buffer)) != 0) return (MPS_DIAG_FAILURE); if (length < diag_read_buffer.BytesToRead) { *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; status = MPS_DIAG_FAILURE; break; } status = mps_diag_read_buffer(sc, &diag_read_buffer, PTRIN(diag_read_buffer.PtrDataBuffer), return_code); if (status == MPS_DIAG_SUCCESS) { if (copyout(&diag_read_buffer, diag_action, sizeof(diag_read_buffer) - sizeof(diag_read_buffer.PtrDataBuffer)) != 0) return (MPS_DIAG_FAILURE); } break; case MPS_FW_DIAG_TYPE_RELEASE: if (length < sizeof(diag_release)) { *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; status = MPS_DIAG_FAILURE; break; } if (copyin(diag_action, &diag_release, sizeof(diag_release)) != 0) return (MPS_DIAG_FAILURE); status = mps_diag_release(sc, &diag_release, return_code); break; default: *return_code = MPS_FW_DIAG_ERROR_INVALID_PARAMETER; status = MPS_DIAG_FAILURE; break; } if ((status == MPS_DIAG_FAILURE) && (original_return_code == MPS_FW_DIAG_NEW) && (*return_code != MPS_FW_DIAG_ERROR_SUCCESS)) status = MPS_DIAG_SUCCESS; return (status); } static int mps_user_diag_action(struct mps_softc *sc, mps_diag_action_t *data) { int status; /* * Only allow one diag action at one time. */ if (sc->mps_flags & MPS_FLAGS_BUSY) { mps_dprint(sc, MPS_USER, "%s: Only one FW diag command " "allowed at a single time.", __func__); return (EBUSY); } sc->mps_flags |= MPS_FLAGS_BUSY; /* * Send diag action request */ if (data->Action == MPS_FW_DIAG_TYPE_REGISTER || data->Action == MPS_FW_DIAG_TYPE_UNREGISTER || data->Action == MPS_FW_DIAG_TYPE_QUERY || data->Action == MPS_FW_DIAG_TYPE_READ_BUFFER || data->Action == MPS_FW_DIAG_TYPE_RELEASE) { status = mps_do_diag_action(sc, data->Action, PTRIN(data->PtrDiagAction), data->Length, &data->ReturnCode); } else status = EINVAL; sc->mps_flags &= ~MPS_FLAGS_BUSY; return (status); } /* * Copy the event recording mask and the event queue size out. For * clarification, the event recording mask (events_to_record) is not the same * thing as the event mask (event_mask). events_to_record has a bit set for * every event type that is to be recorded by the driver, and event_mask has a * bit cleared for every event that is allowed into the driver from the IOC. * They really have nothing to do with each other. */ static void mps_user_event_query(struct mps_softc *sc, mps_event_query_t *data) { uint8_t i; mps_lock(sc); data->Entries = MPS_EVENT_QUEUE_SIZE; for (i = 0; i < 4; i++) { data->Types[i] = sc->events_to_record[i]; } mps_unlock(sc); } /* * Set the driver's event mask according to what's been given. See * mps_user_event_query for explanation of the event recording mask and the IOC * event mask. It's the app's responsibility to enable event logging by setting * the bits in events_to_record. Initially, no events will be logged. */ static void mps_user_event_enable(struct mps_softc *sc, mps_event_enable_t *data) { uint8_t i; mps_lock(sc); for (i = 0; i < 4; i++) { sc->events_to_record[i] = data->Types[i]; } mps_unlock(sc); } /* * Copy out the events that have been recorded, up to the max events allowed. */ static int mps_user_event_report(struct mps_softc *sc, mps_event_report_t *data) { int status = 0; uint32_t size; mps_lock(sc); size = data->Size; if ((size >= sizeof(sc->recorded_events)) && (status == 0)) { mps_unlock(sc); if (copyout((void *)sc->recorded_events, PTRIN(data->PtrEvents), size) != 0) status = EFAULT; mps_lock(sc); } else { /* * data->Size value is not large enough to copy event data. */ status = EFAULT; } /* * Change size value to match the number of bytes that were copied. */ if (status == 0) data->Size = sizeof(sc->recorded_events); mps_unlock(sc); return (status); } /* * Record events into the driver from the IOC if they are not masked. */ void mpssas_record_event(struct mps_softc *sc, MPI2_EVENT_NOTIFICATION_REPLY *event_reply) { uint32_t event; int i, j; uint16_t event_data_len; boolean_t sendAEN = FALSE; event = event_reply->Event; /* * Generate a system event to let anyone who cares know that a * LOG_ENTRY_ADDED event has occurred. This is sent no matter what the * event mask is set to. */ if (event == MPI2_EVENT_LOG_ENTRY_ADDED) { sendAEN = TRUE; } /* * Record the event only if its corresponding bit is set in * events_to_record. event_index is the index into recorded_events and * event_number is the overall number of an event being recorded since * start-of-day. event_index will roll over; event_number will never * roll over. */ i = (uint8_t)(event / 32); j = (uint8_t)(event % 32); if ((i < 4) && ((1 << j) & sc->events_to_record[i])) { i = sc->event_index; sc->recorded_events[i].Type = event; sc->recorded_events[i].Number = ++sc->event_number; bzero(sc->recorded_events[i].Data, MPS_MAX_EVENT_DATA_LENGTH * 4); event_data_len = event_reply->EventDataLength; if (event_data_len > 0) { /* * Limit data to size in m_event entry */ if (event_data_len > MPS_MAX_EVENT_DATA_LENGTH) { event_data_len = MPS_MAX_EVENT_DATA_LENGTH; } for (j = 0; j < event_data_len; j++) { sc->recorded_events[i].Data[j] = event_reply->EventData[j]; } /* * check for index wrap-around */ if (++i == MPS_EVENT_QUEUE_SIZE) { i = 0; } sc->event_index = (uint8_t)i; /* * Set flag to send the event. */ sendAEN = TRUE; } } /* * Generate a system event if flag is set to let anyone who cares know * that an event has occurred. */ if (sendAEN) { //SLM-how to send a system event (see kqueue, kevent) // (void) ddi_log_sysevent(mpt->m_dip, DDI_VENDOR_LSI, "MPT_SAS", // "SAS", NULL, NULL, DDI_NOSLEEP); } } static int mps_user_reg_access(struct mps_softc *sc, mps_reg_access_t *data) { int status = 0; switch (data->Command) { /* * IO access is not supported. */ case REG_IO_READ: case REG_IO_WRITE: mps_dprint(sc, MPS_USER, "IO access is not supported. " "Use memory access."); status = EINVAL; break; case REG_MEM_READ: data->RegData = mps_regread(sc, data->RegOffset); break; case REG_MEM_WRITE: mps_regwrite(sc, data->RegOffset, data->RegData); break; default: status = EINVAL; break; } return (status); } static int mps_user_btdh(struct mps_softc *sc, mps_btdh_mapping_t *data) { uint8_t bt2dh = FALSE; uint8_t dh2bt = FALSE; uint16_t dev_handle, bus, target; bus = data->Bus; target = data->TargetID; dev_handle = data->DevHandle; /* * When DevHandle is 0xFFFF and Bus/Target are not 0xFFFF, use Bus/ * Target to get DevHandle. When Bus/Target are 0xFFFF and DevHandle is * not 0xFFFF, use DevHandle to get Bus/Target. Anything else is * invalid. */ if ((bus == 0xFFFF) && (target == 0xFFFF) && (dev_handle != 0xFFFF)) dh2bt = TRUE; if ((dev_handle == 0xFFFF) && (bus != 0xFFFF) && (target != 0xFFFF)) bt2dh = TRUE; if (!dh2bt && !bt2dh) return (EINVAL); /* * Only handle bus of 0. Make sure target is within range. */ if (bt2dh) { if (bus != 0) return (EINVAL); if (target > sc->max_devices) { mps_dprint(sc, MPS_FAULT, "Target ID is out of range " "for Bus/Target to DevHandle mapping."); return (EINVAL); } dev_handle = sc->mapping_table[target].dev_handle; if (dev_handle) data->DevHandle = dev_handle; } else { bus = 0; target = mps_mapping_get_sas_id_from_handle(sc, dev_handle); data->Bus = bus; data->TargetID = target; } return (0); } static int mps_ioctl(struct cdev *dev, u_long cmd, void *arg, int flag, struct thread *td) { struct mps_softc *sc; struct mps_cfg_page_req *page_req; struct mps_ext_cfg_page_req *ext_page_req; void *mps_page; int error, msleep_ret; mps_page = NULL; sc = dev->si_drv1; page_req = (void *)arg; ext_page_req = (void *)arg; switch (cmd) { case MPSIO_READ_CFG_HEADER: mps_lock(sc); error = mps_user_read_cfg_header(sc, page_req); mps_unlock(sc); break; case MPSIO_READ_CFG_PAGE: mps_page = malloc(page_req->len, M_MPSUSER, M_WAITOK | M_ZERO); if(!mps_page) { mps_printf(sc, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } error = copyin(page_req->buf, mps_page, sizeof(MPI2_CONFIG_PAGE_HEADER)); if (error) break; mps_lock(sc); error = mps_user_read_cfg_page(sc, page_req, mps_page); mps_unlock(sc); if (error) break; error = copyout(mps_page, page_req->buf, page_req->len); break; case MPSIO_READ_EXT_CFG_HEADER: mps_lock(sc); error = mps_user_read_extcfg_header(sc, ext_page_req); mps_unlock(sc); break; case MPSIO_READ_EXT_CFG_PAGE: mps_page = malloc(ext_page_req->len, M_MPSUSER, M_WAITOK|M_ZERO); if(!mps_page) { mps_printf(sc, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } error = copyin(ext_page_req->buf, mps_page, sizeof(MPI2_CONFIG_EXTENDED_PAGE_HEADER)); if (error) break; mps_lock(sc); error = mps_user_read_extcfg_page(sc, ext_page_req, mps_page); mps_unlock(sc); if (error) break; error = copyout(mps_page, ext_page_req->buf, ext_page_req->len); break; case MPSIO_WRITE_CFG_PAGE: mps_page = malloc(page_req->len, M_MPSUSER, M_WAITOK|M_ZERO); if(!mps_page) { mps_printf(sc, "Cannot allocate memory %s %d\n", __func__, __LINE__); return (ENOMEM); } error = copyin(page_req->buf, mps_page, page_req->len); if (error) break; mps_lock(sc); error = mps_user_write_cfg_page(sc, page_req, mps_page); mps_unlock(sc); break; case MPSIO_MPS_COMMAND: error = mps_user_command(sc, (struct mps_usr_command *)arg); break; case MPTIOCTL_PASS_THRU: /* * The user has requested to pass through a command to be * executed by the MPT firmware. Call our routine which does * this. Only allow one passthru IOCTL at one time. */ error = mps_user_pass_thru(sc, (mps_pass_thru_t *)arg); break; case MPTIOCTL_GET_ADAPTER_DATA: /* * The user has requested to read adapter data. Call our * routine which does this. */ error = 0; mps_user_get_adapter_data(sc, (mps_adapter_data_t *)arg); break; case MPTIOCTL_GET_PCI_INFO: /* * The user has requested to read pci info. Call * our routine which does this. */ mps_lock(sc); error = 0; mps_user_read_pci_info(sc, (mps_pci_info_t *)arg); mps_unlock(sc); break; case MPTIOCTL_RESET_ADAPTER: mps_lock(sc); sc->port_enable_complete = 0; uint32_t reinit_start = time_uptime; error = mps_reinit(sc); /* Sleep for 300 second. */ msleep_ret = msleep(&sc->port_enable_complete, &sc->mps_mtx, PRIBIO, "mps_porten", 300 * hz); mps_unlock(sc); if (msleep_ret) printf("Port Enable did not complete after Diag " "Reset msleep error %d.\n", msleep_ret); else mps_dprint(sc, MPS_USER, "Hard Reset with Port Enable completed in %d seconds.\n", (uint32_t) (time_uptime - reinit_start)); break; case MPTIOCTL_DIAG_ACTION: /* * The user has done a diag buffer action. Call our routine * which does this. Only allow one diag action at one time. */ mps_lock(sc); error = mps_user_diag_action(sc, (mps_diag_action_t *)arg); mps_unlock(sc); break; case MPTIOCTL_EVENT_QUERY: /* * The user has done an event query. Call our routine which does * this. */ error = 0; mps_user_event_query(sc, (mps_event_query_t *)arg); break; case MPTIOCTL_EVENT_ENABLE: /* * The user has done an event enable. Call our routine which * does this. */ error = 0; mps_user_event_enable(sc, (mps_event_enable_t *)arg); break; case MPTIOCTL_EVENT_REPORT: /* * The user has done an event report. Call our routine which * does this. */ error = mps_user_event_report(sc, (mps_event_report_t *)arg); break; case MPTIOCTL_REG_ACCESS: /* * The user has requested register access. Call our routine * which does this. */ mps_lock(sc); error = mps_user_reg_access(sc, (mps_reg_access_t *)arg); mps_unlock(sc); break; case MPTIOCTL_BTDH_MAPPING: /* * The user has requested to translate a bus/target to a * DevHandle or a DevHandle to a bus/target. Call our routine * which does this. */ error = mps_user_btdh(sc, (mps_btdh_mapping_t *)arg); break; default: error = ENOIOCTL; break; } if (mps_page != NULL) free(mps_page, M_MPSUSER); return (error); } #ifdef COMPAT_FREEBSD32 struct mps_cfg_page_req32 { MPI2_CONFIG_PAGE_HEADER header; uint32_t page_address; uint32_t buf; int len; uint16_t ioc_status; }; struct mps_ext_cfg_page_req32 { MPI2_CONFIG_EXTENDED_PAGE_HEADER header; uint32_t page_address; uint32_t buf; int len; uint16_t ioc_status; }; struct mps_raid_action32 { uint8_t action; uint8_t volume_bus; uint8_t volume_id; uint8_t phys_disk_num; uint32_t action_data_word; uint32_t buf; int len; uint32_t volume_status; uint32_t action_data[4]; uint16_t action_status; uint16_t ioc_status; uint8_t write; }; struct mps_usr_command32 { uint32_t req; uint32_t req_len; uint32_t rpl; uint32_t rpl_len; uint32_t buf; int len; uint32_t flags; }; #define MPSIO_READ_CFG_HEADER32 _IOWR('M', 200, struct mps_cfg_page_req32) #define MPSIO_READ_CFG_PAGE32 _IOWR('M', 201, struct mps_cfg_page_req32) #define MPSIO_READ_EXT_CFG_HEADER32 _IOWR('M', 202, struct mps_ext_cfg_page_req32) #define MPSIO_READ_EXT_CFG_PAGE32 _IOWR('M', 203, struct mps_ext_cfg_page_req32) #define MPSIO_WRITE_CFG_PAGE32 _IOWR('M', 204, struct mps_cfg_page_req32) #define MPSIO_RAID_ACTION32 _IOWR('M', 205, struct mps_raid_action32) #define MPSIO_MPS_COMMAND32 _IOWR('M', 210, struct mps_usr_command32) static int mps_ioctl32(struct cdev *dev, u_long cmd32, void *_arg, int flag, struct thread *td) { struct mps_cfg_page_req32 *page32 = _arg; struct mps_ext_cfg_page_req32 *ext32 = _arg; struct mps_raid_action32 *raid32 = _arg; struct mps_usr_command32 *user32 = _arg; union { struct mps_cfg_page_req page; struct mps_ext_cfg_page_req ext; struct mps_raid_action raid; struct mps_usr_command user; } arg; u_long cmd; int error; switch (cmd32) { case MPSIO_READ_CFG_HEADER32: case MPSIO_READ_CFG_PAGE32: case MPSIO_WRITE_CFG_PAGE32: if (cmd32 == MPSIO_READ_CFG_HEADER32) cmd = MPSIO_READ_CFG_HEADER; else if (cmd32 == MPSIO_READ_CFG_PAGE32) cmd = MPSIO_READ_CFG_PAGE; else cmd = MPSIO_WRITE_CFG_PAGE; CP(*page32, arg.page, header); CP(*page32, arg.page, page_address); PTRIN_CP(*page32, arg.page, buf); CP(*page32, arg.page, len); CP(*page32, arg.page, ioc_status); break; case MPSIO_READ_EXT_CFG_HEADER32: case MPSIO_READ_EXT_CFG_PAGE32: if (cmd32 == MPSIO_READ_EXT_CFG_HEADER32) cmd = MPSIO_READ_EXT_CFG_HEADER; else cmd = MPSIO_READ_EXT_CFG_PAGE; CP(*ext32, arg.ext, header); CP(*ext32, arg.ext, page_address); PTRIN_CP(*ext32, arg.ext, buf); CP(*ext32, arg.ext, len); CP(*ext32, arg.ext, ioc_status); break; case MPSIO_RAID_ACTION32: cmd = MPSIO_RAID_ACTION; CP(*raid32, arg.raid, action); CP(*raid32, arg.raid, volume_bus); CP(*raid32, arg.raid, volume_id); CP(*raid32, arg.raid, phys_disk_num); CP(*raid32, arg.raid, action_data_word); PTRIN_CP(*raid32, arg.raid, buf); CP(*raid32, arg.raid, len); CP(*raid32, arg.raid, volume_status); bcopy(raid32->action_data, arg.raid.action_data, sizeof arg.raid.action_data); CP(*raid32, arg.raid, ioc_status); CP(*raid32, arg.raid, write); break; case MPSIO_MPS_COMMAND32: cmd = MPSIO_MPS_COMMAND; PTRIN_CP(*user32, arg.user, req); CP(*user32, arg.user, req_len); PTRIN_CP(*user32, arg.user, rpl); CP(*user32, arg.user, rpl_len); PTRIN_CP(*user32, arg.user, buf); CP(*user32, arg.user, len); CP(*user32, arg.user, flags); break; default: return (ENOIOCTL); } error = mps_ioctl(dev, cmd, &arg, flag, td); if (error == 0 && (cmd32 & IOC_OUT) != 0) { switch (cmd32) { case MPSIO_READ_CFG_HEADER32: case MPSIO_READ_CFG_PAGE32: case MPSIO_WRITE_CFG_PAGE32: CP(arg.page, *page32, header); CP(arg.page, *page32, page_address); PTROUT_CP(arg.page, *page32, buf); CP(arg.page, *page32, len); CP(arg.page, *page32, ioc_status); break; case MPSIO_READ_EXT_CFG_HEADER32: case MPSIO_READ_EXT_CFG_PAGE32: CP(arg.ext, *ext32, header); CP(arg.ext, *ext32, page_address); PTROUT_CP(arg.ext, *ext32, buf); CP(arg.ext, *ext32, len); CP(arg.ext, *ext32, ioc_status); break; case MPSIO_RAID_ACTION32: CP(arg.raid, *raid32, action); CP(arg.raid, *raid32, volume_bus); CP(arg.raid, *raid32, volume_id); CP(arg.raid, *raid32, phys_disk_num); CP(arg.raid, *raid32, action_data_word); PTROUT_CP(arg.raid, *raid32, buf); CP(arg.raid, *raid32, len); CP(arg.raid, *raid32, volume_status); bcopy(arg.raid.action_data, raid32->action_data, sizeof arg.raid.action_data); CP(arg.raid, *raid32, ioc_status); CP(arg.raid, *raid32, write); break; case MPSIO_MPS_COMMAND32: PTROUT_CP(arg.user, *user32, req); CP(arg.user, *user32, req_len); PTROUT_CP(arg.user, *user32, rpl); CP(arg.user, *user32, rpl_len); PTROUT_CP(arg.user, *user32, buf); CP(arg.user, *user32, len); CP(arg.user, *user32, flags); break; } } return (error); } #endif /* COMPAT_FREEBSD32 */ static int mps_ioctl_devsw(struct cdev *dev, u_long com, caddr_t arg, int flag, struct thread *td) { #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (mps_ioctl32(dev, com, arg, flag, td)); #endif return (mps_ioctl(dev, com, arg, flag, td)); } Index: projects/building-blocks/sys/kern/kern_timeout.c =================================================================== --- projects/building-blocks/sys/kern/kern_timeout.c (revision 277723) +++ projects/building-blocks/sys/kern/kern_timeout.c (revision 277724) @@ -1,1475 +1,1479 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_callout_profiling.h" #if defined(__arm__) #include "opt_timer.h" #endif #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifndef NO_EVENTTIMERS DPCPU_DECLARE(sbintime_t, hardclocktime); #endif SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__start, "struct callout *"); SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__end, "struct callout *"); #ifdef CALLOUT_PROFILING static int avg_depth; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, "Average number of items examined per softclock call. Units = 1/1000"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); static int avg_lockcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, "Average number of lock callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); static int avg_depth_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0, "Average number of direct callouts examined per callout_process call. " "Units = 1/1000"); static int avg_lockcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD, &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per " "callout_process call. Units = 1/1000"); static int avg_mpcalls_dir; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir, 0, "Average number of MP direct callouts made per callout_process call. " "Units = 1/1000"); #endif static int ncallout; SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &ncallout, 0, "Number of entries in callwheel and size of timeout() preallocation"); #ifdef RSS static int pin_default_swi = 1; static int pin_pcpu_swi = 1; #else static int pin_default_swi = 0; static int pin_pcpu_swi = 0; #endif SYSCTL_INT(_kern, OID_AUTO, pin_default_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_default_swi, 0, "Pin the default (non-per-cpu) swi (shared with PCPU 0 swi)"); SYSCTL_INT(_kern, OID_AUTO, pin_pcpu_swi, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pin_pcpu_swi, 0, "Pin the per-CPU swis (except PCPU 0, which is also default"); /* * TODO: * allocate more timeout table slots when table overflows. */ u_int callwheelsize, callwheelmask; /* * The callout cpu exec entities represent informations necessary for * describing the state of callouts currently running on the CPU and the ones * necessary for migrating callouts to the new callout cpu. In particular, * the first entry of the array cc_exec_entity holds informations for callout * running in SWI thread context, while the second one holds informations * for callout running directly from hardware interrupt context. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ struct cc_exec { struct callout *cc_next; struct callout *cc_curr; #ifdef SMP void (*ce_migration_func)(void *); void *ce_migration_arg; int ce_migration_cpu; sbintime_t ce_migration_time; sbintime_t ce_migration_prec; #endif bool cc_cancel; bool cc_waiting; }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; struct callout *cc_callout; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; struct callout_slist cc_callfree; sbintime_t cc_firstevent; sbintime_t cc_lastscan; void *cc_cookie; u_int cc_bucket; char cc_ktr_event_name[20]; }; #define cc_exec_curr cc_exec_entity[0].cc_curr #define cc_exec_next cc_exec_entity[0].cc_next #define cc_exec_cancel cc_exec_entity[0].cc_cancel #define cc_exec_waiting cc_exec_entity[0].cc_waiting #define cc_exec_curr_dir cc_exec_entity[1].cc_curr #define cc_exec_next_dir cc_exec_entity[1].cc_next #define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel #define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting #ifdef SMP #define cc_migration_func cc_exec_entity[0].ce_migration_func #define cc_migration_arg cc_exec_entity[0].ce_migration_arg #define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu #define cc_migration_time cc_exec_entity[0].ce_migration_time #define cc_migration_prec cc_exec_entity[0].ce_migration_prec #define cc_migration_func_dir cc_exec_entity[1].ce_migration_func #define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg #define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu #define cc_migration_time_dir cc_exec_entity[1].ce_migration_time #define cc_migration_prec_dir cc_exec_entity[1].ce_migration_prec struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) #define CC_SELF() CC_CPU(PCPU_GET(cpuid)) #else struct callout_cpu cc_cpu; #define CC_CPU(cpu) &cc_cpu #define CC_SELF() &cc_cpu #endif #define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock) #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) static int timeout_cpu; static void callout_cpu_init(struct callout_cpu *cc, int cpu); static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct); static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: * cc_curr - If a callout is in progress, it is cc_curr. * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when * cc_curr is non-NULL. */ /* * Resets the execution entity tied to a specific callout cpu. */ static void cc_cce_cleanup(struct callout_cpu *cc, int direct) { cc->cc_exec_entity[direct].cc_curr = NULL; cc->cc_exec_entity[direct].cc_next = NULL; cc->cc_exec_entity[direct].cc_cancel = false; cc->cc_exec_entity[direct].cc_waiting = false; #ifdef SMP cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; cc->cc_exec_entity[direct].ce_migration_time = 0; cc->cc_exec_entity[direct].ce_migration_prec = 0; cc->cc_exec_entity[direct].ce_migration_func = NULL; cc->cc_exec_entity[direct].ce_migration_arg = NULL; #endif } /* * Checks if migration is requested by a specific callout cpu. */ static int cc_cce_migrating(struct callout_cpu *cc, int direct) { #ifdef SMP return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); #else return (0); #endif } /* * Kernel low level callwheel initialization * called on cpu0 during kernel startup. */ static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; /* * Calculate the size of the callout wheel and the preallocated * timeout() structures. * XXX: Clip callout to result of previous function of maxusers * maximum 384. This is still huge, but acceptable. */ ncallout = imin(16 + maxproc + maxfiles, 18508); TUNABLE_INT_FETCH("kern.ncallout", &ncallout); /* * Calculate callout wheel size, should be next power of two higher * than 'ncallout'. */ callwheelsize = 1 << fls(ncallout); callwheelmask = callwheelsize - 1; /* * Fetch whether we're pinning the swi's or not. */ TUNABLE_INT_FETCH("kern.pin_default_swi", &pin_default_swi); TUNABLE_INT_FETCH("kern.pin_pcpu_swi", &pin_pcpu_swi); /* * Only cpu0 handles timeout(9) and receives a preallocation. * * XXX: Once all timeout(9) consumers are converted this can * be removed. */ timeout_cpu = PCPU_GET(cpuid); cc = CC_CPU(timeout_cpu); cc->cc_callout = malloc(ncallout * sizeof(struct callout), M_CALLOUT, M_WAITOK); callout_cpu_init(cc, timeout_cpu); } SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL); /* * Initialize the per-cpu callout structures. */ static void callout_cpu_init(struct callout_cpu *cc, int cpu) { struct callout *c; int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize, M_CALLOUT, M_WAITOK); for (i = 0; i < callwheelsize; i++) LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; for (i = 0; i < 2; i++) cc_cce_cleanup(cc, i); snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); if (cc->cc_callout == NULL) /* Only cpu0 handles timeout(9) */ return; for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); c->c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } } #ifdef SMP /* * Switches the cpu tied to a specific callout. * The function expects a locked incoming callout cpu and returns with * locked outcoming callout cpu. */ static struct callout_cpu * callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) { struct callout_cpu *new_cc; MPASS(c != NULL && cc != NULL); CC_LOCK_ASSERT(cc); /* * Avoid interrupts and preemption firing after the callout cpu * is blocked in order to avoid deadlocks as the new thread * may be willing to acquire the callout cpu lock. */ c->c_cpu = CPUBLOCK; spinlock_enter(); CC_UNLOCK(cc); new_cc = CC_CPU(new_cpu); CC_LOCK(new_cc); spinlock_exit(); c->c_cpu = new_cpu; return (new_cc); } #endif /* * Start standard softclock thread. */ static void start_softclock(void *dummy) { struct callout_cpu *cc; char name[MAXCOMLEN]; #ifdef SMP int cpu; struct intr_event *ie; #endif cc = CC_CPU(timeout_cpu); snprintf(name, sizeof(name), "clock (%d)", timeout_cpu); if (swi_add(&clk_intr_event, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_default_swi && (intr_event_bind(clk_intr_event, timeout_cpu) != 0)) { printf("%s: timeout clock couldn't be pinned to cpu %d\n", __func__, timeout_cpu); } #ifdef SMP CPU_FOREACH(cpu) { if (cpu == timeout_cpu) continue; cc = CC_CPU(cpu); cc->cc_callout = NULL; /* Only cpu0 handles timeout(9). */ callout_cpu_init(cc, cpu); snprintf(name, sizeof(name), "clock (%d)", cpu); ie = NULL; if (swi_add(&ie, name, softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); if (pin_pcpu_swi && (intr_event_bind(ie, cpu) != 0)) { printf("%s: per-cpu clock couldn't be pinned to " "cpu %d\n", __func__, cpu); } } #endif } SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); #define CC_HASH_SHIFT 8 static inline u_int callout_hash(sbintime_t sbt) { return (sbt >> (32 - CC_HASH_SHIFT)); } static inline u_int callout_get_bucket(sbintime_t sbt) { return (callout_hash(sbt) & callwheelmask); } void callout_process(sbintime_t now) { struct callout *tmp, *tmpn; struct callout_cpu *cc; struct callout_list *sc; sbintime_t first, last, max, tmp_max; uint32_t lookahead; u_int firstb, lastb, nowb; #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); cc->cc_lastscan = now; nowb = callout_hash(now); /* Compute the last bucket and minimum time of the bucket after it. */ if (nowb == firstb) lookahead = (SBT_1S / 16); else if (nowb - firstb == 1) lookahead = (SBT_1S / 8); else lookahead = (SBT_1S / 2); first = last = now; first += (lookahead / 2); last += lookahead; last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT)); lastb = callout_hash(last) - 1; max = last; /* * Check if we wrapped around the entire wheel from the last scan. * In case, we need to scan entirely the wheel for pending callouts. */ if (lastb - firstb >= callwheelsize) { lastb = firstb + callwheelsize - 1; if (nowb - firstb >= callwheelsize) nowb = lastb; } /* Iterate callwheel from firstb to nowb and then up to lastb. */ do { sc = &cc->cc_callwheel[firstb & callwheelmask]; tmp = LIST_FIRST(sc); while (tmp != NULL) { /* Run the callout if present time within allowed. */ if (tmp->c_time <= now) { /* * Consumer told us the callout may be run * directly from hardware interrupt context. */ if (tmp->c_flags & CALLOUT_DIRECT) { #ifdef CALLOUT_PROFILING ++depth_dir; #endif cc->cc_exec_next_dir = LIST_NEXT(tmp, c_links.le); cc->cc_bucket = firstb & callwheelmask; LIST_REMOVE(tmp, c_links.le); softclock_call_cc(tmp, cc, #ifdef CALLOUT_PROFILING &mpcalls_dir, &lockcalls_dir, NULL, #endif 1); tmp = cc->cc_exec_next_dir; } else { tmpn = LIST_NEXT(tmp, c_links.le); LIST_REMOVE(tmp, c_links.le); TAILQ_INSERT_TAIL(&cc->cc_expireq, tmp, c_links.tqe); tmp->c_flags |= CALLOUT_PROCESSED; tmp = tmpn; } continue; } /* Skip events from distant future. */ if (tmp->c_time >= max) goto next; /* * Event minimal time is bigger than present maximal * time, so it cannot be aggregated. */ if (tmp->c_time > last) { lastb = nowb; goto next; } /* Update first and last time, respecting this event. */ if (tmp->c_time < first) first = tmp->c_time; tmp_max = tmp->c_time + tmp->c_precision; if (tmp_max < last) last = tmp_max; next: tmp = LIST_NEXT(tmp, c_links.le); } /* Proceed with the next bucket. */ firstb++; /* * Stop if we looked after present time and found * some event we can't execute at now. * Stop if we looked far enough into the future. */ } while (((int)(firstb - lastb)) <= 0); cc->cc_firstevent = last; #ifndef NO_EVENTTIMERS cpu_new_callout(curcpu, last, first); #endif #ifdef CALLOUT_PROFILING avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8; avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ if (!TAILQ_EMPTY(&cc->cc_expireq)) swi_sched(cc->cc_cookie, 0); } static struct callout_cpu * callout_lock(struct callout *c) { struct callout_cpu *cc; int cpu; for (;;) { cpu = c->c_cpu; #ifdef SMP if (cpu == CPUBLOCK) { while (c->c_cpu == CPUBLOCK) cpu_spinwait(); continue; } #endif cc = CC_CPU(cpu); CC_LOCK(cc); if (cpu == c->c_cpu) break; CC_UNLOCK(cc); } return (cc); } static void callout_cc_add(struct callout *c, struct callout_cpu *cc, sbintime_t sbt, sbintime_t precision, void (*func)(void *), void *arg, int cpu, int flags) { int bucket; CC_LOCK_ASSERT(cc); if (sbt < cc->cc_lastscan) sbt = cc->cc_lastscan; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); if (flags & C_DIRECT_EXEC) c->c_flags |= CALLOUT_DIRECT; c->c_flags &= ~CALLOUT_PROCESSED; c->c_func = func; c->c_time = sbt; c->c_precision = precision; bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); if (cc->cc_bucket == bucket) cc->cc_exec_next_dir = c; #ifndef NO_EVENTTIMERS /* * Inform the eventtimers(4) subsystem there's a new callout * that has been inserted, but only if really required. */ if (SBT_MAX - c->c_time < c->c_precision) c->c_precision = SBT_MAX - c->c_time; sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; cpu_new_callout(cpu, sbt, c->c_time); } #endif } static void callout_cc_del(struct callout *c, struct callout_cpu *cc) { if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0) return; c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } static void softclock_call_cc(struct callout *c, struct callout_cpu *cc, #ifdef CALLOUT_PROFILING int *mpcalls, int *lockcalls, int *gcalls, #endif int direct) { struct rm_priotracker tracker; void (*c_func)(void *); void *c_arg; struct lock_class *class; struct lock_object *c_lock; uintptr_t lock_status; int c_flags; #ifdef SMP struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; int flags, new_cpu; sbintime_t new_prec, new_time; #endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */ static timeout_t *lastfunc; #endif KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) == (CALLOUT_PENDING | CALLOUT_ACTIVE), ("softclock_call_cc: pend|act %p %x", c, c->c_flags)); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; lock_status = 0; if (c->c_flags & CALLOUT_SHAREDLOCK) { if (class == &lock_class_rm) lock_status = (uintptr_t)&tracker; else lock_status = 1; } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_flags = c->c_flags; if (c->c_flags & CALLOUT_LOCAL_ALLOC) c->c_flags = CALLOUT_LOCAL_ALLOC; else c->c_flags &= ~CALLOUT_PENDING; cc->cc_exec_entity[direct].cc_curr = c; cc->cc_exec_entity[direct].cc_cancel = false; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, lock_status); /* * The callout may have been cancelled * while we switched locks. */ if (cc->cc_exec_entity[direct].cc_cancel) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ cc->cc_exec_entity[direct].cc_cancel = true; if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; #endif CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p", c, c_func, c_arg); } else { #ifdef CALLOUT_PROFILING (*lockcalls)++; #endif CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { #ifdef CALLOUT_PROFILING (*mpcalls)++; #endif CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running", "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt1 = sbinuptime(); #endif THREAD_NO_SLEEPING(); SDT_PROBE(callout_execute, kernel, , callout__start, c, 0, 0, 0, 0); c_func(c_arg); SDT_PROBE(callout_execute, kernel, , callout__end, c, 0, 0, 0, 0); THREAD_SLEEPING_OK(); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbt2 = sbinuptime(); sbt2 -= sbt1; if (sbt2 > maxdt) { if (lastfunc != c_func || sbt2 > maxdt * 2) { ts2 = sbttots(sbt2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } maxdt = sbt2; lastfunc = c_func; } #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0) class->lc_unlock(c_lock); skip: CC_LOCK(cc); KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); cc->cc_exec_entity[direct].cc_curr = NULL; if (cc->cc_exec_entity[direct].cc_waiting) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ if (cc_cce_migrating(cc, direct)) { cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not * destroyed but that is not easy. */ c->c_flags &= ~CALLOUT_DFRMIGRATION; } cc->cc_exec_entity[direct].cc_waiting = false; CC_UNLOCK(cc); wakeup(&cc->cc_exec_entity[direct].cc_waiting); CC_LOCK(cc); } else if (cc_cce_migrating(cc, direct)) { KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, ("Migrating legacy callout %p", c)); #ifdef SMP /* * If the callout was scheduled for * migration just perform it now. */ new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; new_time = cc->cc_exec_entity[direct].ce_migration_time; new_prec = cc->cc_exec_entity[direct].ce_migration_prec; new_func = cc->cc_exec_entity[direct].ce_migration_func; new_arg = cc->cc_exec_entity[direct].ce_migration_arg; cc_cce_cleanup(cc, direct); /* * It should be assert here that the callout is not destroyed * but that is not easy. * * As first thing, handle deferred callout stops. */ if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); callout_cc_del(c, cc); return; } c->c_flags &= ~CALLOUT_DFRMIGRATION; new_cc = callout_cpu_switch(c, cc, new_cpu); flags = (direct) ? C_DIRECT_EXEC : 0; callout_cc_add(c, new_cc, new_time, new_prec, new_func, new_arg, new_cpu, flags); CC_UNLOCK(new_cc); CC_LOCK(cc); #else panic("migration should not happen"); #endif } /* * If the current callout is locally allocated (from * timeout(9)) then put it on the freelist. * * Note: we need to check the cached copy of c_flags because * if it was not local, then it's not safe to deref the * callout pointer. */ KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 || c->c_flags == CALLOUT_LOCAL_ALLOC, ("corrupted callout")); if (c_flags & CALLOUT_LOCAL_ALLOC) callout_cc_del(c, cc); } /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures * used in this implementation was published by G. Varghese and T. Lauck in * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for * the Efficient Implementation of a Timer Facility" in the Proceedings of * the 11th ACM Annual Symposium on Operating Systems Principles, * Austin, Texas Nov 1987. */ /* * Software (low priority) clock interrupt. * Run periodic events from timeout queue. */ void softclock(void *arg) { struct callout_cpu *cc; struct callout *c; #ifdef CALLOUT_PROFILING int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0; #endif cc = (struct callout_cpu *)arg; CC_LOCK(cc); while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) { TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); softclock_call_cc(c, cc, #ifdef CALLOUT_PROFILING &mpcalls, &lockcalls, &gcalls, #endif 0); #ifdef CALLOUT_PROFILING ++depth; #endif } #ifdef CALLOUT_PROFILING avg_depth += (depth * 1000 - avg_depth) >> 8; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; #endif CC_UNLOCK(cc); } /* * timeout -- * Execute a function after a specified length of time. * * untimeout -- * Cancel previous timeout function call. * * callout_handle_init -- * Initialize a handle so that using it with untimeout is benign. * * See AT&T BCI Driver Reference Manual for specification. This * implementation differs from that one in that although an * identification value is returned from timeout, the original * arguments to timeout as well as the identifier are used to * identify entries for untimeout. */ struct callout_handle timeout(timeout_t *ftn, void *arg, int to_ticks) { struct callout_cpu *cc; struct callout *new; struct callout_handle handle; cc = CC_CPU(timeout_cpu); CC_LOCK(cc); /* Fill in the next free callout structure. */ new = SLIST_FIRST(&cc->cc_callfree); if (new == NULL) /* XXX Attempt to malloc first */ panic("timeout table full"); SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); return (handle); } void untimeout(timeout_t *ftn, void *arg, struct callout_handle handle) { struct callout_cpu *cc; /* * Check for a handle that was initialized * by callout_handle_init, but never used * for a real timeout. */ if (handle.callout == NULL) return; cc = callout_lock(handle.callout); if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) callout_stop(handle.callout); CC_UNLOCK(cc); } void callout_handle_init(struct callout_handle *handle) { handle->callout = NULL; } /* * New interface; clients allocate their own callout structures. * * callout_reset() - establish or change a timeout * callout_stop() - disestablish a timeout * callout_init() - initialize a callout structure so that it can * safely be passed to callout_reset() and callout_stop() * * defines three convenience macros: * * callout_active() - returns truth if callout has not been stopped, * drained, or deactivated since the last time the callout was * reset. * callout_pending() - returns truth if callout is still waiting for timeout * callout_deactivate() - marks the callout as having been serviced */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, void (*ftn)(void *), void *arg, int cpu, int flags) { sbintime_t to_sbt, pr; struct callout_cpu *cc; int cancelled, direct; cancelled = 0; if (flags & C_ABSOLUTE) { to_sbt = sbt; } else { if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) sbt = tick_sbt; if ((flags & C_HARDCLOCK) || #ifdef NO_EVENTTIMERS sbt >= sbt_timethreshold) { to_sbt = getsbinuptime(); /* Add safety belt for the case of hz > 1000. */ to_sbt += tc_tick_sbt - tick_sbt; #else sbt >= sbt_tickthreshold) { /* * Obtain the time of the last hardclock() call on * this CPU directly from the kern_clocksource.c. * This value is per-CPU, but it is equal for all * active ones. */ #ifdef __LP64__ to_sbt = DPCPU_GET(hardclocktime); #else spinlock_enter(); to_sbt = DPCPU_GET(hardclocktime); spinlock_exit(); #endif #endif if ((flags & C_HARDCLOCK) == 0) to_sbt += tick_sbt; } else to_sbt = sbinuptime(); if (SBT_MAX - to_sbt < sbt) to_sbt = SBT_MAX; else to_sbt += sbt; pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : sbt >> C_PRELGET(flags)); if (pr > precision) precision = pr; } /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced. */ if (c->c_flags & CALLOUT_LOCAL_ALLOC) cpu = c->c_cpu; direct = (c->c_flags & CALLOUT_DIRECT) != 0; KASSERT(!direct || c->c_lock == NULL, ("%s: direct callout %p has lock", __func__, c)); cc = callout_lock(c); if (cc->cc_exec_entity[direct].cc_curr == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel) cancelled = cc->cc_exec_entity[direct].cc_cancel = true; if (cc->cc_exec_entity[direct].cc_waiting) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. */ CTR4(KTR_CALLOUT, "%s %p func %p arg %p", cancelled ? "cancelled" : "failed to cancel", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (cancelled); } } if (c->c_flags & CALLOUT_PENDING) { if ((c->c_flags & CALLOUT_PROCESSED) == 0) { if (cc->cc_exec_next_dir == c) cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } #ifdef SMP /* * If the callout must migrate try to perform it immediately. * If the callout is currently running, just defer the migration * to a more appropriate moment. */ if (c->c_cpu != cpu) { if (cc->cc_exec_entity[direct].cc_curr == c) { cc->cc_exec_entity[direct].ce_migration_cpu = cpu; cc->cc_exec_entity[direct].ce_migration_time = to_sbt; cc->cc_exec_entity[direct].ce_migration_prec = precision; cc->cc_exec_entity[direct].ce_migration_func = ftn; cc->cc_exec_entity[direct].ce_migration_arg = arg; c->c_flags |= CALLOUT_DFRMIGRATION; CTR6(KTR_CALLOUT, "migration of %p func %p arg %p in %d.%08x to %u deferred", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff), cpu); CC_UNLOCK(cc); return (cancelled); } cc = callout_cpu_switch(c, cc, cpu); } #endif callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), (u_int)(to_sbt & 0xffffffff)); CC_UNLOCK(cc); return (cancelled); } /* * Common idioms that can be optimized in the future. */ int callout_schedule_on(struct callout *c, int to_ticks, int cpu) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu); } int callout_schedule(struct callout *c, int to_ticks) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu); } int _callout_stop_safe(struct callout *c, int safe) { struct callout_cpu *cc, *old_cc; struct lock_class *class; int direct, sq_locked, use_lock; + if (safe) + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock, + "calling %s", __func__); + /* * Some old subsystems don't hold Giant while running a callout_stop(), * so just discard this check for the moment. */ if (!safe && c->c_lock != NULL) { if (c->c_lock == &Giant.lock_object) use_lock = mtx_owned(&Giant); else { use_lock = 1; class = LOCK_CLASS(c->c_lock); class->lc_assert(c->c_lock, LA_XLOCKED); } } else use_lock = 0; direct = (c->c_flags & CALLOUT_DIRECT) != 0; sq_locked = 0; old_cc = NULL; again: cc = callout_lock(c); /* * If the callout was migrating while the callout cpu lock was * dropped, just drop the sleepqueue lock and check the states * again. */ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); sq_locked = 0; old_cc = NULL; goto again; #else panic("migration should not happen"); #endif } /* * If the callout isn't pending, it's not on the queue, so * don't attempt to remove it from the queue. We can try to * stop it by other means however. */ if (!(c->c_flags & CALLOUT_PENDING)) { c->c_flags &= ~CALLOUT_ACTIVE; /* * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. */ if (cc->cc_exec_entity[direct].cc_curr != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) sleepq_release( &cc->cc_exec_entity[direct].cc_waiting); return (0); } if (safe) { /* * The current callout is running (or just * about to run) and blocking is allowed, so * just wait for the current invocation to * finish. */ while (cc->cc_exec_entity[direct].cc_curr == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid * a LOR between cc_lock and sleepqueue * chain spinlocks. This piece of code * emulates a msleep_spin() call actually. * * If we already have the sleepqueue chain * locked, then we can safely block. If we * don't already have it locked, however, * we have to drop the cc_lock to lock * it. This opens several races, so we * restart at the beginning once we have * both locks. If nothing has changed, then * we will end up back here with sq_locked * set. */ if (!sq_locked) { CC_UNLOCK(cc); sleepq_lock( &cc->cc_exec_entity[direct].cc_waiting); sq_locked = 1; old_cc = cc; goto again; } /* * Migration could be cancelled here, but * as long as it is still not sure when it * will be packed up, just let softclock() * take care of it. */ cc->cc_exec_entity[direct].cc_waiting = true; DROP_GIANT(); CC_UNLOCK(cc); sleepq_add( &cc->cc_exec_entity[direct].cc_waiting, &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); sleepq_wait( &cc->cc_exec_entity[direct].cc_waiting, 0); sq_locked = 0; old_cc = NULL; /* Reacquire locks previously released. */ PICKUP_GIANT(); CC_LOCK(cc); } } else if (use_lock && !cc->cc_exec_entity[direct].cc_cancel) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout * and return. After our caller drops the * lock, the callout will be skipped in * softclock(). */ cc->cc_exec_entity[direct].cc_cancel = true; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); KASSERT(!cc_cce_migrating(cc, direct), ("callout wrongly scheduled for migration")); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); return (1); } else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) { c->c_flags &= ~CALLOUT_DFRMIGRATION; CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (1); } CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain still locked")); return (0); } if (sq_locked) sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if ((c->c_flags & CALLOUT_PROCESSED) == 0) { if (cc->cc_exec_next_dir == c) cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); LIST_REMOVE(c, c_links.le); } else TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); callout_cc_del(c, cc); CC_UNLOCK(cc); return (1); } void callout_init(struct callout *c, int mpsafe) { bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; c->c_flags = CALLOUT_RETURNUNLOCKED; } else { c->c_lock = &Giant.lock_object; c->c_flags = 0; } c->c_cpu = timeout_cpu; } void _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, ("callout_init_lock: bad flags %d", flags)); KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class", __func__)); c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); c->c_cpu = timeout_cpu; } #ifdef APM_FIXUP_CALLTODO /* * Adjust the kernel calltodo timeout list. This routine is used after * an APM resume to recalculate the calltodo timer list values with the * number of hz's we have been sleeping. The next hardclock() will detect * that there are fired timers and run softclock() to execute them. * * Please note, I have not done an exhaustive analysis of what code this * might break. I am motivated to have my select()'s and alarm()'s that * have expired during suspend firing upon resume so that the applications * which set the timer can do the maintanence the timer was for as close * as possible to the originally intended time. Testing this code for a * week showed that resuming from a suspend resulted in 22 to 25 timers * firing, which seemed independant on whether the suspend was 2 hours or * 2 days. Your milage may vary. - Ken Key */ void adjust_timeout_calltodo(struct timeval *time_change) { register struct callout *p; unsigned long delta_ticks; /* * How many ticks were we asleep? * (stolen from tvtohz()). */ /* Don't do anything */ if (time_change->tv_sec < 0) return; else if (time_change->tv_sec <= LONG_MAX / 1000000) delta_ticks = (time_change->tv_sec * 1000000 + time_change->tv_usec + (tick - 1)) / tick + 1; else if (time_change->tv_sec <= LONG_MAX / hz) delta_ticks = time_change->tv_sec * hz + (time_change->tv_usec + (tick - 1)) / tick + 1; else delta_ticks = LONG_MAX; if (delta_ticks > INT_MAX) delta_ticks = INT_MAX; /* * Now rip through the timer calltodo list looking for timers * to expire. */ /* don't collide with softclock() */ CC_LOCK(cc); for (p = calltodo.c_next; p != NULL; p = p->c_next) { p->c_time -= delta_ticks; /* Break if the timer had more time on it than delta_ticks */ if (p->c_time > 0) break; /* take back the ticks the timer didn't use (p->c_time <= 0) */ delta_ticks = -p->c_time; } CC_UNLOCK(cc); return; } #endif /* APM_FIXUP_CALLTODO */ static int flssbt(sbintime_t sbt) { sbt += (uint64_t)sbt >> 1; if (sizeof(long) >= sizeof(sbintime_t)) return (flsl(sbt)); if (sbt >= SBT_1S) return (flsl(((uint64_t)sbt) >> 32) + 32); return (flsl(sbt)); } /* * Dump immediate statistic snapshot of the scheduled callouts. */ static int sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS) { struct callout *tmp; struct callout_cpu *cc; struct callout_list *sc; sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t; int ct[64], cpr[64], ccpbk[32]; int error, val, i, count, tcum, pcum, maxc, c, medc; #ifdef SMP int cpu; #endif val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); count = maxc = 0; st = spr = maxt = maxpr = 0; bzero(ccpbk, sizeof(ccpbk)); bzero(ct, sizeof(ct)); bzero(cpr, sizeof(cpr)); now = sbinuptime(); #ifdef SMP CPU_FOREACH(cpu) { cc = CC_CPU(cpu); #else cc = CC_CPU(timeout_cpu); #endif CC_LOCK(cc); for (i = 0; i < callwheelsize; i++) { sc = &cc->cc_callwheel[i]; c = 0; LIST_FOREACH(tmp, sc, c_links.le) { c++; t = tmp->c_time - now; if (t < 0) t = 0; st += t / SBT_1US; spr += tmp->c_precision / SBT_1US; if (t > maxt) maxt = t; if (tmp->c_precision > maxpr) maxpr = tmp->c_precision; ct[flssbt(t)]++; cpr[flssbt(tmp->c_precision)]++; } if (c > maxc) maxc = c; ccpbk[fls(c + c / 2)]++; count += c; } CC_UNLOCK(cc); #ifdef SMP } #endif for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++) tcum += ct[i]; medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++) pcum += cpr[i]; medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0; for (i = 0, c = 0; i < 32 && c < count / 2; i++) c += ccpbk[i]; medc = (i >= 2) ? (1 << (i - 2)) : 0; printf("Scheduled callouts statistic snapshot:\n"); printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n", count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT); printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n", medc, count / callwheelsize / mp_ncpus, (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000, maxc); printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32, (st / count) / 1000000, (st / count) % 1000000, maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32); printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n", medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32, (spr / count) / 1000000, (spr / count) % 1000000, maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32); printf(" Distribution: \tbuckets\t time\t tcum\t" " prec\t pcum\n"); for (i = 0, tcum = pcum = 0; i < 64; i++) { if (ct[i] == 0 && cpr[i] == 0) continue; t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0; tcum += ct[i]; pcum += cpr[i]; printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n", t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32, i - 1 - (32 - CC_HASH_SHIFT), ct[i], tcum, cpr[i], pcum); } return (error); } SYSCTL_PROC(_kern, OID_AUTO, callout_stat, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_callout_stat, "I", "Dump immediate statistic snapshot of the scheduled callouts"); Index: projects/building-blocks/sys/kern/vfs_subr.c =================================================================== --- projects/building-blocks/sys/kern/vfs_subr.c (revision 277723) +++ projects/building-blocks/sys/kern/vfs_subr.c (revision 277724) @@ -1,4878 +1,4878 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_incr_usecount(struct vnode *); static void v_decr_usecount(struct vnode *); static void v_decr_useonly(struct vnode *); static void v_upgrade_usecount(struct vnode *); static void vnlru_free(int); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. */ static unsigned long numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of vnodes that are ready for recycling. */ static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* * Free vnode target. Free vnodes may simply be files which have been stat'd * but not read. This is somewhat common, and a small cache of such files * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); /* Number of vnodes in the free list. */ static u_long freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of vnodes in the free list"); static int vlru_allow_cache_src; SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); /* * Various variables used for debugging the new implementation of * reassignbuf(). * XXX these are probably of (very) limited utility now. */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "Number of calls to reassignbuf"); /* * Cache for the mount type id assigned to NFS. This is used for * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. */ int nfs_mount_type = -1; /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_free_list * numvnodes * freevnodes */ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* * Number of vnodes we want to exist at any one time. This is mostly used * to size hash tables in vnode-related code. It is normally not used in * getnewvnode(), as wantfreevnodes is normally nonzero.) * * XXX desiredvnodes is historical cruft and should not exist. */ int desiredvnodes; SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "Maximum number of vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ static int vnsz2log; /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return uma_zalloc(buf_trie_zone, M_NOWAIT); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree(buf_trie_zone, node); } PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of physical pages to vnodes approaches sixteen to one. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) #endif static void vntblinit(void *dummy __unused) { u_int i; int physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to physical pages * is one to four until desiredvnodes exceeds 98,304. Thereafter, the * marginal ratio of desiredvnodes to physical pages is one to * sixteen. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * may not exceed one seventh of the kernel's heap size. */ physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4, vm_cnt.v_page_count) / 16; virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + sizeof(struct vnode))); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %d -> %d\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); uma_prealloc(buf_trie_zone, nbuf); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); MNT_ILOCK(mp); MNT_REF(mp); /* * If mount point is currenly being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_ILOCK(mp); MNT_REL(mp); KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); mp->mnt_lockref--; if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || mp->mnt_stat.f_fsid.val[0] != fsid->val[0] || mp->mnt_stat.f_fsid.val[1] != fsid->val[1]) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; /* * If the thread is jailed, but this is not a jail-friendly file * system, deny immediately. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) return (EPERM); /* * If the file system was mounted outside the jail of the calling * thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; -static int timestamp_precision = TSP_SEC; +static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desireable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int vlrureclaim(struct mount *mp) { struct vnode *vp; int done; int trigger; int usevnodes; int count; /* * Calculate the trigger point, don't allow user * screwups to blow us up. This prevents us from * recycling vnodes with lots of resident pages. We * aren't trying to free memory, we are trying to * free vnodes. */ usevnodes = desiredvnodes; if (usevnodes <= 0) usevnodes = 1; trigger = vm_cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); count = mp->mnt_nvnodelistsize / 10 + 1; while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; if (!VI_TRYLOCK(vp)) goto next_iter; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. */ if (vp->v_usecount || (!vlru_allow_cache_src && !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); goto next_iter; } MNT_IUNLOCK(mp); vholdl(vp); if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { vdrop(vp); goto next_iter_mntunlocked; } VI_LOCK(vp); /* * v_usecount may have been bumped after VOP_LOCK() dropped * the vnode interlock and before it was locked again. * * It is not necessary to recheck VI_DOOMED because it can * only be set by another thread that holds both the vnode * lock and vnode interlock. If another thread has the * vnode lock before we get to VOP_LOCK() and obtains the * vnode interlock after VOP_LOCK() drops the vnode * interlock, the other thread will be unable to drop the * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || (!vlru_allow_cache_src && !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); vdrop(vp); goto next_iter_mntunlocked; } KASSERT((vp->v_iflag & VI_DOOMED) == 0, ("VI_DOOMED unexpectedly detected in vlrureclaim()")); vgonel(vp); VOP_UNLOCK(vp, 0); vdropl(vp); done++; next_iter_mntunlocked: if (!should_yield()) goto relock_mnt; goto yield; next_iter: if (!should_yield()) continue; MNT_IUNLOCK(mp); yield: kern_yield(PRI_USER); relock_mnt: MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_write(mp); return done; } /* * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) { struct vnode *vp; mtx_assert(&vnode_free_list_mtx, MA_OWNED); for (; count > 0; count--) { vp = TAILQ_FIRST(&vnode_free_list); /* * The list can be modified while the free_list_mtx * has been dropped and vp could be NULL here. */ if (!vp) break; VNASSERT(vp->v_op != NULL, vp, ("vnlru_free: vnode already reclaimed.")); KASSERT((vp->v_iflag & VI_FREE) != 0, ("Removing vnode not on freelist")); KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Mangling active vnode")); TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); /* * Don't recycle if we can't get the interlock. */ if (!VI_TRYLOCK(vp)) { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); continue; } VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0, vp, ("vp inconsistent on freelist")); /* * The clear of VI_FREE prevents activation of the * vnode. There is no sense in putting the vnode on * the mount point active list, only to remove it * later during recycling. Inline the relevant part * of vholdl(), to avoid triggering assertions or * activating. */ freevnodes--; vp->v_iflag &= ~VI_FREE; vp->v_holdcnt++; mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); vtryrecycle(vp); /* * If the recycled succeeded this vdrop will actually free * the vnode. If not it will simply place it back on * the free list. */ vdrop(vp); mtx_lock(&vnode_free_list_mtx); } } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static void vnlru_proc(void) { struct mount *mp, *nmp; int done; struct proc *p = vnlruproc; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); for (;;) { kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } mtx_unlock(&vnode_free_list_mtx); done = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } done += vlrureclaim(mp); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (done == 0) { #if 0 /* These messages are temporary debugging aids */ if (vnlru_nowhere < 5) printf("vnlru process getting nowhere..\n"); else if (vnlru_nowhere == 5) printf("vnlru process messages stopped.\n"); #endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else kern_yield(PRI_USER); } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if ((vp->v_iflag & VI_DOOMED) == 0) vgonel(vp); VOP_UNLOCK(vp, LK_INTERLOCK); vn_finished_write(vnmp); return (0); } /* * Wait for available vnodes. */ static int getnewvnode_wait(int suspended) { mtx_assert(&vnode_free_list_mtx, MA_OWNED); if (numvnodes > desiredvnodes) { if (suspended) { /* * File system is beeing suspended, we cannot risk a * deadlock here, so allocate new vnode anyway. */ if (freevnodes > wantfreevnodes) vnlru_free(freevnodes - wantfreevnodes); return (0); } if (vnlruproc_sig == 0) { vnlruproc_sig = 1; /* avoid unnecessary wakeups */ wakeup(vnlruproc); } msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); } return (numvnodes > desiredvnodes ? ENFILE : 0); } void getnewvnode_reserve(u_int count) { struct thread *td; td = curthread; /* First try to be quick and racy. */ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { td->td_vp_reserv += count; return; } else atomic_subtract_long(&numvnodes, count); mtx_lock(&vnode_free_list_mtx); while (count > 0) { if (getnewvnode_wait(0) == 0) { count--; td->td_vp_reserv++; atomic_add_long(&numvnodes, 1); } } mtx_unlock(&vnode_free_list_mtx); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; atomic_subtract_long(&numvnodes, td->td_vp_reserv); td->td_vp_reserv = 0; } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct bufobj *bo; struct thread *td; int error; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); vp = NULL; td = curthread; if (td->td_vp_reserv > 0) { td->td_vp_reserv -= 1; goto alloc; } mtx_lock(&vnode_free_list_mtx); /* * Lend our context to reclaim vnodes if they've exceeded the max. */ if (freevnodes > wantfreevnodes) vnlru_free(1); error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ if (error != 0) { mtx_unlock(&vnode_free_list_mtx); return (error); } #endif atomic_add_long(&numvnodes, 1); mtx_unlock(&vnode_free_list_mtx); alloc: vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems * opt-in. */ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bo = &vp->v_bufobj; bo->__bo_vnode = vp; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_ops = &buf_ops_bio; bo->bo_private = vp; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); /* * Initialize namecache. */ LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); /* * Finalize various vnode identity bits. */ vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); vp->v_data = NULL; #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); else if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { bo->bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } rangelock_init(&vp->v_rl); /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; int active; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, ("Active vnode list size %d > Vnode list size %d", mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); active = vp->v_iflag & VI_ACTIVE; vp->v_iflag &= ~VI_ACTIVE; if (active) { mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; mtx_unlock(&vnode_free_list_mtx); } vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag |= VI_ACTIVE; mtx_lock(&vnode_free_list_mtx); TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); if (bo->bo_object != NULL) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_pip_wait(bo->bo_object, "bovlbx"); VM_OBJECT_WUNLOCK(bo->bo_object); } BO_LOCK(bo); } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { continue; } lblkno = 0; xflags = 0; if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if (bp->b_bufobj != bo) { /* XXX: necessary ? */ BUF_UNLOCK(bp); BO_LOCK(bo); return (EAGAIN); } /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp != NULL && (nbp->b_bufobj != bo || nbp->b_lblkno != lblkno || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags)) break; /* nbp invalid */ } return (retval); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize) { struct buf *bp, *nbp; int anyfreed; int trunclbn; struct bufobj *bo; CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, vp, cred, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ trunclbn = (length + blksize - 1) / blksize; ASSERT_VOP_LOCKED(vp, "vtruncbuf"); restart: bo = &vp->v_bufobj; BO_LOCK(bo); anyfreed = 1; for (;anyfreed;) { anyfreed = 0; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI))) { BO_UNLOCK(bo); goto restart; } } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < trunclbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart; bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = 1; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) { BO_UNLOCK(bo); goto restart; } } } if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { goto restart; } VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != (BX_VNDIRTY|BX_VNCLEAN), ("buf_vlist_remove: Buf %p is on two lists", bp)); if (bp->b_xflags & BX_VNDIRTY) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Lookup a buffer using the splay tree. Note that we specifically avoid * shadow buffers used in background bitmap writes. * * This code isn't quite efficient as it could be because we are maintaining * two sorted lists and do not know which list the block resides in. * * During a "make buildworld" the desired buffer is found at one of * the roots more than 60% of the time. Thus, checking both roots * before performing either splay eliminates unnecessary splays on the * first tree splayed. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("brelvp: Buffer %p not on queue.", bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = (*bo)->__bo_vnode; /* XXX */ if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining..."); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; ++reassignbufcalls; CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); /* * B_PAGING flagged buffers cannot be reassigned because their vp * is not fully linked in. */ if (bp->b_flags & B_PAGING) panic("cannot reassign paging buffer"); /* * Delete from old vnode list, if on one. */ BO_LOCK(bo); if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) buf_vlist_remove(bp); else panic("reassignbuf: Buffer %p not on queue.", bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } /* * Increment the use and hold counts on the vnode, taking care to reference * the driver's usecount if this is a chardev. The vholdl() will remove * the vnode from the free list if it is presently free. Requires the * vnode interlock and returns with it held. */ static void v_incr_usecount(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vholdl(vp); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Turn a holdcnt into a use+holdcnt such that only one call to * v_decr_usecount is needed. */ static void v_upgrade_usecount(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount++; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount++; dev_unlock(); } } /* * Decrement the vnode use and hold count along with the driver's usecount * if this is a chardev. The vdropl() below releases the vnode interlock * as it may free the vnode. */ static void v_decr_usecount(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_usecount: negative usecount")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } vdropl(vp); } /* * Decrement only the use count and driver use count. This is intended to * be paired with a follow on vdropl() to release the remaining hold count. * In this way we may vgone() a vnode with a 0 usecount without risk of * having it end up on a free list because the hold count is kept above 0. */ static void v_decr_useonly(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __FUNCTION__); VNASSERT(vp->v_usecount > 0, vp, ("v_decr_useonly: negative usecount")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_usecount--; if (vp->v_type == VCHR && vp->v_rdev != NULL) { dev_lock(); vp->v_rdev->si_usecount--; dev_unlock(); } } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VI_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. */ int vget(struct vnode *vp, int flags, struct thread *td) { int error; error = 0; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vget: invalid lock operation")); CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); vholdl(vp); if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { vdrop(vp); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) panic("vget: vn_lock failed to return ENOENT\n"); VI_LOCK(vp); /* Upgrade our holdcnt to a usecount. */ v_upgrade_usecount(vp); /* * We don't guarantee that any particular close will * trigger inactive processing so just make a best effort * here at preventing a reference to a removed file. If * we don't succeed no harm is done. */ if (vp->v_iflag & VI_OWEINACT) { if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && (flags & LK_NOWAIT) == 0) vinactive(vp, td); vp->v_iflag &= ~VI_OWEINACT; } VI_UNLOCK(vp); return (0); } /* * Increase the reference count of a vnode. */ void vref(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VI_LOCK(vp); v_incr_usecount(vp); VI_UNLOCK(vp); } /* * Return reference count of a vnode. * * The results of this call are only guaranteed when some mechanism other * than the VI lock is used to stop other processes from gaining references * to the vnode. This may be the case if the caller holds the only reference. * This is also useful when stale data is acceptable as race conditions may * be accounted for by some other means. */ int vrefcnt(struct vnode *vp) { int usecnt; VI_LOCK(vp); usecnt = vp->v_usecount; VI_UNLOCK(vp); return (usecnt); } #define VPUTX_VRELE 1 #define VPUTX_VPUT 2 #define VPUTX_VUNREF 3 static void vputx(struct vnode *vp, int func) { int error; KASSERT(vp != NULL, ("vputx: null vp")); if (func == VPUTX_VUNREF) ASSERT_VOP_LOCKED(vp, "vunref"); else if (func == VPUTX_VPUT) ASSERT_VOP_LOCKED(vp, "vput"); else KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VI_LOCK(vp); /* Skip this v_writecount check if we're going to panic below. */ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, ("vputx: missed vn_close")); error = 0; if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && vp->v_usecount == 1)) { if (func == VPUTX_VPUT) VOP_UNLOCK(vp, 0); v_decr_usecount(vp); return; } if (vp->v_usecount != 1) { vprint("vputx: negative ref count", vp); panic("vputx: negative ref cnt"); } CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); /* * We want to hold the vnode until the inactive finishes to * prevent vgone() races. We drop the use count here and the * hold count below when we're done. */ v_decr_useonly(vp); /* * We must call VOP_INACTIVE with the node locked. Mark * as VI_DOINGINACT to avoid recursion. */ vp->v_iflag |= VI_OWEINACT; switch (func) { case VPUTX_VRELE: error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; case VPUTX_VPUT: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VPUTX_VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (vp->v_usecount > 0) vp->v_iflag &= ~VI_OWEINACT; if (error == 0) { if (vp->v_iflag & VI_OWEINACT) vinactive(vp, curthread); if (func != VPUTX_VUNREF) VOP_UNLOCK(vp, 0); } vdropl(vp); } /* * Vnode put/release. * If count drops to zero, call inactive routine and return to freelist. */ void vrele(struct vnode *vp) { vputx(vp, VPUTX_VRELE); } /* * Release an already locked vnode. This give the same effects as * unlock+vrele(), but takes less time and avoids releasing and * re-aquiring the lock (as vrele() acquires the lock internally.) */ void vput(struct vnode *vp) { vputx(vp, VPUTX_VPUT); } /* * Release an exclusively locked vnode. Do not unlock the vnode lock. */ void vunref(struct vnode *vp) { vputx(vp, VPUTX_VUNREF); } /* * Somebody doesn't want the vnode recycled. */ void vhold(struct vnode *vp) { VI_LOCK(vp); vholdl(vp); VI_UNLOCK(vp); } /* * Increase the hold count and activate if this is the first reference. */ void vholdl(struct vnode *vp) { struct mount *mp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS /* getnewvnode() calls v_incr_usecount() without holding interlock. */ if (vp->v_type != VNON || vp->v_data != NULL) { ASSERT_VI_LOCKED(vp, "vholdl"); VNASSERT(vp->v_holdcnt > 0 || (vp->v_iflag & VI_FREE) != 0, vp, ("vholdl: free vnode is held")); } #endif vp->v_holdcnt++; if ((vp->v_iflag & VI_FREE) == 0) return; VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count")); VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed.")); /* * Remove a vnode from the free list, mark it as in use, * and put it on the active list. */ mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); freevnodes--; vp->v_iflag &= ~(VI_FREE|VI_AGE); KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag |= VI_ACTIVE; mp = vp->v_mount; TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize++; mtx_unlock(&vnode_free_list_mtx); } /* * Note that there is one less who cares about this vnode. * vdrop() is the opposite of vhold(). */ void vdrop(struct vnode *vp) { VI_LOCK(vp); vdropl(vp); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VI_DOOMED) in which case we will free it. */ void vdropl(struct vnode *vp) { struct bufobj *bo; struct mount *mp; int active; ASSERT_VI_LOCKED(vp, "vdropl"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (vp->v_holdcnt <= 0) panic("vdrop: holdcnt %d", vp->v_holdcnt); vp->v_holdcnt--; VNASSERT(vp->v_holdcnt >= vp->v_usecount, vp, ("hold count less than use count")); if (vp->v_holdcnt > 0) { VI_UNLOCK(vp); return; } if ((vp->v_iflag & VI_DOOMED) == 0) { /* * Mark a vnode as free: remove it from its active list * and put it up for recycling on the freelist. */ VNASSERT(vp->v_op != NULL, vp, ("vdropl: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); VNASSERT(vp->v_holdcnt == 0, vp, ("vdropl: freeing when we shouldn't")); active = vp->v_iflag & VI_ACTIVE; vp->v_iflag &= ~VI_ACTIVE; mp = vp->v_mount; mtx_lock(&vnode_free_list_mtx); if (active) { TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); mp->mnt_activevnodelistsize--; } if (vp->v_iflag & VI_AGE) { TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist); } else { TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); } freevnodes++; vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); VI_UNLOCK(vp); return; } /* * The vnode has been marked for destruction, so free it. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); atomic_subtract_long(&numvnodes, 1); bo = &vp->v_bufobj; VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("cleaned vnode still on the free list.")); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) destroy_vpollinfo(vp->v_pollinfo); #ifdef INVARIANTS /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); rw_destroy(BO_LOCKPTR(bo)); uma_zfree(vnode_zone, vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. * OWEINACT tracks whether a vnode missed a call to inactive due to a * failed lock upgrade. */ void vinactive(struct vnode *vp, struct thread *td) { struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are on the vnode's dirty list since these will * no longer be checked once the vnode is on the inactive list. * Because the vnode vm object keeps a hold reference on the vnode * if there is at least one resident non-cached page, the vnode * cannot leave the active list without the page cleanup done. */ obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); VM_OBJECT_WUNLOCK(obj); } VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp, 0); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); if (error != 0) { VOP_UNLOCK(vp, 0); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount == 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp, 0); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { VNASSERT(vp->v_usecount == 0 || (vp->v_type != VCHR && vp->v_type != VBLK), vp, ("device VNODE %p is FORCECLOSED", vp)); vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vprint("vflush: busy vnode", vp); #endif } VOP_UNLOCK(vp, 0); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp, 0); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, "vrecycle"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; VI_LOCK(vp); if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } VI_UNLOCK(vp); return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } static void notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); if (TAILQ_EMPTY(&mp->mnt_uppers)) goto unlock; MNT_IUNLOCK(mp); mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); mmp->mnt_op = &vgonel_vfsops; mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_VGONE_UPPER; for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } free(mmp, M_TEMP); mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; wakeup(&mp->mnt_uppers); } unlock: MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ void vgonel(struct vnode *vp) { struct thread *td; int oweinact; int active; struct mount *mp; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_iflag & VI_DOOMED) return; vp->v_iflag |= VI_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount; oweinact = (vp->v_iflag & VI_OWEINACT); VI_UNLOCK(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); if ((vp->v_iflag & VI_DOINGINACT) == 0) vinactive(vp, td); VI_UNLOCK(vp); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } #ifdef INVARIANTS BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); #endif /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_tag = "none"; vp->v_type = VBAD; } /* * Calculate the total number of references to a special device. */ int vcount(struct vnode *vp) { int count; dev_lock(); count = vp->v_rdev->si_usecount; dev_unlock(); return (count); } /* * Same as above, but using the struct cdev *as argument */ int count_dev(struct cdev *dev) { int count; dev_lock(); count = dev->si_usecount; dev_unlock(); return(count); } /* * Print out a description of a vnode. */ static char *typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_TEXT) strlcat(buf, "|VV_TEXT", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_AGE) strlcat(buf, "|VI_AGE", sizeof(buf)); if (vp->v_iflag & VI_DOOMED) strlcat(buf, "|VI_DOOMED", sizeof(buf)); if (vp->v_iflag & VI_FREE) strlcat(buf, "|VI_FREE", sizeof(buf)); if (vp->v_iflag & VI_ACTIVE) strlcat(buf, "|VI_ACTIVE", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_dirty.bv_cnt, vp->v_bufobj.bo_clean.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vprint("", vp); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOINSMNTQ); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_VGONE_UPPER); MNT_KERN_FLAG(MNTK_VGONE_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); MNT_KERN_FLAG(MNTK_NOKNOTE); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d\n", mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { if (vp->v_type != VMARKER) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; xvfsp.vfc_vfsops = 0; xvfsp.vfc_next = 0; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp; struct thread *td; int error; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); td = curthread; /* * Since this only runs when rebooting, it is not interlocked. */ while(!TAILQ_EMPTY(&mountlist)) { mp = TAILQ_LAST(&mountlist, mntlist); error = dounmount(mp, MNT_FORCE, td); if (error) { TAILQ_REMOVE(&mountlist, mp, mnt_list); /* * XXX: Due to the way in which we mount the root * file system off of devfs, devfs will generate a * "busy" warning when we try to unmount it before * the root. Don't print a warning as a result in * order to avoid false positive errors that may * cause needless upset. */ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } else { /* The unmount has removed mp from the mountlist */ } } } /* * perform msync on all vnodes under a mount point * the mount point must be locked. */ void vfs_msync(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { obj = vp->v_object; if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { if (!vget(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, curthread)) { if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(vp); continue; } obj = vp->v_object; if (obj != NULL) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); VM_OBJECT_WUNLOCK(obj); } vput(vp); } } else VI_UNLOCK(vp); } } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initalize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp, 0); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } save = curthread_pflags_set(TDP_SYNCIO); vfs_msync(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: if (errp != NULL) *errp = error; return (error == 0); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, * and optional call-by-reference privused argument allowing vaccess() * to indicate to the caller whether privilege was used to satisfy the * request (obsoleted). Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ if (privused != NULL) *privused = 0; dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ, 0)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { /* XXX audit: privilege used */ if (privused != NULL) *privused = 1; return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to supress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (!IGNORE_LOCK(vp)) { locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #if 0 void assert_vop_elocked_other(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) vfs_badlock("is not exclusive locked by another thread", str, vp); } void assert_vop_slocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) vfs_badlock("is not locked shared but should be", str, vp); } #endif /* 0 */ #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } void vop_strategy_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (panicstr == NULL && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } #endif } void vop_lock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_lock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); #endif } void vop_unlock_pre(void *ap) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_unlock_post(void *ap, int rc) { #ifdef DEBUG_VFS_LOCKS struct vop_unlock_args *a = ap; if (a->a_flags & LK_INTERLOCK) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); #endif } void vop_create_post(void *ap, int rc) { struct vop_create_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); } } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; if (!rc) { VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a = ap; if (!rc) { VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); } } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp, 0); } static void vfs_knl_assert_locked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); #endif } static void vfs_knl_assert_unlocked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_data != 0); VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; return (0); } /* * Mark for update the access time of the file if the filesystem * supports VOP_MARKATIME. This functionality is used by execve and * mmap, so we want to avoid the I/O implied by directly setting * va_atime for the sake of efficiency. */ void vfs_mark_atime(struct vnode *vp, struct ucred *cred) { struct mount *mp; mp = vp->v_mount; ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) (void)VOP_MARKATIME(vp); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); vp = TAILQ_NEXT(*mvp, v_nmntvnodes); while (vp != NULL && (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); VI_LOCK(vp); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); (*mvp)->v_type = VMARKER; vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; return (NULL); } (*mvp)->v_mount = mp; TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); VI_LOCK(vp); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h */ static void mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); *mvp = NULL; } static struct vnode * mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) { struct vnode *vp, *nvp; mtx_assert(&vnode_free_list_mtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_actfreelist); TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_actfreelist); continue; } if (!VI_TRYLOCK(vp)) { if (mp_ncpus == 1 || should_yield()) { TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); pause("vnacti", 1); mtx_lock(&vnode_free_list_mtx); goto restart; } continue; } KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the active list %p %p", vp, mp)); if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) break; nvp = TAILQ_NEXT(vp, v_actfreelist); VI_UNLOCK(vp); vp = nvp; } /* Check if we are done */ if (vp == NULL) { mtx_unlock(&vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); ASSERT_VI_LOCKED(vp, "active iter"); KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); return (vp); } struct vnode * __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_free_list_mtx); return (mnt_vnode_next_active(mvp, mp)); } struct vnode * __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); (*mvp)->v_type = VMARKER; (*mvp)->v_mount = mp; mtx_lock(&vnode_free_list_mtx); vp = TAILQ_FIRST(&mp->mnt_activevnodelist); if (vp == NULL) { mtx_unlock(&vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); return (mnt_vnode_next_active(mvp, mp)); } void __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); mtx_unlock(&vnode_free_list_mtx); mnt_vnode_markerfree_active(mvp, mp); } Index: projects/building-blocks/sys =================================================================== --- projects/building-blocks/sys (revision 277723) +++ projects/building-blocks/sys (revision 277724) Property changes on: projects/building-blocks/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r277711-277723 Index: projects/building-blocks =================================================================== --- projects/building-blocks (revision 277723) +++ projects/building-blocks (revision 277724) Property changes on: projects/building-blocks ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r277711-277723