diff --git a/usr.sbin/arp/arp.c b/usr.sbin/arp/arp.c index 562d982196c3..e5b94ca053e1 100644 --- a/usr.sbin/arp/arp.c +++ b/usr.sbin/arp/arp.c @@ -1,890 +1,890 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1984, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Sun Microsystems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * arp - display, set, and delete arp table entries */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "arp.h" typedef void (action_fn)(struct sockaddr_dl *sdl, struct sockaddr_in *s_in, struct rt_msghdr *rtm); static void nuke_entries(uint32_t ifindex, struct in_addr addr); static int print_entries(uint32_t ifindex, struct in_addr addr); static int delete(char *host); static void usage(void) __dead2; static int set(int argc, char **argv); static int get(char *host); static int file(char *name); static struct rt_msghdr *rtmsg(int cmd, struct sockaddr_in *dst, struct sockaddr_dl *sdl); static int get_ether_addr(in_addr_t ipaddr, struct ether_addr *hwaddr); static int set_rtsock(struct sockaddr_in *dst, struct sockaddr_dl *sdl_m, char *host); struct if_nameindex *ifnameindex; struct arp_opts opts = {}; /* which function we're supposed to do */ #define F_GET 1 #define F_SET 2 #define F_FILESET 3 #define F_REPLACE 4 #define F_DELETE 5 #define SETFUNC(f) { if (func) usage(); func = (f); } #define ARP_XO_VERSION "1" int main(int argc, char *argv[]) { int ch, func = 0; int rtn = 0; argc = xo_parse_args(argc, argv); if (argc < 0) exit(1); while ((ch = getopt(argc, argv, "andfsSi:")) != -1) switch(ch) { case 'a': opts.aflag = true; break; case 'd': SETFUNC(F_DELETE); break; case 'n': opts.nflag = true; break; case 'S': SETFUNC(F_REPLACE); break; case 's': SETFUNC(F_SET); break; case 'f' : SETFUNC(F_FILESET); break; case 'i': opts.rifname = optarg; break; case '?': default: usage(); } argc -= optind; argv += optind; if (!func) func = F_GET; if (opts.rifname) { if (func != F_GET && !(func == F_DELETE && opts.aflag)) xo_errx(1, "-i not applicable to this operation"); if ((opts.rifindex = if_nametoindex(opts.rifname)) == 0) { if (errno == ENXIO) xo_errx(1, "interface %s does not exist", opts.rifname); else xo_err(1, "if_nametoindex(%s)", opts.rifname); } } switch (func) { case F_GET: if (opts.aflag) { if (argc != 0) usage(); xo_set_version(ARP_XO_VERSION); xo_open_container("arp"); xo_open_list("arp-cache"); struct in_addr all_addrs = {}; print_entries(opts.rifindex, all_addrs); xo_close_list("arp-cache"); xo_close_container("arp"); xo_finish(); } else { if (argc != 1) usage(); rtn = get(argv[0]); } break; case F_SET: case F_REPLACE: if (argc < 2 || argc > 6) usage(); if (func == F_REPLACE) (void)delete(argv[0]); rtn = set(argc, argv) ? 1 : 0; break; case F_DELETE: if (opts.aflag) { if (argc != 0) usage(); struct in_addr all_addrs = {}; nuke_entries(0, all_addrs); } else { if (argc != 1) usage(); rtn = delete(argv[0]); } break; case F_FILESET: if (argc != 1) usage(); rtn = file(argv[0]); break; } if (ifnameindex != NULL) if_freenameindex(ifnameindex); return (rtn); } /* * Process a file to set standard arp entries */ static int file(char *name) { FILE *fp; int i, retval; char line[100], arg[5][50], *args[5], *p; if ((fp = fopen(name, "r")) == NULL) xo_err(1, "cannot open %s", name); args[0] = &arg[0][0]; args[1] = &arg[1][0]; args[2] = &arg[2][0]; args[3] = &arg[3][0]; args[4] = &arg[4][0]; retval = 0; while(fgets(line, sizeof(line), fp) != NULL) { if ((p = strchr(line, '#')) != NULL) *p = '\0'; for (p = line; isblank(*p); p++); if (*p == '\n' || *p == '\0') continue; i = sscanf(p, "%49s %49s %49s %49s %49s", arg[0], arg[1], arg[2], arg[3], arg[4]); if (i < 2) { xo_warnx("bad line: %s", line); retval = 1; continue; } if (set(i, args)) retval = 1; } fclose(fp); return (retval); } /* * Given a hostname, fills up a (static) struct sockaddr_in with * the address of the host and returns a pointer to the * structure. */ struct sockaddr_in * getaddr(char *host) { struct hostent *hp; static struct sockaddr_in reply; bzero(&reply, sizeof(reply)); reply.sin_len = sizeof(reply); reply.sin_family = AF_INET; reply.sin_addr.s_addr = inet_addr(host); if (reply.sin_addr.s_addr == INADDR_NONE) { if (!(hp = gethostbyname(host))) { xo_warnx("%s: %s", host, hstrerror(h_errno)); return (NULL); } bcopy((char *)hp->h_addr, (char *)&reply.sin_addr, sizeof reply.sin_addr); } return (&reply); } int valid_type(int type); /* * Returns true if the type is a valid one for ARP. */ int valid_type(int type) { switch (type) { case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: case IFT_INFINIBAND: case IFT_ISO88023: case IFT_ISO88024: case IFT_L2VLAN: case IFT_BRIDGE: return (1); default: return (0); } } /* * Set an individual arp entry */ static int set(int argc, char **argv) { struct sockaddr_in *dst; /* what are we looking for */ struct ether_addr *ea; char *host = argv[0], *eaddr = argv[1]; struct sockaddr_dl sdl_m; argc -= 2; argv += 2; bzero(&sdl_m, sizeof(sdl_m)); sdl_m.sdl_len = sizeof(sdl_m); sdl_m.sdl_family = AF_LINK; dst = getaddr(host); if (dst == NULL) return (1); while (argc-- > 0) { if (strcmp(argv[0], "temp") == 0) { int max_age; size_t len = sizeof(max_age); if (sysctlbyname("net.link.ether.inet.max_age", &max_age, &len, NULL, 0) != 0) xo_err(1, "sysctlbyname"); opts.expire_time = max_age; } else if (strcmp(argv[0], "pub") == 0) { opts.flags |= RTF_ANNOUNCE; if (argc && strcmp(argv[1], "only") == 0) { /* * Compatibility: in pre FreeBSD 8 times * the "only" keyword used to mean that * an ARP entry should be announced, but * not installed into routing table. */ argc--; argv++; } } else if (strcmp(argv[0], "blackhole") == 0) { if (opts.flags & RTF_REJECT) { xo_errx(1, "Choose one of blackhole or reject, " "not both."); } opts.flags |= RTF_BLACKHOLE; } else if (strcmp(argv[0], "reject") == 0) { if (opts.flags & RTF_BLACKHOLE) { xo_errx(1, "Choose one of blackhole or reject, " "not both."); } opts.flags |= RTF_REJECT; } else { xo_warnx("Invalid parameter '%s'", argv[0]); usage(); } argv++; } ea = (struct ether_addr *)LLADDR(&sdl_m); if ((opts.flags & RTF_ANNOUNCE) && !strcmp(eaddr, "auto")) { if (!get_ether_addr(dst->sin_addr.s_addr, ea)) { xo_warnx("no interface found for %s", inet_ntoa(dst->sin_addr)); return (1); } sdl_m.sdl_alen = ETHER_ADDR_LEN; } else { struct ether_addr *ea1 = ether_aton(eaddr); if (ea1 == NULL) { xo_warnx("invalid Ethernet address '%s'", eaddr); return (1); } else { *ea = *ea1; sdl_m.sdl_alen = ETHER_ADDR_LEN; } } #ifndef WITHOUT_NETLINK return (set_nl(0, dst, &sdl_m, host)); #else return (set_rtsock(dst, &sdl_m, host)); #endif } #ifdef WITHOUT_NETLINK static int set_rtsock(struct sockaddr_in *dst, struct sockaddr_dl *sdl_m, char *host) { struct sockaddr_in *addr; struct sockaddr_dl *sdl; struct rt_msghdr *rtm; /* * In the case a proxy-arp entry is being added for * a remote end point, the RTF_ANNOUNCE flag in the * RTM_GET command is an indication to the kernel * routing code that the interface associated with * the prefix route covering the local end of the * PPP link should be returned, on which ARP applies. */ rtm = rtmsg(RTM_GET, dst, NULL); if (rtm == NULL) { xo_warn("%s", host); return (1); } addr = (struct sockaddr_in *)(rtm + 1); sdl = (struct sockaddr_dl *)(SA_SIZE(addr) + (char *)addr); if ((sdl->sdl_family != AF_LINK) || (rtm->rtm_flags & RTF_GATEWAY) || !valid_type(sdl->sdl_type)) { xo_warnx("cannot intuit interface index and type for %s", host); return (1); } sdl_m->sdl_type = sdl->sdl_type; sdl_m->sdl_index = sdl->sdl_index; return (rtmsg(RTM_ADD, dst, sdl_m) == NULL); } #endif /* * Display an individual arp entry */ static int get(char *host) { struct sockaddr_in *addr; int found; addr = getaddr(host); if (addr == NULL) return (1); xo_set_version(ARP_XO_VERSION); xo_open_container("arp"); xo_open_list("arp-cache"); found = print_entries(opts.rifindex, addr->sin_addr); if (found == 0) { xo_emit("{d:hostname/%s} ({d:ip-address/%s}) -- no entry", host, inet_ntoa(addr->sin_addr)); if (opts.rifname) xo_emit(" on {d:interface/%s}", opts.rifname); xo_emit("\n"); } xo_close_list("arp-cache"); xo_close_container("arp"); xo_finish(); return (found == 0); } /* * Delete an arp entry */ #ifdef WITHOUT_NETLINK static int delete_rtsock(char *host) { struct sockaddr_in *addr, *dst; struct rt_msghdr *rtm; struct sockaddr_dl *sdl; dst = getaddr(host); if (dst == NULL) return (1); /* * Perform a regular entry delete first. */ opts.flags &= ~RTF_ANNOUNCE; for (;;) { /* try twice */ rtm = rtmsg(RTM_GET, dst, NULL); if (rtm == NULL) { xo_warn("%s", host); return (1); } addr = (struct sockaddr_in *)(rtm + 1); sdl = (struct sockaddr_dl *)(SA_SIZE(addr) + (char *)addr); /* * With the new L2/L3 restructure, the route * returned is a prefix route. The important * piece of information from the previous * RTM_GET is the interface index. In the * case of ECMP, the kernel will traverse * the route group for the given entry. */ if (sdl->sdl_family == AF_LINK && !(rtm->rtm_flags & RTF_GATEWAY) && valid_type(sdl->sdl_type) ) { addr->sin_addr.s_addr = dst->sin_addr.s_addr; break; } /* * Regular entry delete failed, now check if there * is a proxy-arp entry to remove. */ if (opts.flags & RTF_ANNOUNCE) { xo_warnx("delete: cannot locate %s", host); return (1); } opts.flags |= RTF_ANNOUNCE; } rtm->rtm_flags |= RTF_LLDATA; if (rtmsg(RTM_DELETE, dst, NULL) != NULL) { printf("%s (%s) deleted\n", host, inet_ntoa(addr->sin_addr)); return (0); } return (1); } #endif static int delete(char *host) { #ifdef WITHOUT_NETLINK return (delete_rtsock(host)); #else return (delete_nl(0, host)); #endif } /* * Search the arp table and do some action on matching entries */ static int search(u_long addr, action_fn *action) { int mib[6]; size_t needed; char *lim, *buf, *next; struct rt_msghdr *rtm; struct sockaddr_in *sin2; struct sockaddr_dl *sdl; int st, found_entry = 0; mib[0] = CTL_NET; mib[1] = PF_ROUTE; mib[2] = 0; mib[3] = AF_INET; mib[4] = NET_RT_FLAGS; #ifdef RTF_LLINFO mib[5] = RTF_LLINFO; #else mib[5] = 0; #endif if (sysctl(mib, 6, NULL, &needed, NULL, 0) < 0) xo_err(1, "route-sysctl-estimate"); if (needed == 0) /* empty table */ return 0; buf = NULL; for (;;) { buf = reallocf(buf, needed); if (buf == NULL) xo_errx(1, "could not reallocate memory"); st = sysctl(mib, 6, buf, &needed, NULL, 0); if (st == 0 || errno != ENOMEM) break; needed += needed / 8; } if (st == -1) xo_err(1, "actual retrieval of routing table"); lim = buf + needed; for (next = buf; next < lim; next += rtm->rtm_msglen) { rtm = (struct rt_msghdr *)next; sin2 = (struct sockaddr_in *)(rtm + 1); sdl = (struct sockaddr_dl *)((char *)sin2 + SA_SIZE(sin2)); if (opts.rifindex && (opts.rifindex != sdl->sdl_index)) continue; if (addr && (addr != sin2->sin_addr.s_addr)) continue; found_entry = 1; (*action)(sdl, sin2, rtm); } free(buf); return (found_entry); } /* * Display an arp entry */ static void print_entry(struct sockaddr_dl *sdl, struct sockaddr_in *addr, struct rt_msghdr *rtm) { const char *host; struct hostent *hp; struct if_nameindex *p; if (ifnameindex == NULL) if ((ifnameindex = if_nameindex()) == NULL) xo_err(1, "cannot retrieve interface names"); xo_open_instance("arp-cache"); if (!opts.nflag) hp = gethostbyaddr((caddr_t)&(addr->sin_addr), sizeof addr->sin_addr, AF_INET); else hp = 0; if (hp) host = hp->h_name; else { host = "?"; if (h_errno == TRY_AGAIN) opts.nflag = true; } xo_emit("{:hostname/%s} ({:ip-address/%s}) at ", host, inet_ntoa(addr->sin_addr)); if (sdl->sdl_alen) { if ((sdl->sdl_type == IFT_ETHER || sdl->sdl_type == IFT_L2VLAN || sdl->sdl_type == IFT_BRIDGE) && sdl->sdl_alen == ETHER_ADDR_LEN) xo_emit("{:mac-address/%s}", ether_ntoa((struct ether_addr *)LLADDR(sdl))); else { int n = sdl->sdl_nlen > 0 ? sdl->sdl_nlen + 1 : 0; xo_emit("{:mac-address/%s}", link_ntoa(sdl) + n); } } else xo_emit("{d:/(incomplete)}{en:incomplete/true}"); for (p = ifnameindex; p && p->if_index && p->if_name; p++) { if (p->if_index == sdl->sdl_index) { xo_emit(" on {:interface/%s}", p->if_name); break; } } if (rtm->rtm_rmx.rmx_expire == 0) xo_emit("{d:/ permanent}{en:permanent/true}"); else { static struct timespec tp; time_t expire_time = 0; if (tp.tv_sec == 0) clock_gettime(CLOCK_MONOTONIC, &tp); if ((expire_time = rtm->rtm_rmx.rmx_expire - tp.tv_sec) > 0) xo_emit(" expires in {:expires/%d} seconds", (int)expire_time); else xo_emit("{d:/ expired}{en:expired/true}"); } if (rtm->rtm_flags & RTF_ANNOUNCE) xo_emit("{d:/ published}{en:published/true}"); switch(sdl->sdl_type) { case IFT_ETHER: xo_emit(" [{:type/ethernet}]"); break; case IFT_FDDI: xo_emit(" [{:type/fddi}]"); break; case IFT_ATM: xo_emit(" [{:type/atm}]"); break; case IFT_L2VLAN: xo_emit(" [{:type/vlan}]"); break; case IFT_IEEE1394: xo_emit(" [{:type/firewire}]"); break; case IFT_BRIDGE: xo_emit(" [{:type/bridge}]"); break; case IFT_INFINIBAND: xo_emit(" [{:type/infiniband}]"); break; default: break; } xo_emit("\n"); xo_close_instance("arp-cache"); } static int print_entries(uint32_t ifindex, struct in_addr addr) { #ifndef WITHOUT_NETLINK return (print_entries_nl(ifindex, addr)); #else return (search(addr.s_addr, print_entry)); #endif } /* * Nuke an arp entry */ static void nuke_entry(struct sockaddr_dl *sdl __unused, struct sockaddr_in *addr, struct rt_msghdr *rtm) { char ip[20]; if (rtm->rtm_flags & RTF_PINNED) return; snprintf(ip, sizeof(ip), "%s", inet_ntoa(addr->sin_addr)); delete(ip); } static void nuke_entries(uint32_t ifindex, struct in_addr addr) { search(addr.s_addr, nuke_entry); } static void usage(void) { fprintf(stderr, "%s\n%s\n%s\n%s\n%s\n%s\n%s\n", "usage: arp [-n] [-i interface] hostname", " arp [-n] [-i interface] -a", " arp -d hostname [pub]", " arp -d [-i interface] -a", " arp -s hostname ether_addr [temp] [reject | blackhole] [pub [only]]", " arp -S hostname ether_addr [temp] [reject | blackhole] [pub [only]]", " arp -f filename"); exit(1); } static struct rt_msghdr * rtmsg(int cmd, struct sockaddr_in *dst, struct sockaddr_dl *sdl) { static int seq; int rlen; int l; static int s = -1; static pid_t pid; static struct { struct rt_msghdr m_rtm; char m_space[512]; } m_rtmsg; struct rt_msghdr *rtm = &m_rtmsg.m_rtm; char *cp = m_rtmsg.m_space; if (s < 0) { /* first time: open socket, get pid */ s = socket(PF_ROUTE, SOCK_RAW, 0); if (s < 0) xo_err(1, "socket"); pid = getpid(); } errno = 0; /* * XXX RTM_DELETE relies on a previous RTM_GET to fill the buffer * appropriately. */ if (cmd == RTM_DELETE) goto doit; bzero((char *)&m_rtmsg, sizeof(m_rtmsg)); rtm->rtm_flags = opts.flags; rtm->rtm_version = RTM_VERSION; switch (cmd) { default: xo_errx(1, "internal wrong cmd"); case RTM_ADD: rtm->rtm_addrs |= RTA_GATEWAY; if (opts.expire_time != 0) { struct timespec tp; clock_gettime(CLOCK_MONOTONIC, &tp); rtm->rtm_rmx.rmx_expire = opts.expire_time + tp.tv_sec; } rtm->rtm_inits = RTV_EXPIRE; rtm->rtm_flags |= (RTF_HOST | RTF_STATIC | RTF_LLDATA); /* FALLTHROUGH */ case RTM_GET: rtm->rtm_addrs |= RTA_DST; } #define NEXTADDR(w, s) \ do { \ if ((s) != NULL && rtm->rtm_addrs & (w)) { \ bcopy((s), cp, sizeof(*(s))); \ cp += SA_SIZE(s); \ } \ } while (0) NEXTADDR(RTA_DST, dst); NEXTADDR(RTA_GATEWAY, sdl); rtm->rtm_msglen = cp - (char *)&m_rtmsg; doit: l = rtm->rtm_msglen; rtm->rtm_seq = ++seq; rtm->rtm_type = cmd; if ((rlen = write(s, (char *)&m_rtmsg, l)) < 0) { if (errno != ESRCH || cmd != RTM_DELETE) { xo_warn("writing to routing socket"); return (NULL); } } do { l = read(s, (char *)&m_rtmsg, sizeof(m_rtmsg)); } while (l > 0 && (rtm->rtm_type != cmd || rtm->rtm_seq != seq || rtm->rtm_pid != pid)); if (l < 0) xo_warn("read from routing socket"); return (rtm); } /* * get_ether_addr - get the hardware address of an interface on the - * the same subnet as ipaddr. + * same subnet as ipaddr. */ static int get_ether_addr(in_addr_t ipaddr, struct ether_addr *hwaddr) { struct ifaddrs *ifa, *ifd, *ifas = NULL; in_addr_t ina, mask; struct sockaddr_dl *dla; int retval = 0; /* * Scan through looking for an interface with an Internet * address on the same subnet as `ipaddr'. */ if (getifaddrs(&ifas) < 0) { xo_warnx("getifaddrs"); goto done; } for (ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) { if (ifa->ifa_addr == NULL || ifa->ifa_netmask == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * Check that the interface is up, * and not point-to-point or loopback. */ if ((ifa->ifa_flags & (IFF_UP|IFF_BROADCAST|IFF_POINTOPOINT| IFF_LOOPBACK|IFF_NOARP)) != (IFF_UP|IFF_BROADCAST)) continue; /* Get its netmask and check that it's on the right subnet. */ mask = ((struct sockaddr_in *) ifa->ifa_netmask)->sin_addr.s_addr; ina = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr; if ((ipaddr & mask) == (ina & mask)) break; /* ok, we got it! */ } if (ifa == NULL) goto done; /* * Now scan through again looking for a link-level address * for this interface. */ for (ifd = ifas; ifd != NULL; ifd = ifd->ifa_next) { if (ifd->ifa_addr == NULL) continue; if (strcmp(ifa->ifa_name, ifd->ifa_name) == 0 && ifd->ifa_addr->sa_family == AF_LINK) break; } if (ifd == NULL) goto done; /* * Found the link-level address - copy it out */ dla = (struct sockaddr_dl *)ifd->ifa_addr; memcpy(hwaddr, LLADDR(dla), dla->sdl_alen); printf("using interface %s for proxy with address %s\n", ifa->ifa_name, ether_ntoa(hwaddr)); retval = dla->sdl_alen; done: if (ifas != NULL) freeifaddrs(ifas); return (retval); } diff --git a/usr.sbin/bhyve/amd64/spinup_ap.c b/usr.sbin/bhyve/amd64/spinup_ap.c index df90ad6443d0..294775bb2c96 100644 --- a/usr.sbin/bhyve/amd64/spinup_ap.c +++ b/usr.sbin/bhyve/amd64/spinup_ap.c @@ -1,84 +1,84 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include "bhyverun.h" #include "spinup_ap.h" static void spinup_ap_realmode(struct vcpu *newcpu, uint64_t rip) { int vector, error; uint16_t cs; uint64_t desc_base; uint32_t desc_limit, desc_access; vector = rip >> PAGE_SHIFT; /* * Update the %cs and %rip of the guest so that it starts - * executing real mode code at at 'vector << 12'. + * executing real mode code at 'vector << 12'. */ error = vm_set_register(newcpu, VM_REG_GUEST_RIP, 0); assert(error == 0); error = vm_get_desc(newcpu, VM_REG_GUEST_CS, &desc_base, &desc_limit, &desc_access); assert(error == 0); desc_base = vector << PAGE_SHIFT; error = vm_set_desc(newcpu, VM_REG_GUEST_CS, desc_base, desc_limit, desc_access); assert(error == 0); cs = (vector << PAGE_SHIFT) >> 4; error = vm_set_register(newcpu, VM_REG_GUEST_CS, cs); assert(error == 0); } void spinup_ap(struct vcpu *newcpu, uint64_t rip) { int error; error = vcpu_reset(newcpu); assert(error == 0); spinup_ap_realmode(newcpu, rip); vm_resume_cpu(newcpu); } diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index 8eaa9b71fa15..00e9138d3910 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2783 +1,2783 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "bhyverun.h" #include "config.h" #include "debug.h" #ifdef __amd64__ #include "amd64/inout.h" #endif #include "mem.h" #include "pci_emul.h" #ifdef __amd64__ #include "amd64/pci_lpc.h" #include "pci_passthru.h" #endif #include "qemu_fwcfg.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) #define GB (1024 * 1024 * 1024UL) struct funcinfo { nvlist_t *fi_config; struct pci_devemu *fi_pde; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; struct pci_irq ii_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint8_t *pci_emul_rombase; static uint64_t pci_emul_romoffset; static uint8_t *pci_emul_romlim; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; struct pci_bar_allocation { TAILQ_ENTRY(pci_bar_allocation) chain; struct pci_devinst *pdi; int idx; enum pcibar_type type; uint64_t size; }; static TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER(pci_bars); struct boot_device { TAILQ_ENTRY(boot_device) boot_device_chain; struct pci_devinst *pdi; int bootindex; }; static TAILQ_HEAD(boot_list, boot_device) boot_devices = TAILQ_HEAD_INITIALIZER( boot_devices); #if defined(__amd64__) #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_IOMASK 0xffff /* * OVMF always uses 0xc0000000 as base address for 32 bit PCI MMIO. Don't * change this address without changing it in OVMF. */ #define PCI_EMUL_MEMBASE32 0xc0000000 #elif defined(__aarch64__) #define PCI_EMUL_IOBASE 0xdf000000UL #define PCI_EMUL_IOLIMIT 0xe0000000UL #define PCI_EMUL_MEMBASE32 0xa0000000UL #else #error Unsupported platform #endif #define PCI_EMUL_ROMSIZE 0x10000000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ #ifdef __amd64__ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); #endif #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMSIZE64 (32*GB) static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); static struct pci_devemu *pci_emul_finddev(const char *name); static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } static int is_pcir_bar(int coff) { return (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)); } static int is_pcir_bios(int coff) { return (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { EPRINTLN("Invalid PCI slot info field \"%s\"", aopt); } /* * Helper function to parse a list of comma-separated options where * each option is formatted as "name[=value]". If no value is * provided, the option is treated as a boolean and is given a value * of true. */ int pci_parse_legacy_config(nvlist_t *nvl, const char *opt) { char *config, *name, *tofree, *value; if (opt == NULL) return (0); config = tofree = strdup(opt); while ((name = strsep(&config, ",")) != NULL) { value = strchr(name, '='); if (value != NULL) { *value = '\0'; value++; set_config_value_node(nvl, name, value); } else set_config_bool_node(nvl, name, true); } free(tofree); return (0); } /* * PCI device configuration is stored in MIBs that encode the device's * location: * * pci... * * Where "bus", "slot", and "func" are all decimal values without * leading zeroes. Each valid device must have a "device" node which * identifies the driver model of the device. * * Device backends can provide a parser for the "config" string. If * a custom parser is not provided, pci_parse_legacy_config() is used * to parse the string. */ int pci_parse_slot(char *opt) { char node_name[sizeof("pci.XXX.XX.X")]; struct pci_devemu *pde; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; nvlist_t *nvl; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum, fnum, emul); goto done; } snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum, fnum); nvl = find_config_node(node_name); if (nvl != NULL) { EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum, fnum); goto done; } nvl = create_config_node(node_name); if (pde->pe_alias != NULL) set_config_value_node(nvl, "device", pde->pe_alias); else set_config_value_node(nvl, "device", pde->pe_emu); if (pde->pe_legacy_config != NULL) error = pde->pe_legacy_config(nvl, config); else error = pci_parse_legacy_config(nvl, config); done: free(str); return (error); } void pci_print_supported_devices(void) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } uint32_t pci_config_read_reg(const struct pcisel *const host_sel, nvlist_t *nvl, const uint32_t reg, const uint8_t size, const uint32_t def) { const char *config; const nvlist_t *pci_regs; assert(size == 1 || size == 2 || size == 4); pci_regs = find_relative_config_node(nvl, "pcireg"); if (pci_regs == NULL) { return def; } switch (reg) { case PCIR_DEVICE: config = get_config_value_node(pci_regs, "device"); break; case PCIR_VENDOR: config = get_config_value_node(pci_regs, "vendor"); break; case PCIR_REVID: config = get_config_value_node(pci_regs, "revid"); break; case PCIR_SUBVEND_0: config = get_config_value_node(pci_regs, "subvendor"); break; case PCIR_SUBDEV_0: config = get_config_value_node(pci_regs, "subdevice"); break; default: return (-1); } if (config == NULL) { return def; } else if (host_sel != NULL && strcmp(config, "host") == 0) { #ifdef __amd64__ return pci_host_read_config(host_sel, reg, size); #else errx(1, "cannot fetch host PCI configuration"); #endif } else { return strtol(config, NULL, 16); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } #ifdef __amd64__ static int pci_emul_io_handler(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; assert(port >= 0); for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && (uint64_t)port >= pdi->pi_bar[i].addr && (uint64_t)port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(pdi, i, offset, bytes); else (*pe->pe_barwrite)(pdi, i, offset, bytes, *eax); return (0); } } return (-1); } #else static int pci_emul_iomem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int)arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_IO); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); assert(size == 1 || size == 2 || size == 4); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_READ) *val = (*pe->pe_barread)(pdi, bidx, offset, size); else (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); return (0); } #endif /* !__amd64__ */ static int pci_emul_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int)arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { struct pci_devemu *pe; int error; enum pcibar_type type; pe = pi->pi_d; type = pi->pi_bar[idx].type; switch (type) { case PCIBAR_IO: { #ifdef __amd64__ struct inout_port iop; bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); #else struct mem_range mr; bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_iomem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); #endif break; } case PCIBAR_MEM32: case PCIBAR_MEM64: { struct mem_range mr; bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); break; } case PCIBAR_ROM: error = 0; break; default: error = EINVAL; break; } assert(error == 0); if (pe->pe_baraddr != NULL) (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Is the ROM enabled for the emulated pci device? */ static int romen(struct pci_devinst *pi) { return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) == PCIM_BIOS_ENABLE; } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX)); assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX)); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else if (type == PCIBAR_ROM) { if (size < ~PCIM_BIOS_ADDR_MASK + 1) size = ~PCIM_BIOS_ADDR_MASK + 1; } else { if (size < 16) size = 16; } /* * To reduce fragmentation of the MMIO space, we allocate the BARs by * size. Therefore, don't allocate the BAR yet. We create a list of all * BAR allocation which is sorted by BAR size. When all PCI devices are * initialized, we will assign an address to the BARs. */ /* create a new list entry */ struct pci_bar_allocation *const new_bar = malloc(sizeof(*new_bar)); memset(new_bar, 0, sizeof(*new_bar)); new_bar->pdi = pdi; new_bar->idx = idx; new_bar->type = type; new_bar->size = size; /* * Search for a BAR which size is lower than the size of our newly * allocated BAR. */ struct pci_bar_allocation *bar = NULL; TAILQ_FOREACH(bar, &pci_bars, chain) { if (bar->size < size) { break; } } if (bar == NULL) { /* * Either the list is empty or new BAR is the smallest BAR of * the list. Append it to the end of our list. */ TAILQ_INSERT_TAIL(&pci_bars, new_bar, chain); } else { /* * The found BAR is smaller than our new BAR. For that reason, * insert our new BAR before the found BAR. */ TAILQ_INSERT_BEFORE(bar, new_bar, chain); } /* * pci_passthru devices synchronize their physical and virtual command * register on init. For that reason, the virtual cmd reg should be * updated as early as possible. */ uint16_t enbit = 0; switch (type) { case PCIBAR_IO: enbit = PCIM_CMD_PORTEN; break; case PCIBAR_MEM64: case PCIBAR_MEM32: enbit = PCIM_CMD_MEMEN; break; default: enbit = 0; break; } const uint16_t cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); return (0); } static int pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx, const enum pcibar_type type, const uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (128MB currently). */ if (size > 128 * 1024 * 1024) { baseptr = &pci_emul_membase64; limit = pci_emul_memlim64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; case PCIBAR_ROM: /* do not claim memory for ROM. OVMF will do it for us. */ baseptr = NULL; limit = 0; mask = PCIM_BIOS_ADDR_MASK; lobits = 0; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } else { addr = 0; } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* * passthru devices are using same lobits as physical device they set * this property */ if (pdi->pi_bar[idx].lobits != 0) { lobits = pdi->pi_bar[idx].lobits; } else { pdi->pi_bar[idx].lobits = lobits; } /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } if (type != PCIBAR_ROM) { register_bar(pdi, idx); } return (0); } int pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, void **const addr) { /* allocate ROM space once on first call */ if (pci_emul_rombase == 0) { pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM, "pcirom", PCI_EMUL_ROMSIZE); if (pci_emul_rombase == MAP_FAILED) { warnx("%s: failed to create rom segment", __func__); return (-1); } pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE; pci_emul_romoffset = 0; } /* ROM size should be a power of 2 and greater than 2 KB */ const uint64_t rom_size = MAX(1UL << flsl(size), ~PCIM_BIOS_ADDR_MASK + 1); /* check if ROM fits into ROM space */ if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) { warnx("%s: no space left in rom segment:", __func__); warnx("%16lu bytes left", PCI_EMUL_ROMSIZE - pci_emul_romoffset); warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus, pdi->pi_slot, pdi->pi_func); return (-1); } /* allocate ROM BAR */ const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM, rom_size); if (error) return error; /* return address */ *addr = pci_emul_rombase + pci_emul_romoffset; /* save offset into ROM Space */ pdi->pi_romoffset = pci_emul_romoffset; /* increase offset for next ROM */ pci_emul_romoffset += rom_size; return (0); } int pci_emul_add_boot_device(struct pci_devinst *pi, int bootindex) { struct boot_device *new_device, *device; /* don't permit a negative bootindex */ if (bootindex < 0) { errx(4, "Invalid bootindex %d for %s", bootindex, pi->pi_name); } /* alloc new boot device */ new_device = calloc(1, sizeof(struct boot_device)); if (new_device == NULL) { return (ENOMEM); } new_device->pdi = pi; new_device->bootindex = bootindex; /* search for boot device with higher boot index */ TAILQ_FOREACH(device, &boot_devices, boot_device_chain) { if (device->bootindex == bootindex) { errx(4, "Could not set bootindex %d for %s. Bootindex already occupied by %s", bootindex, pi->pi_name, device->pdi->pi_name); } else if (device->bootindex > bootindex) { break; } } /* add boot device to queue */ if (device == NULL) { TAILQ_INSERT_TAIL(&boot_devices, new_device, boot_device_chain); } else { TAILQ_INSERT_BEFORE(device, new_device, boot_device_chain); } return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(const char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pci_irq_init_irq(&pdi->pi_lintr.irq); pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s@pci.%d.%d.%d", pde->pe_emu, bus, slot, func); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(pdi, fi->fi_config); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; } CFGWRITE(pi, offset, val, bytes); msgctrl = pci_get_cfgdata16(pi, capoff + 2); addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } static void pciecap_cfgwrite(struct pci_devinst *pi, int capoff __unused, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. A capoff parameter of zero will force a search for the * offset and type. */ void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid) { uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; if (capoff == 0) { /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); capid = pci_get_cfgdata8(pi, capoff); } /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr __unused, int size __unused, uint64_t *val, void *arg1 __unused, long arg2 __unused) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int pci_emul_ecfg_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int bytes, uint64_t *val, void *arg1 __unused, long arg2 __unused) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; pci_cfgrw(in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } static int init_bootorder(void) { struct boot_device *device; FILE *fp; char *bootorder; size_t bootorder_len; if (TAILQ_EMPTY(&boot_devices)) return (0); fp = open_memstream(&bootorder, &bootorder_len); TAILQ_FOREACH(device, &boot_devices, boot_device_chain) { fprintf(fp, "/pci@i0cf8/pci@%d,%d\n", device->pdi->pi_slot, device->pdi->pi_func); } fclose(fp); return (qemu_fwcfg_add_file("bootorder", bootorder_len, bootorder)); } #define BUSIO_ROUNDUP 32 #define BUSMEM32_ROUNDUP (1024 * 1024) #define BUSMEM64_ROUNDUP (512 * 1024 * 1024) int init_pci(struct vmctx *ctx) { char node_name[sizeof("pci.XXX.XX.X")]; struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; nvlist_t *nvl; const char *emul; size_t lowmem; int bus, slot, func; int error; if (vm_get_lowmem_limit(ctx) > PCI_EMUL_MEMBASE32) errx(EX_OSERR, "Invalid lowmem limit"); pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = PCI_EMUL_MEMBASE32; pci_emul_membase64 = vm_get_highmem_base(ctx) + vm_get_highmem_size(ctx); pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64); pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64; TAILQ_INIT(&boot_devices); for (bus = 0; bus < MAXBUSES; bus++) { snprintf(node_name, sizeof(node_name), "pci.%d", bus); nvl = find_config_node(node_name); if (nvl == NULL) continue; pci_businfo[bus] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bus]; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; /* first run: init devices */ for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bus, slot, func); nvl = find_config_node(node_name); if (nvl == NULL) continue; fi->fi_config = nvl; emul = get_config_value_node(nvl, "device"); if (emul == NULL) { EPRINTLN("pci slot %d:%d:%d: missing " "\"device\" value", bus, slot, func); return (EINVAL); } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown " "device \"%s\"", bus, slot, func, emul); return (EINVAL); } if (pde->pe_alias != NULL) { EPRINTLN("pci slot %d:%d:%d: legacy " "device \"%s\", use \"%s\" instead", bus, slot, func, emul, pde->pe_alias); return (EINVAL); } fi->fi_pde = pde; error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* second run: assign BARs and free list */ struct pci_bar_allocation *bar; struct pci_bar_allocation *bar_tmp; TAILQ_FOREACH_SAFE(bar, &pci_bars, chain, bar_tmp) { pci_emul_assign_bar(bar->pdi, bar->idx, bar->type, bar->size); free(bar); } TAILQ_INIT(&pci_bars); /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM32_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM32_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM64_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM64_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } #ifdef __amd64__ lpc_pirq_routed(); #endif if ((error = init_bootorder()) != 0) { warnx("%s: Unable to init bootorder", __func__); return (error); } /* * The guest physical memory map looks like the following on amd64: * [0, lowmem) guest system memory * [lowmem, 0xC0000000) memory hole (may be absent) * [0xC0000000, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) guest system memory * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation * * On arm64 the guest physical memory map looks like this: * [0x0DF00000, 0x10000000) PCI I/O memory * [0xA0000000, 0xE0000000) PCI 32-bit BAR allocation * [0xE0000000, 0xF0000000) PCI extended config window * [4GB, 4GB + highmem) guest system memory * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation * * "lowmem" is guest memory below 0xC0000000. amd64 guests provisioned * with less than 3GB of RAM will have no memory above the 4GB boundary. * System memory for arm64 guests is all above the 4GB boundary. */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } #ifdef __amd64__ static void pci_apic_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq, void *arg __unused) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", irq->ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq, void *arg __unused) { char *name; name = lpc_pirq_name(irq->pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } #endif /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); #ifdef __amd64__ if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } #endif assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); #ifdef __amd64__ if (pci_count_lintr(bus) != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } #endif dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); #ifdef __amd64__ done: #endif dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_raise_msi(pi->pi_vmctx, mte->addr, mte->msg_data, pi->pi_bus, pi->pi_slot, pi->pi_func); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_raise_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index, pi->pi_bus, pi->pi_slot, pi->pi_func); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; struct pci_irq *irq; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; irq = &ii->ii_irq; pci_irq_route(pi, irq); pi->pi_lintr.irq = *irq; pci_set_cfgdata8(pi, PCIR_INTLINE, pci_irq_intline(irq)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, &ii->ii_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } /* * Update device state in response to changes to the PCI command * register. */ void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old) { int i; uint16_t changed, new; new = pci_get_cfgdata16(pi, PCIR_COMMAND); changed = old ^ new; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (new & PCIM_CMD_PORTEN) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_ROM: /* skip (un-)register of ROM if it disabled */ if (!romen(pi)) break; /* fallthrough */ case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (new & PCIM_CMD_MEMEN) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int rshift; uint32_t cmd, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ pci_emul_cmd_changed(pi, cmd); } static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *valp) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the - * the guest is doing an un-aligned access. + * guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *valp = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *valp = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *valp = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { needcfg = pe->pe_cfgread(pi, coff, bytes, valp); } else { needcfg = 1; } if (needcfg) *valp = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, valp); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(pi, coff, bytes, *valp) == 0) return; /* * Special handling for write to BAR and ROM registers */ if (is_pcir_bar(coff) || is_pcir_bios(coff)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; if (is_pcir_bar(coff)) { idx = (coff - PCIR_BAR(0)) / 4; } else if (is_pcir_bios(coff)) { idx = PCI_ROM_IDX; } else { errx(4, "%s: invalid BAR offset %d", __func__, coff); } mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *valp & mask; #if defined(PCI_EMUL_IOMASK) addr &= PCI_EMUL_IOMASK; #endif bar = addr | pi->pi_bar[idx].lobits; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *valp & mask; bar |= pi->pi_bar[idx].lobits; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *valp & mask; bar |= pi->pi_bar[idx].lobits; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*valp << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; case PCIBAR_ROM: addr = bar = *valp & mask; if (memen(pi) && romen(pi)) { unregister_bar(pi, idx); } pi->pi_bar[idx].addr = addr; pi->pi_bar[idx].lobits = *valp & PCIM_BIOS_ENABLE; /* romen could have changed it value */ if (memen(pi) && romen(pi)) { register_bar(pi, idx); } bar |= pi->pi_bar[idx].lobits; break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *valp, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *valp, bytes); } else { CFGWRITE(pi, coff, *valp, bytes); } } } #ifdef __amd64__ static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx __unused, int in, int port __unused, int bytes, uint32_t *eax, void *arg __unused) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = (x & PCI_REGMAX) & ~0x03; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int pci_emul_cfgdata(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg __unused) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { pci_cfgrw(in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #endif #ifdef BHYVE_SNAPSHOT /* * Saves/restores PCI device emulated state. Returns 0 on success. */ static int pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) { struct pci_devinst *pi; int i; int ret; pi = meta->dev_data; SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), meta, ret, done); for (i = 0; i < (int)nitems(pi->pi_bar); i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); } /* Restore MSI-X table. */ for (i = 0; i < pi->pi_msix.table_count; i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, meta, ret, done); } done: return (ret); } int pci_snapshot(struct vm_snapshot_meta *meta) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(meta->dev_name != NULL); pdi = meta->dev_data; pde = pdi->pi_d; if (pde->pe_snapshot == NULL) return (ENOTSUP); ret = pci_snapshot_pci_dev(meta); if (ret == 0) ret = (*pde->pe_snapshot)(meta); return (ret); } int pci_pause(struct pci_devinst *pdi) { struct pci_devemu *pde = pdi->pi_d; if (pde->pe_pause == NULL) { /* The pause/resume functionality is optional. */ return (0); } return (*pde->pe_pause)(pdi); } int pci_resume(struct pci_devinst *pdi) { struct pci_devemu *pde = pdi->pi_d; if (pde->pe_resume == NULL) { /* The pause/resume functionality is optional. */ return (0); } return (*pde->pe_resume)(pdi); } #endif #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct pci_devinst *pi, nvlist_t *nvl __unused) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } #ifdef BHYVE_SNAPSHOT struct pci_devinst * pci_next(const struct pci_devinst *cursor) { unsigned bus = 0, slot = 0, func = 0; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; bus = cursor ? cursor->pi_bus : 0; slot = cursor ? cursor->pi_slot : 0; func = cursor ? (cursor->pi_func + 1) : 0; for (; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; if (slot >= MAXSLOTS) slot = 0; for (; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; if (func >= MAXFUNCS) func = 0; for (; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; return (fi->fi_devi); } } } return (NULL); } static int pci_emul_snapshot(struct vm_snapshot_meta *meta __unused) { return (0); } #endif static const struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_emul_snapshot, #endif }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index 2b72b862ab21..a4f37034d93d 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -1,435 +1,435 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Chris Torek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _BHYVE_VIRTIO_H_ #define _BHYVE_VIRTIO_H_ #include #include #include #include /* * These are derived from several virtio specifications. * * Some useful links: * https://github.com/rustyrussell/virtio-spec * http://people.redhat.com/pbonzini/virtio-spec.pdf */ /* * A virtual device has zero or more "virtual queues" (virtqueue). * Each virtqueue uses at least two 4096-byte pages, laid out thus: * * +-----------------------------------------------+ * | "desc": descriptors, 16 bytes each | * | ----------------------------------------- | * | "avail": 2 uint16; uint16; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * | "used": 2 x uint16; elems; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * * The number that appears here is always a power of two and is * limited to no more than 32768 (as it must fit in a 16-bit field). * If is sufficiently large, the above will occupy more than * two pages. In any case, all pages must be physically contiguous * within the guest's physical address space. * * The 16-byte "desc" descriptors consist of a 64-bit guest * physical address , a 32-bit length , a 16-bit * , and a 16-bit field (all in guest byte order). * * There are three flags that may be set : * NEXT descriptor is chained, so use its "next" field * WRITE descriptor is for host to write into guest RAM * (else host is to read from guest RAM) * INDIRECT descriptor address field is (guest physical) * address of a linear array of descriptors * * Unless INDIRECT is set, is the number of bytes that may * be read/written from guest physical address . If * INDIRECT is set, WRITE is ignored and provides the length * of the indirect descriptors (and must be a multiple of * 16). Note that NEXT may still be set in the main descriptor * pointing to the indirect, and should be set in each indirect * descriptor that uses the next descriptor (these should generally * be numbered sequentially). However, INDIRECT must not be set * in the indirect descriptors. Upon reaching an indirect descriptor * without a NEXT bit, control returns to the direct descriptors. * * Except inside an indirect, each value must be in the * range [0 .. N) (i.e., the half-open interval). (Inside an * indirect, each must be in the range [0 .. /16).) * * The "avail" data structures reside in the same pages as the * "desc" structures since both together are used by the device to * pass information to the hypervisor's virtual driver. These * begin with a 16-bit field and 16-bit index , then * have 16-bit values, followed by one final 16-bit * field . The entries are simply indices - * indices into the descriptor ring (and thus must meet the same + * into the descriptor ring (and thus must meet the same * constraints as each value). However, is counted * up from 0 (initially) and simply wraps around after 65535; it * is taken mod to find the next available entry. * * The "used" ring occupies a separate page or pages, and contains * values written from the virtual driver back to the guest OS. * This begins with a 16-bit and 16-bit , then there * are "vring_used" elements, followed by a 16-bit . * The "vring_used" elements consist of a 32-bit and a * 32-bit (vu_tlen below). The is simply the index of * the head of a descriptor chain the guest made available * earlier, and the is the number of bytes actually written, * e.g., in the case of a network driver that provided a large * receive buffer but received only a small amount of data. * * The two event fields, and , in the * avail and used rings (respectively -- note the reversal!), are * always provided, but are used only if the virtual device * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature * negotiation. Similarly, both rings provide a flag -- * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in * their field, indicating that the guest does not need an * interrupt, or that the hypervisor driver does not need a * notify, when descriptors are added to the corresponding ring. * (These are provided only for interrupt optimization and need * not be implemented.) */ #define VRING_ALIGN 4096 /* * The address of any given virtual queue is determined by a single * Page Frame Number register. The guest writes the PFN into the * PCI config space. However, a device that has two or more * virtqueues can have a different PFN, and size, for each queue. * The number of queues is determinable via the PCI config space * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means * queue #0, 1 means queue#1, etc. Once a queue is selected, the * remaining PFN and QNUM registers refer to that queue. * * QNUM is a read-only register containing a nonzero power of two * that indicates the (hypervisor's) queue size. Or, if reading it * produces zero, the hypervisor does not have a corresponding * queue. (The number of possible queues depends on the virtual * device. The block device has just one; the network device * provides either two -- 0 = receive, 1 = transmit -- or three, * with 2 = control.) * * PFN is a read/write register giving the physical page address of * the virtqueue in guest memory (the guest must allocate enough space * based on the hypervisor's provided QNUM). * * QNOTIFY is effectively write-only: when the guest writes a queue * number to the register, the hypervisor should scan the specified * virtqueue. (Reading QNOTIFY currently always gets 0). */ /* * PFN register shift amount */ #define VRING_PFN 12 /* * PCI vendor/device IDs */ #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 #define VIRTIO_DEV_CONSOLE 0x1003 #define VIRTIO_DEV_SCSI 0x1004 #define VIRTIO_DEV_RANDOM 0x1005 #define VIRTIO_DEV_9P 0x1009 #define VIRTIO_DEV_INPUT 0x1052 /* * PCI revision IDs */ #define VIRTIO_REV_INPUT 1 /* * PCI subvendor IDs */ #define VIRTIO_SUBVEN_INPUT 0x108E /* * PCI subdevice IDs */ #define VIRTIO_SUBDEV_INPUT 0x1100 /* From section 2.3, "Virtqueue Configuration", of the virtio specification */ static inline int vring_size_aligned(u_int qsz) { return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN)); } struct pci_devinst; struct vqueue_info; struct vm_snapshot_meta; /* * A virtual device, with some number (possibly 0) of virtual * queues and some size (possibly 0) of configuration-space * registers private to the device. The virtio_softc should come * at the front of each "derived class", so that a pointer to the * virtio_softc is also a pointer to the more specific, derived- * from-virtio driver's softc. * * Note: inside each hypervisor virtio driver, changes to these * data structures must be locked against other threads, if any. * Except for PCI config space register read/write, we assume each * driver does the required locking, but we need a pointer to the * lock (if there is one) for PCI config space read/write ops. * * When the guest reads or writes the device's config space, the * generic layer checks for operations on the special registers * described above. If the offset of the register(s) being read * or written is past the CFG area (CFG0 or CFG1), the request is * passed on to the virtual device, after subtracting off the * generic-layer size. (So, drivers can just use the offset as * an offset into "struct config", for instance.) * * (The virtio layer also makes sure that the read or write is to/ * from a "good" config offset, hence vc_cfgsize, and on BAR #0. * However, the driver must verify the read or write size and offset * and that no one is writing a readonly register.) * * The BROKED flag ("this thing done gone and broked") is for future * use. */ #define VIRTIO_USE_MSIX 0x01 #define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ #define VIRTIO_BROKED 0x08 /* ??? */ struct virtio_softc { struct virtio_consts *vs_vc; /* constants (see below) */ int vs_flags; /* VIRTIO_* flags from above */ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ struct pci_devinst *vs_pi; /* PCI device instance */ uint32_t vs_negotiated_caps; /* negotiated capabilities */ struct vqueue_info *vs_queues; /* one per vc_nvq */ int vs_curq; /* current queue */ uint8_t vs_status; /* value from last status write */ uint8_t vs_isr; /* ISR flags, if not MSI-X */ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ }; #define VS_LOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_lock(vs->vs_mtx); \ } while (0) #define VS_UNLOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_unlock(vs->vs_mtx); \ } while (0) struct virtio_consts { const char *vc_name; /* name of driver (for diagnostics) */ int vc_nvq; /* number of virtual queues */ size_t vc_cfgsize; /* size of dev-specific config regs */ void (*vc_reset)(void *); /* called on virtual device reset */ void (*vc_qnotify)(void *, struct vqueue_info *); /* called on QNOTIFY if no VQ notify */ int (*vc_cfgread)(void *, int, int, uint32_t *); /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ void (*vc_pause)(void *); /* called to pause device activity */ void (*vc_resume)(void *); /* called to resume device activity */ int (*vc_snapshot)(void *, struct vm_snapshot_meta *); /* called to save / restore device state */ }; /* * Data structure allocated (statically) per virtual queue. * * Drivers may change vq_qsize after a reset. When the guest OS * requests a device reset, the hypervisor first calls * vs->vs_vc->vc_reset(); then the data structure below is * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). * * The remaining fields should only be fussed-with by the generic * code. * * Note: the addresses of vq_desc, vq_avail, and vq_used are all * computable from each other, but it's a lot simpler if we just * keep a pointer to each one. The event indices are similarly * (but more easily) computable, and this time we'll compute them: * they're just XX_ring[N]. */ #define VQ_ALLOC 0x01 /* set once we have a pfn */ #define VQ_BROKED 0x02 /* ??? */ struct vqueue_info { uint16_t vq_qsize; /* size of this queue (a power of 2) */ void (*vq_notify)(void *, struct vqueue_info *); /* called instead of vc_notify, if not NULL */ struct virtio_softc *vq_vs; /* backpointer to softc */ uint16_t vq_num; /* we're the num'th queue in the softc */ uint16_t vq_flags; /* flags (see above) */ uint16_t vq_last_avail; /* a recent value of vq_avail->idx */ uint16_t vq_next_used; /* index of the next used slot to be filled */ uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ struct vring_desc *vq_desc; /* descriptor array */ struct vring_avail *vq_avail; /* the "avail" ring */ struct vring_used *vq_used; /* the "used" ring */ }; /* as noted above, these are sort of backwards, name-wise */ #define VQ_AVAIL_EVENT_IDX(vq) \ (*(uint16_t *)&(vq)->vq_used->ring[(vq)->vq_qsize]) #define VQ_USED_EVENT_IDX(vq) \ ((vq)->vq_avail->ring[(vq)->vq_qsize]) /* * Is this ring ready for I/O? */ static inline int vq_ring_ready(struct vqueue_info *vq) { return (vq->vq_flags & VQ_ALLOC); } /* * Are there "available" descriptors? (This does not count * how many, just returns True if there are some.) */ static inline int vq_has_descs(struct vqueue_info *vq) { return (vq_ring_ready(vq) && vq->vq_last_avail != vq->vq_avail->idx); } /* * Deliver an interrupt to the guest for a specific MSI-X queue or * event. */ static inline void vi_interrupt(struct virtio_softc *vs, uint8_t isr, uint16_t msix_idx) { if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, msix_idx); else { VS_LOCK(vs); vs->vs_isr |= isr; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); VS_UNLOCK(vs); } } /* * Deliver an interrupt to the guest on the given virtual queue (if * possible, or a generic MSI interrupt if not using MSI-X). */ static inline void vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) { vi_interrupt(vs, VIRTIO_PCI_ISR_INTR, vq->vq_msix_idx); } static inline void vq_kick_enable(struct vqueue_info *vq) { vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY; /* * Full memory barrier to make sure the store to vq_used->flags * happens before the load from vq_avail->idx, which results from a * subsequent call to vq_has_descs(). */ atomic_thread_fence_seq_cst(); } static inline void vq_kick_disable(struct vqueue_info *vq) { vq->vq_used->flags |= VRING_USED_F_NO_NOTIFY; } struct iovec; /* * Request description returned by vq_getchain. * * Writable iovecs start at iov[req.readable]. */ struct vi_req { int readable; /* num of readable iovecs */ int writable; /* num of writable iovecs */ unsigned int idx; /* ring index */ }; void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues); int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, struct vi_req *reqp); void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_relchain_publish(struct vqueue_info *vq); void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); uint64_t vi_pci_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size); void vi_pci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); #ifdef BHYVE_SNAPSHOT int vi_pci_snapshot(struct vm_snapshot_meta *meta); int vi_pci_pause(struct pci_devinst *pi); int vi_pci_resume(struct pci_devinst *pi); #endif #endif /* _BHYVE_VIRTIO_H_ */ diff --git a/usr.sbin/crunch/crunchgen/crunchgen.c b/usr.sbin/crunch/crunchgen/crunchgen.c index 1deb2bb2f7ea..462a13f9c897 100644 --- a/usr.sbin/crunch/crunchgen/crunchgen.c +++ b/usr.sbin/crunch/crunchgen/crunchgen.c @@ -1,1246 +1,1246 @@ /* * Copyright (c) 1994 University of Maryland * All Rights Reserved. * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of U.M. not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. U.M. makes no representations about the * suitability of this software for any purpose. It is provided "as is" * without express or implied warranty. * * U.M. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL U.M. * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * Author: James da Silva, Systems Design and Analysis Group * Computer Science Department * University of Maryland at College Park */ /* * ======================================================================== * crunchgen.c * * Generates a Makefile and main C file for a crunched executable, * from specs given in a .conf file. */ #include #include #include #include #include #include #include #include #include #include #include #include #define CRUNCH_VERSION "0.2" #define MAXLINELEN 16384 #define MAXFIELDS 2048 /* internal representation of conf file: */ /* simple lists of strings suffice for most parms */ typedef struct strlst { struct strlst *next; char *str; } strlst_t; /* progs have structure, each field can be set with "special" or calculated */ typedef struct prog { struct prog *next; /* link field */ char *name; /* program name */ char *ident; /* C identifier for the program name */ char *srcdir; char *realsrcdir; char *objdir; char *objvar; /* Makefile variable to replace OBJS */ strlst_t *objs, *objpaths; strlst_t *buildopts; strlst_t *keeplist; strlst_t *links; strlst_t *libs; strlst_t *libs_so; int goterror; } prog_t; /* global state */ static strlst_t *buildopts = NULL; static strlst_t *srcdirs = NULL; static strlst_t *libs = NULL; static strlst_t *libs_so = NULL; static prog_t *progs = NULL; static char confname[MAXPATHLEN], infilename[MAXPATHLEN]; static char outmkname[MAXPATHLEN], outcfname[MAXPATHLEN], execfname[MAXPATHLEN]; static char tempfname[MAXPATHLEN], cachename[MAXPATHLEN]; static char curfilename[MAXPATHLEN]; static bool tempfname_initialized = false; static char outhdrname[MAXPATHLEN] ; /* user-supplied header for *.mk */ static const char *objprefix; /* where are the objects ? */ static const char *path_make; static int linenum = -1; static int goterror = 0; static int verbose, readcache; /* options */ static int reading_cache; static int makeobj = 0; /* add 'make obj' rules to the makefile */ static int list_mode; /* general library routines */ void status(const char *str); void out_of_memory(void); void add_string(strlst_t **listp, char *str); int is_dir(const char *pathname); int is_nonempty_file(const char *pathname); int subtract_strlst(strlst_t **lista, strlst_t **listb); int in_list(strlst_t **listp, char *str); /* helper routines for main() */ void usage(void); void parse_conf_file(void); void gen_outputs(void); extern const char *crunched_skel[]; int main(int argc, char **argv) { char *p; int optc; verbose = 1; readcache = 1; *outmkname = *outcfname = *execfname = '\0'; path_make = getenv("MAKE"); if (path_make == NULL || *path_make == '\0') path_make = "make"; p = getenv("MAKEOBJDIRPREFIX"); if (p == NULL || *p == '\0') objprefix = "/usr/obj"; /* default */ else if ((objprefix = strdup(p)) == NULL) out_of_memory(); while((optc = getopt(argc, argv, "lh:m:c:e:p:foq")) != -1) { switch(optc) { case 'f': readcache = 0; break; case 'o': makeobj = 1; break; case 'q': verbose = 0; break; case 'm': strlcpy(outmkname, optarg, sizeof(outmkname)); break; case 'p': if ((objprefix = strdup(optarg)) == NULL) out_of_memory(); break; case 'h': strlcpy(outhdrname, optarg, sizeof(outhdrname)); break; case 'c': strlcpy(outcfname, optarg, sizeof(outcfname)); break; case 'e': strlcpy(execfname, optarg, sizeof(execfname)); break; case 'l': list_mode++; verbose = 0; break; case '?': default: usage(); } } argc -= optind; argv += optind; if (argc != 1) usage(); /* * generate filenames */ strlcpy(infilename, argv[0], sizeof(infilename)); /* confname = `basename infilename .conf` */ if ((p=strrchr(infilename, '/')) != NULL) strlcpy(confname, p + 1, sizeof(confname)); else strlcpy(confname, infilename, sizeof(confname)); if ((p=strrchr(confname, '.')) != NULL && !strcmp(p, ".conf")) *p = '\0'; if (!*outmkname) snprintf(outmkname, sizeof(outmkname), "%s.mk", confname); if (!*outcfname) snprintf(outcfname, sizeof(outcfname), "%s.c", confname); if (!*execfname) snprintf(execfname, sizeof(execfname), "%s", confname); snprintf(cachename, sizeof(cachename), "%s.cache", confname); snprintf(tempfname, sizeof(tempfname), "%s/crunchgen_%sXXXXXX", getenv("TMPDIR") ? getenv("TMPDIR") : _PATH_TMP, confname); tempfname_initialized = false; parse_conf_file(); if (list_mode) exit(goterror); gen_outputs(); exit(goterror); } void usage(void) { fprintf(stderr, "%s%s\n\t%s%s\n", "usage: crunchgen [-foq] ", "[-h ] [-m ]", "[-p ] [-c ] [-e ] ", ""); exit(1); } /* * ======================================================================== * parse_conf_file subsystem * */ /* helper routines for parse_conf_file */ void parse_one_file(char *filename); void parse_line(char *pline, int *fc, char **fv, int nf); void add_srcdirs(int argc, char **argv); void add_progs(int argc, char **argv); void add_link(int argc, char **argv); void add_libs(int argc, char **argv); void add_libs_so(int argc, char **argv); void add_buildopts(int argc, char **argv); void add_special(int argc, char **argv); prog_t *find_prog(char *str); void add_prog(char *progname); void parse_conf_file(void) { if (!is_nonempty_file(infilename)) errx(1, "fatal: input file \"%s\" not found", infilename); parse_one_file(infilename); if (readcache && is_nonempty_file(cachename)) { reading_cache = 1; parse_one_file(cachename); } } void parse_one_file(char *filename) { char *fieldv[MAXFIELDS]; int fieldc; void (*f)(int c, char **v); FILE *cf; char line[MAXLINELEN]; snprintf(line, sizeof(line), "reading %s", filename); status(line); strlcpy(curfilename, filename, sizeof(curfilename)); if ((cf = fopen(curfilename, "r")) == NULL) { warn("%s", curfilename); goterror = 1; return; } linenum = 0; while (fgets(line, MAXLINELEN, cf) != NULL) { linenum++; parse_line(line, &fieldc, fieldv, MAXFIELDS); if (fieldc < 1) continue; if (!strcmp(fieldv[0], "srcdirs")) f = add_srcdirs; else if(!strcmp(fieldv[0], "progs")) f = add_progs; else if(!strcmp(fieldv[0], "ln")) f = add_link; else if(!strcmp(fieldv[0], "libs")) f = add_libs; else if(!strcmp(fieldv[0], "libs_so")) f = add_libs_so; else if(!strcmp(fieldv[0], "buildopts")) f = add_buildopts; else if(!strcmp(fieldv[0], "special")) f = add_special; else { warnx("%s:%d: skipping unknown command `%s'", curfilename, linenum, fieldv[0]); goterror = 1; continue; } if (fieldc < 2) { warnx("%s:%d: %s %s", curfilename, linenum, fieldv[0], "command needs at least 1 argument, skipping"); goterror = 1; continue; } f(fieldc, fieldv); } if (ferror(cf)) { warn("%s", curfilename); goterror = 1; } fclose(cf); } void parse_line(char *pline, int *fc, char **fv, int nf) { char *p; p = pline; *fc = 0; while (1) { while (isspace((unsigned char)*p)) p++; if (*p == '\0' || *p == '#') break; if (*fc < nf) fv[(*fc)++] = p; while (*p && !isspace((unsigned char)*p) && *p != '#') p++; if (*p == '\0' || *p == '#') break; *p++ = '\0'; } if (*p) *p = '\0'; /* needed for '#' case */ } void add_srcdirs(int argc, char **argv) { int i; for (i = 1; i < argc; i++) { if (is_dir(argv[i])) add_string(&srcdirs, argv[i]); else { warnx("%s:%d: `%s' is not a directory, skipping it", curfilename, linenum, argv[i]); goterror = 1; } } } void add_progs(int argc, char **argv) { int i; for (i = 1; i < argc; i++) add_prog(argv[i]); } void add_prog(char *progname) { prog_t *p1, *p2; /* add to end, but be smart about dups */ for (p1 = NULL, p2 = progs; p2 != NULL; p1 = p2, p2 = p2->next) if (!strcmp(p2->name, progname)) return; p2 = malloc(sizeof(prog_t)); if(p2) { memset(p2, 0, sizeof(prog_t)); p2->name = strdup(progname); } if (!p2 || !p2->name) out_of_memory(); p2->next = NULL; if (p1 == NULL) progs = p2; else p1->next = p2; p2->ident = NULL; p2->srcdir = NULL; p2->realsrcdir = NULL; p2->objdir = NULL; p2->links = NULL; p2->libs = NULL; p2->libs_so = NULL; p2->objs = NULL; p2->keeplist = NULL; p2->buildopts = NULL; p2->goterror = 0; if (list_mode) printf("%s\n",progname); } void add_link(int argc, char **argv) { int i; prog_t *p = find_prog(argv[1]); if (p == NULL) { warnx("%s:%d: no prog %s previously declared, skipping link", curfilename, linenum, argv[1]); goterror = 1; return; } for (i = 2; i < argc; i++) { if (list_mode) printf("%s\n",argv[i]); add_string(&p->links, argv[i]); } } void add_libs(int argc, char **argv) { int i; for(i = 1; i < argc; i++) { add_string(&libs, argv[i]); if ( in_list(&libs_so, argv[i]) ) warnx("%s:%d: " "library `%s' specified as dynamic earlier", curfilename, linenum, argv[i]); } } void add_libs_so(int argc, char **argv) { int i; for(i = 1; i < argc; i++) { add_string(&libs_so, argv[i]); if ( in_list(&libs, argv[i]) ) warnx("%s:%d: " "library `%s' specified as static earlier", curfilename, linenum, argv[i]); } } void add_buildopts(int argc, char **argv) { int i; for (i = 1; i < argc; i++) add_string(&buildopts, argv[i]); } void add_special(int argc, char **argv) { int i; prog_t *p = find_prog(argv[1]); if (p == NULL) { if (reading_cache) return; warnx("%s:%d: no prog %s previously declared, skipping special", curfilename, linenum, argv[1]); goterror = 1; return; } if (!strcmp(argv[2], "ident")) { if (argc != 4) goto argcount; if ((p->ident = strdup(argv[3])) == NULL) out_of_memory(); } else if (!strcmp(argv[2], "srcdir")) { if (argc != 4) goto argcount; if ((p->srcdir = strdup(argv[3])) == NULL) out_of_memory(); } else if (!strcmp(argv[2], "objdir")) { if(argc != 4) goto argcount; if((p->objdir = strdup(argv[3])) == NULL) out_of_memory(); } else if (!strcmp(argv[2], "objs")) { p->objs = NULL; for (i = 3; i < argc; i++) add_string(&p->objs, argv[i]); } else if (!strcmp(argv[2], "objpaths")) { p->objpaths = NULL; for (i = 3; i < argc; i++) add_string(&p->objpaths, argv[i]); } else if (!strcmp(argv[2], "keep")) { p->keeplist = NULL; for(i = 3; i < argc; i++) add_string(&p->keeplist, argv[i]); } else if (!strcmp(argv[2], "objvar")) { if(argc != 4) goto argcount; if ((p->objvar = strdup(argv[3])) == NULL) out_of_memory(); } else if (!strcmp(argv[2], "buildopts")) { p->buildopts = NULL; for (i = 3; i < argc; i++) add_string(&p->buildopts, argv[i]); } else if (!strcmp(argv[2], "lib")) { for (i = 3; i < argc; i++) add_string(&p->libs, argv[i]); } else { warnx("%s:%d: bad parameter name `%s', skipping line", curfilename, linenum, argv[2]); goterror = 1; } return; argcount: warnx("%s:%d: too %s arguments, expected \"special %s %s \"", curfilename, linenum, argc < 4? "few" : "many", argv[1], argv[2]); goterror = 1; } prog_t *find_prog(char *str) { prog_t *p; for (p = progs; p != NULL; p = p->next) if (!strcmp(p->name, str)) return p; return NULL; } /* * ======================================================================== * gen_outputs subsystem * */ /* helper subroutines */ void remove_error_progs(void); void fillin_program(prog_t *p); void gen_specials_cache(void); void gen_output_makefile(void); void gen_output_cfile(void); void fillin_program_objs(prog_t *p, char *path); void top_makefile_rules(FILE *outmk); void prog_makefile_rules(FILE *outmk, prog_t *p); void output_strlst(FILE *outf, strlst_t *lst); char *genident(char *str); char *dir_search(char *progname); void gen_outputs(void) { prog_t *p; for (p = progs; p != NULL; p = p->next) fillin_program(p); remove_error_progs(); gen_specials_cache(); gen_output_cfile(); gen_output_makefile(); status(""); fprintf(stderr, "Run \"%s -f %s\" to build crunched binary.\n", path_make, outmkname); } /* * run the makefile for the program to find which objects are necessary */ void fillin_program(prog_t *p) { char path[MAXPATHLEN]; char line[MAXLINELEN]; snprintf(line, MAXLINELEN, "filling in parms for %s", p->name); status(line); if (!p->ident) p->ident = genident(p->name); /* look for the source directory if one wasn't specified by a special */ if (!p->srcdir) { p->srcdir = dir_search(p->name); } /* Determine the actual srcdir (maybe symlinked). */ if (p->srcdir) { p->realsrcdir = realpath(p->srcdir, NULL); if (p->realsrcdir == NULL) errx(1, "Can't resolve path: %s\n", p->srcdir); } /* Unless the option to make object files was specified the - * the objects will be built in the source directory unless + * objects will be built in the source directory unless * an object directory already exists. */ if (!makeobj && !p->objdir && p->srcdir) { char *auto_obj; auto_obj = NULL; snprintf(line, sizeof line, "%s/%s", objprefix, p->realsrcdir); if (is_dir(line) || ((auto_obj = getenv("MK_AUTO_OBJ")) != NULL && strcmp(auto_obj, "yes") == 0)) { if ((p->objdir = strdup(line)) == NULL) out_of_memory(); } else p->objdir = p->realsrcdir; } /* * XXX look for a Makefile.{name} in local directory first. * This lets us override the original Makefile. */ snprintf(path, sizeof(path), "Makefile.%s", p->name); if (is_nonempty_file(path)) { snprintf(line, MAXLINELEN, "Using %s for %s", path, p->name); status(line); } else if (p->srcdir) snprintf(path, sizeof(path), "%s/Makefile", p->srcdir); if (!p->objs && p->srcdir && is_nonempty_file(path)) fillin_program_objs(p, path); if (!p->srcdir && !p->objdir && verbose) warnx("%s: %s: %s", "warning: could not find source directory", infilename, p->name); if (!p->objs && verbose) warnx("%s: %s: warning: could not find any .o files", infilename, p->name); if ((!p->srcdir || !p->objdir) && !p->objs) p->goterror = 1; } void fillin_program_objs(prog_t *p, char *path) { char *obj, *cp; int fd, rc; FILE *f; const char *objvar="OBJS"; strlst_t *s; char line[MAXLINELEN]; /* discover the objs from the srcdir Makefile */ /* * We reuse the same temporary file name for multiple objects. However, * some libc implementations (such as glibc) return EINVAL if there * are no XXXXX characters in the template. This happens after the * first call to mkstemp since the argument is modified in-place. * To avoid this error we use open() instead of mkstemp() after the * call to mkstemp(). */ if (tempfname_initialized) { if ((fd = open(tempfname, O_CREAT | O_EXCL | O_RDWR, 0600)) == -1) { err(EX_OSERR, "open(%s)", tempfname); } } else if ((fd = mkstemp(tempfname)) == -1) { err(EX_OSERR, "mkstemp(%s)", tempfname); } tempfname_initialized = true; if ((f = fdopen(fd, "w")) == NULL) { warn("fdopen(%s)", tempfname); goterror = 1; goto out; } if (p->objvar) objvar = p->objvar; /* * XXX include outhdrname (e.g. to contain Make variables) */ if (outhdrname[0] != '\0') fprintf(f, ".include \"%s\"\n", outhdrname); fprintf(f, ".include \"%s\"\n", path); fprintf(f, ".POSIX:\n"); if (buildopts) { fprintf(f, "BUILDOPTS+="); output_strlst(f, buildopts); } fprintf(f, ".if defined(PROG)\n"); fprintf(f, "%s?=${PROG}.o\n", objvar); fprintf(f, ".endif\n"); fprintf(f, "loop:\n\t@echo 'OBJS= '${%s}\n", objvar); fprintf(f, "crunchgen_objs:\n" "\t@cd %s && %s -f %s $(BUILDOPTS) $(%s_OPTS)", p->srcdir, path_make, tempfname, p->ident); for (s = p->buildopts; s != NULL; s = s->next) fprintf(f, " %s", s->str); fprintf(f, " loop\n"); fclose(f); snprintf(line, MAXLINELEN, "cd %s && %s -f %s -B crunchgen_objs", p->srcdir, path_make, tempfname); if ((f = popen(line, "r")) == NULL) { warn("submake pipe"); goterror = 1; goto out; } while(fgets(line, MAXLINELEN, f)) { if (strncmp(line, "OBJS= ", 6)) { warnx("make error: %s", line); goterror = 1; goto out; } cp = line + 6; while (isspace((unsigned char)*cp)) cp++; while(*cp) { obj = cp; while (*cp && !isspace((unsigned char)*cp)) cp++; if (*cp) *cp++ = '\0'; add_string(&p->objs, obj); while (isspace((unsigned char)*cp)) cp++; } } if ((rc=pclose(f)) != 0) { warnx("make error: make returned %d", rc); goterror = 1; } out: unlink(tempfname); } void remove_error_progs(void) { prog_t *p1, *p2; p1 = NULL; p2 = progs; while (p2 != NULL) { if (!p2->goterror) p1 = p2, p2 = p2->next; else { /* delete it from linked list */ warnx("%s: %s: ignoring program because of errors", infilename, p2->name); if (p1) p1->next = p2->next; else progs = p2->next; p2 = p2->next; } } } void gen_specials_cache(void) { FILE *cachef; prog_t *p; char line[MAXLINELEN]; snprintf(line, MAXLINELEN, "generating %s", cachename); status(line); if ((cachef = fopen(cachename, "w")) == NULL) { warn("%s", cachename); goterror = 1; return; } fprintf(cachef, "# %s - parm cache generated from %s by crunchgen " " %s\n\n", cachename, infilename, CRUNCH_VERSION); for (p = progs; p != NULL; p = p->next) { fprintf(cachef, "\n"); if (p->srcdir) fprintf(cachef, "special %s srcdir %s\n", p->name, p->srcdir); if (p->objdir) fprintf(cachef, "special %s objdir %s\n", p->name, p->objdir); if (p->objs) { fprintf(cachef, "special %s objs", p->name); output_strlst(cachef, p->objs); } if (p->objpaths) { fprintf(cachef, "special %s objpaths", p->name); output_strlst(cachef, p->objpaths); } } fclose(cachef); } void gen_output_makefile(void) { prog_t *p; FILE *outmk; char line[MAXLINELEN]; snprintf(line, MAXLINELEN, "generating %s", outmkname); status(line); if ((outmk = fopen(outmkname, "w")) == NULL) { warn("%s", outmkname); goterror = 1; return; } fprintf(outmk, "# %s - generated from %s by crunchgen %s\n\n", outmkname, infilename, CRUNCH_VERSION); if (outhdrname[0] != '\0') fprintf(outmk, ".include \"%s\"\n", outhdrname); top_makefile_rules(outmk); for (p = progs; p != NULL; p = p->next) prog_makefile_rules(outmk, p); fprintf(outmk, "\n# ========\n"); fclose(outmk); } void gen_output_cfile(void) { const char **cp; FILE *outcf; prog_t *p; strlst_t *s; char line[MAXLINELEN]; snprintf(line, MAXLINELEN, "generating %s", outcfname); status(line); if((outcf = fopen(outcfname, "w")) == NULL) { warn("%s", outcfname); goterror = 1; return; } fprintf(outcf, "/* %s - generated from %s by crunchgen %s */\n", outcfname, infilename, CRUNCH_VERSION); fprintf(outcf, "#define EXECNAME \"%s\"\n", execfname); for (cp = crunched_skel; *cp != NULL; cp++) fprintf(outcf, "%s\n", *cp); for (p = progs; p != NULL; p = p->next) fprintf(outcf, "extern crunched_stub_t _crunched_%s_stub;\n", p->ident); fprintf(outcf, "\nstruct stub entry_points[] = {\n"); for (p = progs; p != NULL; p = p->next) { fprintf(outcf, "\t{ \"%s\", _crunched_%s_stub },\n", p->name, p->ident); for (s = p->links; s != NULL; s = s->next) fprintf(outcf, "\t{ \"%s\", _crunched_%s_stub },\n", s->str, p->ident); } fprintf(outcf, "\t{ EXECNAME, crunched_main },\n"); fprintf(outcf, "\t{ NULL, NULL }\n};\n"); fclose(outcf); } char *genident(char *str) { char *n, *s, *d; /* * generates a Makefile/C identifier from a program name, * mapping '-' to '_' and ignoring all other non-identifier * characters. This leads to programs named "foo.bar" and * "foobar" to map to the same identifier. */ if ((n = strdup(str)) == NULL) return NULL; for (d = s = n; *s != '\0'; s++) { if (*s == '-') *d++ = '_'; else if (*s == '_' || isalnum((unsigned char)*s)) *d++ = *s; } *d = '\0'; return n; } char *dir_search(char *progname) { char path[MAXPATHLEN]; strlst_t *dir; char *srcdir; for (dir = srcdirs; dir != NULL; dir = dir->next) { snprintf(path, MAXPATHLEN, "%s/%s", dir->str, progname); if (!is_dir(path)) continue; if ((srcdir = strdup(path)) == NULL) out_of_memory(); return srcdir; } return NULL; } void top_makefile_rules(FILE *outmk) { prog_t *p; fprintf(outmk, "LD?= ld\n"); if ( subtract_strlst(&libs, &libs_so) ) fprintf(outmk, "# NOTE: Some LIBS declarations below overridden by LIBS_SO\n"); fprintf(outmk, "LIBS+="); output_strlst(outmk, libs); fprintf(outmk, "LIBS_SO+="); output_strlst(outmk, libs_so); if (makeobj) { fprintf(outmk, "MAKEOBJDIRPREFIX?=%s\n", objprefix); fprintf(outmk, "MAKEENV=env MAKEOBJDIRPREFIX=$(MAKEOBJDIRPREFIX)\n"); fprintf(outmk, "CRUNCHMAKE=$(MAKEENV) $(MAKE)\n"); } else { fprintf(outmk, "CRUNCHMAKE=$(MAKE)\n"); } if (buildopts) { fprintf(outmk, "BUILDOPTS+="); output_strlst(outmk, buildopts); } fprintf(outmk, "CRUNCHED_OBJS="); for (p = progs; p != NULL; p = p->next) fprintf(outmk, " %s.lo", p->name); fprintf(outmk, "\n"); fprintf(outmk, "SUBMAKE_TARGETS="); for (p = progs; p != NULL; p = p->next) fprintf(outmk, " %s_make", p->ident); fprintf(outmk, "\nSUBCLEAN_TARGETS="); for (p = progs; p != NULL; p = p->next) fprintf(outmk, " %s_clean", p->ident); fprintf(outmk, "\n\n"); fprintf(outmk, "all: objs exe\nobjs: $(SUBMAKE_TARGETS)\n"); fprintf(outmk, "exe: %s\n", execfname); fprintf(outmk, "%s: %s.o $(CRUNCHED_OBJS) $(SUBMAKE_TARGETS)\n", execfname, execfname); fprintf(outmk, ".if defined(LIBS_SO) && !empty(LIBS_SO)\n"); fprintf(outmk, "\t$(CC) -o %s %s.o $(CRUNCHED_OBJS) \\\n", execfname, execfname); fprintf(outmk, "\t\t-Xlinker -Bstatic $(LIBS) \\\n"); fprintf(outmk, "\t\t-Xlinker -Bdynamic $(LIBS_SO)\n"); fprintf(outmk, ".else\n"); fprintf(outmk, "\t$(CC) -static -o %s %s.o $(CRUNCHED_OBJS) $(LIBS)\n", execfname, execfname); fprintf(outmk, ".endif\n"); fprintf(outmk, "realclean: clean subclean\n"); fprintf(outmk, "clean:\n\trm -f %s *.lo *.o *_stub.c\n", execfname); fprintf(outmk, "subclean: $(SUBCLEAN_TARGETS)\n"); } void prog_makefile_rules(FILE *outmk, prog_t *p) { strlst_t *lst; fprintf(outmk, "\n# -------- %s\n\n", p->name); fprintf(outmk, "%s_OBJDIR=", p->ident); if (p->objdir) fprintf(outmk, "%s", p->objdir); else fprintf(outmk, "$(MAKEOBJDIRPREFIX)/$(%s_REALSRCDIR)\n", p->ident); fprintf(outmk, "\n"); fprintf(outmk, "%s_OBJPATHS=", p->ident); if (p->objpaths) output_strlst(outmk, p->objpaths); else { for (lst = p->objs; lst != NULL; lst = lst->next) { fprintf(outmk, " $(%s_OBJDIR)/%s", p->ident, lst->str); } fprintf(outmk, "\n"); } fprintf(outmk, "$(%s_OBJPATHS): .NOMETA\n", p->ident); if (p->srcdir && p->objs) { fprintf(outmk, "%s_SRCDIR=%s\n", p->ident, p->srcdir); fprintf(outmk, "%s_REALSRCDIR=%s\n", p->ident, p->realsrcdir); fprintf(outmk, "%s_OBJS=", p->ident); output_strlst(outmk, p->objs); if (p->buildopts != NULL) { fprintf(outmk, "%s_OPTS+=", p->ident); output_strlst(outmk, p->buildopts); } #if 0 fprintf(outmk, "$(%s_OBJPATHS): %s_make\n\n", p->ident, p->ident); #endif fprintf(outmk, "%s_make:\n", p->ident); fprintf(outmk, "\t(cd $(%s_SRCDIR) && ", p->ident); if (makeobj) fprintf(outmk, "$(CRUNCHMAKE) obj && "); fprintf(outmk, "\\\n"); fprintf(outmk, "\t\t$(CRUNCHMAKE) $(BUILDOPTS) $(%s_OPTS) depend &&", p->ident); fprintf(outmk, "\\\n"); fprintf(outmk, "\t\t$(CRUNCHMAKE) $(BUILDOPTS) $(%s_OPTS) " "$(%s_OBJS))", p->ident, p->ident); fprintf(outmk, "\n"); fprintf(outmk, "%s_clean:\n", p->ident); fprintf(outmk, "\t(cd $(%s_SRCDIR) && $(CRUNCHMAKE) $(BUILDOPTS) clean cleandepend)\n\n", p->ident); } else { fprintf(outmk, "%s_make:\n", p->ident); fprintf(outmk, "\t@echo \"** cannot make objs for %s\"\n\n", p->name); } if (p->libs) { fprintf(outmk, "%s_LIBS=", p->ident); output_strlst(outmk, p->libs); } fprintf(outmk, "%s_stub.c:\n", p->name); fprintf(outmk, "\techo \"" "extern int main(int argc, char **argv, char **envp); " "int _crunched_%s_stub(int argc, char **argv, char **envp);" "int _crunched_%s_stub(int argc, char **argv, char **envp)" "{return main(argc,argv,envp);}\" >%s_stub.c\n", p->ident, p->ident, p->name); fprintf(outmk, "%s.lo: %s_stub.o $(%s_OBJPATHS) %s", p->name, p->name, p->ident, outmkname); if (p->libs) fprintf(outmk, " $(%s_LIBS)", p->ident); fprintf(outmk, "\n"); fprintf(outmk, "\t$(CC) -nostdlib -r -o %s.lo %s_stub.o $(%s_OBJPATHS)", p->name, p->name, p->ident); if (p->libs) fprintf(outmk, " $(%s_LIBS)", p->ident); fprintf(outmk, "\n"); fprintf(outmk, "\tcrunchide -k _crunched_%s_stub ", p->ident); for (lst = p->keeplist; lst != NULL; lst = lst->next) fprintf(outmk, "-k %s ", lst->str); fprintf(outmk, "%s.lo\n", p->name); } void output_strlst(FILE *outf, strlst_t *lst) { for (; lst != NULL; lst = lst->next) if ( strlen(lst->str) ) fprintf(outf, " %s", lst->str); fprintf(outf, "\n"); } /* * ======================================================================== * general library routines * */ void status(const char *str) { static int lastlen = 0; int len, spaces; if (!verbose) return; len = strlen(str); spaces = lastlen - len; if (spaces < 1) spaces = 1; fprintf(stderr, " [%s]%*.*s\r", str, spaces, spaces, " "); fflush(stderr); lastlen = len; } void out_of_memory(void) { err(1, "%s: %d: out of memory, stopping", infilename, linenum); } void add_string(strlst_t **listp, char *str) { strlst_t *p1, *p2; /* add to end, but be smart about dups */ for (p1 = NULL, p2 = *listp; p2 != NULL; p1 = p2, p2 = p2->next) if (!strcmp(p2->str, str)) return; p2 = malloc(sizeof(strlst_t)); if (p2) { p2->next = NULL; p2->str = strdup(str); } if (!p2 || !p2->str) out_of_memory(); if (p1 == NULL) *listp = p2; else p1->next = p2; } int subtract_strlst(strlst_t **lista, strlst_t **listb) { int subtract_count = 0; strlst_t *p1; for (p1 = *listb; p1 != NULL; p1 = p1->next) if ( in_list(lista, p1->str) ) { warnx("Will compile library `%s' dynamically", p1->str); strcat(p1->str, ""); subtract_count++; } return subtract_count; } int in_list(strlst_t **listp, char *str) { strlst_t *p1; for (p1 = *listp; p1 != NULL; p1 = p1->next) if (!strcmp(p1->str, str)) return 1; return 0; } int is_dir(const char *pathname) { struct stat buf; if (stat(pathname, &buf) == -1) return 0; return S_ISDIR(buf.st_mode); } int is_nonempty_file(const char *pathname) { struct stat buf; if (stat(pathname, &buf) == -1) return 0; return S_ISREG(buf.st_mode) && buf.st_size > 0; } diff --git a/usr.sbin/fstyp/hammer2_disk.h b/usr.sbin/fstyp/hammer2_disk.h index eae40b618966..6efefe9a2495 100644 --- a/usr.sbin/fstyp/hammer2_disk.h +++ b/usr.sbin/fstyp/hammer2_disk.h @@ -1,1388 +1,1388 @@ /*- * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon * by Venkatesh Srinivas * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of The DragonFly Project nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific, prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _HAMMER2_DISK_H_ #define _HAMMER2_DISK_H_ #ifndef _SYS_UUID_H_ #include #endif #ifndef _SYS_DMSG_H_ /* * dmsg_hdr must be 64 bytes */ struct dmsg_hdr { uint16_t magic; /* 00 sanity, synchro, endian */ uint16_t reserved02; /* 02 */ uint32_t salt; /* 04 random salt helps w/crypto */ uint64_t msgid; /* 08 message transaction id */ uint64_t circuit; /* 10 circuit id or 0 */ uint64_t reserved18; /* 18 */ uint32_t cmd; /* 20 flags | cmd | hdr_size / ALIGN */ uint32_t aux_crc; /* 24 auxiliary data crc */ uint32_t aux_bytes; /* 28 auxiliary data length (bytes) */ uint32_t error; /* 2C error code or 0 */ uint64_t aux_descr; /* 30 negotiated OOB data descr */ uint32_t reserved38; /* 38 */ uint32_t hdr_crc; /* 3C (aligned) extended header crc */ }; typedef struct dmsg_hdr dmsg_hdr_t; #endif /* * The structures below represent the on-disk media structures for the HAMMER2 * filesystem. Note that all fields for on-disk structures are naturally * aligned. The host endian format is typically used - compatibility is * possible if the implementation detects reversed endian and adjusts accesses * accordingly. * * HAMMER2 primarily revolves around the directory topology: inodes, * directory entries, and block tables. Block device buffer cache buffers * are always 64KB. Logical file buffers are typically 16KB. All data * references utilize 64-bit byte offsets. * * Free block management is handled independently using blocks reserved by * the media topology. */ /* * The data at the end of a file or directory may be a fragment in order * to optimize storage efficiency. The minimum fragment size is 1KB. * Since allocations are in powers of 2 fragments must also be sized in * powers of 2 (1024, 2048, ... 65536). * * For the moment the maximum allocation size is HAMMER2_PBUFSIZE (64K), * which is 2^16. Larger extents may be supported in the future. Smaller * fragments might be supported in the future (down to 64 bytes is possible), * but probably will not be. * * A full indirect block use supports 512 x 128-byte blockrefs in a 64KB * buffer. Indirect blocks down to 1KB are supported to keep small * directories small. * * A maximally sized file (2^64-1 bytes) requires ~6 indirect block levels * using 64KB indirect blocks (128 byte refs, 512 or radix 9 per indblk). * * 16(datablk) + 9 + 9 + 9 + 9 + 9 + 9 = ~70. * 16(datablk) + 7 + 9 + 9 + 9 + 9 + 9 = ~68. (smaller top level indblk) * * The actual depth depends on copies redundancy and whether the filesystem * has chosen to use a smaller indirect block size at the top level or not. */ #define HAMMER2_ALLOC_MIN 1024 /* minimum allocation size */ #define HAMMER2_RADIX_MIN 10 /* minimum allocation size 2^N */ #define HAMMER2_ALLOC_MAX 65536 /* maximum allocation size */ #define HAMMER2_RADIX_MAX 16 /* maximum allocation size 2^N */ #define HAMMER2_RADIX_KEY 64 /* number of bits in key */ /* * MINALLOCSIZE - The minimum allocation size. This can be smaller * or larger than the minimum physical IO size. * * NOTE: Should not be larger than 1K since inodes * are 1K. * * MINIOSIZE - The minimum IO size. This must be less than * or equal to HAMMER2_LBUFSIZE. * * HAMMER2_LBUFSIZE - Nominal buffer size for I/O rollups. * * HAMMER2_PBUFSIZE - Topological block size used by files for all * blocks except the block straddling EOF. * * HAMMER2_SEGSIZE - Allocation map segment size, typically 4MB * (space represented by a level0 bitmap). */ #define HAMMER2_SEGSIZE (1 << HAMMER2_FREEMAP_LEVEL0_RADIX) #define HAMMER2_SEGRADIX HAMMER2_FREEMAP_LEVEL0_RADIX #define HAMMER2_PBUFRADIX 16 /* physical buf (1<<16) bytes */ #define HAMMER2_PBUFSIZE 65536 #define HAMMER2_LBUFRADIX 14 /* logical buf (1<<14) bytes */ #define HAMMER2_LBUFSIZE 16384 /* * Generally speaking we want to use 16K and 64K I/Os */ #define HAMMER2_MINIORADIX HAMMER2_LBUFRADIX #define HAMMER2_MINIOSIZE HAMMER2_LBUFSIZE #define HAMMER2_IND_BYTES_MIN 4096 #define HAMMER2_IND_BYTES_NOM HAMMER2_LBUFSIZE #define HAMMER2_IND_BYTES_MAX HAMMER2_PBUFSIZE #define HAMMER2_IND_RADIX_MIN 12 #define HAMMER2_IND_RADIX_NOM HAMMER2_LBUFRADIX #define HAMMER2_IND_RADIX_MAX HAMMER2_PBUFRADIX #define HAMMER2_IND_COUNT_MIN (HAMMER2_IND_BYTES_MIN / \ sizeof(hammer2_blockref_t)) #define HAMMER2_IND_COUNT_MAX (HAMMER2_IND_BYTES_MAX / \ sizeof(hammer2_blockref_t)) /* * In HAMMER2, arrays of blockrefs are fully set-associative, meaning that * any element can occur at any index and holes can be anywhere. As a * future optimization we will be able to flag that such arrays are sorted * and thus optimize lookups, but for now we don't. * * Inodes embed either 512 bytes of direct data or an array of 4 blockrefs, * resulting in highly efficient storage for files <= 512 bytes and for files * <= 512KB. Up to 4 directory entries can be referenced from a directory * without requiring an indirect block. * * Indirect blocks are typically either 4KB (64 blockrefs / ~4MB represented), * or 64KB (1024 blockrefs / ~64MB represented). */ #define HAMMER2_SET_RADIX 2 /* radix 2 = 4 entries */ #define HAMMER2_SET_COUNT (1 << HAMMER2_SET_RADIX) #define HAMMER2_EMBEDDED_BYTES 512 /* inode blockset/dd size */ #define HAMMER2_EMBEDDED_RADIX 9 #define HAMMER2_PBUFMASK (HAMMER2_PBUFSIZE - 1) #define HAMMER2_LBUFMASK (HAMMER2_LBUFSIZE - 1) #define HAMMER2_SEGMASK (HAMMER2_SEGSIZE - 1) #define HAMMER2_LBUFMASK64 ((hammer2_off_t)HAMMER2_LBUFMASK) #define HAMMER2_PBUFSIZE64 ((hammer2_off_t)HAMMER2_PBUFSIZE) #define HAMMER2_PBUFMASK64 ((hammer2_off_t)HAMMER2_PBUFMASK) #define HAMMER2_SEGSIZE64 ((hammer2_off_t)HAMMER2_SEGSIZE) #define HAMMER2_SEGMASK64 ((hammer2_off_t)HAMMER2_SEGMASK) #define HAMMER2_UUID_STRING "5cbb9ad1-862d-11dc-a94d-01301bb8a9f5" /* * A 4MB segment is reserved at the beginning of each 2GB zone. This segment * contains the volume header (or backup volume header), the free block * table, and possibly other information in the future. A 4MB segment for * freemap is reserved at the beginning of every 1GB. * * 4MB = 64 x 64K blocks. Each 4MB segment is broken down as follows: * * ========== * 0 volume header (for the first four 2GB zones) * 1 freemap00 level1 FREEMAP_LEAF (256 x 128B bitmap data per 1GB) * 2 level2 FREEMAP_NODE (256 x 128B indirect block per 256GB) * 3 level3 FREEMAP_NODE (256 x 128B indirect block per 64TB) * 4 level4 FREEMAP_NODE (256 x 128B indirect block per 16PB) * 5 level5 FREEMAP_NODE (256 x 128B indirect block per 4EB) * 6 freemap01 level1 (rotation) * 7 level2 * 8 level3 * 9 level4 * 10 level5 * 11 freemap02 level1 (rotation) * 12 level2 * 13 level3 * 14 level4 * 15 level5 * 16 freemap03 level1 (rotation) * 17 level2 * 18 level3 * 19 level4 * 20 level5 * 21 freemap04 level1 (rotation) * 22 level2 * 23 level3 * 24 level4 * 25 level5 * 26 freemap05 level1 (rotation) * 27 level2 * 28 level3 * 29 level4 * 30 level5 * 31 freemap06 level1 (rotation) * 32 level2 * 33 level3 * 34 level4 * 35 level5 * 36 freemap07 level1 (rotation) * 37 level2 * 38 level3 * 39 level4 * 40 level5 * 41 unused * .. unused * 63 unused * ========== * * The first four 2GB zones contain volume headers and volume header backups. * After that the volume header block# is reserved for future use. Similarly, * there are many blocks related to various Freemap levels which are not * used in every segment and those are also reserved for future use. * Note that each FREEMAP_LEAF or FREEMAP_NODE uses 32KB out of 64KB slot. * * Freemap (see the FREEMAP document) * * The freemap utilizes blocks #1-40 in 8 sets of 5 blocks. Each block in * a set represents a level of depth in the freemap topology. Eight sets * exist to prevent live updates from disturbing the state of the freemap * were a crash/reboot to occur. That is, a live update is not committed * until the update's flush reaches the volume root. There are FOUR volume * roots representing the last four synchronization points, so the freemap * must be consistent no matter which volume root is chosen by the mount * code. * * Each freemap set is 5 x 64K blocks and represents the 1GB, 256GB, 64TB, * 16PB and 4EB indirect map. The volume header itself has a set of 4 freemap * blockrefs representing another 2 bits, giving us a total 64 bits of * representable address space. * * The Level 0 64KB block represents 1GB of storage represented by 32KB * (256 x struct hammer2_bmap_data). Each structure represents 4MB of storage * and has a 512 bit bitmap, using 2 bits to represent a 16KB chunk of * storage. These 2 bits represent the following states: * * 00 Free * 01 (reserved) (Possibly partially allocated) * 10 Possibly free * 11 Allocated * * One important thing to note here is that the freemap resolution is 16KB, * but the minimum storage allocation size is 1KB. The hammer2 vfs keeps * track of sub-allocations in memory, which means that on a unmount or reboot * the entire 16KB of a partially allocated block will be considered fully * allocated. It is possible for fragmentation to build up over time, but * defragmentation is fairly easy to accomplish since all modifications * allocate a new block. * * The Second thing to note is that due to the way snapshots and inode * replication works, deleting a file cannot immediately free the related * space. Furthermore, deletions often do not bother to traverse the * block subhierarchy being deleted. And to go even further, whole * sub-directory trees can be deleted simply by deleting the directory inode * at the top. So even though we have a symbol to represent a 'possibly free' * block (binary 10), only the bulk free scanning code can actually use it. * Normal 'rm's or other deletions do not. * * WARNING! ZONE_SEG and VOLUME_ALIGN must be a multiple of 1<= ZONE_SEG. * * In Summary: * * (1) Modifications to freemap blocks 'allocate' a new copy (aka use a block * from the next set). The new copy is reused until a flush occurs at * which point the next modification will then rotate to the next set. */ #define HAMMER2_VOLUME_ALIGN (8 * 1024 * 1024) #define HAMMER2_VOLUME_ALIGN64 ((hammer2_off_t)HAMMER2_VOLUME_ALIGN) #define HAMMER2_VOLUME_ALIGNMASK (HAMMER2_VOLUME_ALIGN - 1) #define HAMMER2_VOLUME_ALIGNMASK64 ((hammer2_off_t)HAMMER2_VOLUME_ALIGNMASK) #define HAMMER2_NEWFS_ALIGN (HAMMER2_VOLUME_ALIGN) #define HAMMER2_NEWFS_ALIGN64 ((hammer2_off_t)HAMMER2_VOLUME_ALIGN) #define HAMMER2_NEWFS_ALIGNMASK (HAMMER2_VOLUME_ALIGN - 1) #define HAMMER2_NEWFS_ALIGNMASK64 ((hammer2_off_t)HAMMER2_NEWFS_ALIGNMASK) #define HAMMER2_ZONE_BYTES64 (2LLU * 1024 * 1024 * 1024) #define HAMMER2_ZONE_MASK64 (HAMMER2_ZONE_BYTES64 - 1) #define HAMMER2_ZONE_SEG (4 * 1024 * 1024) #define HAMMER2_ZONE_SEG64 ((hammer2_off_t)HAMMER2_ZONE_SEG) #define HAMMER2_ZONE_BLOCKS_SEG (HAMMER2_ZONE_SEG / HAMMER2_PBUFSIZE) #define HAMMER2_ZONE_FREEMAP_INC 5 /* 5 deep */ #define HAMMER2_ZONE_VOLHDR 0 /* volume header or backup */ #define HAMMER2_ZONE_FREEMAP_00 1 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_01 6 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_02 11 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_03 16 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_04 21 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_05 26 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_06 31 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_07 36 /* normal freemap rotation */ #define HAMMER2_ZONE_FREEMAP_END 41 /* (non-inclusive) */ #define HAMMER2_ZONE_UNUSED41 41 #define HAMMER2_ZONE_UNUSED42 42 #define HAMMER2_ZONE_UNUSED43 43 #define HAMMER2_ZONE_UNUSED44 44 #define HAMMER2_ZONE_UNUSED45 45 #define HAMMER2_ZONE_UNUSED46 46 #define HAMMER2_ZONE_UNUSED47 47 #define HAMMER2_ZONE_UNUSED48 48 #define HAMMER2_ZONE_UNUSED49 49 #define HAMMER2_ZONE_UNUSED50 50 #define HAMMER2_ZONE_UNUSED51 51 #define HAMMER2_ZONE_UNUSED52 52 #define HAMMER2_ZONE_UNUSED53 53 #define HAMMER2_ZONE_UNUSED54 54 #define HAMMER2_ZONE_UNUSED55 55 #define HAMMER2_ZONE_UNUSED56 56 #define HAMMER2_ZONE_UNUSED57 57 #define HAMMER2_ZONE_UNUSED58 58 #define HAMMER2_ZONE_UNUSED59 59 #define HAMMER2_ZONE_UNUSED60 60 #define HAMMER2_ZONE_UNUSED61 61 #define HAMMER2_ZONE_UNUSED62 62 #define HAMMER2_ZONE_UNUSED63 63 #define HAMMER2_ZONE_END 64 /* non-inclusive */ #define HAMMER2_NFREEMAPS 8 /* FREEMAP_00 - FREEMAP_07 */ /* relative to FREEMAP_x */ #define HAMMER2_ZONEFM_LEVEL1 0 /* 1GB leafmap */ #define HAMMER2_ZONEFM_LEVEL2 1 /* 256GB indmap */ #define HAMMER2_ZONEFM_LEVEL3 2 /* 64TB indmap */ #define HAMMER2_ZONEFM_LEVEL4 3 /* 16PB indmap */ #define HAMMER2_ZONEFM_LEVEL5 4 /* 4EB indmap */ /* LEVEL6 is a set of 4 blockrefs in the volume header 16EB */ /* * Freemap radix. Assumes a set-count of 4, 128-byte blockrefs, * 32KB indirect block for freemap (LEVELN_PSIZE below). * * Leaf entry represents 4MB of storage broken down into a 512-bit * bitmap, 2-bits per entry. So course bitmap item represents 16KB. */ #if HAMMER2_SET_COUNT != 4 #error "hammer2_disk.h - freemap assumes SET_COUNT is 4" #endif #define HAMMER2_FREEMAP_LEVEL6_RADIX 64 /* 16EB (end) */ #define HAMMER2_FREEMAP_LEVEL5_RADIX 62 /* 4EB */ #define HAMMER2_FREEMAP_LEVEL4_RADIX 54 /* 16PB */ #define HAMMER2_FREEMAP_LEVEL3_RADIX 46 /* 64TB */ #define HAMMER2_FREEMAP_LEVEL2_RADIX 38 /* 256GB */ #define HAMMER2_FREEMAP_LEVEL1_RADIX 30 /* 1GB */ #define HAMMER2_FREEMAP_LEVEL0_RADIX 22 /* 4MB (128by in l-1 leaf) */ #define HAMMER2_FREEMAP_LEVELN_PSIZE 32768 /* physical bytes */ #define HAMMER2_FREEMAP_LEVEL5_SIZE ((hammer2_off_t)1 << \ HAMMER2_FREEMAP_LEVEL5_RADIX) #define HAMMER2_FREEMAP_LEVEL4_SIZE ((hammer2_off_t)1 << \ HAMMER2_FREEMAP_LEVEL4_RADIX) #define HAMMER2_FREEMAP_LEVEL3_SIZE ((hammer2_off_t)1 << \ HAMMER2_FREEMAP_LEVEL3_RADIX) #define HAMMER2_FREEMAP_LEVEL2_SIZE ((hammer2_off_t)1 << \ HAMMER2_FREEMAP_LEVEL2_RADIX) #define HAMMER2_FREEMAP_LEVEL1_SIZE ((hammer2_off_t)1 << \ HAMMER2_FREEMAP_LEVEL1_RADIX) #define HAMMER2_FREEMAP_LEVEL0_SIZE ((hammer2_off_t)1 << \ HAMMER2_FREEMAP_LEVEL0_RADIX) #define HAMMER2_FREEMAP_LEVEL5_MASK (HAMMER2_FREEMAP_LEVEL5_SIZE - 1) #define HAMMER2_FREEMAP_LEVEL4_MASK (HAMMER2_FREEMAP_LEVEL4_SIZE - 1) #define HAMMER2_FREEMAP_LEVEL3_MASK (HAMMER2_FREEMAP_LEVEL3_SIZE - 1) #define HAMMER2_FREEMAP_LEVEL2_MASK (HAMMER2_FREEMAP_LEVEL2_SIZE - 1) #define HAMMER2_FREEMAP_LEVEL1_MASK (HAMMER2_FREEMAP_LEVEL1_SIZE - 1) #define HAMMER2_FREEMAP_LEVEL0_MASK (HAMMER2_FREEMAP_LEVEL0_SIZE - 1) #define HAMMER2_FREEMAP_COUNT (int)(HAMMER2_FREEMAP_LEVELN_PSIZE / \ sizeof(hammer2_bmap_data_t)) /* * XXX I made a mistake and made the reserved area begin at each LEVEL1 zone, * which is on a 1GB demark. This will eat a little more space but for * now we retain compatibility and make FMZONEBASE every 1GB */ #define H2FMZONEBASE(key) ((key) & ~HAMMER2_FREEMAP_LEVEL1_MASK) #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1)) /* * 16KB bitmap granularity (x2 bits per entry). */ #define HAMMER2_FREEMAP_BLOCK_RADIX 14 #define HAMMER2_FREEMAP_BLOCK_SIZE (1 << HAMMER2_FREEMAP_BLOCK_RADIX) #define HAMMER2_FREEMAP_BLOCK_MASK (HAMMER2_FREEMAP_BLOCK_SIZE - 1) /* * bitmap[] structure. 2 bits per HAMMER2_FREEMAP_BLOCK_SIZE. * * 8 x 64-bit elements, 2 bits per block. * 32 blocks (radix 5) per element. * representing INDEX_SIZE bytes worth of storage per element. */ typedef uint64_t hammer2_bitmap_t; #define HAMMER2_BMAP_ALLONES ((hammer2_bitmap_t)-1) #define HAMMER2_BMAP_ELEMENTS 8 #define HAMMER2_BMAP_BITS_PER_ELEMENT 64 #define HAMMER2_BMAP_INDEX_RADIX 5 /* 32 blocks per element */ #define HAMMER2_BMAP_BLOCKS_PER_ELEMENT (1 << HAMMER2_BMAP_INDEX_RADIX) #define HAMMER2_BMAP_INDEX_SIZE (HAMMER2_FREEMAP_BLOCK_SIZE * \ HAMMER2_BMAP_BLOCKS_PER_ELEMENT) #define HAMMER2_BMAP_INDEX_MASK (HAMMER2_BMAP_INDEX_SIZE - 1) #define HAMMER2_BMAP_SIZE (HAMMER2_BMAP_INDEX_SIZE * \ HAMMER2_BMAP_ELEMENTS) #define HAMMER2_BMAP_MASK (HAMMER2_BMAP_SIZE - 1) /* * Two linear areas can be reserved after the initial 4MB segment in the base * zone (the one starting at offset 0). These areas are NOT managed by the * block allocator and do not fall under HAMMER2 crc checking rules based * at the volume header (but can be self-CRCd internally, depending). */ #define HAMMER2_BOOT_MIN_BYTES HAMMER2_VOLUME_ALIGN #define HAMMER2_BOOT_NOM_BYTES (64*1024*1024) #define HAMMER2_BOOT_MAX_BYTES (256*1024*1024) #define HAMMER2_REDO_MIN_BYTES HAMMER2_VOLUME_ALIGN #define HAMMER2_REDO_NOM_BYTES (256*1024*1024) #define HAMMER2_REDO_MAX_BYTES (1024*1024*1024) /* * Most HAMMER2 types are implemented as unsigned 64-bit integers. * Transaction ids are monotonic. * * We utilize 32-bit iSCSI CRCs. */ typedef uint64_t hammer2_tid_t; typedef uint64_t hammer2_off_t; typedef uint64_t hammer2_key_t; typedef uint32_t hammer2_crc32_t; /* * Miscellaneous ranges (all are unsigned). */ #define HAMMER2_TID_MIN 1ULL #define HAMMER2_TID_MAX 0xFFFFFFFFFFFFFFFFULL #define HAMMER2_KEY_MIN 0ULL #define HAMMER2_KEY_MAX 0xFFFFFFFFFFFFFFFFULL #define HAMMER2_OFFSET_MIN 0ULL #define HAMMER2_OFFSET_MAX 0xFFFFFFFFFFFFFFFFULL /* * HAMMER2 data offset special cases and masking. * * All HAMMER2 data offsets have to be broken down into a 64K buffer base * offset (HAMMER2_OFF_MASK_HI) and a 64K buffer index (HAMMER2_OFF_MASK_LO). * * Indexes into physical buffers are always 64-byte aligned. The low 6 bits * of the data offset field specifies how large the data chunk being pointed * to as a power of 2. The theoretical minimum radix is thus 6 (The space * needed in the low bits of the data offset field). However, the practical * minimum allocation chunk size is 1KB (a radix of 10), so HAMMER2 sets * HAMMER2_RADIX_MIN to 10. The maximum radix is currently 16 (64KB), but * we fully intend to support larger extents in the future. * * WARNING! A radix of 0 (such as when data_off is all 0's) is a special * case which means no data associated with the blockref, and * not the '1 byte' it would otherwise calculate to. */ #define HAMMER2_OFF_BAD ((hammer2_off_t)-1) #define HAMMER2_OFF_MASK 0xFFFFFFFFFFFFFFC0ULL #define HAMMER2_OFF_MASK_LO (HAMMER2_OFF_MASK & HAMMER2_PBUFMASK64) #define HAMMER2_OFF_MASK_HI (~HAMMER2_PBUFMASK64) #define HAMMER2_OFF_MASK_RADIX 0x000000000000003FULL #define HAMMER2_MAX_COPIES 6 /* * HAMMER2 directory support and pre-defined keys */ #define HAMMER2_DIRHASH_VISIBLE 0x8000000000000000ULL #define HAMMER2_DIRHASH_USERMSK 0x7FFFFFFFFFFFFFFFULL #define HAMMER2_DIRHASH_LOMASK 0x0000000000007FFFULL #define HAMMER2_DIRHASH_HIMASK 0xFFFFFFFFFFFF0000ULL #define HAMMER2_DIRHASH_FORCED 0x0000000000008000ULL /* bit forced on */ #define HAMMER2_SROOT_KEY 0x0000000000000000ULL /* volume to sroot */ #define HAMMER2_BOOT_KEY 0xd9b36ce135528000ULL /* sroot to BOOT PFS */ /************************************************************************ * DMSG SUPPORT * ************************************************************************ * LNK_VOLCONF * * All HAMMER2 directories directly under the super-root on your local * media can be mounted separately, even if they share the same physical * device. * * When you do a HAMMER2 mount you are effectively tying into a HAMMER2 * cluster via local media. The local media does not have to participate * in the cluster, other than to provide the hammer2_volconf[] array and * root inode for the mount. * * This is important: The mount device path you specify serves to bootstrap * your entry into the cluster, but your mount will make active connections * to ALL copy elements in the hammer2_volconf[] array which match the * PFSID of the directory in the super-root that you specified. The local * media path does not have to be mentioned in this array but becomes part * of the cluster based on its type and access rights. ALL ELEMENTS ARE * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM. * * The actual cluster may be far larger than the elements you list in the * hammer2_volconf[] array. You list only the elements you wish to * directly connect to and you are able to access the rest of the cluster * indirectly through those connections. * * WARNING! This structure must be exactly 128 bytes long for its config * array to fit in the volume header. */ struct hammer2_volconf { uint8_t copyid; /* 00 copyid 0-255 (must match slot) */ uint8_t inprog; /* 01 operation in progress, or 0 */ uint8_t chain_to; /* 02 operation chaining to, or 0 */ uint8_t chain_from; /* 03 operation chaining from, or 0 */ uint16_t flags; /* 04-05 flags field */ uint8_t error; /* 06 last operational error */ uint8_t priority; /* 07 priority and round-robin flag */ uint8_t remote_pfs_type;/* 08 probed direct remote PFS type */ uint8_t reserved08[23]; /* 09-1F */ uuid_t pfs_clid; /* 20-2F copy target must match this uuid */ uint8_t label[16]; /* 30-3F import/export label */ uint8_t path[64]; /* 40-7F target specification string or key */ } __packed; typedef struct hammer2_volconf hammer2_volconf_t; #define DMSG_VOLF_ENABLED 0x0001 #define DMSG_VOLF_INPROG 0x0002 #define DMSG_VOLF_CONN_RR 0x80 /* round-robin at same priority */ #define DMSG_VOLF_CONN_EF 0x40 /* media errors flagged */ #define DMSG_VOLF_CONN_PRI 0x0F /* select priority 0-15 (15=best) */ struct dmsg_lnk_hammer2_volconf { dmsg_hdr_t head; hammer2_volconf_t copy; /* copy spec */ int32_t index; int32_t unused01; uuid_t mediaid; int64_t reserved02[32]; } __packed; typedef struct dmsg_lnk_hammer2_volconf dmsg_lnk_hammer2_volconf_t; #define DMSG_LNK_HAMMER2_VOLCONF DMSG_LNK(DMSG_LNK_CMD_HAMMER2_VOLCONF, \ dmsg_lnk_hammer2_volconf) #define H2_LNK_VOLCONF(msg) ((dmsg_lnk_hammer2_volconf_t *)(msg)->any.buf) /* * HAMMER2 directory entry header (embedded in blockref) exactly 16 bytes */ struct hammer2_dirent_head { hammer2_tid_t inum; /* inode number */ uint16_t namlen; /* name length */ uint8_t type; /* OBJTYPE_* */ uint8_t unused0B; uint8_t unused0C[4]; } __packed; typedef struct hammer2_dirent_head hammer2_dirent_head_t; /* * The media block reference structure. This forms the core of the HAMMER2 * media topology recursion. This 128-byte data structure is embedded in the * volume header, in inodes (which are also directory entries), and in * indirect blocks. * * A blockref references a single media item, which typically can be a * directory entry (aka inode), indirect block, or data block. * * The primary feature a blockref represents is the ability to validate * the entire tree underneath it via its check code. Any modification to * anything propagates up the blockref tree all the way to the root, replacing * the related blocks and compounding the generated check code. * * The check code can be a simple 32-bit iscsi code, a 64-bit crc, or as * complex as a 512 bit cryptographic hash. I originally used a 64-byte * blockref but later expanded it to 128 bytes to be able to support the * larger check code as well as to embed statistics for quota operation. * * Simple check codes are not sufficient for unverified dedup. Even with * a maximally-sized check code unverified dedup should only be used in - * in subdirectory trees where you do not need 100% data integrity. + * subdirectory trees where you do not need 100% data integrity. * * Unverified dedup is deduping based on meta-data only without verifying * that the data blocks are actually identical. Verified dedup guarantees * integrity but is a far more I/O-expensive operation. * * -- * * mirror_tid - per cluster node modified (propagated upward by flush) * modify_tid - clc record modified (not propagated). * update_tid - clc record updated (propagated upward on verification) * * CLC - Stands for 'Cluster Level Change', identifiers which are identical * within the topology across all cluster nodes (when fully * synchronized). * * NOTE: The range of keys represented by the blockref is (key) to * ((key) + (1LL << keybits) - 1). HAMMER2 usually populates * blocks bottom-up, inserting a new root when radix expansion * is required. * * leaf_count - Helps manage leaf collapse calculations when indirect * blocks become mostly empty. This value caps out at * HAMMER2_BLOCKREF_LEAF_MAX (65535). * * Used by the chain code to determine when to pull leafs up * from nearly empty indirect blocks. For the purposes of this * calculation, BREF_TYPE_INODE is considered a leaf, along * with DIRENT and DATA. * * RESERVED FIELDS * * A number of blockref fields are reserved and should generally be set to * 0 for future compatibility. * * FUTURE BLOCKREF EXPANSION * * CONTENT ADDRESSABLE INDEXING (future) - Using a 256 or 512-bit check code. */ struct hammer2_blockref { /* MUST BE EXACTLY 64 BYTES */ uint8_t type; /* type of underlying item */ uint8_t methods; /* check method & compression method */ uint8_t copyid; /* specify which copy this is */ uint8_t keybits; /* #of keybits masked off 0=leaf */ uint8_t vradix; /* virtual data/meta-data size */ uint8_t flags; /* blockref flags */ uint16_t leaf_count; /* leaf aggregation count */ hammer2_key_t key; /* key specification */ hammer2_tid_t mirror_tid; /* media flush topology & freemap */ hammer2_tid_t modify_tid; /* clc modify (not propagated) */ hammer2_off_t data_off; /* low 6 bits is phys size (radix)*/ hammer2_tid_t update_tid; /* clc modify (propagated upward) */ union { char buf[16]; /* * Directory entry header (BREF_TYPE_DIRENT) * * NOTE: check.buf contains filename if <= 64 bytes. Longer * filenames are stored in a data reference of size * HAMMER2_ALLOC_MIN (at least 256, typically 1024). * * NOTE: inode structure may contain a copy of a recently * associated filename, for recovery purposes. * * NOTE: Superroot entries are INODEs, not DIRENTs. Code * allows both cases. */ hammer2_dirent_head_t dirent; /* * Statistics aggregation (BREF_TYPE_INODE, BREF_TYPE_INDIRECT) */ struct { hammer2_key_t data_count; hammer2_key_t inode_count; } stats; } embed; union { /* check info */ char buf[64]; struct { uint32_t value; uint32_t reserved[15]; } iscsi32; struct { uint64_t value; uint64_t reserved[7]; } xxhash64; struct { char data[24]; char reserved[40]; } sha192; struct { char data[32]; char reserved[32]; } sha256; struct { char data[64]; } sha512; /* * Freemap hints are embedded in addition to the icrc32. * * bigmask - Radixes available for allocation (0-31). * Heuristical (may be permissive but not * restrictive). Typically only radix values * 10-16 are used (i.e. (1<<10) through (1<<16)). * * avail - Total available space remaining, in bytes */ struct { uint32_t icrc32; uint32_t bigmask; /* available radixes */ uint64_t avail; /* total available bytes */ char reserved[48]; } freemap; } check; } __packed; typedef struct hammer2_blockref hammer2_blockref_t; #define HAMMER2_BLOCKREF_BYTES 128 /* blockref struct in bytes */ #define HAMMER2_BLOCKREF_RADIX 7 #define HAMMER2_BLOCKREF_LEAF_MAX 65535 /* * On-media and off-media blockref types. * * types >= 128 are pseudo values that should never be present on-media. */ #define HAMMER2_BREF_TYPE_EMPTY 0 #define HAMMER2_BREF_TYPE_INODE 1 #define HAMMER2_BREF_TYPE_INDIRECT 2 #define HAMMER2_BREF_TYPE_DATA 3 #define HAMMER2_BREF_TYPE_DIRENT 4 #define HAMMER2_BREF_TYPE_FREEMAP_NODE 5 #define HAMMER2_BREF_TYPE_FREEMAP_LEAF 6 #define HAMMER2_BREF_TYPE_FREEMAP 254 /* pseudo-type */ #define HAMMER2_BREF_TYPE_VOLUME 255 /* pseudo-type */ #define HAMMER2_BREF_FLAG_PFSROOT 0x01 /* see also related opflag */ #define HAMMER2_BREF_FLAG_ZERO 0x02 /* * Encode/decode check mode and compression mode for * bref.methods. The compression level is not encoded in * bref.methods. */ #define HAMMER2_ENC_CHECK(n) (((n) & 15) << 4) #define HAMMER2_DEC_CHECK(n) (((n) >> 4) & 15) #define HAMMER2_ENC_COMP(n) ((n) & 15) #define HAMMER2_DEC_COMP(n) ((n) & 15) #define HAMMER2_CHECK_NONE 0 #define HAMMER2_CHECK_DISABLED 1 #define HAMMER2_CHECK_ISCSI32 2 #define HAMMER2_CHECK_XXHASH64 3 #define HAMMER2_CHECK_SHA192 4 #define HAMMER2_CHECK_FREEMAP 5 #define HAMMER2_CHECK_DEFAULT HAMMER2_CHECK_XXHASH64 /* user-specifiable check modes only */ #define HAMMER2_CHECK_STRINGS { "none", "disabled", "crc32", \ "xxhash64", "sha192" } #define HAMMER2_CHECK_STRINGS_COUNT 5 /* * Encode/decode check or compression algorithm request in * ipdata->meta.check_algo and ipdata->meta.comp_algo. */ #define HAMMER2_ENC_ALGO(n) (n) #define HAMMER2_DEC_ALGO(n) ((n) & 15) #define HAMMER2_ENC_LEVEL(n) ((n) << 4) #define HAMMER2_DEC_LEVEL(n) (((n) >> 4) & 15) #define HAMMER2_COMP_NONE 0 #define HAMMER2_COMP_AUTOZERO 1 #define HAMMER2_COMP_LZ4 2 #define HAMMER2_COMP_ZLIB 3 #define HAMMER2_COMP_NEWFS_DEFAULT HAMMER2_COMP_LZ4 #define HAMMER2_COMP_STRINGS { "none", "autozero", "lz4", "zlib" } #define HAMMER2_COMP_STRINGS_COUNT 4 /* * Passed to hammer2_chain_create(), causes methods to be inherited from * parent. */ #define HAMMER2_METH_DEFAULT -1 /* * HAMMER2 block references are collected into sets of 4 blockrefs. These * sets are fully associative, meaning the elements making up a set are * not sorted in any way and may contain duplicate entries, holes, or * entries which shortcut multiple levels of indirection. Sets are used * in various ways: * * (1) When redundancy is desired a set may contain several duplicate * entries pointing to different copies of the same data. Up to 4 copies * are supported. * * (2) The blockrefs in a set can shortcut multiple levels of indirections * within the bounds imposed by the parent of set. * * When a set fills up another level of indirection is inserted, moving * some or all of the set's contents into indirect blocks placed under the * set. This is a top-down approach in that indirect blocks are not created * until the set actually becomes full (that is, the entries in the set can * shortcut the indirect blocks when the set is not full). Depending on how * things are filled multiple indirect blocks will eventually be created. * * Indirect blocks are typically 4KB (64 entres) or 64KB (1024 entries) and * are also treated as fully set-associative. */ struct hammer2_blockset { hammer2_blockref_t blockref[HAMMER2_SET_COUNT]; }; typedef struct hammer2_blockset hammer2_blockset_t; /* * Catch programmer snafus */ #if (1 << HAMMER2_SET_RADIX) != HAMMER2_SET_COUNT #error "hammer2 direct radix is incorrect" #endif #if (1 << HAMMER2_PBUFRADIX) != HAMMER2_PBUFSIZE #error "HAMMER2_PBUFRADIX and HAMMER2_PBUFSIZE are inconsistent" #endif #if (1 << HAMMER2_RADIX_MIN) != HAMMER2_ALLOC_MIN #error "HAMMER2_RADIX_MIN and HAMMER2_ALLOC_MIN are inconsistent" #endif /* * hammer2_bmap_data - A freemap entry in the LEVEL1 block. * * Each 128-byte entry contains the bitmap and meta-data required to manage * a LEVEL0 (4MB) block of storage. The storage is managed in 256 x 16KB * chunks. * * A smaller allocation granularity is supported via a linear iterator and/or * must otherwise be tracked in ram. * * (data structure must be 128 bytes exactly) * * linear - A BYTE linear allocation offset used for sub-16KB allocations * only. May contain values between 0 and 4MB. Must be ignored * if 16KB-aligned (i.e. force bitmap scan), otherwise may be * used to sub-allocate within the 16KB block (which is already * marked as allocated in the bitmap). * * Sub-allocations need only be 1KB-aligned and do not have to be * size-aligned, and 16KB or larger allocations do not update this * field, resulting in pretty good packing. * * Please note that file data granularity may be limited by * other issues such as buffer cache direct-mapping and the * desire to support sector sizes up to 16KB (so H2 only issues * I/O's in multiples of 16KB anyway). * * class - Clustering class. Cleared to 0 only if the entire leaf becomes * free. Used to cluster device buffers so all elements must have * the same device block size, but may mix logical sizes. * * Typically integrated with the blockref type in the upper 8 bits * to localize inodes and indrect blocks, improving bulk free scans * and directory scans. * * bitmap - Two bits per 16KB allocation block arranged in arrays of * 64-bit elements, 256x2 bits representing ~4MB worth of media * storage. Bit patterns are as follows: * * 00 Unallocated * 01 (reserved) * 10 Possibly free * 11 Allocated */ struct hammer2_bmap_data { int32_t linear; /* 00 linear sub-granular allocation offset */ uint16_t class; /* 04-05 clustering class ((type<<8)|radix) */ uint8_t reserved06; /* 06 */ uint8_t reserved07; /* 07 */ uint32_t reserved08; /* 08 */ uint32_t reserved0C; /* 0C */ uint32_t reserved10; /* 10 */ uint32_t reserved14; /* 14 */ uint32_t reserved18; /* 18 */ uint32_t avail; /* 1C */ uint32_t reserved20[8]; /* 20-3F 256 bits manages 128K/1KB/2-bits */ /* 40-7F 512 bits manages 4MB of storage */ hammer2_bitmap_t bitmapq[HAMMER2_BMAP_ELEMENTS]; } __packed; typedef struct hammer2_bmap_data hammer2_bmap_data_t; /* * XXX "Inodes ARE directory entries" is no longer the case. Hardlinks are * dirents which refer to the same inode#, which is how filesystems usually * implement hardlink. The following comments need to be updated. * * In HAMMER2 inodes ARE directory entries, with a special exception for * hardlinks. The inode number is stored in the inode rather than being * based on the location of the inode (since the location moves every time * the inode or anything underneath the inode is modified). * * The inode is 1024 bytes, made up of 256 bytes of meta-data, 256 bytes * for the filename, and 512 bytes worth of direct file data OR an embedded * blockset. The in-memory hammer2_inode structure contains only the mostly- * node-independent meta-data portion (some flags are node-specific and will * not be synchronized). The rest of the inode is node-specific and chain I/O * is required to obtain it. * * Directories represent one inode per blockref. Inodes are not laid out * as a file but instead are represented by the related blockrefs. The * blockrefs, in turn, are indexed by the 64-bit directory hash key. Remember * that blocksets are fully associative, so a certain degree efficiency is * achieved just from that. * * Up to 512 bytes of direct data can be embedded in an inode, and since * inodes are essentially directory entries this also means that small data * files end up simply being laid out linearly in the directory, resulting * in fewer seeks and highly optimal access. * * The compression mode can be changed at any time in the inode and is * recorded on a blockref-by-blockref basis. * * Hardlinks are supported via the inode map. Essentially the way a hardlink * works is that all individual directory entries representing the same file * are special cased and specify the same inode number. The actual file * is placed in the nearest parent directory that is parent to all instances * of the hardlink. If all hardlinks to a file are in the same directory * the actual file will also be placed in that directory. This file uses * the inode number as the directory entry key and is invisible to normal * directory scans. Real directory entry keys are differentiated from the * inode number key via bit 63. Access to the hardlink silently looks up * the real file and forwards all operations to that file. Removal of the * last hardlink also removes the real file. */ #define HAMMER2_INODE_BYTES 1024 /* (asserted by code) */ #define HAMMER2_INODE_MAXNAME 256 /* maximum name in bytes */ #define HAMMER2_INODE_VERSION_ONE 1 #define HAMMER2_INODE_START 1024 /* dynamically allocated */ struct hammer2_inode_meta { uint16_t version; /* 0000 inode data version */ uint8_t reserved02; /* 0002 */ uint8_t pfs_subtype; /* 0003 pfs sub-type */ /* * core inode attributes, inode type, misc flags */ uint32_t uflags; /* 0004 chflags */ uint32_t rmajor; /* 0008 available for device nodes */ uint32_t rminor; /* 000C available for device nodes */ uint64_t ctime; /* 0010 inode change time */ uint64_t mtime; /* 0018 modified time */ uint64_t atime; /* 0020 access time (unsupported) */ uint64_t btime; /* 0028 birth time */ uuid_t uid; /* 0030 uid / degenerate unix uid */ uuid_t gid; /* 0040 gid / degenerate unix gid */ uint8_t type; /* 0050 object type */ uint8_t op_flags; /* 0051 operational flags */ uint16_t cap_flags; /* 0052 capability flags */ uint32_t mode; /* 0054 unix modes (typ low 16 bits) */ /* * inode size, identification, localized recursive configuration * for compression and backup copies. * * NOTE: Nominal parent inode number (iparent) is only applicable * for directories but can also help for files during * catastrophic recovery. */ hammer2_tid_t inum; /* 0058 inode number */ hammer2_off_t size; /* 0060 size of file */ uint64_t nlinks; /* 0068 hard links (typ only dirs) */ hammer2_tid_t iparent; /* 0070 nominal parent inum */ hammer2_key_t name_key; /* 0078 full filename key */ uint16_t name_len; /* 0080 filename length */ uint8_t ncopies; /* 0082 ncopies to local media */ uint8_t comp_algo; /* 0083 compression request & algo */ /* * These fields are currently only applicable to PFSROOTs. * * NOTE: We can't use {volume_data->fsid, pfs_clid} to uniquely * identify an instance of a PFS in the cluster because * a mount may contain more than one copy of the PFS as * a separate node. {pfs_clid, pfs_fsid} must be used for * registration in the cluster. */ uint8_t target_type; /* 0084 hardlink target type */ uint8_t check_algo; /* 0085 check code request & algo */ uint8_t pfs_nmasters; /* 0086 (if PFSROOT) if multi-master */ uint8_t pfs_type; /* 0087 (if PFSROOT) node type */ uint64_t pfs_inum; /* 0088 (if PFSROOT) inum allocator */ uuid_t pfs_clid; /* 0090 (if PFSROOT) cluster uuid */ uuid_t pfs_fsid; /* 00A0 (if PFSROOT) unique uuid */ /* * Quotas and aggregate sub-tree inode and data counters. Note that * quotas are not replicated downward, they are explicitly set by * the sysop and in-memory structures keep track of inheritance. */ hammer2_key_t data_quota; /* 00B0 subtree quota in bytes */ hammer2_key_t unusedB8; /* 00B8 subtree byte count */ hammer2_key_t inode_quota; /* 00C0 subtree quota inode count */ hammer2_key_t unusedC8; /* 00C8 subtree inode count */ /* * The last snapshot tid is tested against modify_tid to determine * when a copy must be made of a data block whos check mode has been * disabled (a disabled check mode allows data blocks to be updated * in place instead of copy-on-write). */ hammer2_tid_t pfs_lsnap_tid; /* 00D0 last snapshot tid */ hammer2_tid_t reservedD8; /* 00D8 (avail) */ /* * Tracks (possibly degenerate) free areas covering all sub-tree * allocations under inode, not counting the inode itself. * 0/0 indicates empty entry. fully set-associative. * * (not yet implemented) */ uint64_t decrypt_check; /* 00E0 decryption validator */ hammer2_off_t reservedE0[3]; /* 00E8/F0/F8 */ } __packed; typedef struct hammer2_inode_meta hammer2_inode_meta_t; struct hammer2_inode_data { hammer2_inode_meta_t meta; /* 0000-00FF */ unsigned char filename[HAMMER2_INODE_MAXNAME]; /* 0100-01FF (256 char, unterminated) */ union { /* 0200-03FF (64x8 = 512 bytes) */ hammer2_blockset_t blockset; char data[HAMMER2_EMBEDDED_BYTES]; } u; } __packed; typedef struct hammer2_inode_data hammer2_inode_data_t; #define HAMMER2_OPFLAG_DIRECTDATA 0x01 #define HAMMER2_OPFLAG_PFSROOT 0x02 /* (see also bref flag) */ #define HAMMER2_OPFLAG_COPYIDS 0x04 /* copyids override parent */ #define HAMMER2_OBJTYPE_UNKNOWN 0 #define HAMMER2_OBJTYPE_DIRECTORY 1 #define HAMMER2_OBJTYPE_REGFILE 2 #define HAMMER2_OBJTYPE_FIFO 4 #define HAMMER2_OBJTYPE_CDEV 5 #define HAMMER2_OBJTYPE_BDEV 6 #define HAMMER2_OBJTYPE_SOFTLINK 7 #define HAMMER2_OBJTYPE_UNUSED08 8 #define HAMMER2_OBJTYPE_SOCKET 9 #define HAMMER2_OBJTYPE_WHITEOUT 10 #define HAMMER2_COPYID_NONE 0 #define HAMMER2_COPYID_LOCAL ((uint8_t)-1) #define HAMMER2_COPYID_COUNT 256 /* * PFS types identify the role of a PFS within a cluster. The PFS types * is stored on media and in LNK_SPAN messages and used in other places. * * The low 4 bits specify the current active type while the high 4 bits * specify the transition target if the PFS is being upgraded or downgraded, * If the upper 4 bits are not zero it may effect how a PFS is used during * the transition. * * Generally speaking, downgrading a MASTER to a SLAVE cannot complete until * at least all MASTERs have updated their pfs_nmasters field. And upgrading * a SLAVE to a MASTER cannot complete until the new prospective master has * been fully synchronized (though theoretically full synchronization is * not required if a (new) quorum of other masters are fully synchronized). * * It generally does not matter which PFS element you actually mount, you * are mounting 'the cluster'. So, for example, a network mount will mount * a DUMMY PFS type on a memory filesystem. However, there are two exceptions. * In order to gain the benefits of a SOFT_MASTER or SOFT_SLAVE, those PFSs * must be directly mounted. */ #define HAMMER2_PFSTYPE_NONE 0x00 #define HAMMER2_PFSTYPE_CACHE 0x01 #define HAMMER2_PFSTYPE_UNUSED02 0x02 #define HAMMER2_PFSTYPE_SLAVE 0x03 #define HAMMER2_PFSTYPE_SOFT_SLAVE 0x04 #define HAMMER2_PFSTYPE_SOFT_MASTER 0x05 #define HAMMER2_PFSTYPE_MASTER 0x06 #define HAMMER2_PFSTYPE_UNUSED07 0x07 #define HAMMER2_PFSTYPE_SUPROOT 0x08 #define HAMMER2_PFSTYPE_DUMMY 0x09 #define HAMMER2_PFSTYPE_MAX 16 #define HAMMER2_PFSTRAN_NONE 0x00 /* no transition in progress */ #define HAMMER2_PFSTRAN_CACHE 0x10 #define HAMMER2_PFSTRAN_UNMUSED20 0x20 #define HAMMER2_PFSTRAN_SLAVE 0x30 #define HAMMER2_PFSTRAN_SOFT_SLAVE 0x40 #define HAMMER2_PFSTRAN_SOFT_MASTER 0x50 #define HAMMER2_PFSTRAN_MASTER 0x60 #define HAMMER2_PFSTRAN_UNUSED70 0x70 #define HAMMER2_PFSTRAN_SUPROOT 0x80 #define HAMMER2_PFSTRAN_DUMMY 0x90 #define HAMMER2_PFS_DEC(n) ((n) & 0x0F) #define HAMMER2_PFS_DEC_TRANSITION(n) (((n) >> 4) & 0x0F) #define HAMMER2_PFS_ENC_TRANSITION(n) (((n) & 0x0F) << 4) #define HAMMER2_PFSSUBTYPE_NONE 0 #define HAMMER2_PFSSUBTYPE_SNAPSHOT 1 /* manual/managed snapshot */ #define HAMMER2_PFSSUBTYPE_AUTOSNAP 2 /* automatic snapshot */ /* * PFS mode of operation is a bitmask. This is typically not stored * on-media, but defined here because the field may be used in dmsgs. */ #define HAMMER2_PFSMODE_QUORUM 0x01 #define HAMMER2_PFSMODE_RW 0x02 /* * Allocation Table * */ /* * Flags (8 bits) - blockref, for freemap only * * Note that the minimum chunk size is 1KB so we could theoretically have * 10 bits here, but we might have some future extension that allows a * chunk size down to 256 bytes and if so we will need bits 8 and 9. */ #define HAMMER2_AVF_SELMASK 0x03 /* select group */ #define HAMMER2_AVF_ALL_ALLOC 0x04 /* indicate all allocated */ #define HAMMER2_AVF_ALL_FREE 0x08 /* indicate all free */ #define HAMMER2_AVF_RESERVED10 0x10 #define HAMMER2_AVF_RESERVED20 0x20 #define HAMMER2_AVF_RESERVED40 0x40 #define HAMMER2_AVF_RESERVED80 0x80 #define HAMMER2_AVF_AVMASK32 ((uint32_t)0xFFFFFF00LU) #define HAMMER2_AVF_AVMASK64 ((uint64_t)0xFFFFFFFFFFFFFF00LLU) #define HAMMER2_AV_SELECT_A 0x00 #define HAMMER2_AV_SELECT_B 0x01 #define HAMMER2_AV_SELECT_C 0x02 #define HAMMER2_AV_SELECT_D 0x03 /* * The volume header eats a 64K block. There is currently an issue where * we want to try to fit all nominal filesystem updates in a 512-byte section * but it may be a lost cause due to the need for a blockset. * * All information is stored in host byte order. The volume header's magic * number may be checked to determine the byte order. If you wish to mount * between machines w/ different endian modes you'll need filesystem code * which acts on the media data consistently (either all one way or all the * other). Our code currently does not do that. * * A read-write mount may have to recover missing allocations by doing an * incremental mirror scan looking for modifications made after alloc_tid. * If alloc_tid == last_tid then no recovery operation is needed. Recovery * operations are usually very, very fast. * * Read-only mounts do not need to do any recovery, access to the filesystem * topology is always consistent after a crash (is always consistent, period). * However, there may be shortcutted blockref updates present from deep in * the tree which are stored in the volumeh eader and must be tracked on * the fly. * * NOTE: The copyinfo[] array contains the configuration for both the * cluster connections and any local media copies. The volume * header will be replicated for each local media copy. * * The mount command may specify multiple medias or just one and * allow HAMMER2 to pick up the others when it checks the copyinfo[] * array on mount. * * NOTE: root_blockref points to the super-root directory, not the root * directory. The root directory will be a subdirectory under the * super-root. * * The super-root directory contains all root directories and all * snapshots (readonly or writable). It is possible to do a * null-mount of the super-root using special path constructions * relative to your mounted root. * * NOTE: HAMMER2 allows any subdirectory tree to be managed as if it were * a PFS, including mirroring and storage quota operations, and this is * preferred over creating discrete PFSs in the super-root. Instead * the super-root is most typically used to create writable snapshots, * alternative roots, and so forth. The super-root is also used by * the automatic snapshotting mechanism. */ #define HAMMER2_VOLUME_ID_HBO 0x48414d3205172011LLU #define HAMMER2_VOLUME_ID_ABO 0x11201705324d4148LLU struct hammer2_volume_data { /* * sector #0 - 512 bytes */ uint64_t magic; /* 0000 Signature */ hammer2_off_t boot_beg; /* 0008 Boot area (future) */ hammer2_off_t boot_end; /* 0010 (size = end - beg) */ hammer2_off_t aux_beg; /* 0018 Aux area (future) */ hammer2_off_t aux_end; /* 0020 (size = end - beg) */ hammer2_off_t volu_size; /* 0028 Volume size, bytes */ uint32_t version; /* 0030 */ uint32_t flags; /* 0034 */ uint8_t copyid; /* 0038 copyid of phys vol */ uint8_t freemap_version; /* 0039 freemap algorithm */ uint8_t peer_type; /* 003A HAMMER2_PEER_xxx */ uint8_t reserved003B; /* 003B */ uint32_t reserved003C; /* 003C */ uuid_t fsid; /* 0040 */ uuid_t fstype; /* 0050 */ /* * allocator_size is precalculated at newfs time and does not include * reserved blocks, boot, or redo areas. * * Initial non-reserved-area allocations do not use the freemap * but instead adjust alloc_iterator. Dynamic allocations take * over starting at (allocator_beg). This makes newfs_hammer2's * job a lot easier and can also serve as a testing jig. */ hammer2_off_t allocator_size; /* 0060 Total data space */ hammer2_off_t allocator_free; /* 0068 Free space */ hammer2_off_t allocator_beg; /* 0070 Initial allocations */ /* * mirror_tid reflects the highest committed change for this * block device regardless of whether it is to the super-root * or to a PFS or whatever. * * freemap_tid reflects the highest committed freemap change for * this block device. */ hammer2_tid_t mirror_tid; /* 0078 committed tid (vol) */ hammer2_tid_t reserved0080; /* 0080 */ hammer2_tid_t reserved0088; /* 0088 */ hammer2_tid_t freemap_tid; /* 0090 committed tid (fmap) */ hammer2_tid_t bulkfree_tid; /* 0098 bulkfree incremental */ hammer2_tid_t reserved00A0[5]; /* 00A0-00C7 */ /* * Copyids are allocated dynamically from the copyexists bitmap. * An id from the active copies set (up to 8, see copyinfo later on) * may still exist after the copy set has been removed from the * volume header and its bit will remain active in the bitmap and * cannot be reused until it is 100% removed from the hierarchy. */ uint32_t copyexists[8]; /* 00C8-00E7 copy exists bmap */ char reserved0140[248]; /* 00E8-01DF */ /* * 32 bit CRC array at the end of the first 512 byte sector. * * icrc_sects[7] - First 512-4 bytes of volume header (including all * the other icrc's except this one). * * icrc_sects[6] - Sector 1 (512 bytes) of volume header, which is * the blockset for the root. * * icrc_sects[5] - Sector 2 * icrc_sects[4] - Sector 3 * icrc_sects[3] - Sector 4 (the freemap blockset) */ hammer2_crc32_t icrc_sects[8]; /* 01E0-01FF */ /* * sector #1 - 512 bytes * * The entire sector is used by a blockset. */ hammer2_blockset_t sroot_blockset; /* 0200-03FF Superroot dir */ /* * sector #2-7 */ char sector2[512]; /* 0400-05FF reserved */ char sector3[512]; /* 0600-07FF reserved */ hammer2_blockset_t freemap_blockset; /* 0800-09FF freemap */ char sector5[512]; /* 0A00-0BFF reserved */ char sector6[512]; /* 0C00-0DFF reserved */ char sector7[512]; /* 0E00-0FFF reserved */ /* * sector #8-71 - 32768 bytes * * Contains the configuration for up to 256 copyinfo targets. These * specify local and remote copies operating as masters or slaves. * copyid's 0 and 255 are reserved (0 indicates an empty slot and 255 * indicates the local media). * * Each inode contains a set of up to 8 copyids, either inherited * from its parent or explicitly specified in the inode, which * indexes into this array. */ /* 1000-8FFF copyinfo config */ hammer2_volconf_t copyinfo[HAMMER2_COPYID_COUNT]; /* * Remaining sections are reserved for future use. */ char reserved0400[0x6FFC]; /* 9000-FFFB reserved */ /* * icrc on entire volume header */ hammer2_crc32_t icrc_volheader; /* FFFC-FFFF full volume icrc*/ } __packed; typedef struct hammer2_volume_data hammer2_volume_data_t; /* * Various parts of the volume header have their own iCRCs. * * The first 512 bytes has its own iCRC stored at the end of the 512 bytes * and not included the icrc calculation. * * The second 512 bytes also has its own iCRC but it is stored in the first * 512 bytes so it covers the entire second 512 bytes. * * The whole volume block (64KB) has an iCRC covering all but the last 4 bytes, * which is where the iCRC for the whole volume is stored. This is currently * a catch-all for anything not individually iCRCd. */ #define HAMMER2_VOL_ICRC_SECT0 7 #define HAMMER2_VOL_ICRC_SECT1 6 #define HAMMER2_VOLUME_BYTES 65536 #define HAMMER2_VOLUME_ICRC0_OFF 0 #define HAMMER2_VOLUME_ICRC1_OFF 512 #define HAMMER2_VOLUME_ICRCVH_OFF 0 #define HAMMER2_VOLUME_ICRC0_SIZE (512 - 4) #define HAMMER2_VOLUME_ICRC1_SIZE (512) #define HAMMER2_VOLUME_ICRCVH_SIZE (65536 - 4) #define HAMMER2_VOL_VERSION_MIN 1 #define HAMMER2_VOL_VERSION_DEFAULT 1 #define HAMMER2_VOL_VERSION_WIP 2 #define HAMMER2_NUM_VOLHDRS 4 union hammer2_media_data { hammer2_volume_data_t voldata; hammer2_inode_data_t ipdata; hammer2_blockset_t blkset; hammer2_blockref_t npdata[HAMMER2_IND_COUNT_MAX]; hammer2_bmap_data_t bmdata[HAMMER2_FREEMAP_COUNT]; char buf[HAMMER2_PBUFSIZE]; } __packed; typedef union hammer2_media_data hammer2_media_data_t; #endif /* !_HAMMER2_DISK_H_ */ diff --git a/usr.sbin/kldxref/kldxref.c b/usr.sbin/kldxref/kldxref.c index 8f3d24718211..c88769ce1824 100644 --- a/usr.sbin/kldxref/kldxref.c +++ b/usr.sbin/kldxref/kldxref.c @@ -1,852 +1,852 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2000, Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Boris Popov. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ef.h" #define MAXRECSIZE (64 << 10) /* 64k */ #define check(val) if ((error = (val)) != 0) break static bool dflag; /* do not create a hint file, only write on stdout */ static int verbose; static FILE *fxref; /* current hints file */ static int byte_order; static GElf_Ehdr ehdr; static char *ehdr_filename; static const char *xref_file = "linker.hints"; /* * A record is stored in the static buffer recbuf before going to disk. */ static char recbuf[MAXRECSIZE]; static int recpos; /* current write position */ static int reccnt; /* total record written to this file so far */ static void intalign(void) { recpos = roundup2(recpos, sizeof(int)); } static void write_int(int val) { char buf[4]; assert(byte_order != ELFDATANONE); if (byte_order == ELFDATA2LSB) le32enc(buf, val); else be32enc(buf, val); fwrite(buf, sizeof(buf), 1, fxref); } static void record_start(void) { recpos = 0; memset(recbuf, 0, MAXRECSIZE); } static int record_end(void) { if (recpos == 0) { /* * Pretend to have written a record in debug mode so * the architecture check works. */ if (dflag) reccnt++; return (0); } if (reccnt == 0) { /* File version record. */ write_int(1); } reccnt++; intalign(); write_int(recpos); return (fwrite(recbuf, recpos, 1, fxref) != 1 ? errno : 0); } static int record_buf(const void *buf, size_t size) { if (MAXRECSIZE - recpos < size) errx(1, "record buffer overflow"); memcpy(recbuf + recpos, buf, size); recpos += size; return (0); } /* * An int is stored in target byte order and aligned */ static int record_int(int val) { char buf[4]; assert(byte_order != ELFDATANONE); if (byte_order == ELFDATA2LSB) le32enc(buf, val); else be32enc(buf, val); intalign(); return (record_buf(buf, sizeof(buf))); } /* * A string is stored as 1-byte length plus data, no padding */ static int record_string(const char *str) { int error; size_t len; u_char val; if (dflag) return (0); val = len = strlen(str); if (len > 255) errx(1, "string %s too long", str); error = record_buf(&val, sizeof(val)); if (error != 0) return (error); return (record_buf(str, len)); } /* From sys/isa/pnp.c */ static char * pnp_eisaformat(uint32_t id) { uint8_t *data; static char idbuf[8]; const char hextoascii[] = "0123456789abcdef"; id = htole32(id); data = (uint8_t *)&id; idbuf[0] = '@' + ((data[0] & 0x7c) >> 2); idbuf[1] = '@' + (((data[0] & 0x3) << 3) + ((data[1] & 0xe0) >> 5)); idbuf[2] = '@' + (data[1] & 0x1f); idbuf[3] = hextoascii[(data[2] >> 4)]; idbuf[4] = hextoascii[(data[2] & 0xf)]; idbuf[5] = hextoascii[(data[3] >> 4)]; idbuf[6] = hextoascii[(data[3] & 0xf)]; idbuf[7] = 0; return (idbuf); } struct pnp_elt { int pe_kind; /* What kind of entry */ #define TYPE_SZ_MASK 0x0f #define TYPE_FLAGGED 0x10 /* all f's is a wildcard */ #define TYPE_INT 0x20 /* Is a number */ #define TYPE_PAIRED 0x40 #define TYPE_LE 0x80 /* Matches <= this value */ #define TYPE_GE 0x100 /* Matches >= this value */ #define TYPE_MASK 0x200 /* Specifies a mask to follow */ #define TYPE_U8 (1 | TYPE_INT) #define TYPE_V8 (1 | TYPE_INT | TYPE_FLAGGED) #define TYPE_G16 (2 | TYPE_INT | TYPE_GE) #define TYPE_L16 (2 | TYPE_INT | TYPE_LE) #define TYPE_M16 (2 | TYPE_INT | TYPE_MASK) #define TYPE_U16 (2 | TYPE_INT) #define TYPE_V16 (2 | TYPE_INT | TYPE_FLAGGED) #define TYPE_U32 (4 | TYPE_INT) #define TYPE_V32 (4 | TYPE_INT | TYPE_FLAGGED) #define TYPE_W32 (4 | TYPE_INT | TYPE_PAIRED) #define TYPE_D 7 #define TYPE_Z 8 #define TYPE_P 9 #define TYPE_E 10 #define TYPE_T 11 int pe_offset; /* Offset within the element */ char * pe_key; /* pnp key name */ TAILQ_ENTRY(pnp_elt) next; /* Link */ }; typedef TAILQ_HEAD(pnp_head, pnp_elt) pnp_list; /* * this function finds the data from the pnp table, as described by the - * the description and creates a new output (new_desc). This output table + * description and creates a new output (new_desc). This output table * is a form that's easier for the agent that's automatically loading the * modules. * * The format output is the simplified string from this routine in the * same basic format as the pnp string, as documented in sys/module.h. * First a string describing the format is output, the a count of the * number of records, then each record. The format string also describes * the length of each entry (though it isn't a fixed length when strings * are present). * * type Output Meaning * I uint32_t Integer equality comparison * J uint32_t Pair of uint16_t fields converted to native * byte order. The two fields both must match. * G uint32_t Greater than or equal to * L uint32_t Less than or equal to * M uint32_t Mask of which fields to test. Fields that * take up space increment the count. This * field must be first, and resets the count. * D string Description of the device this pnp info is for * Z string pnp string must match this * T nothing T fields set pnp values that must be true for * the entire table. * Values are packed the same way that other values are packed in this file. * Strings and int32_t's start on a 32-bit boundary and are padded with 0 * bytes. Objects that are smaller than uint32_t are converted, without * sign extension to uint32_t to simplify parsing downstream. */ static int parse_pnp_list(struct elf_file *ef, const char *desc, char **new_desc, pnp_list *list) { const char *walker, *ep; const char *colon, *semi; struct pnp_elt *elt; char type[8], key[32]; int off; size_t new_desc_size; FILE *fp; TAILQ_INIT(list); walker = desc; ep = desc + strlen(desc); off = 0; fp = open_memstream(new_desc, &new_desc_size); if (fp == NULL) err(1, "Could not open new memory stream"); if (verbose > 1) printf("Converting %s into a list\n", desc); while (walker < ep) { colon = strchr(walker, ':'); semi = strchr(walker, ';'); if (semi != NULL && semi < colon) goto err; if (colon - walker > sizeof(type)) goto err; strncpy(type, walker, colon - walker); type[colon - walker] = '\0'; if (semi != NULL) { if (semi - colon >= sizeof(key)) goto err; strncpy(key, colon + 1, semi - colon - 1); key[semi - colon - 1] = '\0'; walker = semi + 1; /* Fail safe if we have spaces after ; */ while (walker < ep && isspace(*walker)) walker++; } else { if (strlen(colon + 1) >= sizeof(key)) goto err; strcpy(key, colon + 1); walker = ep; } if (verbose > 1) printf("Found type %s for name %s\n", type, key); /* Skip pointer place holders */ if (strcmp(type, "P") == 0) { off += elf_pointer_size(ef); continue; } /* * Add a node of the appropriate type */ elt = malloc(sizeof(struct pnp_elt) + strlen(key) + 1); TAILQ_INSERT_TAIL(list, elt, next); elt->pe_key = (char *)(elt + 1); elt->pe_offset = off; if (strcmp(type, "U8") == 0) elt->pe_kind = TYPE_U8; else if (strcmp(type, "V8") == 0) elt->pe_kind = TYPE_V8; else if (strcmp(type, "G16") == 0) elt->pe_kind = TYPE_G16; else if (strcmp(type, "L16") == 0) elt->pe_kind = TYPE_L16; else if (strcmp(type, "M16") == 0) elt->pe_kind = TYPE_M16; else if (strcmp(type, "U16") == 0) elt->pe_kind = TYPE_U16; else if (strcmp(type, "V16") == 0) elt->pe_kind = TYPE_V16; else if (strcmp(type, "U32") == 0) elt->pe_kind = TYPE_U32; else if (strcmp(type, "V32") == 0) elt->pe_kind = TYPE_V32; else if (strcmp(type, "W32") == 0) elt->pe_kind = TYPE_W32; else if (strcmp(type, "D") == 0) /* description char * */ elt->pe_kind = TYPE_D; else if (strcmp(type, "Z") == 0) /* char * to match */ elt->pe_kind = TYPE_Z; else if (strcmp(type, "P") == 0) /* Pointer -- ignored */ elt->pe_kind = TYPE_P; else if (strcmp(type, "E") == 0) /* EISA PNP ID, as uint32_t */ elt->pe_kind = TYPE_E; else if (strcmp(type, "T") == 0) elt->pe_kind = TYPE_T; else goto err; /* * Maybe the rounding here needs to be more nuanced and/or somehow * architecture specific. Fortunately, most tables in the system * have sane ordering of types. */ if (elt->pe_kind & TYPE_INT) { elt->pe_offset = roundup2(elt->pe_offset, elt->pe_kind & TYPE_SZ_MASK); off = elt->pe_offset + (elt->pe_kind & TYPE_SZ_MASK); } else if (elt->pe_kind == TYPE_E) { /* Type E stored as Int, displays as string */ elt->pe_offset = roundup2(elt->pe_offset, sizeof(uint32_t)); off = elt->pe_offset + sizeof(uint32_t); } else if (elt->pe_kind == TYPE_T) { /* doesn't actually consume space in the table */ off = elt->pe_offset; } else { elt->pe_offset = roundup2(elt->pe_offset, elf_pointer_size(ef)); off = elt->pe_offset + elf_pointer_size(ef); } if (elt->pe_kind & TYPE_PAIRED) { char *word, *ctx, newtype; for (word = strtok_r(key, "/", &ctx); word; word = strtok_r(NULL, "/", &ctx)) { newtype = elt->pe_kind & TYPE_FLAGGED ? 'J' : 'I'; fprintf(fp, "%c:%s;", newtype, word); } } else { char newtype; if (elt->pe_kind & TYPE_FLAGGED) newtype = 'J'; else if (elt->pe_kind & TYPE_GE) newtype = 'G'; else if (elt->pe_kind & TYPE_LE) newtype = 'L'; else if (elt->pe_kind & TYPE_MASK) newtype = 'M'; else if (elt->pe_kind & TYPE_INT) newtype = 'I'; else if (elt->pe_kind == TYPE_D) newtype = 'D'; else if (elt->pe_kind == TYPE_Z || elt->pe_kind == TYPE_E) newtype = 'Z'; else if (elt->pe_kind == TYPE_T) newtype = 'T'; else errx(1, "Impossible type %x\n", elt->pe_kind); fprintf(fp, "%c:%s;", newtype, key); } } if (ferror(fp) != 0) { fclose(fp); errx(1, "Exhausted space converting description %s", desc); } if (fclose(fp) != 0) errx(1, "Failed to close memory stream"); return (0); err: errx(1, "Parse error of description string %s", desc); } static void free_pnp_list(char *new_desc, pnp_list *list) { struct pnp_elt *elt, *elt_tmp; TAILQ_FOREACH_SAFE(elt, list, next, elt_tmp) { TAILQ_REMOVE(list, elt, next); free(elt); } free(new_desc); } static uint16_t parse_16(const void *p) { if (byte_order == ELFDATA2LSB) return (le16dec(p)); else return (be16dec(p)); } static uint32_t parse_32(const void *p) { if (byte_order == ELFDATA2LSB) return (le32dec(p)); else return (be32dec(p)); } static void parse_pnp_entry(struct elf_file *ef, struct pnp_elt *elt, const char *walker) { uint8_t v1; uint16_t v2; uint32_t v4; int value; char buffer[1024]; if (elt->pe_kind == TYPE_W32) { v4 = parse_32(walker + elt->pe_offset); value = v4 & 0xffff; record_int(value); if (verbose > 1) printf("W32:%#x", value); value = (v4 >> 16) & 0xffff; record_int(value); if (verbose > 1) printf(":%#x;", value); } else if (elt->pe_kind & TYPE_INT) { switch (elt->pe_kind & TYPE_SZ_MASK) { case 1: memcpy(&v1, walker + elt->pe_offset, sizeof(v1)); if ((elt->pe_kind & TYPE_FLAGGED) && v1 == 0xff) value = -1; else value = v1; break; case 2: v2 = parse_16(walker + elt->pe_offset); if ((elt->pe_kind & TYPE_FLAGGED) && v2 == 0xffff) value = -1; else value = v2; break; case 4: v4 = parse_32(walker + elt->pe_offset); if ((elt->pe_kind & TYPE_FLAGGED) && v4 == 0xffffffff) value = -1; else value = v4; break; default: errx(1, "Invalid size somehow %#x", elt->pe_kind); } if (verbose > 1) printf("I:%#x;", value); record_int(value); } else if (elt->pe_kind == TYPE_T) { /* Do nothing */ } else { /* E, Z or D -- P already filtered */ if (elt->pe_kind == TYPE_E) { v4 = parse_32(walker + elt->pe_offset); strcpy(buffer, pnp_eisaformat(v4)); } else { GElf_Addr address; address = elf_address_from_pointer(ef, walker + elt->pe_offset); buffer[0] = '\0'; if (address != 0) { elf_read_string(ef, address, buffer, sizeof(buffer)); buffer[sizeof(buffer) - 1] = '\0'; } } if (verbose > 1) printf("%c:%s;", elt->pe_kind == TYPE_E ? 'E' : (elt->pe_kind == TYPE_Z ? 'Z' : 'D'), buffer); record_string(buffer); } } static void record_pnp_info(struct elf_file *ef, const char *cval, struct Gmod_pnp_match_info *pnp, const char *descr) { pnp_list list; struct pnp_elt *elt; char *new_descr, *walker; void *table; size_t len; int error, i; if (verbose > 1) printf(" pnp info for bus %s format %s %d entries of %d bytes\n", cval, descr, pnp->num_entry, pnp->entry_len); /* * Parse descr to weed out the chaff and to create a list * of offsets to output. */ parse_pnp_list(ef, descr, &new_descr, &list); record_int(MDT_PNP_INFO); record_string(cval); record_string(new_descr); record_int(pnp->num_entry); len = pnp->num_entry * pnp->entry_len; error = elf_read_relocated_data(ef, pnp->table, len, &table); if (error != 0) { free_pnp_list(new_descr, &list); return; } /* * Walk the list and output things. We've collapsed all the * variant forms of the table down to just ints and strings. */ walker = table; for (i = 0; i < pnp->num_entry; i++) { TAILQ_FOREACH(elt, &list, next) { parse_pnp_entry(ef, elt, walker); } if (verbose > 1) printf("\n"); walker += pnp->entry_len; } /* Now free it */ free_pnp_list(new_descr, &list); free(table); } static int parse_entry(struct Gmod_metadata *md, const char *cval, struct elf_file *ef, const char *kldname) { struct Gmod_depend mdp; struct Gmod_version mdv; struct Gmod_pnp_match_info pnp; char descr[1024]; GElf_Addr data; int error; data = md->md_data; error = 0; record_start(); switch (md->md_type) { case MDT_DEPEND: if (!dflag) break; check(elf_read_mod_depend(ef, data, &mdp)); printf(" depends on %s.%d (%d,%d)\n", cval, mdp.md_ver_preferred, mdp.md_ver_minimum, mdp.md_ver_maximum); break; case MDT_VERSION: check(elf_read_mod_version(ef, data, &mdv)); if (dflag) { printf(" interface %s.%d\n", cval, mdv.mv_version); } else { record_int(MDT_VERSION); record_string(cval); record_int(mdv.mv_version); record_string(kldname); } break; case MDT_MODULE: if (dflag) { printf(" module %s\n", cval); } else { record_int(MDT_MODULE); record_string(cval); record_string(kldname); } break; case MDT_PNP_INFO: check(elf_read_mod_pnp_match_info(ef, data, &pnp)); check(elf_read_string(ef, pnp.descr, descr, sizeof(descr))); if (dflag) { printf(" pnp info for bus %s format %s %d entries of %d bytes\n", cval, descr, pnp.num_entry, pnp.entry_len); } else { record_pnp_info(ef, cval, &pnp, descr); } break; default: warnx("unknown metadata record %d in file %s", md->md_type, kldname); } if (!error) record_end(); return (error); } static int read_kld(char *filename, char *kldname) { struct Gmod_metadata md; struct elf_file ef; GElf_Addr *p; int error; long entries, i; char cval[MAXMODNAME + 1]; if (verbose || dflag) printf("%s\n", filename); error = elf_open_file(&ef, filename, verbose); if (error != 0) return (error); if (reccnt == 0) { ehdr = ef.ef_hdr; byte_order = elf_encoding(&ef); free(ehdr_filename); ehdr_filename = strdup(filename); } else if (!elf_compatible(&ef, &ehdr)) { warnx("%s does not match architecture of %s", filename, ehdr_filename); elf_close_file(&ef); return (EINVAL); } do { check(elf_read_linker_set(&ef, MDT_SETNAME, &p, &entries)); /* * Do a first pass to find MDT_MODULE. It is required to be * ordered first in the output linker.hints stream because it * serves as an implicit record boundary between distinct klds * in the stream. Other MDTs only make sense in the context of * a specific MDT_MODULE. * * Some compilers (e.g., GCC 6.4.0 xtoolchain) or binutils * (e.g., GNU binutils 2.32 objcopy/ld.bfd) can reorder * MODULE_METADATA set entries relative to the source ordering. * This is permitted by the C standard; memory layout of * file-scope objects is left implementation-defined. There is * no requirement that source code ordering is retained. * * Handle that here by taking two passes to ensure MDT_MODULE * records are emitted to linker.hints before other MDT records * in the same kld. */ for (i = 0; i < entries; i++) { check(elf_read_mod_metadata(&ef, p[i], &md)); check(elf_read_string(&ef, md.md_cval, cval, sizeof(cval))); if (md.md_type == MDT_MODULE) { parse_entry(&md, cval, &ef, kldname); break; } } if (error != 0) { free(p); warnc(error, "error while reading %s", filename); break; } /* * Second pass for all !MDT_MODULE entries. */ for (i = 0; i < entries; i++) { check(elf_read_mod_metadata(&ef, p[i], &md)); check(elf_read_string(&ef, md.md_cval, cval, sizeof(cval))); if (md.md_type != MDT_MODULE) parse_entry(&md, cval, &ef, kldname); } if (error != 0) warnc(error, "error while reading %s", filename); free(p); } while(0); elf_close_file(&ef); return (error); } /* * Create a temp file in directory root, make sure we don't * overflow the buffer for the destination name */ static FILE * maketempfile(char *dest, const char *root) { int fd; if (snprintf(dest, MAXPATHLEN, "%s/lhint.XXXXXX", root) >= MAXPATHLEN) { errno = ENAMETOOLONG; return (NULL); } fd = mkstemp(dest); if (fd < 0) return (NULL); fchmod(fd, 0644); /* nothing secret in the file */ return (fdopen(fd, "w+")); } static char xrefname[MAXPATHLEN], tempname[MAXPATHLEN]; static void usage(void) { fprintf(stderr, "%s\n", "usage: kldxref [-Rdv] [-f hintsfile] path ..." ); exit(1); } static int #if defined(__GLIBC__) || defined(__APPLE__) compare(const FTSENT **a, const FTSENT **b) #else compare(const FTSENT *const *a, const FTSENT *const *b) #endif { if ((*a)->fts_info == FTS_D && (*b)->fts_info != FTS_D) return (1); if ((*a)->fts_info != FTS_D && (*b)->fts_info == FTS_D) return (-1); return (strcmp((*a)->fts_name, (*b)->fts_name)); } int main(int argc, char *argv[]) { FTS *ftsp; FTSENT *p; char *dot = NULL; int opt, fts_options; struct stat sb; fts_options = FTS_PHYSICAL; while ((opt = getopt(argc, argv, "Rdf:v")) != -1) { switch (opt) { case 'd': /* no hint file, only print on stdout */ dflag = true; break; case 'f': /* use this name instead of linker.hints */ xref_file = optarg; break; case 'v': verbose++; break; case 'R': /* recurse on directories */ fts_options |= FTS_COMFOLLOW; break; default: usage(); /* NOTREACHED */ } } if (argc - optind < 1) usage(); argc -= optind; argv += optind; if (stat(argv[0], &sb) != 0) err(1, "%s", argv[0]); if ((sb.st_mode & S_IFDIR) == 0 && !dflag) { errno = ENOTDIR; err(1, "%s", argv[0]); } if (elf_version(EV_CURRENT) == EV_NONE) errx(1, "unsupported libelf"); ftsp = fts_open(argv, fts_options, compare); if (ftsp == NULL) exit(1); for (;;) { p = fts_read(ftsp); if ((p == NULL || p->fts_info == FTS_D) && fxref) { /* close and rename the current hint file */ fclose(fxref); fxref = NULL; if (reccnt != 0) { rename(tempname, xrefname); } else { /* didn't find any entry, ignore this file */ unlink(tempname); unlink(xrefname); } } if (p == NULL) break; if (p->fts_info == FTS_D && !dflag) { /* visiting a new directory, create a new hint file */ snprintf(xrefname, sizeof(xrefname), "%s/%s", ftsp->fts_path, xref_file); fxref = maketempfile(tempname, ftsp->fts_path); if (fxref == NULL) err(1, "can't create %s", tempname); byte_order = ELFDATANONE; reccnt = 0; } /* skip non-files.. */ if (p->fts_info != FTS_F) continue; /* * Skip files that generate errors like .debug, .symbol and .pkgsave * by generally skipping all files not ending with ".ko" or that have * no dots in the name (like kernel). */ dot = strrchr(p->fts_name, '.'); if (dot != NULL && strcmp(dot, ".ko") != 0) continue; read_kld(p->fts_path, p->fts_name); } fts_close(ftsp); return (0); } diff --git a/usr.sbin/newsyslog/newsyslog.c b/usr.sbin/newsyslog/newsyslog.c index 9f9185c1ad20..e9b84bae342d 100644 --- a/usr.sbin/newsyslog/newsyslog.c +++ b/usr.sbin/newsyslog/newsyslog.c @@ -1,2928 +1,2928 @@ /*- * ------+---------+---------+-------- + --------+---------+---------+---------* * This file includes significant modifications done by: * Copyright (c) 2003, 2004 - Garance Alistair Drosehn . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * ------+---------+---------+-------- + --------+---------+---------+---------* */ /* * This file contains changes from the Open Software Foundation. */ /* * Copyright 1988, 1989 by the Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose and without fee is hereby granted, provided * that the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the names of M.I.T. and the M.I.T. S.I.P.B. not be * used in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. and the M.I.T. * S.I.P.B. make no representations about the suitability of this software * for any purpose. It is provided "as is" without express or implied * warranty. * */ /* * newsyslog - roll over selected logs at the appropriate time, keeping the a * specified number of backup files around. */ #include #define OSF #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pathnames.h" #include "extern.h" /* * Compression types */ enum compress_types_enum { COMPRESS_NONE = 0, COMPRESS_GZIP = 1, COMPRESS_BZIP2 = 2, COMPRESS_XZ = 3, COMPRESS_ZSTD = 4, COMPRESS_LEGACY = 5, /* Special: use legacy type */ COMPRESS_TYPES = COMPRESS_LEGACY /* Number of supported compression types */ }; /* * Bit-values for the 'flags' parsed from a config-file entry. */ #define CE_BINARY 0x0008 /* Logfile is in binary, do not add status */ /* messages to logfile(s) when rotating. */ #define CE_NOSIGNAL 0x0010 /* There is no process to signal when */ /* trimming this file. */ #define CE_TRIMAT 0x0020 /* trim file at a specific time. */ #define CE_GLOB 0x0040 /* name of the log is file name pattern. */ #define CE_SIGNALGROUP 0x0080 /* Signal a process-group instead of a single */ /* process when trimming this file. */ #define CE_CREATE 0x0100 /* Create the log file if it does not exist. */ #define CE_NODUMP 0x0200 /* Set 'nodump' on newly created log file. */ #define CE_PID2CMD 0x0400 /* Replace PID file with a shell command.*/ #define CE_PLAIN0 0x0800 /* Do not compress zero'th history file */ #define CE_RFC5424 0x1000 /* Use RFC5424 format rotation message */ #define CE_NOEMPTY 0x2000 /* Do not rotate the file when its size */ /* is zero */ #define MIN_PID 5 /* Don't touch pids lower than this */ #define MAX_PID 99999 /* was lower, see /usr/include/sys/proc.h */ #define kbytes(size) (((size) + 1023) >> 10) #define DEFAULT_MARKER "" #define DEBUG_MARKER "" #define INCLUDE_MARKER "" #define COMPRESS_MARKER "" #define DEFAULT_TIMEFNAME_FMT "%Y%m%dT%H%M%S" #define MAX_OLDLOGS 65536 /* Default maximum number of old logfiles */ struct compress_types { const char *name; /* Name of compression type */ const char *flag; /* Flag in configuration file */ const char *suffix; /* Compression suffix */ const char *path; /* Path to compression program */ const char **flags; /* Compression program flags */ int nflags; /* Program flags count */ }; static const char *gzip_flags[] = { "-f" }; #define bzip2_flags gzip_flags #define xz_flags gzip_flags static const char *zstd_flags[] = { "-q", "-T0", "--adapt", "--long", "--rm" }; static struct compress_types compress_type[COMPRESS_TYPES] = { [COMPRESS_NONE] = { .name = "none", .flag = "", .suffix = "", .path = "", .flags = NULL, .nflags = 0 }, [COMPRESS_GZIP] = { .name = "gzip", .flag = "Z", .suffix = ".gz", .path = _PATH_GZIP, .flags = gzip_flags, .nflags = nitems(gzip_flags) }, [COMPRESS_BZIP2] = { .name = "bzip2", .flag = "J", .suffix = ".bz2", .path = _PATH_BZIP2, .flags = bzip2_flags, .nflags = nitems(bzip2_flags) }, [COMPRESS_XZ] = { .name = "xz", .flag = "X", .suffix = ".xz", .path = _PATH_XZ, .flags = xz_flags, .nflags = nitems(xz_flags) }, [COMPRESS_ZSTD] = { .name = "zstd", .flag = "Y", .suffix = ".zst", .path = _PATH_ZSTD, .flags = zstd_flags, .nflags = nitems(zstd_flags) }, }; struct conf_entry { STAILQ_ENTRY(conf_entry) cf_nextp; char *log; /* Name of the log */ char *pid_cmd_file; /* PID or command file */ char *r_reason; /* The reason this file is being rotated */ int firstcreate; /* Creating log for the first time (-C). */ int rotate; /* Non-zero if this file should be rotated */ int fsize; /* size found for the log file */ uid_t uid; /* Owner of log */ gid_t gid; /* Group of log */ int numlogs; /* Number of logs to keep */ int trsize; /* Size cutoff to trigger trimming the log */ int hours; /* Hours between log trimming */ struct ptime_data *trim_at; /* Specific time to do trimming */ unsigned int permissions; /* File permissions on the log */ int flags; /* CE_BINARY */ int compress; /* Compression */ int sig; /* Signal to send */ int def_cfg; /* Using the rule for this file */ }; struct sigwork_entry { SLIST_ENTRY(sigwork_entry) sw_nextp; int sw_signum; /* the signal to send */ int sw_pidok; /* true if pid value is valid */ pid_t sw_pid; /* the process id from the PID file */ const char *sw_pidtype; /* "daemon" or "process group" */ int sw_runcmd; /* run command or send PID to signal */ char sw_fname[1]; /* file the PID was read from or shell cmd */ }; struct zipwork_entry { SLIST_ENTRY(zipwork_entry) zw_nextp; const struct conf_entry *zw_conf; /* for chown/perm/flag info */ const struct sigwork_entry *zw_swork; /* to know success of signal */ int zw_fsize; /* size of the file to compress */ char zw_fname[1]; /* the file to compress */ }; struct include_entry { STAILQ_ENTRY(include_entry) inc_nextp; const char *file; /* Name of file to process */ }; struct oldlog_entry { char *fname; /* Filename of the log file */ time_t t; /* Parsed timestamp of the logfile */ }; typedef enum { FREE_ENT, KEEP_ENT } fk_entry; STAILQ_HEAD(cflist, conf_entry); static SLIST_HEAD(swlisthead, sigwork_entry) swhead = SLIST_HEAD_INITIALIZER(swhead); static SLIST_HEAD(zwlisthead, zipwork_entry) zwhead = SLIST_HEAD_INITIALIZER(zwhead); STAILQ_HEAD(ilist, include_entry); int dbg_at_times; /* -D Show details of 'trim_at' code */ static int archtodir = 0; /* Archive old logfiles to other directory */ static int createlogs; /* Create (non-GLOB) logfiles which do not */ /* already exist. 1=='for entries with */ /* C flag', 2=='for all entries'. */ int verbose = 0; /* Print out what's going on */ static int needroot = 1; /* Root privs are necessary */ int noaction = 0; /* Don't do anything, just show it */ static int norotate = 0; /* Don't rotate */ static int nosignal; /* Do not send any signals */ static int enforcepid = 0; /* If PID file does not exist or empty, do nothing */ static int force = 0; /* Force the trim no matter what */ static int rotatereq = 0; /* -R = Always rotate the file(s) as given */ /* on the command (this also requires */ /* that a list of files *are* given on */ /* the run command). */ static char *requestor; /* The name given on a -R request */ static char *timefnamefmt = NULL;/* Use time based filenames instead of .0 */ static char *archdirname; /* Directory path to old logfiles archive */ static char *destdir = NULL; /* Directory to treat at root for logs */ static const char *conf; /* Configuration file to use */ static enum compress_types_enum compress_type_override = COMPRESS_LEGACY; /* Compression type */ static bool compress_type_set = false; static bool compress_type_seen = false; struct ptime_data *dbg_timenow; /* A "timenow" value set via -D option */ static struct ptime_data *timenow; /* The time to use for checking at-fields */ #define DAYTIME_LEN 16 static char daytime[DAYTIME_LEN];/* The current time in human readable form, * used for rotation-tracking messages. */ /* Another buffer to hold the current time in RFC5424 format. Fractional * seconds are allowed by the RFC, but are not included in the * rotation-tracking messages written by newsyslog and so are not accounted for * in the length below. */ #define DAYTIME_RFC5424_LEN sizeof("YYYY-MM-DDTHH:MM:SS+00:00") static char daytime_rfc5424[DAYTIME_RFC5424_LEN]; static char hostname[MAXHOSTNAMELEN]; /* hostname */ static size_t hostname_shortlen; static const char *path_syslogpid = _PATH_SYSLOGPID; static struct cflist *get_worklist(char **files); static void parse_file(FILE *cf, struct cflist *work_p, struct cflist *glob_p, struct conf_entry **defconf, struct ilist *inclist); static void add_to_queue(const char *fname, struct ilist *inclist); static char *sob(char *p); static char *son(char *p); static int isnumberstr(const char *); static int isglobstr(const char *); static char *missing_field(char *p, char *errline); static void change_attrs(const char *, const struct conf_entry *); static const char *get_logfile_suffix(const char *logfile); static fk_entry do_entry(struct conf_entry *); static fk_entry do_rotate(const struct conf_entry *); static void do_sigwork(struct sigwork_entry *); static void do_zipwork(struct zipwork_entry *); static struct sigwork_entry * save_sigwork(const struct conf_entry *); static struct zipwork_entry * save_zipwork(const struct conf_entry *, const struct sigwork_entry *, int, const char *); static void set_swpid(struct sigwork_entry *, const struct conf_entry *); static int sizefile(const char *); static void expand_globs(struct cflist *work_p, struct cflist *glob_p); static void free_clist(struct cflist *list); static void free_entry(struct conf_entry *ent); static struct conf_entry *init_entry(const char *fname, struct conf_entry *src_entry); static void parse_args(int argc, char **argv); static int parse_doption(const char *doption); static void usage(void) __dead2; static int log_trim(const char *logname, const struct conf_entry *log_ent); static int age_old_log(const char *file); static void savelog(char *from, char *to); static void createdir(const struct conf_entry *ent, char *dirpart); static void createlog(const struct conf_entry *ent); static int parse_signal(const char *str); /* * All the following take a parameter of 'int', but expect values in the * range of unsigned char. Define wrappers which take values of type 'char', * whether signed or unsigned, and ensure they end up in the right range. */ #define isdigitch(Anychar) isdigit((u_char)(Anychar)) #define isprintch(Anychar) isprint((u_char)(Anychar)) #define isspacech(Anychar) isspace((u_char)(Anychar)) #define tolowerch(Anychar) tolower((u_char)(Anychar)) int main(int argc, char **argv) { struct cflist *worklist; struct conf_entry *p; struct sigwork_entry *stmp; struct zipwork_entry *ztmp; SLIST_INIT(&swhead); SLIST_INIT(&zwhead); parse_args(argc, argv); argc -= optind; argv += optind; if (needroot && getuid() && geteuid()) errx(1, "must have root privs"); worklist = get_worklist(argv); /* * Rotate all the files which need to be rotated. Note that * some users have *hundreds* of entries in newsyslog.conf! */ while (!STAILQ_EMPTY(worklist)) { p = STAILQ_FIRST(worklist); STAILQ_REMOVE_HEAD(worklist, cf_nextp); if (do_entry(p) == FREE_ENT) free_entry(p); } /* * Send signals to any processes which need a signal to tell * them to close and re-open the log file(s) we have rotated. * Note that zipwork_entries include pointers to these * sigwork_entry's, so we can not free the entries here. */ if (!SLIST_EMPTY(&swhead)) { if (noaction || verbose) printf("Signal all daemon process(es)...\n"); SLIST_FOREACH(stmp, &swhead, sw_nextp) do_sigwork(stmp); if (!(rotatereq && nosignal)) { if (noaction) printf("\tsleep 10\n"); else { if (verbose) printf("Pause 10 seconds to allow " "daemon(s) to close log file(s)\n"); sleep(10); } } } /* * Compress all files that we're expected to compress, now * that all processes should have closed the files which * have been rotated. */ if (!SLIST_EMPTY(&zwhead)) { if (noaction || verbose) printf("Compress all rotated log file(s)...\n"); while (!SLIST_EMPTY(&zwhead)) { ztmp = SLIST_FIRST(&zwhead); do_zipwork(ztmp); SLIST_REMOVE_HEAD(&zwhead, zw_nextp); free(ztmp); } } /* Now free all the sigwork entries. */ while (!SLIST_EMPTY(&swhead)) { stmp = SLIST_FIRST(&swhead); SLIST_REMOVE_HEAD(&swhead, sw_nextp); free(stmp); } while (wait(NULL) > 0 || errno == EINTR) ; return (0); } static struct conf_entry * init_entry(const char *fname, struct conf_entry *src_entry) { struct conf_entry *tempwork; if (verbose > 4) printf("\t--> [creating entry for %s]\n", fname); tempwork = malloc(sizeof(struct conf_entry)); if (tempwork == NULL) err(1, "malloc of conf_entry for %s", fname); if (destdir == NULL || fname[0] != '/') tempwork->log = strdup(fname); else asprintf(&tempwork->log, "%s%s", destdir, fname); if (tempwork->log == NULL) err(1, "strdup for %s", fname); if (src_entry != NULL) { tempwork->pid_cmd_file = NULL; if (src_entry->pid_cmd_file) tempwork->pid_cmd_file = strdup(src_entry->pid_cmd_file); tempwork->r_reason = NULL; tempwork->firstcreate = 0; tempwork->rotate = 0; tempwork->fsize = -1; tempwork->uid = src_entry->uid; tempwork->gid = src_entry->gid; tempwork->numlogs = src_entry->numlogs; tempwork->trsize = src_entry->trsize; tempwork->hours = src_entry->hours; tempwork->trim_at = NULL; if (src_entry->trim_at != NULL) tempwork->trim_at = ptime_init(src_entry->trim_at); tempwork->permissions = src_entry->permissions; tempwork->flags = src_entry->flags; tempwork->compress = src_entry->compress; tempwork->sig = src_entry->sig; tempwork->def_cfg = src_entry->def_cfg; } else { /* Initialize as a "do-nothing" entry */ tempwork->pid_cmd_file = NULL; tempwork->r_reason = NULL; tempwork->firstcreate = 0; tempwork->rotate = 0; tempwork->fsize = -1; tempwork->uid = (uid_t)-1; tempwork->gid = (gid_t)-1; tempwork->numlogs = 1; tempwork->trsize = -1; tempwork->hours = -1; tempwork->trim_at = NULL; tempwork->permissions = 0; tempwork->flags = 0; tempwork->compress = COMPRESS_NONE; tempwork->sig = SIGHUP; tempwork->def_cfg = 0; } return (tempwork); } static void free_entry(struct conf_entry *ent) { if (ent == NULL) return; if (ent->log != NULL) { if (verbose > 4) printf("\t--> [freeing entry for %s]\n", ent->log); free(ent->log); ent->log = NULL; } if (ent->pid_cmd_file != NULL) { free(ent->pid_cmd_file); ent->pid_cmd_file = NULL; } if (ent->r_reason != NULL) { free(ent->r_reason); ent->r_reason = NULL; } if (ent->trim_at != NULL) { ptime_free(ent->trim_at); ent->trim_at = NULL; } free(ent); } static void free_clist(struct cflist *list) { struct conf_entry *ent; while (!STAILQ_EMPTY(list)) { ent = STAILQ_FIRST(list); STAILQ_REMOVE_HEAD(list, cf_nextp); free_entry(ent); } free(list); list = NULL; } static bool parse_compression_type(const char *str, enum compress_types_enum *type) { int i; for (i = 0; i < COMPRESS_TYPES; i++) { if (strcasecmp(str, compress_type[i].name) == 0) { *type = i; break; } } if (i == COMPRESS_TYPES) { if (strcasecmp(str, "legacy") == 0) compress_type_override = COMPRESS_LEGACY; else { return (false); } } return (true); } static const char * compression_type_name(enum compress_types_enum type) { if (type == COMPRESS_LEGACY) return ("legacy"); else return (compress_type[type].name); } static fk_entry do_entry(struct conf_entry * ent) { #define REASON_MAX 80 int modtime; fk_entry free_or_keep; double diffsecs; char temp_reason[REASON_MAX]; int oversized; free_or_keep = FREE_ENT; if (verbose) printf("%s <%d%s>: ", ent->log, ent->numlogs, compress_type[ent->compress].flag); ent->fsize = sizefile(ent->log); oversized = ((ent->trsize > 0) && (ent->fsize >= ent->trsize)); modtime = age_old_log(ent->log); ent->rotate = 0; ent->firstcreate = 0; if (ent->fsize < 0) { /* * If either the C flag or the -C option was specified, * and if we won't be creating the file, then have the * verbose message include a hint as to why the file * will not be created. */ temp_reason[0] = '\0'; if (createlogs > 1) ent->firstcreate = 1; else if ((ent->flags & CE_CREATE) && createlogs) ent->firstcreate = 1; else if (ent->flags & CE_CREATE) strlcpy(temp_reason, " (no -C option)", REASON_MAX); else if (createlogs) strlcpy(temp_reason, " (no C flag)", REASON_MAX); if (ent->firstcreate) { if (verbose) printf("does not exist -> will create.\n"); createlog(ent); } else if (verbose) { printf("does not exist, skipped%s.\n", temp_reason); } } else { if (ent->flags & CE_NOEMPTY && ent->fsize == 0) { if (verbose) printf("--> Not rotating empty file\n"); return (free_or_keep); } if (ent->flags & CE_TRIMAT && !force && !rotatereq && !oversized) { diffsecs = ptimeget_diff(timenow, ent->trim_at); if (diffsecs < 0.0) { /* trim_at is some time in the future. */ if (verbose) { ptime_adjust4dst(ent->trim_at, timenow); printf("--> will trim at %s", ptimeget_ctime(ent->trim_at)); } return (free_or_keep); } else if (diffsecs >= 3600.0) { /* * trim_at is more than an hour in the past, * so find the next valid trim_at time, and * tell the user what that will be. */ if (verbose && dbg_at_times) printf("\n\t--> prev trim at %s\t", ptimeget_ctime(ent->trim_at)); if (verbose) { ptimeset_nxtime(ent->trim_at); printf("--> will trim at %s", ptimeget_ctime(ent->trim_at)); } return (free_or_keep); } else if (verbose && noaction && dbg_at_times) { /* * If we are just debugging at-times, then * a detailed message is helpful. Also * skip "doing" any commands, since they * would all be turned off by no-action. */ printf("\n\t--> timematch at %s", ptimeget_ctime(ent->trim_at)); return (free_or_keep); } else if (verbose && ent->hours <= 0) { printf("--> time is up\n"); } } if (verbose && (ent->trsize > 0)) printf("size (Kb): %d [%d] ", ent->fsize, ent->trsize); if (verbose && (ent->hours > 0)) printf(" age (hr): %d [%d] ", modtime, ent->hours); /* * Figure out if this logfile needs to be rotated. */ temp_reason[0] = '\0'; if (rotatereq) { ent->rotate = 1; snprintf(temp_reason, REASON_MAX, " due to -R from %s", requestor); } else if (force) { ent->rotate = 1; snprintf(temp_reason, REASON_MAX, " due to -F request"); } else if (oversized) { ent->rotate = 1; snprintf(temp_reason, REASON_MAX, " due to size>%dK", ent->trsize); } else if (ent->hours <= 0 && (ent->flags & CE_TRIMAT)) { ent->rotate = 1; } else if ((ent->hours > 0) && ((modtime >= ent->hours) || (modtime < 0))) { ent->rotate = 1; } /* * If the file needs to be rotated, then rotate it. */ if (ent->rotate && !norotate) { if (temp_reason[0] != '\0') ent->r_reason = strdup(temp_reason); if (verbose) { if (ent->compress == COMPRESS_NONE) printf("--> trimming log....\n"); else printf("--> trimming log and compressing with %s....\n", compression_type_name(ent->compress)); } if (noaction && !verbose) printf("%s <%d%s>: trimming\n", ent->log, ent->numlogs, compress_type[ent->compress].flag); free_or_keep = do_rotate(ent); } else { if (verbose) printf("--> skipping\n"); } } return (free_or_keep); #undef REASON_MAX } static void parse_args(int argc, char **argv) { int ch; char *p; timenow = ptime_init(NULL); ptimeset_time(timenow, time(NULL)); strlcpy(daytime, ptimeget_ctime(timenow) + 4, DAYTIME_LEN); ptimeget_ctime_rfc5424(timenow, daytime_rfc5424, DAYTIME_RFC5424_LEN); /* Let's get our hostname */ (void)gethostname(hostname, sizeof(hostname)); hostname_shortlen = strcspn(hostname, "."); /* Parse command line options. */ while ((ch = getopt(argc, argv, "a:c:d:f:nrst:vCD:FNPR:S:")) != -1) switch (ch) { case 'a': archtodir++; archdirname = optarg; break; case 'c': if (!parse_compression_type(optarg, &compress_type_override)) { warnx("Unrecognized compression method '%s'.", optarg); usage(); } compress_type_set = true; break; case 'd': destdir = optarg; break; case 'f': conf = optarg; break; case 'n': noaction++; /* FALLTHROUGH */ case 'r': needroot = 0; break; case 's': nosignal = 1; break; case 't': if (optarg[0] == '\0' || strcmp(optarg, "DEFAULT") == 0) timefnamefmt = strdup(DEFAULT_TIMEFNAME_FMT); else timefnamefmt = strdup(optarg); break; case 'v': verbose++; break; case 'C': /* Useful for things like rc.diskless... */ createlogs++; break; case 'D': /* * Set some debugging option. The specific option * depends on the value of optarg. These options * may come and go without notice or documentation. */ if (parse_doption(optarg)) break; usage(); /* NOTREACHED */ case 'F': force++; break; case 'N': norotate++; break; case 'P': enforcepid++; break; case 'R': rotatereq++; requestor = strdup(optarg); break; case 'S': path_syslogpid = optarg; break; case 'm': /* Used by OpenBSD for "monitor mode" */ default: usage(); /* NOTREACHED */ } if (force && norotate) { warnx("Only one of -F and -N may be specified."); usage(); /* NOTREACHED */ } if (rotatereq) { if (optind == argc) { warnx("At least one filename must be given when -R is specified."); usage(); /* NOTREACHED */ } /* Make sure "requestor" value is safe for a syslog message. */ for (p = requestor; *p != '\0'; p++) { if (!isprintch(*p) && (*p != '\t')) *p = '.'; } } if (dbg_timenow) { /* * Note that the 'daytime' variable is not changed. * That is only used in messages that track when a * logfile is rotated, and if a file *is* rotated, * then it will still rotated at the "real now" time. */ ptime_free(timenow); timenow = dbg_timenow; fprintf(stderr, "Debug: Running as if TimeNow is %s", ptimeget_ctime(dbg_timenow)); } } /* * These debugging options are mainly meant for developer use, such * as writing regression-tests. They would not be needed by users * during normal operation of newsyslog... */ static int parse_doption(const char *doption) { const char TN[] = "TN="; int res; if (strncmp(doption, TN, sizeof(TN) - 1) == 0) { /* * The "TimeNow" debugging option. This might be off * by an hour when crossing a timezone change. */ dbg_timenow = ptime_init(NULL); res = ptime_relparse(dbg_timenow, PTM_PARSE_ISO8601, time(NULL), doption + sizeof(TN) - 1); if (res == -2) { warnx("Non-existent time specified on -D %s", doption); return (0); /* failure */ } else if (res < 0) { warnx("Malformed time given on -D %s", doption); return (0); /* failure */ } return (1); /* successfully parsed */ } if (strcmp(doption, "ats") == 0) { dbg_at_times++; return (1); /* successfully parsed */ } /* XXX - This check could probably be dropped. */ if ((strcmp(doption, "neworder") == 0) || (strcmp(doption, "oldorder") == 0)) { warnx("NOTE: newsyslog always uses 'neworder'."); return (1); /* successfully parsed */ } warnx("Unknown -D (debug) option: '%s'", doption); return (0); /* failure */ } static void usage(void) { int i; char *alltypes = NULL, *tmp = NULL; for (i = 0; i < COMPRESS_TYPES; i++) { if (i == COMPRESS_NONE) { (void)asprintf(&tmp, "%s|legacy", compress_type[i].name); } else { (void)asprintf(&tmp, "%s|%s", alltypes, compress_type[i].name); } if (alltypes) free(alltypes); alltypes = tmp; tmp = NULL; } fprintf(stderr, "usage: newsyslog [-CFNPnrsv] [-a directory] [-c %s]\n" " [-d directory] [-f config_file]\n" " [-S pidfile] [-t timefmt] [[-R tagname] file ...]\n", alltypes); exit(1); } /* * Parse a configuration file and return a linked list of all the logs * which should be processed. */ static struct cflist * get_worklist(char **files) { FILE *f; char **given; struct cflist *cmdlist, *filelist, *globlist; struct conf_entry *defconf, *dupent, *ent; struct ilist inclist; struct include_entry *inc; int gmatch, fnres; defconf = NULL; STAILQ_INIT(&inclist); filelist = malloc(sizeof(struct cflist)); if (filelist == NULL) err(1, "malloc of filelist"); STAILQ_INIT(filelist); globlist = malloc(sizeof(struct cflist)); if (globlist == NULL) err(1, "malloc of globlist"); STAILQ_INIT(globlist); inc = malloc(sizeof(struct include_entry)); if (inc == NULL) err(1, "malloc of inc"); inc->file = conf; if (inc->file == NULL) inc->file = _PATH_CONF; STAILQ_INSERT_TAIL(&inclist, inc, inc_nextp); STAILQ_FOREACH(inc, &inclist, inc_nextp) { if (strcmp(inc->file, "-") != 0) f = fopen(inc->file, "r"); else { f = stdin; inc->file = ""; } if (!f) err(1, "%s", inc->file); if (verbose) printf("Processing %s\n", inc->file); parse_file(f, filelist, globlist, &defconf, &inclist); (void) fclose(f); } /* * All config-file information has been read in and turned into * a filelist and a globlist. If there were no specific files * given on the run command, then the only thing left to do is to * call a routine which finds all files matched by the globlist * and adds them to the filelist. Then return the worklist. */ if (*files == NULL) { expand_globs(filelist, globlist); free_clist(globlist); if (defconf != NULL) free_entry(defconf); return (filelist); } /* * If newsyslog was given a specific list of files to process, * it may be that some of those files were not listed in any * config file. Those unlisted files should get the default * rotation action. First, create the default-rotation action * if none was found in a system config file. */ if (defconf == NULL) { defconf = init_entry(DEFAULT_MARKER, NULL); defconf->numlogs = 3; defconf->trsize = 50; defconf->permissions = S_IRUSR|S_IWUSR; } /* * If newsyslog was run with a list of specific filenames, * then create a new worklist which has only those files in * it, picking up the rotation-rules for those files from * the original filelist. * * XXX - Note that this will copy multiple rules for a single * logfile, if multiple entries are an exact match for * that file. That matches the historic behavior, but do * we want to continue to allow it? If so, it should * probably be handled more intelligently. */ cmdlist = malloc(sizeof(struct cflist)); if (cmdlist == NULL) err(1, "malloc of cmdlist"); STAILQ_INIT(cmdlist); for (given = files; *given; ++given) { /* * First try to find exact-matches for this given file. */ gmatch = 0; STAILQ_FOREACH(ent, filelist, cf_nextp) { if (strcmp(ent->log, *given) == 0) { gmatch++; dupent = init_entry(*given, ent); STAILQ_INSERT_TAIL(cmdlist, dupent, cf_nextp); } } if (gmatch) { if (verbose > 2) printf("\t+ Matched entry %s\n", *given); continue; } /* * There was no exact-match for this given file, so look * for a "glob" entry which does match. */ gmatch = 0; if (verbose > 2) printf("\t+ Checking globs for %s\n", *given); STAILQ_FOREACH(ent, globlist, cf_nextp) { fnres = fnmatch(ent->log, *given, FNM_PATHNAME); if (verbose > 2) printf("\t+ = %d for pattern %s\n", fnres, ent->log); if (fnres == 0) { gmatch++; dupent = init_entry(*given, ent); /* This new entry is not a glob! */ dupent->flags &= ~CE_GLOB; STAILQ_INSERT_TAIL(cmdlist, dupent, cf_nextp); /* Only allow a match to one glob-entry */ break; } } if (gmatch) { if (verbose > 2) printf("\t+ Matched %s via %s\n", *given, ent->log); continue; } /* * This given file was not found in any config file, so * add a worklist item based on the default entry. */ if (verbose > 2) printf("\t+ No entry matched %s (will use %s)\n", *given, DEFAULT_MARKER); dupent = init_entry(*given, defconf); /* Mark that it was *not* found in a config file */ dupent->def_cfg = 1; STAILQ_INSERT_TAIL(cmdlist, dupent, cf_nextp); } /* * Free all the entries in the original work list, the list of * glob entries, and the default entry. */ free_clist(filelist); free_clist(globlist); free_entry(defconf); /* And finally, return a worklist which matches the given files. */ return (cmdlist); } /* * Expand the list of entries with filename patterns, and add all files * which match those glob-entries onto the worklist. */ static void expand_globs(struct cflist *work_p, struct cflist *glob_p) { int gmatch, gres; size_t i; char *mfname; struct conf_entry *dupent, *ent, *globent; glob_t pglob; struct stat st_fm; /* * The worklist contains all fully-specified (non-GLOB) names. * * Now expand the list of filename-pattern (GLOB) entries into * a second list, which (by definition) will only match files * that already exist. Do not add a glob-related entry for any * file which already exists in the fully-specified list. */ STAILQ_FOREACH(globent, glob_p, cf_nextp) { gres = glob(globent->log, GLOB_NOCHECK, NULL, &pglob); if (gres != 0) { warn("cannot expand pattern (%d): %s", gres, globent->log); continue; } if (verbose > 2) printf("\t+ Expanding pattern %s\n", globent->log); for (i = 0; i < pglob.gl_matchc; i++) { mfname = pglob.gl_pathv[i]; /* See if this file already has a specific entry. */ gmatch = 0; STAILQ_FOREACH(ent, work_p, cf_nextp) { if (strcmp(mfname, ent->log) == 0) { gmatch++; break; } } if (gmatch) continue; /* Make sure the named matched is a file. */ gres = lstat(mfname, &st_fm); if (gres != 0) { /* Error on a file that glob() matched?!? */ warn("Skipping %s - lstat() error", mfname); continue; } if (!S_ISREG(st_fm.st_mode)) { /* We only rotate files! */ if (verbose > 2) printf("\t+ . skipping %s (!file)\n", mfname); continue; } if (verbose > 2) printf("\t+ . add file %s\n", mfname); dupent = init_entry(mfname, globent); /* This new entry is not a glob! */ dupent->flags &= ~CE_GLOB; /* Add to the worklist. */ STAILQ_INSERT_TAIL(work_p, dupent, cf_nextp); } globfree(&pglob); if (verbose > 2) printf("\t+ Done with pattern %s\n", globent->log); } } /* * Parse a configuration file and update a linked list of all the logs to * process. */ static void parse_file(FILE *cf, struct cflist *work_p, struct cflist *glob_p, struct conf_entry **defconf_p, struct ilist *inclist) { char line[BUFSIZ], *parse, *q; char *cp, *errline, *group; struct conf_entry *working; struct passwd *pwd; struct group *grp; glob_t pglob; int eol, ptm_opts, res, special; size_t i; errline = NULL; while (fgets(line, BUFSIZ, cf)) { if ((line[0] == '\n') || (line[0] == '#') || (strlen(line) == 0)) continue; if (errline != NULL) free(errline); errline = strdup(line); for (cp = line + 1; *cp != '\0'; cp++) { if (*cp != '#') continue; if (*(cp - 1) == '\\') { strcpy(cp - 1, cp); cp--; continue; } *cp = '\0'; break; } q = parse = missing_field(sob(line), errline); parse = son(line); if (!*parse) { warnx("malformed line (missing fields):\n%s", errline); continue; } *parse = '\0'; /* * Allow people to set debug options via the config file. * (NOTE: debug options are undocumented, and may disappear * at any time, etc). */ if (strcasecmp(DEBUG_MARKER, q) == 0) { q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) warnx("debug line specifies no option:\n%s", errline); else { *parse = '\0'; parse_doption(q); } continue; } else if (strcasecmp(INCLUDE_MARKER, q) == 0) { if (verbose) printf("Found: %s", errline); q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) { warnx("include line missing argument:\n%s", errline); continue; } *parse = '\0'; if (isglobstr(q)) { res = glob(q, GLOB_NOCHECK, NULL, &pglob); if (res != 0) { warn("cannot expand pattern (%d): %s", res, q); continue; } if (verbose > 2) printf("\t+ Expanding pattern %s\n", q); for (i = 0; i < pglob.gl_matchc; i++) add_to_queue(pglob.gl_pathv[i], inclist); globfree(&pglob); } else add_to_queue(q, inclist); continue; } else if (strcasecmp(COMPRESS_MARKER, q) == 0) { enum compress_types_enum result; if (verbose) printf("Found: %s", errline); q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) warnx("compress line specifies no option:\n%s", errline); else { *parse = '\0'; if (parse_compression_type(q, &result)) { if (compress_type_set) { warnx("Ignoring compress line " "option '%s', using '%s' instead", q, compression_type_name(compress_type_override)); } else { if (compress_type_seen) warnx("Compress type should appear before all log files:\n%s", errline); compress_type_override = result; compress_type_set = true; } } else { warnx("Bad compress option '%s'", q); }; } continue; } #define badline(msg, ...) do { \ warnx(msg, __VA_ARGS__); \ goto cleanup; \ } while (0) special = 0; working = init_entry(q, NULL); if (strcasecmp(DEFAULT_MARKER, q) == 0) { special = 1; if (*defconf_p != NULL) badline("Ignoring duplicate entry for %s!", q); *defconf_p = working; } q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) badline("malformed line (missing fields):\n%s", errline); *parse = '\0'; if ((group = strchr(q, ':')) != NULL || (group = strrchr(q, '.')) != NULL) { *group++ = '\0'; if (*q) { if (!(isnumberstr(q))) { if ((pwd = getpwnam(q)) == NULL) badline( "error in config file; unknown user:\n%s", errline); working->uid = pwd->pw_uid; } else working->uid = atoi(q); } else working->uid = (uid_t)-1; q = group; if (*q) { if (!(isnumberstr(q))) { if ((grp = getgrnam(q)) == NULL) badline( "error in config file; unknown group:\n%s", errline); working->gid = grp->gr_gid; } else working->gid = atoi(q); } else working->gid = (gid_t)-1; q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) badline("malformed line (missing fields):\n%s", errline); *parse = '\0'; } else { working->uid = (uid_t)-1; working->gid = (gid_t)-1; } if (!sscanf(q, "%o", &working->permissions)) badline("error in config file; bad permissions:\n%s", errline); if ((working->permissions & ~DEFFILEMODE) != 0) { warnx("File mode bits 0%o changed to 0%o in line:\n%s", working->permissions, working->permissions & DEFFILEMODE, errline); working->permissions &= DEFFILEMODE; } q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) badline("malformed line (missing fields):\n%s", errline); *parse = '\0'; if (!sscanf(q, "%d", &working->numlogs) || working->numlogs < 0) badline("error in config file; bad value for count of logs to save:\n%s", errline); q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) badline("malformed line (missing fields):\n%s", errline); *parse = '\0'; if (isdigitch(*q)) working->trsize = atoi(q); else if (strcmp(q, "*") == 0) working->trsize = -1; else { warnx("Invalid value of '%s' for 'size' in line:\n%s", q, errline); working->trsize = -1; } working->flags = 0; working->compress = COMPRESS_NONE; q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); eol = !*parse; *parse = '\0'; { char *ep; u_long ul; ul = strtoul(q, &ep, 10); if (ep == q) working->hours = 0; else if (*ep == '*') working->hours = -1; else if (ul > INT_MAX) badline("interval is too large:\n%s", errline); else working->hours = ul; if (*ep == '\0' || strcmp(ep, "*") == 0) goto no_trimat; if (*ep != '@' && *ep != '$') badline("malformed interval/at:\n%s", errline); working->flags |= CE_TRIMAT; working->trim_at = ptime_init(NULL); ptm_opts = PTM_PARSE_ISO8601; if (*ep == '$') ptm_opts = PTM_PARSE_DWM; ptm_opts |= PTM_PARSE_MATCHDOM; res = ptime_relparse(working->trim_at, ptm_opts, ptimeget_secs(timenow), ep + 1); if (res == -2) badline("nonexistent time for 'at' value:\n%s", errline); else if (res < 0) badline("malformed 'at' value:\n%s", errline); } no_trimat: if (eol) q = NULL; else { q = parse = sob(parse + 1); /* Optional field */ parse = son(parse); if (!*parse) eol = 1; *parse = '\0'; } for (; q && *q && !isspacech(*q); q++) { switch (tolowerch(*q)) { case 'b': working->flags |= CE_BINARY; break; case 'c': working->flags |= CE_CREATE; break; case 'd': working->flags |= CE_NODUMP; break; case 'e': working->flags |= CE_NOEMPTY; break; case 'g': working->flags |= CE_GLOB; break; case 'j': if (compress_type_override == COMPRESS_LEGACY) working->compress = COMPRESS_BZIP2; else working->compress = compress_type_override; compress_type_seen = true; break; case 'n': working->flags |= CE_NOSIGNAL; break; case 'p': working->flags |= CE_PLAIN0; break; case 'r': working->flags |= CE_PID2CMD; break; case 't': working->flags |= CE_RFC5424; break; case 'u': working->flags |= CE_SIGNALGROUP; break; case 'w': /* Deprecated flag - keep for compatibility purposes */ break; case 'x': if (compress_type_override == COMPRESS_LEGACY) working->compress = COMPRESS_XZ; else working->compress = compress_type_override; compress_type_seen = true; break; case 'y': if (compress_type_override == COMPRESS_LEGACY) working->compress = COMPRESS_ZSTD; else working->compress = compress_type_override; compress_type_seen = true; break; case 'z': if (compress_type_override == COMPRESS_LEGACY) working->compress = COMPRESS_GZIP; else working->compress = compress_type_override; compress_type_seen = true; break; case '-': break; case 'f': /* Used by OpenBSD for "CE_FOLLOW" */ case 'm': /* Used by OpenBSD for "CE_MONITOR" */ default: badline("illegal flag in config file -- %c", *q); } } if (eol) q = NULL; else { q = parse = sob(parse + 1); /* Optional field */ parse = son(parse); if (!*parse) eol = 1; *parse = '\0'; } working->pid_cmd_file = NULL; if (q && *q) { if (*q == '/') working->pid_cmd_file = strdup(q); else if (isalnum(*q)) goto got_sig; else { badline( "illegal pid file or signal in config file:\n%s", errline); } } if (eol) q = NULL; else { q = parse = sob(parse + 1); /* Optional field */ parse = son(parse); *parse = '\0'; } working->sig = SIGHUP; if (q && *q) { got_sig: working->sig = parse_signal(q); if (working->sig < 1 || working->sig >= sys_nsig) { badline( "illegal signal in config file:\n%s", errline); } } /* * Finish figuring out what pid-file to use (if any) in * later processing if this logfile needs to be rotated. */ if ((working->flags & CE_NOSIGNAL) == CE_NOSIGNAL) { /* * This config-entry specified 'n' for nosignal, * see if it also specified an explicit pid_cmd_file. * This would be a pretty pointless combination. */ if (working->pid_cmd_file != NULL) { warnx("Ignoring '%s' because flag 'n' was specified in line:\n%s", working->pid_cmd_file, errline); free(working->pid_cmd_file); working->pid_cmd_file = NULL; } } else if (working->pid_cmd_file == NULL) { /* * This entry did not specify the 'n' flag, which * means it should signal syslogd unless it had * specified some other pid-file (and obviously the * syslog pid-file will not be for a process-group). * Also, we should only try to notify syslog if we * are root. */ if (working->flags & CE_SIGNALGROUP) { warnx("Ignoring flag 'U' in line:\n%s", errline); working->flags &= ~CE_SIGNALGROUP; } if (needroot) working->pid_cmd_file = strdup(path_syslogpid); } /* * Add this entry to the appropriate list of entries, unless * it was some kind of special entry (eg: ). */ if (special) { ; /* Do not add to any list */ } else if (working->flags & CE_GLOB) { STAILQ_INSERT_TAIL(glob_p, working, cf_nextp); } else { STAILQ_INSERT_TAIL(work_p, working, cf_nextp); } continue; cleanup: free_entry(working); #undef badline } /* while (fgets(line, BUFSIZ, cf)) */ if (errline != NULL) free(errline); } static char * missing_field(char *p, char *errline) { if (!p || !*p) errx(1, "missing field in config file:\n%s", errline); return (p); } /* * In our sort we return it in the reverse of what qsort normally * would do, as we want the newest files first. If we have two * entries with the same time we don't really care about order. * * Support function for qsort() in delete_oldest_timelog(). */ static int oldlog_entry_compare(const void *a, const void *b) { const struct oldlog_entry *ola = a, *olb = b; if (ola->t > olb->t) return (-1); else if (ola->t < olb->t) return (1); else return (0); } /* * Check whether the file corresponding to dp is an archive of the logfile * logfname, based on the timefnamefmt format string. Return true and fill out * tm if this is the case; otherwise return false. */ static int validate_old_timelog(int fd, const struct dirent *dp, const char *logfname, struct tm *tm) { struct stat sb; size_t logfname_len; char *s; int c; logfname_len = strlen(logfname); if (dp->d_type != DT_REG) { /* * Some filesystems (e.g. NFS) don't fill out the d_type field * and leave it set to DT_UNKNOWN; in this case we must obtain * the file type ourselves. */ if (dp->d_type != DT_UNKNOWN || fstatat(fd, dp->d_name, &sb, AT_SYMLINK_NOFOLLOW) != 0 || !S_ISREG(sb.st_mode)) return (0); } /* Ignore everything but files with our logfile prefix. */ if (strncmp(dp->d_name, logfname, logfname_len) != 0) return (0); /* Ignore the actual non-rotated logfile. */ if (dp->d_namlen == logfname_len) return (0); /* * Make sure we created have found a logfile, so the * postfix is valid, IE format is: '.