diff --git a/sys/compat/linux/linux.c b/sys/compat/linux/linux.c index e72e5cbc7bfc..8cc678e1ec7d 100644 --- a/sys/compat/linux/linux.c +++ b/sys/compat/linux/linux.c @@ -1,724 +1,721 @@ /*- * Copyright (c) 2015 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -struct futex_list futex_list; -struct mtx futex_mtx; /* protects the futex list */ - CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ); static int bsd_to_linux_sigtbl[LINUX_SIGTBLSZ] = { LINUX_SIGHUP, /* SIGHUP */ LINUX_SIGINT, /* SIGINT */ LINUX_SIGQUIT, /* SIGQUIT */ LINUX_SIGILL, /* SIGILL */ LINUX_SIGTRAP, /* SIGTRAP */ LINUX_SIGABRT, /* SIGABRT */ 0, /* SIGEMT */ LINUX_SIGFPE, /* SIGFPE */ LINUX_SIGKILL, /* SIGKILL */ LINUX_SIGBUS, /* SIGBUS */ LINUX_SIGSEGV, /* SIGSEGV */ LINUX_SIGSYS, /* SIGSYS */ LINUX_SIGPIPE, /* SIGPIPE */ LINUX_SIGALRM, /* SIGALRM */ LINUX_SIGTERM, /* SIGTERM */ LINUX_SIGURG, /* SIGURG */ LINUX_SIGSTOP, /* SIGSTOP */ LINUX_SIGTSTP, /* SIGTSTP */ LINUX_SIGCONT, /* SIGCONT */ LINUX_SIGCHLD, /* SIGCHLD */ LINUX_SIGTTIN, /* SIGTTIN */ LINUX_SIGTTOU, /* SIGTTOU */ LINUX_SIGIO, /* SIGIO */ LINUX_SIGXCPU, /* SIGXCPU */ LINUX_SIGXFSZ, /* SIGXFSZ */ LINUX_SIGVTALRM,/* SIGVTALRM */ LINUX_SIGPROF, /* SIGPROF */ LINUX_SIGWINCH, /* SIGWINCH */ 0, /* SIGINFO */ LINUX_SIGUSR1, /* SIGUSR1 */ LINUX_SIGUSR2 /* SIGUSR2 */ }; static int linux_to_bsd_sigtbl[LINUX_SIGTBLSZ] = { SIGHUP, /* LINUX_SIGHUP */ SIGINT, /* LINUX_SIGINT */ SIGQUIT, /* LINUX_SIGQUIT */ SIGILL, /* LINUX_SIGILL */ SIGTRAP, /* LINUX_SIGTRAP */ SIGABRT, /* LINUX_SIGABRT */ SIGBUS, /* LINUX_SIGBUS */ SIGFPE, /* LINUX_SIGFPE */ SIGKILL, /* LINUX_SIGKILL */ SIGUSR1, /* LINUX_SIGUSR1 */ SIGSEGV, /* LINUX_SIGSEGV */ SIGUSR2, /* LINUX_SIGUSR2 */ SIGPIPE, /* LINUX_SIGPIPE */ SIGALRM, /* LINUX_SIGALRM */ SIGTERM, /* LINUX_SIGTERM */ SIGBUS, /* LINUX_SIGSTKFLT */ SIGCHLD, /* LINUX_SIGCHLD */ SIGCONT, /* LINUX_SIGCONT */ SIGSTOP, /* LINUX_SIGSTOP */ SIGTSTP, /* LINUX_SIGTSTP */ SIGTTIN, /* LINUX_SIGTTIN */ SIGTTOU, /* LINUX_SIGTTOU */ SIGURG, /* LINUX_SIGURG */ SIGXCPU, /* LINUX_SIGXCPU */ SIGXFSZ, /* LINUX_SIGXFSZ */ SIGVTALRM, /* LINUX_SIGVTALARM */ SIGPROF, /* LINUX_SIGPROF */ SIGWINCH, /* LINUX_SIGWINCH */ SIGIO, /* LINUX_SIGIO */ /* * FreeBSD does not have SIGPWR signal, map Linux SIGPWR signal * to the first unused FreeBSD signal number. Since Linux supports * signals from 1 to 64 we are ok here as our SIGRTMIN = 65. */ SIGRTMIN, /* LINUX_SIGPWR */ SIGSYS /* LINUX_SIGSYS */ }; static struct cdev *dev_shm_cdev; static struct cdevsw dev_shm_cdevsw = { .d_version = D_VERSION, .d_name = "dev_shm", }; /* * Map Linux RT signals to the FreeBSD RT signals. */ static inline int linux_to_bsd_rt_signal(int sig) { return (SIGRTMIN + 1 + sig - LINUX_SIGRTMIN); } static inline int bsd_to_linux_rt_signal(int sig) { return (sig - SIGRTMIN - 1 + LINUX_SIGRTMIN); } int linux_to_bsd_signal(int sig) { KASSERT(sig > 0 && sig <= LINUX_SIGRTMAX, ("invalid Linux signal %d\n", sig)); if (sig < LINUX_SIGRTMIN) return (linux_to_bsd_sigtbl[_SIG_IDX(sig)]); return (linux_to_bsd_rt_signal(sig)); } int bsd_to_linux_signal(int sig) { if (sig <= LINUX_SIGTBLSZ) return (bsd_to_linux_sigtbl[_SIG_IDX(sig)]); if (sig == SIGRTMIN) return (LINUX_SIGPWR); return (bsd_to_linux_rt_signal(sig)); } int linux_to_bsd_sigaltstack(int lsa) { int bsa = 0; if (lsa & LINUX_SS_DISABLE) bsa |= SS_DISABLE; /* * Linux ignores SS_ONSTACK flag for ss * parameter while FreeBSD prohibits it. */ return (bsa); } int bsd_to_linux_sigaltstack(int bsa) { int lsa = 0; if (bsa & SS_DISABLE) lsa |= LINUX_SS_DISABLE; if (bsa & SS_ONSTACK) lsa |= LINUX_SS_ONSTACK; return (lsa); } void linux_to_bsd_sigset(l_sigset_t *lss, sigset_t *bss) { int b, l; SIGEMPTYSET(*bss); for (l = 1; l <= LINUX_SIGRTMAX; l++) { if (LINUX_SIGISMEMBER(*lss, l)) { b = linux_to_bsd_signal(l); if (b) SIGADDSET(*bss, b); } } } void bsd_to_linux_sigset(sigset_t *bss, l_sigset_t *lss) { int b, l; LINUX_SIGEMPTYSET(*lss); for (b = 1; b <= SIGRTMAX; b++) { if (SIGISMEMBER(*bss, b)) { l = bsd_to_linux_signal(b); if (l) LINUX_SIGADDSET(*lss, l); } } } /* * Translate a Linux interface name to a FreeBSD interface name, * and return the associated ifnet structure * bsdname and lxname need to be least IFNAMSIZ bytes long, but * can point to the same buffer. */ struct ifnet * ifname_linux_to_bsd(struct thread *td, const char *lxname, char *bsdname) { struct ifnet *ifp; int len, unit; char *ep; int index; bool is_eth, is_lo; for (len = 0; len < LINUX_IFNAMSIZ; ++len) if (!isalpha(lxname[len]) || lxname[len] == '\0') break; if (len == 0 || len == LINUX_IFNAMSIZ) return (NULL); /* Linux loopback interface name is lo (not lo0) */ is_lo = (len == 2 && strncmp(lxname, "lo", len) == 0); unit = (int)strtoul(lxname + len, &ep, 10); if ((ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ) && is_lo == 0) return (NULL); index = 0; is_eth = (len == 3 && strncmp(lxname, "eth", len) == 0); CURVNET_SET(TD_TO_VNET(td)); IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make * the test optional based on is_eth. */ if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0) break; if (is_eth && IFP_IS_ETH(ifp) && unit == index++) break; if (is_lo && IFP_IS_LOOP(ifp)) break; } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (ifp != NULL && bsdname != NULL) strlcpy(bsdname, ifp->if_xname, IFNAMSIZ); return (ifp); } void linux_ifflags(struct ifnet *ifp, short *flags) { unsigned short fl; fl = (ifp->if_flags | ifp->if_drv_flags) & 0xffff; *flags = 0; if (fl & IFF_UP) *flags |= LINUX_IFF_UP; if (fl & IFF_BROADCAST) *flags |= LINUX_IFF_BROADCAST; if (fl & IFF_DEBUG) *flags |= LINUX_IFF_DEBUG; if (fl & IFF_LOOPBACK) *flags |= LINUX_IFF_LOOPBACK; if (fl & IFF_POINTOPOINT) *flags |= LINUX_IFF_POINTOPOINT; if (fl & IFF_DRV_RUNNING) *flags |= LINUX_IFF_RUNNING; if (fl & IFF_NOARP) *flags |= LINUX_IFF_NOARP; if (fl & IFF_PROMISC) *flags |= LINUX_IFF_PROMISC; if (fl & IFF_ALLMULTI) *flags |= LINUX_IFF_ALLMULTI; if (fl & IFF_MULTICAST) *flags |= LINUX_IFF_MULTICAST; } int linux_ifhwaddr(struct ifnet *ifp, struct l_sockaddr *lsa) { struct ifaddr *ifa; struct sockaddr_dl *sdl; if (IFP_IS_LOOP(ifp)) { bzero(lsa, sizeof(*lsa)); lsa->sa_family = LINUX_ARPHRD_LOOPBACK; return (0); } if (!IFP_IS_ETH(ifp)) return (ENOENT); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (sdl != NULL && (sdl->sdl_family == AF_LINK) && (sdl->sdl_type == IFT_ETHER)) { bzero(lsa, sizeof(*lsa)); lsa->sa_family = LINUX_ARPHRD_ETHER; bcopy(LLADDR(sdl), lsa->sa_data, LINUX_IFHWADDRLEN); return (0); } } return (ENOENT); } int linux_to_bsd_domain(int domain) { switch (domain) { case LINUX_AF_UNSPEC: return (AF_UNSPEC); case LINUX_AF_UNIX: return (AF_LOCAL); case LINUX_AF_INET: return (AF_INET); case LINUX_AF_INET6: return (AF_INET6); case LINUX_AF_AX25: return (AF_CCITT); case LINUX_AF_IPX: return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); } return (-1); } int bsd_to_linux_domain(int domain) { switch (domain) { case AF_UNSPEC: return (LINUX_AF_UNSPEC); case AF_LOCAL: return (LINUX_AF_UNIX); case AF_INET: return (LINUX_AF_INET); case AF_INET6: return (LINUX_AF_INET6); case AF_CCITT: return (LINUX_AF_AX25); case AF_IPX: return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); } return (-1); } /* * Based on the fact that: * 1. Native and Linux storage of struct sockaddr * and struct sockaddr_in6 are equal. * 2. On Linux sa_family is the first member of all struct sockaddr. */ int bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, socklen_t len) { struct l_sockaddr *kosa; int error, bdom; *lsa = NULL; if (len < 2 || len > UCHAR_MAX) return (EINVAL); kosa = malloc(len, M_SONAME, M_WAITOK); bcopy(sa, kosa, len); bdom = bsd_to_linux_domain(sa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } kosa->sa_family = bdom; *lsa = kosa; return (0); out: free(kosa, M_SONAME); return (error); } int linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, socklen_t *len) { struct sockaddr *sa; struct l_sockaddr *kosa; #ifdef INET6 struct sockaddr_in6 *sin6; bool oldv6size; #endif char *name; int salen, bdom, error, hdrlen, namelen; if (*len < 2 || *len > UCHAR_MAX) return (EINVAL); salen = *len; #ifdef INET6 oldv6size = false; /* * Check for old (pre-RFC2553) sockaddr_in6. We may accept it * if it's a v4-mapped address, so reserve the proper space * for it. */ if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) { salen += sizeof(uint32_t); oldv6size = true; } #endif kosa = malloc(salen, M_SONAME, M_WAITOK); if ((error = copyin(osa, kosa, *len))) goto out; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } #ifdef INET6 /* * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6, * which lacks the scope id compared with RFC2553 one. If we detect * the situation, reject the address and write a message to system log. * * Still accept addresses for which the scope id is not used. */ if (oldv6size) { if (bdom == AF_INET6) { sin6 = (struct sockaddr_in6 *)kosa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) || (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) { sin6->sin6_scope_id = 0; } else { linux_msg(curthread, "obsolete pre-RFC2553 sockaddr_in6 rejected"); error = EINVAL; goto out; } } else salen -= sizeof(uint32_t); } #endif if (bdom == AF_INET) { if (salen < sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } salen = sizeof(struct sockaddr_in); } if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) { hdrlen = offsetof(struct sockaddr_un, sun_path); name = ((struct sockaddr_un *)kosa)->sun_path; if (*name == '\0') { /* * Linux abstract namespace starts with a NULL byte. * XXX We do not support abstract namespace yet. */ namelen = strnlen(name + 1, salen - hdrlen - 1) + 1; } else namelen = strnlen(name, salen - hdrlen); salen = hdrlen + namelen; if (salen > sizeof(struct sockaddr_un)) { error = ENAMETOOLONG; goto out; } } sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; *sap = sa; *len = salen; return (0); out: free(kosa, M_SONAME); return (error); } void linux_dev_shm_create(void) { int error; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev_shm_cdev, &dev_shm_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0, "shm/.mountpoint"); if (error != 0) { printf("%s: failed to create device node, error %d\n", __func__, error); } } void linux_dev_shm_destroy(void) { destroy_dev(dev_shm_cdev); } int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value) { int bsd_mask, bsd_value, linux_mask, linux_value; int linux_ret; size_t i; bool applied; applied = false; linux_ret = 0; for (i = 0; i < mapcnt; ++i) { bsd_mask = bitmap[i].bsd_mask; bsd_value = bitmap[i].bsd_value; if (bsd_mask == 0) bsd_mask = bsd_value; linux_mask = bitmap[i].linux_mask; linux_value = bitmap[i].linux_value; if (linux_mask == 0) linux_mask = linux_value; /* * If a mask larger than just the value is set, we explicitly * want to make sure that only this bit we mapped within that * mask is set. */ if ((value & bsd_mask) == bsd_value) { linux_ret = (linux_ret & ~linux_mask) | linux_value; applied = true; } } if (!applied) return (no_value); return (linux_ret); } int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value) { int bsd_mask, bsd_value, linux_mask, linux_value; int bsd_ret; size_t i; bool applied; applied = false; bsd_ret = 0; for (i = 0; i < mapcnt; ++i) { bsd_mask = bitmap[i].bsd_mask; bsd_value = bitmap[i].bsd_value; if (bsd_mask == 0) bsd_mask = bsd_value; linux_mask = bitmap[i].linux_mask; linux_value = bitmap[i].linux_value; if (linux_mask == 0) linux_mask = linux_value; /* * If a mask larger than just the value is set, we explicitly * want to make sure that only this bit we mapped within that * mask is set. */ if ((value & linux_mask) == linux_value) { bsd_ret = (bsd_ret & ~bsd_mask) | bsd_value; applied = true; } } if (!applied) return (no_value); return (bsd_ret); } void linux_to_bsd_poll_events(struct thread *td, int fd, short lev, short *bev) { struct proc *p = td->td_proc; struct filedesc *fdp; struct file *fp; int error; short bits = 0; if (lev & LINUX_POLLIN) bits |= POLLIN; if (lev & LINUX_POLLPRI) bits |= POLLPRI; if (lev & LINUX_POLLOUT) bits |= POLLOUT; if (lev & LINUX_POLLERR) bits |= POLLERR; if (lev & LINUX_POLLHUP) bits |= POLLHUP; if (lev & LINUX_POLLNVAL) bits |= POLLNVAL; if (lev & LINUX_POLLRDNORM) bits |= POLLRDNORM; if (lev & LINUX_POLLRDBAND) bits |= POLLRDBAND; if (lev & LINUX_POLLWRBAND) bits |= POLLWRBAND; if (lev & LINUX_POLLWRNORM) bits |= POLLWRNORM; if (lev & LINUX_POLLRDHUP) { /* * It seems that the Linux silencly ignores POLLRDHUP * on non-socket file descriptors unlike FreeBSD, where * events bits is more strictly checked (POLLSTANDARD). */ fdp = p->p_fd; error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error == 0) { /* * XXX. On FreeBSD POLLRDHUP applies only to * stream sockets. */ if (fp->f_type == DTYPE_SOCKET) bits |= POLLRDHUP; fdrop(fp, td); } } if (lev & LINUX_POLLMSG) LINUX_RATELIMIT_MSG_OPT1("unsupported POLLMSG, events(%d)", lev); if (lev & LINUX_POLLREMOVE) LINUX_RATELIMIT_MSG_OPT1("unsupported POLLREMOVE, events(%d)", lev); *bev = bits; } void bsd_to_linux_poll_events(short bev, short *lev) { short bits = 0; if (bev & POLLIN) bits |= LINUX_POLLIN; if (bev & POLLPRI) bits |= LINUX_POLLPRI; if (bev & (POLLOUT | POLLWRNORM)) /* * POLLWRNORM is equal to POLLOUT on FreeBSD, * but not on Linux */ bits |= LINUX_POLLOUT; if (bev & POLLERR) bits |= LINUX_POLLERR; if (bev & POLLHUP) bits |= LINUX_POLLHUP; if (bev & POLLNVAL) bits |= LINUX_POLLNVAL; if (bev & POLLRDNORM) bits |= LINUX_POLLRDNORM; if (bev & POLLRDBAND) bits |= LINUX_POLLRDBAND; if (bev & POLLWRBAND) bits |= LINUX_POLLWRBAND; if (bev & POLLRDHUP) bits |= LINUX_POLLRDHUP; *lev = bits; } diff --git a/sys/compat/linux/linux.h b/sys/compat/linux/linux.h index 2ec12fd64d58..0b405b692796 100644 --- a/sys/compat/linux/linux.h +++ b/sys/compat/linux/linux.h @@ -1,265 +1,262 @@ /*- * Copyright (c) 2015 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_MI_H_ #define _LINUX_MI_H_ #include /* * Private Brandinfo flags */ #define LINUX_BI_FUTEX_REQUEUE 0x01000000 /* * poll() */ #define LINUX_POLLIN 0x0001 #define LINUX_POLLPRI 0x0002 #define LINUX_POLLOUT 0x0004 #define LINUX_POLLERR 0x0008 #define LINUX_POLLHUP 0x0010 #define LINUX_POLLNVAL 0x0020 #define LINUX_POLLRDNORM 0x0040 #define LINUX_POLLRDBAND 0x0080 #define LINUX_POLLWRNORM 0x0100 #define LINUX_POLLWRBAND 0x0200 #define LINUX_POLLMSG 0x0400 #define LINUX_POLLREMOVE 0x1000 #define LINUX_POLLRDHUP 0x2000 #define LINUX_IFHWADDRLEN 6 #define LINUX_IFNAMSIZ 16 struct l_sockaddr { unsigned short sa_family; char sa_data[14]; }; #define LINUX_ARPHRD_ETHER 1 #define LINUX_ARPHRD_LOOPBACK 772 /* * Supported address families */ #define LINUX_AF_UNSPEC 0 #define LINUX_AF_UNIX 1 #define LINUX_AF_INET 2 #define LINUX_AF_AX25 3 #define LINUX_AF_IPX 4 #define LINUX_AF_APPLETALK 5 #define LINUX_AF_INET6 10 #define LINUX_AF_NETLINK 16 #define LINUX_NETLINK_ROUTE 0 #define LINUX_NETLINK_SOCK_DIAG 4 #define LINUX_NETLINK_NFLOG 5 #define LINUX_NETLINK_SELINUX 7 #define LINUX_NETLINK_AUDIT 9 #define LINUX_NETLINK_FIB_LOOKUP 10 #define LINUX_NETLINK_NETFILTER 12 #define LINUX_NETLINK_KOBJECT_UEVENT 15 /* * net device flags */ #define LINUX_IFF_UP 0x0001 #define LINUX_IFF_BROADCAST 0x0002 #define LINUX_IFF_DEBUG 0x0004 #define LINUX_IFF_LOOPBACK 0x0008 #define LINUX_IFF_POINTOPOINT 0x0010 #define LINUX_IFF_NOTRAILERS 0x0020 #define LINUX_IFF_RUNNING 0x0040 #define LINUX_IFF_NOARP 0x0080 #define LINUX_IFF_PROMISC 0x0100 #define LINUX_IFF_ALLMULTI 0x0200 #define LINUX_IFF_MASTER 0x0400 #define LINUX_IFF_SLAVE 0x0800 #define LINUX_IFF_MULTICAST 0x1000 #define LINUX_IFF_PORTSEL 0x2000 #define LINUX_IFF_AUTOMEDIA 0x4000 #define LINUX_IFF_DYNAMIC 0x8000 /* sigaltstack */ #define LINUX_SS_ONSTACK 1 #define LINUX_SS_DISABLE 2 int linux_to_bsd_sigaltstack(int lsa); int bsd_to_linux_sigaltstack(int bsa); /* sigset */ typedef struct { uint64_t __mask; } l_sigset_t; /* primitives to manipulate sigset_t */ #define LINUX_SIGEMPTYSET(set) (set).__mask = 0 #define LINUX_SIGISMEMBER(set, sig) (1UL & ((set).__mask >> _SIG_IDX(sig))) #define LINUX_SIGADDSET(set, sig) (set).__mask |= 1UL << _SIG_IDX(sig) void linux_to_bsd_sigset(l_sigset_t *, sigset_t *); void bsd_to_linux_sigset(sigset_t *, l_sigset_t *); /* signaling */ #define LINUX_SIGHUP 1 #define LINUX_SIGINT 2 #define LINUX_SIGQUIT 3 #define LINUX_SIGILL 4 #define LINUX_SIGTRAP 5 #define LINUX_SIGABRT 6 #define LINUX_SIGIOT LINUX_SIGABRT #define LINUX_SIGBUS 7 #define LINUX_SIGFPE 8 #define LINUX_SIGKILL 9 #define LINUX_SIGUSR1 10 #define LINUX_SIGSEGV 11 #define LINUX_SIGUSR2 12 #define LINUX_SIGPIPE 13 #define LINUX_SIGALRM 14 #define LINUX_SIGTERM 15 #define LINUX_SIGSTKFLT 16 #define LINUX_SIGCHLD 17 #define LINUX_SIGCONT 18 #define LINUX_SIGSTOP 19 #define LINUX_SIGTSTP 20 #define LINUX_SIGTTIN 21 #define LINUX_SIGTTOU 22 #define LINUX_SIGURG 23 #define LINUX_SIGXCPU 24 #define LINUX_SIGXFSZ 25 #define LINUX_SIGVTALRM 26 #define LINUX_SIGPROF 27 #define LINUX_SIGWINCH 28 #define LINUX_SIGIO 29 #define LINUX_SIGPOLL LINUX_SIGIO #define LINUX_SIGPWR 30 #define LINUX_SIGSYS 31 #define LINUX_SIGTBLSZ 31 #define LINUX_SIGRTMIN 32 #define LINUX_SIGRTMAX 64 #define LINUX_SIG_VALID(sig) ((sig) <= LINUX_SIGRTMAX && (sig) > 0) int linux_to_bsd_signal(int sig); int bsd_to_linux_signal(int sig); -extern LIST_HEAD(futex_list, futex) futex_list; -extern struct mtx futex_mtx; - void linux_dev_shm_create(void); void linux_dev_shm_destroy(void); /* * mask=0 is not sensible for this application, so it will be taken to mean * a mask equivalent to the value. Otherwise, (word & mask) == value maps to * (word & ~mask) | value in a bitfield for the platform we're converting to. */ struct bsd_to_linux_bitmap { int bsd_mask; int bsd_value; int linux_mask; int linux_value; }; int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value); int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, size_t mapcnt, int no_value); /* * These functions are used for simplification of BSD <-> Linux bit conversions. * Given `value`, a bit field, these functions will walk the given bitmap table * and set the appropriate bits for the target platform. If any bits were * successfully converted, then the return value is the equivalent of value * represented with the bit values appropriate for the target platform. * Otherwise, the value supplied as `no_value` is returned. */ #define bsd_to_linux_bits(_val, _bmap, _noval) \ bsd_to_linux_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) #define linux_to_bsd_bits(_val, _bmap, _noval) \ linux_to_bsd_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) /* * Easy mapping helpers. BITMAP_EASY_LINUX represents a single bit to be * translated, and the FreeBSD and Linux values are supplied. BITMAP_1t1_LINUX * is the extreme version of this, where not only is it a single bit, but the * name of the macro used to represent the Linux version of a bit literally has * LINUX_ prepended to the normal name. */ #define BITMAP_EASY_LINUX(_name, _linux_name) \ { \ .bsd_value = (_name), \ .linux_value = (_linux_name), \ } #define BITMAP_1t1_LINUX(_name) BITMAP_EASY_LINUX(_name, LINUX_##_name) int bsd_to_linux_errno(int error); void linux_check_errtbl(void); #define STATX_BASIC_STATS 0x07ff #define STATX_BTIME 0x0800 #define STATX_ALL 0x0fff #define STATX_ATTR_COMPRESSED 0x0004 #define STATX_ATTR_IMMUTABLE 0x0010 #define STATX_ATTR_APPEND 0x0020 #define STATX_ATTR_NODUMP 0x0040 #define STATX_ATTR_ENCRYPTED 0x0800 #define STATX_ATTR_AUTOMOUNT 0x1000 struct l_statx_timestamp { int64_t tv_sec; int32_t tv_nsec; int32_t __spare0; }; struct l_statx { uint32_t stx_mask; uint32_t stx_blksize; uint64_t stx_attributes; uint32_t stx_nlink; uint32_t stx_uid; uint32_t stx_gid; uint16_t stx_mode; uint16_t __spare0[1]; uint64_t stx_ino; uint64_t stx_size; uint64_t stx_blocks; uint64_t stx_attributes_mask; struct l_statx_timestamp stx_atime; struct l_statx_timestamp stx_btime; struct l_statx_timestamp stx_ctime; struct l_statx_timestamp stx_mtime; uint32_t stx_rdev_major; uint32_t stx_rdev_minor; uint32_t stx_dev_major; uint32_t stx_dev_minor; uint64_t stx_mnt_id; uint64_t __spare2[13]; }; #endif /* _LINUX_MI_H_ */ diff --git a/sys/compat/linux/linux_common.c b/sys/compat/linux/linux_common.c index 4d81470649de..0009d9d26164 100644 --- a/sys/compat/linux/linux_common.c +++ b/sys/compat/linux/linux_common.c @@ -1,90 +1,86 @@ /*- * Copyright (c) 2014 Vassilis Laganakos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include -#include #include #include #include #include #include #include SET_DECLARE(linux_device_handler_set, struct linux_device_handler); TAILQ_HEAD(, linux_ioctl_handler_element) linux_ioctl_handlers = TAILQ_HEAD_INITIALIZER(linux_ioctl_handlers); struct sx linux_ioctl_sx; SX_SYSINIT(linux_ioctl, &linux_ioctl_sx, "Linux ioctl handlers"); static int linux_common_modevent(module_t mod, int type, void *data) { struct linux_device_handler **ldhp; switch(type) { case MOD_LOAD: #ifdef INVARIANTS linux_check_errtbl(); #endif linux_dev_shm_create(); linux_osd_jail_register(); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_register_handler(*ldhp); - LIST_INIT(&futex_list); - mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); break; case MOD_UNLOAD: linux_dev_shm_destroy(); linux_osd_jail_deregister(); SET_FOREACH(ldhp, linux_device_handler_set) linux_device_unregister_handler(*ldhp); - mtx_destroy(&futex_mtx); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t linux_common_mod = { "linux_common", linux_common_modevent, 0 }; DECLARE_MODULE(linux_common, linux_common_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_VERSION(linux_common, 1); diff --git a/sys/compat/linux/linux_fork.c b/sys/compat/linux/linux_fork.c index 8230d5b4108b..ff2d5dcfc957 100644 --- a/sys/compat/linux/linux_fork.c +++ b/sys/compat/linux/linux_fork.c @@ -1,446 +1,439 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #ifdef LINUX_LEGACY_SYSCALLS int linux_fork(struct thread *td, struct linux_fork_args *args) { struct fork_req fr; int error; struct proc *p2; struct thread *td2; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &p2; if ((error = fork1(td, &fr)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); return (0); } int linux_vfork(struct thread *td, struct linux_vfork_args *args) { struct fork_req fr; int error; struct proc *p2; struct thread *td2; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFMEM | RFPPWAIT | RFSTOPPED; fr.fr_procp = &p2; if ((error = fork1(td, &fr)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); return (0); } #endif static int linux_clone_proc(struct thread *td, struct linux_clone_args *args) { struct fork_req fr; int error, ff = RFPROC | RFSTOPPED, f2; struct proc *p2; struct thread *td2; int exit_signal; struct linux_emuldata *em; f2 = 0; exit_signal = args->flags & 0x000000ff; if (LINUX_SIG_VALID(exit_signal)) { exit_signal = linux_to_bsd_signal(exit_signal); } else if (exit_signal != 0) return (EINVAL); if (args->flags & LINUX_CLONE_VM) ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; if (args->flags & LINUX_CLONE_FILES) { if (!(args->flags & LINUX_CLONE_FS)) f2 |= FR2_SHARE_PATHS; } else { ff |= RFFDG; if (args->flags & LINUX_CLONE_FS) f2 |= FR2_SHARE_PATHS; } if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); if (args->flags & LINUX_CLONE_VFORK) ff |= RFPPWAIT; bzero(&fr, sizeof(fr)); fr.fr_flags = ff; fr.fr_flags2 = f2; fr.fr_procp = &p2; error = fork1(td, &fr); if (error) return (error); td2 = FIRST_THREAD_IN_PROC(p2); /* create the emuldata */ linux_proc_init(td, td2, args->flags); em = em_find(td2); KASSERT(em != NULL, ("clone_proc: emuldata not found.\n")); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); if (error) linux_msg(td, "copyout p_pid failed!"); } PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); /* * In a case of stack = NULL, we are supposed to COW calling process * stack. This is what normal fork() does, so we just keep tf_rsp arg * intact. */ linux_set_upcall(td2, PTROUT(args->stack)); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(td2, args->tls); /* * If CLONE_PARENT is set, then the parent of the new process will be * the same as that of the calling process. */ if (args->flags & LINUX_CLONE_PARENT) { sx_xlock(&proctree_lock); PROC_LOCK(p2); proc_reparent(p2, td->td_proc->p_pptr, true); PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); td->td_retval[0] = p2->p_pid; return (0); } static int linux_clone_thread(struct thread *td, struct linux_clone_args *args) { struct linux_emuldata *em; struct thread *newtd; struct proc *p; int error; LINUX_CTR4(clone_thread, "thread(%d) flags %x ptid %p ctid %p", td->td_tid, (unsigned)args->flags, args->parent_tidptr, args->child_tidptr); if ((args->flags & LINUX_CLONE_PARENT) != 0) return (EINVAL); if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); /* Threads should be created with own stack */ if (args->stack == NULL) return (EINVAL); p = td->td_proc; #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(p); if (error != 0) return (EPROCLIM); } #endif /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) goto fail; cpu_copy_thread(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = p; thread_cow_get(newtd, td); /* create the emuldata */ linux_proc_init(td, newtd, args->flags); em = em_find(newtd); KASSERT(em != NULL, ("clone_thread: emuldata not found.\n")); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(newtd, args->tls); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; cpu_thread_clean(newtd); linux_set_upcall(newtd, PTROUT(args->stack)); PROC_LOCK(p); p->p_flag |= P_HADTHREADS; thread_link(newtd, p); bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (p->p_ptevents & PTRACE_LWP) newtd->td_dbgflags |= TDB_BORN; PROC_UNLOCK(p); tidhash_add(newtd); LINUX_CTR2(clone_thread, "thread(%d) successful clone to %d", td->td_tid, newtd->td_tid); if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&newtd->td_tid, args->parent_tidptr, sizeof(newtd->td_tid)); if (error) linux_msg(td, "clone_thread: copyout td_tid failed!"); } /* * Make this runnable after we are finished with it. */ thread_lock(newtd); TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); td->td_retval[0] = newtd->td_tid; return (0); fail: #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); } #endif return (error); } int linux_clone(struct thread *td, struct linux_clone_args *args) { if (args->flags & LINUX_CLONE_THREAD) return (linux_clone_thread(td, args)); else return (linux_clone_proc(td, args)); } int linux_exit(struct thread *td, struct linux_exit_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("exit: emuldata not found.\n")); LINUX_CTR2(exit, "thread(%d) (%d)", em->em_tid, args->rval); linux_thread_detach(td); /* * XXX. When the last two threads of a process * exit via pthread_exit() try thr_exit() first. */ kern_thr_exit(td); exit1(td, args->rval, 0); /* NOTREACHED */ } int linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n")); em->child_clear_tid = args->tidptr; td->td_retval[0] = em->em_tid; LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d", em->em_tid, args->tidptr, td->td_retval[0]); return (0); } void linux_thread_detach(struct thread *td) { - struct linux_sys_futex_args cup; struct linux_emuldata *em; int *child_clear_tid; int error; em = em_find(td); KASSERT(em != NULL, ("thread_detach: emuldata not found.\n")); LINUX_CTR1(thread_detach, "thread(%d)", em->em_tid); release_futexes(td, em); child_clear_tid = em->child_clear_tid; if (child_clear_tid != NULL) { LINUX_CTR2(thread_detach, "thread(%d) %p", em->em_tid, child_clear_tid); error = suword32(child_clear_tid, 0); if (error != 0) return; - cup.uaddr = child_clear_tid; - cup.op = LINUX_FUTEX_WAKE; - cup.val = 1; /* wake one */ - cup.timeout = NULL; - cup.uaddr2 = NULL; - cup.val3 = 0; - error = linux_sys_futex(td, &cup); + error = futex_wake(td, child_clear_tid, 1, false); /* * this cannot happen at the moment and if this happens it * probably means there is a user space bug */ if (error != 0) linux_msg(td, "futex stuff in thread_detach failed."); } } diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c index 547d52c8ca4a..df6c7cdbf74a 100644 --- a/sys/compat/linux/linux_futex.c +++ b/sys/compat/linux/linux_futex.c @@ -1,1255 +1,802 @@ /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2005 Emmanuel Dreyfus * All rights reserved. * Copyright (c) 2009-2016 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if 0 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); #endif #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); -/** - * Futex part for the special DTrace module "locks". - */ -LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *"); -LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *"); - -/** - * Per futex probes. - */ -LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *"); -LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *"); - /** * DTrace probes in this module. */ -LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t", - "int"); -LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t", - "int"); -LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int"); -LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t", - "int"); -LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *"); -LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int"); -LIN_SDT_PROBE_DEFINE0(futex, futex_get, error); -LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *", - "struct waiting_proc *", "uint32_t *", "uint32_t"); -LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *", - "struct waiting_proc *"); -LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t", - "struct waiting_proc *", "uint32_t"); -LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *"); -LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *"); -LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *", - "struct waiting_proc *", "uint32_t"); -LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int"); -LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_clockswitch); -LIN_SDT_PROBE_DEFINE1(futex, linux_futex, copyin_error, "int"); -LIN_SDT_PROBE_DEFINE0(futex, linux_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE_DEFINE3(futex, linux_futex, debug_wait, "uint32_t *", "uint32_t", "uint32_t"); -LIN_SDT_PROBE_DEFINE4(futex, linux_futex, debug_wait_value_neq, - "uint32_t *", "uint32_t", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, linux_futex, debug_wake, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE5(futex, linux_futex, debug_cmp_requeue, "uint32_t *", "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *"); -LIN_SDT_PROBE_DEFINE2(futex, linux_futex, debug_cmp_requeue_value_neq, - "uint32_t", "int"); LIN_SDT_PROBE_DEFINE5(futex, linux_futex, debug_wake_op, "uint32_t *", "int", "uint32_t", "uint32_t *", "uint32_t"); -LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unhandled_efault); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_lock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_unlock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_trylock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, deprecated_requeue); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE_DEFINE1(futex, linux_futex, unknown_operation, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int"); -struct futex; +#define FUTEX_SHARED 0x8 /* shared futex */ -struct waiting_proc { - uint32_t wp_flags; - struct futex *wp_futex; - TAILQ_ENTRY(waiting_proc) wp_list; -}; - -struct futex { - struct mtx f_lck; - uint32_t *f_uaddr; /* user-supplied value, for debug */ - struct umtx_key f_key; - uint32_t f_refcount; - uint32_t f_bitset; - LIST_ENTRY(futex) f_list; - TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; -}; +#define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE -#define FUTEX_LOCK(f) mtx_lock(&(f)->f_lck) -#define FUTEX_LOCKED(f) mtx_owned(&(f)->f_lck) -#define FUTEX_UNLOCK(f) mtx_unlock(&(f)->f_lck) -#define FUTEX_INIT(f) do { \ - mtx_init(&(f)->f_lck, "ftlk", NULL, \ - MTX_DUPOK); \ - LIN_SDT_PROBE1(futex, futex, create, \ - &(f)->f_lck); \ - } while (0) -#define FUTEX_DESTROY(f) do { \ - LIN_SDT_PROBE1(futex, futex, destroy, \ - &(f)->f_lck); \ - mtx_destroy(&(f)->f_lck); \ - } while (0) -#define FUTEX_ASSERT_LOCKED(f) mtx_assert(&(f)->f_lck, MA_OWNED) -#define FUTEX_ASSERT_UNLOCKED(f) mtx_assert(&(f)->f_lck, MA_NOTOWNED) - -#define FUTEXES_LOCK do { \ - mtx_lock(&futex_mtx); \ - LIN_SDT_PROBE1(locks, futex_mtx, \ - locked, &futex_mtx); \ - } while (0) -#define FUTEXES_UNLOCK do { \ - LIN_SDT_PROBE1(locks, futex_mtx, \ - unlock, &futex_mtx); \ - mtx_unlock(&futex_mtx); \ - } while (0) - -/* flags for futex_get() */ -#define FUTEX_CREATE_WP 0x1 /* create waiting_proc */ -#define FUTEX_DONTCREATE 0x2 /* don't create futex if not exists */ -#define FUTEX_DONTEXISTS 0x4 /* return EINVAL if futex exists */ -#define FUTEX_SHARED 0x8 /* shared futex */ -#define FUTEX_DONTLOCK 0x10 /* don't lock futex */ - -/* wp_flags */ -#define FUTEX_WP_REQUEUED 0x1 /* wp requeued - wp moved from wp_list - * of futex where thread sleep to wp_list - * of another futex. - */ -#define FUTEX_WP_REMOVED 0x2 /* wp is woken up and removed from futex - * wp_list to prevent double wakeup. - */ - -static void futex_put(struct futex *, struct waiting_proc *); -static int futex_get0(uint32_t *, struct futex **f, uint32_t); -static int futex_get(uint32_t *, struct waiting_proc **, struct futex **, - uint32_t); -static int futex_sleep(struct futex *, struct waiting_proc *, struct timespec *); -static int futex_wake(struct futex *, int, uint32_t); -static int futex_requeue(struct futex *, int, struct futex *, int); -static void futex_lock(struct futex *); -static void futex_unlock(struct futex *); static int futex_atomic_op(struct thread *, int, uint32_t *); static int handle_futex_death(struct linux_emuldata *, uint32_t *, unsigned int); static int fetch_robust_entry(struct linux_robust_list **, struct linux_robust_list **, unsigned int *); struct linux_futex_args { uint32_t *uaddr; int32_t op; uint32_t flags; bool clockrt; uint32_t val; struct timespec *ts; uint32_t *uaddr2; uint32_t val3; bool val3_compare; struct timespec kts; }; +static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, + struct linux_futex_args *); static int linux_futex(struct thread *, struct linux_futex_args *); static int linux_futex_wait(struct thread *, struct linux_futex_args *); static int linux_futex_wake(struct thread *, struct linux_futex_args *); static int linux_futex_requeue(struct thread *, struct linux_futex_args *); static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); -static void -futex_put(struct futex *f, struct waiting_proc *wp) -{ - - if (wp != NULL) { - if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0) - TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); - free(wp, M_FUTEX_WP); - } - - FUTEXES_LOCK; - if (--f->f_refcount == 0) { - LIST_REMOVE(f, f_list); - FUTEXES_UNLOCK; - if (FUTEX_LOCKED(f)) - futex_unlock(f); - - LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr, - f->f_refcount, f->f_key.shared); - LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d " - "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); - umtx_key_release(&f->f_key); - FUTEX_DESTROY(f); - free(f, M_FUTEX); - return; - } - - LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount, - f->f_key.shared); - LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d", - f->f_uaddr, f->f_refcount, f->f_key.shared); - if (FUTEX_LOCKED(f)) - futex_unlock(f); - FUTEXES_UNLOCK; -} - -static int -futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags) -{ - struct futex *f, *tmpf; - struct umtx_key key; - int error; - - *newf = tmpf = NULL; - - error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ? - AUTO_SHARE : THREAD_SHARE, &key); - if (error) { - LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error); - return (error); - } -retry: - FUTEXES_LOCK; - LIST_FOREACH(f, &futex_list, f_list) { - if (umtx_key_match(&f->f_key, &key)) { - if (tmpf != NULL) { - if (FUTEX_LOCKED(tmpf)) - futex_unlock(tmpf); - FUTEX_DESTROY(tmpf); - free(tmpf, M_FUTEX); - } - if (flags & FUTEX_DONTEXISTS) { - FUTEXES_UNLOCK; - umtx_key_release(&key); - - return (EINVAL); - } - - /* - * Increment refcount of the found futex to - * prevent it from deallocation before FUTEX_LOCK() - */ - ++f->f_refcount; - FUTEXES_UNLOCK; - umtx_key_release(&key); - - if ((flags & FUTEX_DONTLOCK) == 0) - futex_lock(f); - *newf = f; - LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr, - f->f_refcount, f->f_key.shared); - LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d", - uaddr, f->f_refcount, f->f_key.shared); - - return (0); - } - } - - if (flags & FUTEX_DONTCREATE) { - FUTEXES_UNLOCK; - umtx_key_release(&key); - LIN_SDT_PROBE1(futex, futex_get0, null, uaddr); - LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr); - - return (0); - } - - if (tmpf == NULL) { - FUTEXES_UNLOCK; - tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO); - tmpf->f_uaddr = uaddr; - tmpf->f_key = key; - tmpf->f_refcount = 1; - tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY; - FUTEX_INIT(tmpf); - TAILQ_INIT(&tmpf->f_waiting_proc); - - /* - * Lock the new futex before an insert into the futex_list - * to prevent futex usage by other. - */ - if ((flags & FUTEX_DONTLOCK) == 0) - futex_lock(tmpf); - goto retry; - } - - LIST_INSERT_HEAD(&futex_list, tmpf, f_list); - FUTEXES_UNLOCK; - - LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount, - tmpf->f_key.shared); - LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new", - uaddr, tmpf->f_refcount, tmpf->f_key.shared); - *newf = tmpf; - - return (0); -} - -static int -futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f, - uint32_t flags) -{ - int error; - - if (flags & FUTEX_CREATE_WP) { - *wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK); - (*wp)->wp_flags = 0; - } - error = futex_get0(uaddr, f, flags); - if (error) { - LIN_SDT_PROBE0(futex, futex_get, error); - - if (flags & FUTEX_CREATE_WP) - free(*wp, M_FUTEX_WP); - - return (error); - } - if (flags & FUTEX_CREATE_WP) { - TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list); - (*wp)->wp_futex = *f; - } - - return (error); -} - -static inline void -futex_lock(struct futex *f) -{ - - LINUX_CTR3(sys_futex, "futex_lock uaddr %p ref %d shared %d", - f->f_uaddr, f->f_refcount, f->f_key.shared); - FUTEX_ASSERT_UNLOCKED(f); - FUTEX_LOCK(f); -} - -static inline void -futex_unlock(struct futex *f) -{ - - LINUX_CTR3(sys_futex, "futex_unlock uaddr %p ref %d shared %d", - f->f_uaddr, f->f_refcount, f->f_key.shared); - FUTEX_ASSERT_LOCKED(f); - FUTEX_UNLOCK(f); -} - -static int -futex_sleep(struct futex *f, struct waiting_proc *wp, struct timespec *ts) -{ - sbintime_t sbt, prec, tmp; - time_t over; - int error; - - FUTEX_ASSERT_LOCKED(f); - if (ts != NULL) { - if (ts->tv_sec > INT32_MAX / 2) { - over = ts->tv_sec - INT32_MAX / 2; - ts->tv_sec -= over; - } - tmp = tstosbt(*ts); - if (TIMESEL(&sbt, tmp)) - sbt += tc_tick_sbt; - sbt += tmp; - prec = tmp; - prec >>= tc_precexp; - } else { - sbt = 0; - prec = 0; - } - LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %ld ref %d", - f->f_uaddr, wp, sbt, f->f_refcount); - - error = msleep_sbt(wp, &f->f_lck, PCATCH, "futex", sbt, prec, C_ABSOLUTE); - if (wp->wp_flags & FUTEX_WP_REQUEUED) { - KASSERT(f != wp->wp_futex, ("futex != wp_futex")); - - if (error) { - LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error, - f->f_uaddr, wp, wp->wp_futex->f_uaddr, - wp->wp_futex->f_refcount); - } - - LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp" - " %p requeued uaddr %p ref %d", - error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, - wp->wp_futex->f_refcount); - futex_put(f, NULL); - f = wp->wp_futex; - futex_lock(f); - } else { - if (error) { - LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error, - f->f_uaddr, wp); - } - LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p", - error, f->f_uaddr, wp); - } - - futex_put(f, wp); - - return (error); -} - -static int -futex_wake(struct futex *f, int n, uint32_t bitset) +int +futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) { - struct waiting_proc *wp, *wpt; - int count = 0; + struct linux_futex_args args; - if (bitset == 0) - return (EINVAL); + bzero(&args, sizeof(args)); + args.op = LINUX_FUTEX_WAKE; + args.uaddr = uaddr; + args.flags = shared == true ? FUTEX_SHARED : 0; + args.val = val; + args.val3 = FUTEX_BITSET_MATCH_ANY; - FUTEX_ASSERT_LOCKED(f); - TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { - LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp, - f->f_refcount); - LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d", - f->f_uaddr, wp, f->f_refcount); - /* - * Unless we find a matching bit in - * the bitset, continue searching. - */ - if (!(wp->wp_futex->f_bitset & bitset)) - continue; - - wp->wp_flags |= FUTEX_WP_REMOVED; - TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); - LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp); - wakeup_one(wp); - if (++count == n) - break; - } - - return (count); -} - -static int -futex_requeue(struct futex *f, int nrwake, struct futex *f2, - int nrrequeue) -{ - struct waiting_proc *wp, *wpt; - int count = 0; - - FUTEX_ASSERT_LOCKED(f); - FUTEX_ASSERT_LOCKED(f2); - - TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { - if (++count <= nrwake) { - LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p", - f->f_uaddr, wp); - wp->wp_flags |= FUTEX_WP_REMOVED; - TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); - LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp); - wakeup_one(wp); - } else { - LIN_SDT_PROBE3(futex, futex_requeue, requeue, - f->f_uaddr, wp, f2->f_uaddr); - LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p", - f->f_uaddr, wp, f2->f_uaddr); - wp->wp_flags |= FUTEX_WP_REQUEUED; - /* Move wp to wp_list of f2 futex */ - TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); - TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list); - - /* - * Thread which sleeps on wp after waking should - * acquire f2 lock, so increment refcount of f2 to - * prevent it from premature deallocation. - */ - wp->wp_futex = f2; - FUTEXES_LOCK; - ++f2->f_refcount; - FUTEXES_UNLOCK; - if (count - nrwake >= nrrequeue) - break; - } - } - - return (count); + return (linux_futex_wake(td, &args)); } static int futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg, cmparg); switch (op) { case FUTEX_OP_SET: ret = futex_xchgl(oparg, uaddr, &oldval); break; case FUTEX_OP_ADD: ret = futex_addl(oparg, uaddr, &oldval); break; case FUTEX_OP_OR: ret = futex_orl(oparg, uaddr, &oldval); break; case FUTEX_OP_ANDN: ret = futex_andl(~oparg, uaddr, &oldval); break; case FUTEX_OP_XOR: ret = futex_xorl(oparg, uaddr, &oldval); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op); ret = -ENOSYS; break; } if (ret) return (ret); switch (cmp) { case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp); ret = -ENOSYS; } return (ret); } static int linux_futex(struct thread *td, struct linux_futex_args *args) { struct linux_pemuldata *pem; struct proc *p; if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { args->flags = 0; args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; } else args->flags = FUTEX_SHARED; - /* - * Currently support for switching between CLOCK_MONOTONIC and - * CLOCK_REALTIME is not present. However Linux forbids the use of - * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and - * FUTEX_WAIT_REQUEUE_PI. - */ args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; - if (args->clockrt && args->op != LINUX_FUTEX_WAIT_BITSET && - args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) { - LIN_SDT_PROBE0(futex, linux_futex, - unimplemented_clockswitch); - return (ENOSYS); - } switch (args->op) { case LINUX_FUTEX_WAIT: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAIT_BITSET: LIN_SDT_PROBE3(futex, linux_futex, debug_wait, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); return (linux_futex_wait(td, args)); case LINUX_FUTEX_WAKE: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAKE_BITSET: LIN_SDT_PROBE3(futex, linux_futex, debug_wake, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); return (linux_futex_wake(td, args)); case LINUX_FUTEX_REQUEUE: /* * Glibc does not use this operation since version 2.3.3, * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when * FUTEX_REQUEUE returned EINVAL. */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { linux_msg(td, "unsupported FUTEX_REQUEUE"); pem->flags |= LINUX_XDEPR_REQUEUEOP; LIN_SDT_PROBE0(futex, linux_futex, deprecated_requeue); } /* * The above is true, however musl libc does make use of the * futex requeue operation, allow operation for brands which * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. */ p = td->td_proc; Elf_Brandinfo *bi = p->p_elf_brandinfo; if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) return (EINVAL); args->val3_compare = false; /* FALLTHROUGH */ case LINUX_FUTEX_CMP_REQUEUE: LIN_SDT_PROBE5(futex, linux_futex, debug_cmp_requeue, args->uaddr, args->val, args->val3, args->uaddr2, args->ts); LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", args->uaddr, args->val, args->val3, args->uaddr2, args->ts); return (linux_futex_requeue(td, args)); case LINUX_FUTEX_WAKE_OP: LIN_SDT_PROBE5(futex, linux_futex, debug_wake_op, args->uaddr, args->op, args->val, args->uaddr2, args->val3); LINUX_CTR5(sys_futex, "WAKE_OP " "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", args->uaddr, args->val, args->uaddr2, args->val3, args->ts); return (linux_futex_wakeop(td, args)); case LINUX_FUTEX_LOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_LOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_lock_pi); } return (ENOSYS); case LINUX_FUTEX_UNLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_UNLOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_unlock_pi); } return (ENOSYS); case LINUX_FUTEX_TRYLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_TRYLOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_trylock_pi); } return (ENOSYS); case LINUX_FUTEX_WAIT_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_wait_requeue_pi); } return (ENOSYS); case LINUX_FUTEX_CMP_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_cmp_requeue_pi); } return (ENOSYS); default: linux_msg(td, "unsupported futex op %d", args->op); LIN_SDT_PROBE1(futex, linux_futex, unknown_operation, args->op); return (ENOSYS); } } static int linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) { + struct umtx_key key, key2; int nrwake, op_ret, ret; - struct futex *f, *f2; - int error, save; - uint32_t val; + int error, count; if (args->uaddr == args->uaddr2) return (EINVAL); -retry: - f = f2 = NULL; - error = futex_get(args->uaddr, NULL, &f, args->flags | FUTEX_DONTLOCK); + error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); if (error != 0) return (error); - - error = futex_get(args->uaddr2, NULL, &f2, args->flags | FUTEX_DONTLOCK); + error = umtx_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); if (error != 0) { - futex_put(f, NULL); + umtx_key_release(&key); return (error); } - futex_lock(f); - futex_lock(f2); - - /* - * This function returns positive number as results and - * negative as errors - */ - save = vm_fault_disable_pagefaults(); + umtxq_lock(&key); + umtxq_busy(&key); + umtxq_unlock(&key); op_ret = futex_atomic_op(td, args->val3, args->uaddr2); - vm_fault_enable_pagefaults(save); - - LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x", - args->uaddr, op_ret); - if (op_ret < 0) { - if (f2 != NULL) - futex_put(f2, NULL); - futex_put(f, NULL); if (op_ret == -ENOSYS) - return (ENOSYS); - error = copyin(args->uaddr2, &val, sizeof(val)); - if (error == 0) - goto retry; - return (error); + error = ENOSYS; + else + error = EFAULT; } - - ret = futex_wake(f, args->val, args->val3); - + umtxq_lock(&key); + umtxq_unbusy(&key); + if (error != 0) + goto out; + ret = umtxq_signal_mask(&key, args->val, args->val3); if (op_ret > 0) { - op_ret = 0; nrwake = (int)(unsigned long)args->ts; - - if (f2 != NULL) - op_ret += futex_wake(f2, nrwake, args->val3); + umtxq_lock(&key2); + count = umtxq_count(&key2); + if (count > 0) + ret += umtxq_signal_mask(&key2, nrwake, args->val3); else - op_ret += futex_wake(f, nrwake, args->val3); - ret += op_ret; + ret += umtxq_signal_mask(&key, nrwake, args->val3); + umtxq_unlock(&key2); } - if (f2 != NULL) - futex_put(f2, NULL); - futex_put(f, NULL); td->td_retval[0] = ret; - return (0); +out: + umtxq_unlock(&key); + umtx_key_release(&key2); + umtx_key_release(&key); + return (error); } static int linux_futex_requeue(struct thread *td, struct linux_futex_args *args) { int nrwake, nrrequeue; - struct futex *f, *f2; + struct umtx_key key, key2; int error; - uint32_t val; + uint32_t uval; /* * Linux allows this, we would not, it is an incorrect * usage of declared ABI, so return EINVAL. */ - if (args->uaddr == args->uaddr2) { - LIN_SDT_PROBE0(futex, linux_futex, - invalid_cmp_requeue_use); + if (args->uaddr == args->uaddr2) return (EINVAL); - } nrrequeue = (int)(unsigned long)args->ts; nrwake = args->val; /* * Sanity check to prevent signed integer overflow, * see Linux CVE-2018-6927 */ if (nrwake < 0 || nrrequeue < 0) return (EINVAL); -retry: - f = f2 = NULL; - error = futex_get(args->uaddr, NULL, &f, args->flags | FUTEX_DONTLOCK); + error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); if (error != 0) return (error); - - /* - * To avoid deadlocks return EINVAL if second futex - * exists at this time. - * - * Glibc fall back to FUTEX_WAKE in case of any error - * returned by FUTEX_CMP_REQUEUE. - */ - error = futex_get(args->uaddr2, NULL, &f2, - args->flags | FUTEX_DONTEXISTS | FUTEX_DONTLOCK); - if (error != 0) { - futex_put(f, NULL); - return (error); - } - futex_lock(f); - futex_lock(f2); - error = copyin_nofault(args->uaddr, &val, sizeof(val)); + error = umtx_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); if (error != 0) { - futex_put(f2, NULL); - futex_put(f, NULL); - error = copyin(args->uaddr, &val, sizeof(val)); - if (error == 0) - goto retry; - LIN_SDT_PROBE1(futex, linux_futex, copyin_error, - error); - LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d", - error); + umtx_key_release(&key); return (error); } - if (args->val3_compare == true && val != args->val3) { - LIN_SDT_PROBE2(futex, linux_futex, - debug_cmp_requeue_value_neq, args->val, val); - LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x", - args->val, val); - futex_put(f2, NULL); - futex_put(f, NULL); - return (EAGAIN); - } - - td->td_retval[0] = futex_requeue(f, nrwake, f2, nrrequeue); - futex_put(f2, NULL); - futex_put(f, NULL); - return (0); + umtxq_lock(&key); + umtxq_busy(&key); + umtxq_unlock(&key); + error = fueword32(args->uaddr, &uval); + if (error != 0) + error = EFAULT; + else if (args->val3_compare == true && uval != args->val3) + error = EWOULDBLOCK; + umtxq_lock(&key); + umtxq_unbusy(&key); + if (error == 0) { + umtxq_lock(&key2); + td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); + umtxq_unlock(&key2); + } + umtxq_unlock(&key); + umtx_key_release(&key2); + umtx_key_release(&key); + return (error); } static int linux_futex_wake(struct thread *td, struct linux_futex_args *args) { - struct futex *f; + struct umtx_key key; int error; - f = NULL; - error = futex_get(args->uaddr, NULL, &f, args->flags | FUTEX_DONTCREATE); - if (error != 0) - return (error); + if (args->val3 == 0) + return (EINVAL); - if (f == NULL) { - td->td_retval[0] = 0; + error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); + if (error != 0) return (error); - } - td->td_retval[0] = futex_wake(f, args->val, args->val3); - futex_put(f, NULL); + umtxq_lock(&key); + td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); + umtxq_unlock(&key); + umtx_key_release(&key); return (0); } static int linux_futex_wait(struct thread *td, struct linux_futex_args *args) { - struct waiting_proc *wp; - struct timespec kts; - struct futex *f; + struct umtx_abs_timeout timo; + struct umtx_q *uq; + uint32_t uval; int error; - uint32_t val; - - if (args->ts != NULL) { - if (args->clockrt) { - nanotime(&kts); - timespecsub(args->ts, &kts, args->ts); - } else if (args->op == LINUX_FUTEX_WAIT_BITSET) { - nanouptime(&kts); - timespecsub(args->ts, &kts, args->ts); - } - } -retry: - f = NULL; - error = futex_get(args->uaddr, &wp, &f, args->flags | FUTEX_CREATE_WP); - if (error != 0) - return (error); + if (args->val3 == 0) + error = EINVAL; - error = copyin_nofault(args->uaddr, &val, sizeof(val)); - if (error != 0) { - futex_put(f, wp); - error = copyin(args->uaddr, &val, sizeof(val)); - if (error == 0) - goto retry; - LIN_SDT_PROBE1(futex, linux_futex, copyin_error, error); - LINUX_CTR1(sys_futex, "WAIT copyin failed %d", error); + uq = td->td_umtxq; + error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), + &uq->uq_key); + if (error != 0) return (error); - } - if (val != args->val) { - LIN_SDT_PROBE4(futex, linux_futex, - debug_wait_value_neq, args->uaddr, args->val, val, - args->val3); - LINUX_CTR3(sys_futex, - "WAIT uaddr %p val 0x%x != uval 0x%x", - args->uaddr, args->val, val); - futex_put(f, wp); - return (EWOULDBLOCK); - } - - if (args->val3 == 0) { - futex_put(f, wp); - return (EINVAL); - } - - f->f_bitset = args->val3; - error = futex_sleep(f, wp, args->ts); + if (args->ts != NULL) + linux_umtx_abs_timeout_init(&timo, args); + umtxq_lock(&uq->uq_key); + umtxq_busy(&uq->uq_key); + uq->uq_bitset = args->val3; + umtxq_insert(uq); + umtxq_unlock(&uq->uq_key); + error = fueword32(args->uaddr, &uval); if (error != 0) - LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error); - if (error == EWOULDBLOCK) - error = ETIMEDOUT; + error = EFAULT; + else if (uval != args->val) + error = EWOULDBLOCK; + umtxq_lock(&uq->uq_key); + umtxq_unbusy(&uq->uq_key); + if (error == 0) { + error = umtxq_sleep(uq, "futex", + args->ts == NULL ? NULL : &timo); + if ((uq->uq_flags & UQF_UMTXQ) == 0) + error = 0; + else + umtxq_remove(uq); + } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { + umtxq_remove(uq); + } + umtxq_unlock(&uq->uq_key); + umtx_key_release(&uq->uq_key); + if (error == ERESTART) + error = EINTR; return (error); } +static void +linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, + struct linux_futex_args *args) +{ + int clockid, absolute; + + /* + * The FUTEX_CLOCK_REALTIME option bit can be employed only with the + * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI. + * For FUTEX_WAIT, timeout is interpreted as a relative value, for other + * futex operations timeout is interpreted as an absolute value. + * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures + * the timeout against the CLOCK_REALTIME clock, otherwise the kernel + * measures the timeout against the CLOCK_MONOTONIC clock. + */ + clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; + absolute = args->op == LINUX_FUTEX_WAIT ? false : true; + umtx_abs_timeout_init(timo, clockid, absolute, args->ts); +} + int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) { struct linux_futex_args fargs = { .uaddr = args->uaddr, .op = args->op, .val = args->val, .ts = NULL, .uaddr2 = args->uaddr2, .val3 = args->val3, .val3_compare = true, }; struct l_timespec lts; int error; switch (args->op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: case LINUX_FUTEX_WAIT_BITSET: if (args->timeout != NULL) { error = copyin(args->timeout, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec(&fargs.kts, <s); if (error != 0) return (error); fargs.ts = &fargs.kts; } break; default: fargs.ts = PTRIN(args->timeout); } return (linux_futex(td, &fargs)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_sys_futex_time64(struct thread *td, struct linux_sys_futex_time64_args *args) { struct linux_futex_args fargs = { .uaddr = args->uaddr, .op = args->op, .val = args->val, .ts = NULL, .uaddr2 = args->uaddr2, .val3 = args->val3, }; struct l_timespec64 lts; int error; switch (args->op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: case LINUX_FUTEX_WAIT_BITSET: if (args->timeout != NULL) { error = copyin(args->timeout, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec64(&fargs.kts, <s); if (error != 0) return (error); fargs.ts = &fargs.kts; } break; default: fargs.ts = PTRIN(args->timeout); } return (linux_futex(td, &fargs)); } #endif int linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) { struct linux_emuldata *em; if (args->len != sizeof(struct linux_robust_list_head)) { LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error); return (EINVAL); } em = em_find(td); em->robust_futexes = args->head; return (0); } int linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) { struct linux_emuldata *em; struct linux_robust_list_head *head; l_size_t len = sizeof(struct linux_robust_list_head); struct thread *td2; int error = 0; if (!args->pid) { em = em_find(td); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); head = em->robust_futexes; } else { td2 = tdfind(args->pid, -1); if (td2 == NULL) return (ESRCH); if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { PROC_UNLOCK(td2->td_proc); return (EPERM); } em = em_find(td2); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); /* XXX: ptrace? */ if (priv_check(td, PRIV_CRED_SETUID) || priv_check(td, PRIV_CRED_SETEUID) || p_candebug(td, td2->td_proc)) { PROC_UNLOCK(td2->td_proc); return (EPERM); } head = em->robust_futexes; PROC_UNLOCK(td2->td_proc); } error = copyout(&len, args->len, sizeof(l_size_t)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); return (EFAULT); } error = copyout(&head, args->head, sizeof(head)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); } return (error); } static int handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr, unsigned int pi) { uint32_t uval, nval, mval; - struct futex *f; int error; retry: error = copyin(uaddr, &uval, 4); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error); return (EFAULT); } if ((uval & FUTEX_TID_MASK) == em->em_tid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = casuword32(uaddr, uval, mval); if (nval == -1) return (EFAULT); if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { - error = futex_get(uaddr, NULL, &f, - FUTEX_DONTCREATE | FUTEX_SHARED); - if (error) + error = futex_wake(curthread, uaddr, 1, true); + if (error != 0) return (error); - if (f != NULL) { - futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY); - futex_put(f, NULL); - } } } return (0); } static int fetch_robust_entry(struct linux_robust_list **entry, struct linux_robust_list **head, unsigned int *pi) { l_ulong uentry; int error; error = copyin((const void *)head, &uentry, sizeof(l_ulong)); if (error) { LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error); return (EFAULT); } *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; return (0); } /* This walks the list of robust futexes releasing them. */ void release_futexes(struct thread *td, struct linux_emuldata *em) { struct linux_robust_list_head *head = NULL; struct linux_robust_list *entry, *next_entry, *pending; unsigned int limit = 2048, pi, next_pi, pip; l_long futex_offset; int rc, error; head = em->robust_futexes; if (head == NULL) return; if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) return; error = copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)); if (error) { LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error); return; } if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) return; while (entry != &head->list) { rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); if (entry != pending) if (handle_futex_death(em, (uint32_t *)((caddr_t)entry + futex_offset), pi)) { return; } if (rc) return; entry = next_entry; pi = next_pi; if (!--limit) break; sched_relinquish(curthread); } if (pending) handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip); } diff --git a/sys/compat/linux/linux_futex.h b/sys/compat/linux/linux_futex.h index 6ea873d98411..4255cbdc7363 100644 --- a/sys/compat/linux/linux_futex.h +++ b/sys/compat/linux/linux_futex.h @@ -1,89 +1,90 @@ /* $NetBSD: linux_futex.h,v 1.2 2005/12/11 12:20:19 christos Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_FUTEX_H #define _LINUX_FUTEX_H #define LINUX_FUTEX_WAIT 0 #define LINUX_FUTEX_WAKE 1 #define LINUX_FUTEX_FD 2 /* unused */ #define LINUX_FUTEX_REQUEUE 3 #define LINUX_FUTEX_CMP_REQUEUE 4 #define LINUX_FUTEX_WAKE_OP 5 #define LINUX_FUTEX_LOCK_PI 6 #define LINUX_FUTEX_UNLOCK_PI 7 #define LINUX_FUTEX_TRYLOCK_PI 8 #define LINUX_FUTEX_WAIT_BITSET 9 #define LINUX_FUTEX_WAKE_BITSET 10 #define LINUX_FUTEX_WAIT_REQUEUE_PI 11 #define LINUX_FUTEX_CMP_REQUEUE_PI 12 #define LINUX_FUTEX_PRIVATE_FLAG 128 #define LINUX_FUTEX_CLOCK_REALTIME 256 #define LINUX_FUTEX_CMD_MASK ~(LINUX_FUTEX_PRIVATE_FLAG | \ LINUX_FUTEX_CLOCK_REALTIME) #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ #define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ #define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ #define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ #define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ #define FUTEX_OP_OPARG_SHIFT 8 /* Use (1 << OPARG) instead of OPARG. */ #define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ #define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ #define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ #define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ #define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ #define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ #define FUTEX_WAITERS 0x80000000 #define FUTEX_OWNER_DIED 0x40000000 #define FUTEX_TID_MASK 0x3fffffff #define FUTEX_BITSET_MATCH_ANY 0xffffffff int futex_xchgl(int oparg, uint32_t *uaddr, int *oldval); int futex_addl(int oparg, uint32_t *uaddr, int *oldval); int futex_orl(int oparg, uint32_t *uaddr, int *oldval); int futex_andl(int oparg, uint32_t *uaddr, int *oldval); int futex_xorl(int oparg, uint32_t *uaddr, int *oldval); +int futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared); void release_futexes(struct thread *, struct linux_emuldata *); #endif /* !_LINUX_FUTEX_H */ diff --git a/sys/compat/linux/linux_util.c b/sys/compat/linux/linux_util.c index 86266fd3178f..b807dd1f33a0 100644 --- a/sys/compat/linux/linux_util.c +++ b/sys/compat/linux/linux_util.c @@ -1,336 +1,334 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1994 Christos Zoulas * Copyright (c) 1995 Frank van der Linden * Copyright (c) 1995 Scott Bartram * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: svr4_util.c,v 1.5 1995/01/22 23:44:50 christos Exp */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); MALLOC_DEFINE(M_EPOLL, "lepoll", "Linux events structures"); -MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes"); -MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futex waiting proc"); FEATURE(linuxulator_v4l, "V4L ioctl wrapper support in the linuxulator"); FEATURE(linuxulator_v4l2, "V4L2 ioctl wrapper support in the linuxulator"); /** * Special DTrace provider for the linuxulator. * * In this file we define the provider for the entire linuxulator. All * modules (= files of the linuxulator) use it. * * We define a different name depending on the emulated bitsize, see * ../..//linux{,32}/linux.h, e.g.: * native bitsize = linuxulator * amd64, 32bit emulation = linuxulator32 */ LIN_SDT_PROVIDER_DEFINE(linuxulator); LIN_SDT_PROVIDER_DEFINE(linuxulator32); char linux_emul_path[MAXPATHLEN] = "/compat/linux"; SYSCTL_STRING(_compat_linux, OID_AUTO, emul_path, CTLFLAG_RWTUN, linux_emul_path, sizeof(linux_emul_path), "Linux runtime environment path"); static bool use_real_ifnames = false; SYSCTL_BOOL(_compat_linux, OID_AUTO, use_real_ifnames, CTLFLAG_RWTUN, &use_real_ifnames, 0, "Use FreeBSD interface names instead of generating ethN aliases"); /* * Search an alternate path before passing pathname arguments on to * system calls. Useful for keeping a separate 'emulation tree'. * * If cflag is set, we check if an attempt can be made to create the * named file, i.e. we check if the directory it should be in exists. */ int linux_emul_convpath(struct thread *td, const char *path, enum uio_seg pathseg, char **pbuf, int cflag, int dfd) { int retval; retval = kern_alternate_path(td, linux_emul_path, path, pathseg, pbuf, cflag, dfd); return (retval); } void linux_msg(const struct thread *td, const char *fmt, ...) { va_list ap; struct proc *p; if (linux_debug == 0) return; p = td->td_proc; printf("linux: jid %d pid %d (%s): ", p->p_ucred->cr_prison->pr_id, (int)p->p_pid, p->p_comm); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } struct device_element { TAILQ_ENTRY(device_element) list; struct linux_device_handler entry; }; static TAILQ_HEAD(, device_element) devices = TAILQ_HEAD_INITIALIZER(devices); static struct linux_device_handler null_handler = { "mem", "mem", "null", "null", 1, 3, 1}; DATA_SET(linux_device_handler_set, null_handler); char * linux_driver_get_name_dev(device_t dev) { struct device_element *de; const char *device_name = device_get_name(dev); if (device_name == NULL) return (NULL); TAILQ_FOREACH(de, &devices, list) { if (strcmp(device_name, de->entry.bsd_driver_name) == 0) return (de->entry.linux_driver_name); } return (NULL); } int linux_driver_get_major_minor(const char *node, int *major, int *minor) { struct device_element *de; unsigned long devno; size_t sz; if (node == NULL || major == NULL || minor == NULL) return (1); sz = sizeof("pts/") - 1; if (strncmp(node, "pts/", sz) == 0 && node[sz] != '\0') { /* * Linux checks major and minors of the slave device * to make sure it's a pty device, so let's make him * believe it is. */ devno = strtoul(node + sz, NULL, 10); *major = 136 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("dri/card") - 1; if (strncmp(node, "dri/card", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("dri/controlD") - 1; if (strncmp(node, "dri/controlD", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("dri/renderD") - 1; if (strncmp(node, "dri/renderD", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } sz = sizeof("drm/") - 1; if (strncmp(node, "drm/", sz) == 0 && node[sz] != '\0') { devno = strtoul(node + sz, NULL, 10); *major = 226 + (devno / 256); *minor = devno % 256; return (0); } TAILQ_FOREACH(de, &devices, list) { if (strcmp(node, de->entry.bsd_device_name) == 0) { *major = de->entry.linux_major; *minor = de->entry.linux_minor; return (0); } } return (1); } int linux_vn_get_major_minor(const struct vnode *vp, int *major, int *minor) { int error; if (vp->v_type != VCHR) return (ENOTBLK); dev_lock(); if (vp->v_rdev == NULL) { dev_unlock(); return (ENXIO); } error = linux_driver_get_major_minor(devtoname(vp->v_rdev), major, minor); dev_unlock(); return (error); } char * linux_get_char_devices() { struct device_element *de; char *temp, *string, *last; char formated[256]; int current_size = 0, string_size = 1024; string = malloc(string_size, M_LINUX, M_WAITOK); string[0] = '\000'; last = ""; TAILQ_FOREACH(de, &devices, list) { if (!de->entry.linux_char_device) continue; temp = string; if (strcmp(last, de->entry.bsd_driver_name) != 0) { last = de->entry.bsd_driver_name; snprintf(formated, sizeof(formated), "%3d %s\n", de->entry.linux_major, de->entry.linux_device_name); if (strlen(formated) + current_size >= string_size) { string_size *= 2; string = malloc(string_size, M_LINUX, M_WAITOK); bcopy(temp, string, current_size); free(temp, M_LINUX); } strcat(string, formated); current_size = strlen(string); } } return (string); } void linux_free_get_char_devices(char *string) { free(string, M_LINUX); } static int linux_major_starting = 200; int linux_device_register_handler(struct linux_device_handler *d) { struct device_element *de; if (d == NULL) return (EINVAL); de = malloc(sizeof(*de), M_LINUX, M_WAITOK); if (d->linux_major < 0) { d->linux_major = linux_major_starting++; } bcopy(d, &de->entry, sizeof(*d)); /* Add the element to the list, sorted on span. */ TAILQ_INSERT_TAIL(&devices, de, list); return (0); } int linux_device_unregister_handler(struct linux_device_handler *d) { struct device_element *de; if (d == NULL) return (EINVAL); TAILQ_FOREACH(de, &devices, list) { if (bcmp(d, &de->entry, sizeof(*d)) == 0) { TAILQ_REMOVE(&devices, de, list); free(de, M_LINUX); return (0); } } return (EINVAL); } bool linux_use_real_ifname(const struct ifnet *ifp) { return (use_real_ifnames || !IFP_IS_ETH(ifp)); } diff --git a/sys/compat/linux/linux_util.h b/sys/compat/linux/linux_util.h index 2350e62c7270..7a8ec80cb1bf 100644 --- a/sys/compat/linux/linux_util.h +++ b/sys/compat/linux/linux_util.h @@ -1,191 +1,189 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1994 Christos Zoulas * Copyright (c) 1995 Frank van der Linden * Copyright (c) 1995 Scott Bartram * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: svr4_util.h,v 1.5 1994/11/18 02:54:31 christos Exp * from: linux_util.h,v 1.2 1995/03/05 23:23:50 fvdl Exp * $FreeBSD$ */ #ifndef _LINUX_UTIL_H_ #define _LINUX_UTIL_H_ #include #include #include #include #include #include #include #include MALLOC_DECLARE(M_LINUX); MALLOC_DECLARE(M_EPOLL); -MALLOC_DECLARE(M_FUTEX); -MALLOC_DECLARE(M_FUTEX_WP); extern char linux_emul_path[]; extern int linux_use_emul_path; int linux_emul_convpath(struct thread *, const char *, enum uio_seg, char **, int, int); #define LUSECONVPATH(td) atomic_load_int(&linux_use_emul_path) #define LCONVPATH_AT(td, upath, pathp, i, dfd) \ do { \ int _error; \ \ _error = linux_emul_convpath(td, upath, UIO_USERSPACE, \ pathp, i, dfd); \ if (*(pathp) == NULL) \ return (_error); \ } while (0) #define LCONVPATH(td, upath, pathp, i) \ LCONVPATH_AT(td, upath, pathp, i, AT_FDCWD) #define LCONVPATHEXIST(td, upath, pathp) LCONVPATH(td, upath, pathp, 0) #define LCONVPATHEXIST_AT(td, upath, pathp, dfd) LCONVPATH_AT(td, upath, pathp, 0, dfd) #define LCONVPATHCREAT(td, upath, pathp) LCONVPATH(td, upath, pathp, 1) #define LCONVPATHCREAT_AT(td, upath, pathp, dfd) LCONVPATH_AT(td, upath, pathp, 1, dfd) #define LFREEPATH(path) free(path, M_TEMP) #define DUMMY(s) \ LIN_SDT_PROBE_DEFINE0(dummy, s, not_implemented); \ int \ linux_ ## s(struct thread *td, struct linux_ ## s ## _args *args) \ { \ static pid_t pid; \ \ if (pid != td->td_proc->p_pid) { \ linux_msg(td, "syscall %s not implemented", #s); \ LIN_SDT_PROBE0(dummy, s, not_implemented); \ pid = td->td_proc->p_pid; \ }; \ \ return (ENOSYS); \ } \ struct __hack /* * This is for the syscalls that are not even yet implemented in Linux. * * They're marked as UNIMPL in syscall.master so it will * have nosys record in linux_sysent[]. */ #define UNIMPLEMENTED(s) void linux_msg(const struct thread *td, const char *fmt, ...) __printflike(2, 3); struct linux_device_handler { char *bsd_driver_name; char *linux_driver_name; char *bsd_device_name; char *linux_device_name; int linux_major; int linux_minor; int linux_char_device; }; int linux_device_register_handler(struct linux_device_handler *h); int linux_device_unregister_handler(struct linux_device_handler *h); char *linux_driver_get_name_dev(device_t dev); int linux_driver_get_major_minor(const char *node, int *major, int *minor); int linux_vn_get_major_minor(const struct vnode *vn, int *major, int *minor); char *linux_get_char_devices(void); void linux_free_get_char_devices(char *string); /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) ((ifp)->if_type == IFT_ETHER) #define IFP_IS_LOOP(ifp) ((ifp)->if_type == IFT_LOOP) struct ifnet; bool linux_use_real_ifname(const struct ifnet *ifp); #if defined(KTR) #define KTR_LINUX KTR_SUBSYS #define LINUX_CTRFMT(nm, fmt) #nm"("fmt")" #define LINUX_CTR6(f, m, p1, p2, p3, p4, p5, p6) do { \ CTR6(KTR_LINUX, LINUX_CTRFMT(f, m), \ p1, p2, p3, p4, p5, p6); \ } while (0) #define LINUX_CTR(f) LINUX_CTR6(f, "", 0, 0, 0, 0, 0, 0) #define LINUX_CTR0(f, m) LINUX_CTR6(f, m, 0, 0, 0, 0, 0, 0) #define LINUX_CTR1(f, m, p1) LINUX_CTR6(f, m, p1, 0, 0, 0, 0, 0) #define LINUX_CTR2(f, m, p1, p2) LINUX_CTR6(f, m, p1, p2, 0, 0, 0, 0) #define LINUX_CTR3(f, m, p1, p2, p3) LINUX_CTR6(f, m, p1, p2, p3, 0, 0, 0) #define LINUX_CTR4(f, m, p1, p2, p3, p4) LINUX_CTR6(f, m, p1, p2, p3, p4, 0, 0) #define LINUX_CTR5(f, m, p1, p2, p3, p4, p5) LINUX_CTR6(f, m, p1, p2, p3, p4, p5, 0) #else #define LINUX_CTR(f) #define LINUX_CTR0(f, m) #define LINUX_CTR1(f, m, p1) #define LINUX_CTR2(f, m, p1, p2) #define LINUX_CTR3(f, m, p1, p2, p3) #define LINUX_CTR4(f, m, p1, p2, p3, p4) #define LINUX_CTR5(f, m, p1, p2, p3, p4, p5) #define LINUX_CTR6(f, m, p1, p2, p3, p4, p5, p6) #endif /* * Some macros for rate limiting messages: * - noisy if compat.linux.debug = 1 * - print only once if compat.linux.debug > 1 */ #define LINUX_RATELIMIT_MSG_NOTTESTED(_what) \ do { \ static int seen = 0; \ \ if (seen == 0 && linux_debug >= 2) { \ linux_msg(curthread, "%s is not tested, please report on emulation@FreeBSD.org how it works", _what); \ \ if (linux_debug < 3) \ seen = 1; \ } \ } while (0) #define LINUX_RATELIMIT_MSG_OPT1(_message, _opt1) \ do { \ static int seen = 0; \ \ if (seen == 0) { \ linux_msg(curthread, _message, _opt1); \ \ if (linux_debug < 3) \ seen = 1; \ } \ } while (0) #endif /* ! _LINUX_UTIL_H_ */ diff --git a/sys/i386/linux/linux_sysvec.c b/sys/i386/linux/linux_sysvec.c index 3a12f4eb7689..6e8ce848f6ab 100644 --- a/sys/i386/linux/linux_sysvec.c +++ b/sys/i386/linux/linux_sysvec.c @@ -1,1178 +1,1173 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1996 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include -#include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(linux, 1); #define LINUX_VDSOPAGE_SIZE PAGE_SIZE * 2 #define LINUX_VDSOPAGE (VM_MAXUSER_ADDRESS - LINUX_VDSOPAGE_SIZE) #define LINUX_SHAREDPAGE (LINUX_VDSOPAGE - PAGE_SIZE) /* * PAGE_SIZE - the size * of the native SHAREDPAGE */ #define LINUX_USRSTACK LINUX_SHAREDPAGE #define LINUX_PS_STRINGS (LINUX_USRSTACK - sizeof(struct ps_strings)) static int linux_szsigcode; static vm_object_t linux_vdso_obj; static char *linux_vdso_mapping; extern char _binary_linux_vdso_so_o_start; extern char _binary_linux_vdso_so_o_end; static vm_offset_t linux_vdso_base; extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); static int linux_fixup(uintptr_t *stack_base, struct image_params *iparams); static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *iparams); static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask); static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack); static void linux_exec_sysvec_init(void *param); static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp); static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base); static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel); static void linux_vdso_install(const void *param); static void linux_vdso_deinstall(const void *param); static void linux_vdso_reloc(char *mapping, Elf_Addr offset); #define LINUX_T_UNKNOWN 255 static int _bsd_to_linux_trapcode[] = { LINUX_T_UNKNOWN, /* 0 */ 6, /* 1 T_PRIVINFLT */ LINUX_T_UNKNOWN, /* 2 */ 3, /* 3 T_BPTFLT */ LINUX_T_UNKNOWN, /* 4 */ LINUX_T_UNKNOWN, /* 5 */ 16, /* 6 T_ARITHTRAP */ 254, /* 7 T_ASTFLT */ LINUX_T_UNKNOWN, /* 8 */ 13, /* 9 T_PROTFLT */ 1, /* 10 T_TRCTRAP */ LINUX_T_UNKNOWN, /* 11 */ 14, /* 12 T_PAGEFLT */ LINUX_T_UNKNOWN, /* 13 */ 17, /* 14 T_ALIGNFLT */ LINUX_T_UNKNOWN, /* 15 */ LINUX_T_UNKNOWN, /* 16 */ LINUX_T_UNKNOWN, /* 17 */ 0, /* 18 T_DIVIDE */ 2, /* 19 T_NMI */ 4, /* 20 T_OFLOW */ 5, /* 21 T_BOUND */ 7, /* 22 T_DNA */ 8, /* 23 T_DOUBLEFLT */ 9, /* 24 T_FPOPFLT */ 10, /* 25 T_TSSFLT */ 11, /* 26 T_SEGNPFLT */ 12, /* 27 T_STKFLT */ 18, /* 28 T_MCHK */ 19, /* 29 T_XMMFLT */ 15 /* 30 T_RESERVED */ }; #define bsd_to_linux_trapcode(code) \ ((code)args->argc + 1); base--; suword(base, (intptr_t)envp); base--; suword(base, (intptr_t)argv); base--; suword(base, imgp->args->argc); *stack_base = (uintptr_t)base; return (0); } static int linux_copyout_auxargs(struct image_params *imgp, uintptr_t base) { struct proc *p; Elf32_Auxargs *args; Elf32_Auxinfo *argarray, *pos; struct ps_strings *arginfo; int error, issetugid; p = imgp->proc; issetugid = imgp->proc->p_flag & P_SUGID ? 1 : 0; arginfo = (struct ps_strings *)PROC_PS_STRINGS(p); args = (Elf32_Auxargs *)imgp->auxargs; argarray = pos = malloc(LINUX_AT_COUNT * sizeof(*pos), M_TEMP, M_WAITOK | M_ZERO); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, linux_vdso_base); AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, __kernel_vsyscall); AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); /* * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0, * as it has appeared in the 2.4.0-rc7 first time. * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK), * glibc falls back to the hard-coded CLK_TCK value when aux entry * is not present. * Also see linux_times() implementation. */ if (linux_kernver(curthread) >= LINUX_KERNVER_2004000) AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); AUXARGS_ENTRY_PTR(pos, LINUX_AT_RANDOM, imgp->canary); if (imgp->execpathp != 0) AUXARGS_ENTRY_PTR(pos, LINUX_AT_EXECFN, imgp->execpathp); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; KASSERT(pos - argarray <= LINUX_AT_COUNT, ("Too many auxargs")); error = copyout(argarray, (void *)base, sizeof(*argarray) * LINUX_AT_COUNT); free(argarray, M_TEMP); return (error); } static int linux_fixup_elf(uintptr_t *stack_base, struct image_params *imgp) { register_t *base; base = (register_t *)*stack_base; base--; if (suword(base, (register_t)imgp->args->argc) == -1) return (EFAULT); *stack_base = (uintptr_t)base; return (0); } /* * Copied from kern/kern_exec.c */ static int linux_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc, error; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; char canary[LINUX_AT_RANDOM_LEN]; size_t execpath_len; struct proc *p; p = imgp->proc; arginfo = (struct ps_strings *)PROC_PS_STRINGS(p); destp = (uintptr_t)arginfo; if (imgp->execpath != NULL && imgp->auxargs != NULL) { execpath_len = strlen(imgp->execpath) + 1; destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= roundup(sizeof(canary), sizeof(void *)); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); /* Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has LINUX_AT_COUNT entries. */ destp -= LINUX_AT_COUNT * sizeof(Elf32_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* vectp also becomes our initial stack base. */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* Fill in "ps_strings" struct for ps, w, etc. */ if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* A null vector table pointer separates the argp's from the envp's. */ if (suword(vectp++, 0) != 0) return (EFAULT); if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* The end of the vector table is a null pointer. */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int sig, code; int oonstack; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; frame.sf_siginfo = &fp->sf_si; frame.sf_ucontext = &fp->sf_sc; /* Fill in POSIX parts. */ siginfo_to_lsiginfo(&ksi->ksi_info, &frame.sf_si, sig); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = NULL; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__mask; frame.sf_sc.uc_mcontext.sc_gs = rgs(); frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_edi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_esi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_ebp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_ebx; frame.sf_sc.uc_mcontext.sc_esp = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_edx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_ecx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_eax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_eip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = __kernel_rt_sigreturn; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int sig, code; int oonstack; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; sig = ksi->ksi_signo; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_esp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Build the argument list for the signal handler. */ sig = bsd_to_linux_signal(sig); bzero(&frame, sizeof(frame)); frame.sf_handler = catcher; frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__mask; frame.sf_sc.sc_gs = rgs(); frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_edi; frame.sf_sc.sc_esi = regs->tf_esi; frame.sf_sc.sc_ebp = regs->tf_ebp; frame.sf_sc.sc_ebx = regs->tf_ebx; frame.sf_sc.sc_esp = regs->tf_esp; frame.sf_sc.sc_edx = regs->tf_edx; frame.sf_sc.sc_ecx = regs->tf_ecx; frame.sf_sc.sc_eax = regs->tf_eax; frame.sf_sc.sc_eip = regs->tf_eip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_eflags; frame.sf_sc.sc_esp_at_signal = regs->tf_esp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (register_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno); frame.sf_extramask[0] = lmask.__mask; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* Build context to run handler in. */ regs->tf_esp = (int)fp; regs->tf_eip = __kernel_sigreturn; regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args) { struct l_sigframe frame; struct trapframe *regs; l_sigset_t lmask; sigset_t bmask; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the sigframe. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->sfp, &frame, sizeof(frame)) != 0) return (EFAULT); /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = frame.sf_sc.sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(frame.sf_sc.sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } lmask.__mask = frame.sf_sc.sc_mask; linux_to_bsd_sigset(&lmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = frame.sf_sc.sc_fs; regs->tf_es = frame.sf_sc.sc_es; regs->tf_ds = frame.sf_sc.sc_ds; regs->tf_edi = frame.sf_sc.sc_edi; regs->tf_esi = frame.sf_sc.sc_esi; regs->tf_ebp = frame.sf_sc.sc_ebp; regs->tf_ebx = frame.sf_sc.sc_ebx; regs->tf_edx = frame.sf_sc.sc_edx; regs->tf_ecx = frame.sf_sc.sc_ecx; regs->tf_eax = frame.sf_sc.sc_eax; regs->tf_eip = frame.sf_sc.sc_eip; regs->tf_cs = frame.sf_sc.sc_cs; regs->tf_eflags = eflags; regs->tf_esp = frame.sf_sc.sc_esp_at_signal; regs->tf_ss = frame.sf_sc.sc_ss; return (EJUSTRETURN); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by rt_sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * psl to gain improper privileges or to cause * a machine fault. */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct l_ucontext uc; struct l_sigcontext *context; sigset_t bmask; l_stack_t *lss; stack_t ss; struct trapframe *regs; int eflags; ksiginfo_t ksi; regs = td->td_frame; /* * The trampoline code hands us the ucontext. * It is unsafe to keep track of it ourselves, in the event that a * program jumps out of a signal handler. */ if (copyin(args->ucp, &uc, sizeof(uc)) != 0) return (EFAULT); context = &uc.uc_mcontext; /* Check for security violations. */ #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) eflags = context->sc_eflags; if (!EFLAGS_SECURE(eflags, regs->tf_eflags)) return (EINVAL); /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } linux_to_bsd_sigset(&uc.uc_sigmask, &bmask); kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0); /* Restore signal context. */ /* %gs was restored by the trampoline. */ regs->tf_fs = context->sc_fs; regs->tf_es = context->sc_es; regs->tf_ds = context->sc_ds; regs->tf_edi = context->sc_edi; regs->tf_esi = context->sc_esi; regs->tf_ebp = context->sc_ebp; regs->tf_ebx = context->sc_ebx; regs->tf_edx = context->sc_edx; regs->tf_ecx = context->sc_ecx; regs->tf_eax = context->sc_eax; regs->tf_eip = context->sc_eip; regs->tf_cs = context->sc_cs; regs->tf_eflags = eflags; regs->tf_esp = context->sc_esp_at_signal; regs->tf_ss = context->sc_ss; /* Call sigaltstack & ignore results. */ lss = &uc.uc_stack; ss.ss_sp = PTRIN(lss->ss_sp); ss.ss_size = lss->ss_size; ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags); (void)kern_sigaltstack(td, &ss, NULL); return (EJUSTRETURN); } static int linux_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->tf_eax; sa->args[0] = frame->tf_ebx; sa->args[1] = frame->tf_ecx; sa->args[2] = frame->tf_edx; sa->args[3] = frame->tf_esi; sa->args[4] = frame->tf_edi; sa->args[5] = frame->tf_ebp; /* Unconfirmed */ if (sa->code >= p->p_sysent->sv_size) /* nosys */ sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; else sa->callp = &p->p_sysent->sv_table[sa->code]; td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; return (0); } static void linux_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame = td->td_frame; cpu_set_syscall_retval(td, error); if (__predict_false(error != 0)) { if (error != ERESTART && error != EJUSTRETURN) frame->tf_eax = bsd_to_linux_errno(error); } } /* * exec_setregs may initialize some registers differently than Linux * does, thus potentially confusing Linux binaries. If necessary, we * override the exec_setregs default(s) here. */ static void linux_exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct pcb *pcb = td->td_pcb; exec_setregs(td, imgp, stack); /* Linux sets %gs to 0, we default to _udatasel. */ pcb->pcb_gs = 0; load_gs(0); pcb->pcb_initial_npxcw = __LINUX_NPXCW__; } struct sysentvec linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux a.out", .sv_coredump = NULL, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = PS_STRINGS, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32 | SV_SIG_DISCIGN | SV_SIG_WAITNDQ, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; INIT_SYSENTVEC(aout_sysvec, &linux_sysvec); struct sysentvec elf_linux_sysvec = { .sv_size = LINUX_SYS_MAXSYSCALL, .sv_table = linux_sysent, .sv_transtrap = linux_translate_traps, .sv_fixup = linux_fixup_elf, .sv_sendsig = linux_sendsig, .sv_sigcode = &_binary_linux_vdso_so_o_start, .sv_szsigcode = &linux_szsigcode, .sv_name = "Linux ELF32", .sv_coredump = elf32_coredump, .sv_elf_core_osabi = ELFOSABI_FREEBSD, .sv_elf_core_abi_vendor = FREEBSD_ABI_VENDOR, .sv_elf_core_prepare_notes = elf32_prepare_notes, .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = LINUX_USRSTACK, .sv_psstrings = LINUX_PS_STRINGS, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, .sv_setregs = linux_exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP | SV_SIG_DISCIGN | SV_SIG_WAITNDQ | SV_TIMEKEEP, .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, .sv_shared_page_base = LINUX_SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, .sv_trap = NULL, .sv_onexec = linux_on_exec_vmspace, .sv_onexit = linux_on_exit, .sv_ontdexit = linux_thread_dtor, .sv_setid_allowed = &linux_setid_allowed_query, }; static int linux_on_exec_vmspace(struct proc *p, struct image_params *imgp) { int error = 0; if (SV_PROC_FLAG(p, SV_SHP) != 0) error = linux_map_vdso(p, linux_vdso_obj, linux_vdso_base, LINUX_VDSOPAGE_SIZE, imgp); if (error == 0) linux_on_exec(p, imgp); return (error); } /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */ static void linux_exec_sysvec_init(void *param) { l_uintptr_t *ktimekeep_base, *ktsc_selector; struct sysentvec *sv; ptrdiff_t tkoff; sv = param; /* Fill timekeep_base */ exec_sysvec_init(sv); tkoff = kern_timekeep_base - linux_vdso_base; ktimekeep_base = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktimekeep_base = sv->sv_timekeep_base; tkoff = kern_tsc_selector - linux_vdso_base; ktsc_selector = (l_uintptr_t *)(linux_vdso_mapping + tkoff); *ktsc_selector = linux_vdso_tsc_selector_idx(); if (bootverbose) printf("Linux i386 vDSO tsc_selector: %u\n", *ktsc_selector); } SYSINIT(elf_linux_exec_sysvec_init, SI_SUB_EXEC + 1, SI_ORDER_ANY, linux_exec_sysvec_init, &elf_linux_sysvec); static void linux_vdso_install(const void *param) { char *vdso_start = &_binary_linux_vdso_so_o_start; char *vdso_end = &_binary_linux_vdso_so_o_end; linux_szsigcode = vdso_end - vdso_start; MPASS(linux_szsigcode <= LINUX_VDSOPAGE_SIZE); linux_vdso_base = LINUX_VDSOPAGE; __elfN(linux_vdso_fixup)(vdso_start, linux_vdso_base); linux_vdso_obj = __elfN(linux_shared_page_init) (&linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); bcopy(vdso_start, linux_vdso_mapping, linux_szsigcode); linux_vdso_reloc(linux_vdso_mapping, linux_vdso_base); } SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC + 1, SI_ORDER_FIRST, linux_vdso_install, NULL); static void linux_vdso_deinstall(const void *param) { __elfN(linux_shared_page_fini)(linux_vdso_obj, linux_vdso_mapping, LINUX_VDSOPAGE_SIZE); } SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, linux_vdso_deinstall, NULL); static void linux_vdso_reloc(char *mapping, Elf_Addr offset) { const Elf_Shdr *shdr; const Elf_Rel *rel; const Elf_Ehdr *ehdr; Elf_Addr *where; Elf_Size rtype, symidx; Elf_Addr addr, addend; int i, relcnt; MPASS(offset != 0); relcnt = 0; ehdr = (const Elf_Ehdr *)mapping; shdr = (const Elf_Shdr *)(mapping + ehdr->e_shoff); for (i = 0; i < ehdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_REL: rel = (const Elf_Rel *)(mapping + shdr[i].sh_offset); relcnt = shdr[i].sh_size / sizeof(*rel); break; case SHT_RELA: printf("Linux i386 vDSO: unexpected Rela section\n"); break; } } for (i = 0; i < relcnt; i++, rel++) { where = (Elf_Addr *)(mapping + rel->r_offset); addend = *where; rtype = ELF_R_TYPE(rel->r_info); symidx = ELF_R_SYM(rel->r_info); switch (rtype) { case R_386_NONE: /* none */ break; case R_386_RELATIVE: /* B + A */ addr = (Elf_Addr)PTROUT(offset + addend); if (*where != addr) *where = addr; break; case R_386_IRELATIVE: printf("Linux i386 vDSO: unexpected ifunc relocation, " "symbol index %d\n", symidx); break; default: printf("Linux i386 vDSO: unexpected relocation type %d, " "symbol index %d\n", rtype, symidx); } } } static char GNU_ABI_VENDOR[] = "GNU"; static int GNULINUX_ABI_DESC = 0; static bool linux_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNULINUX_ABI_DESC) return (false); /* * For Linux we encode osrel using the Linux convention of * (version << 16) | (major << 8) | (minor) * See macro in linux_mib.h */ *osrel = LINUX_KERNVER(desc[1], desc[2], desc[3]); return (true); } static Elf_Brandnote linux_brandnote = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = linux_trans_osrel }; static Elf32_Brandinfo linux_brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_glibc2brand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-linux.so.2", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; static Elf32_Brandinfo linux_muslbrand = { .brand = ELFOSABI_LINUX, .machine = EM_386, .compat_3_brand = "Linux", .emul_path = linux_emul_path, .interp_path = "/lib/ld-musl-i386.so.1", .sysvec = &elf_linux_sysvec, .interp_newpath = NULL, .brand_note = &linux_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE | LINUX_BI_FUTEX_REQUEUE }; Elf32_Brandinfo *linux_brandlist[] = { &linux_brand, &linux_glibc2brand, &linux_muslbrand, NULL }; static int linux_elf_modevent(module_t mod, int type, void *data) { Elf32_Brandinfo **brandinfo; int error; struct linux_ioctl_handler **lihp; error = 0; switch(type) { case MOD_LOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_insert_brand_entry(*brandinfo) < 0) error = EINVAL; if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_register_handler(*lihp); - LIST_INIT(&futex_list); - mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF); linux_dev_shm_create(); linux_osd_jail_register(); stclohz = (stathz ? stathz : hz); if (bootverbose) printf("Linux ELF exec handler installed\n"); } else printf("cannot insert Linux ELF brand handler\n"); break; case MOD_UNLOAD: for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_brand_inuse(*brandinfo)) error = EBUSY; if (error == 0) { for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; ++brandinfo) if (elf32_remove_brand_entry(*brandinfo) < 0) error = EINVAL; } if (error == 0) { SET_FOREACH(lihp, linux_ioctl_handler_set) linux_ioctl_unregister_handler(*lihp); - mtx_destroy(&futex_mtx); linux_dev_shm_destroy(); linux_osd_jail_deregister(); if (bootverbose) printf("Linux ELF exec handler removed\n"); } else printf("Could not deinstall ELF interpreter entry\n"); break; default: return (EOPNOTSUPP); } return (error); } static moduledata_t linux_elf_mod = { "linuxelf", linux_elf_modevent, 0 }; DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); FEATURE(linux, "Linux 32bit support"); /* * linux_vdso_install() and linux_exec_sysvec_init() must be called * after exec_sysvec_init() which is SI_SUB_EXEC (SI_ORDER_ANY). */