Index: head/sys/compat/linux/linux.c =================================================================== --- head/sys/compat/linux/linux.c (revision 362058) +++ head/sys/compat/linux/linux.c (revision 362059) @@ -1,553 +1,553 @@ /*- * Copyright (c) 2015 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct futex_list futex_list; struct mtx futex_mtx; /* protects the futex list */ CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ); static int bsd_to_linux_sigtbl[LINUX_SIGTBLSZ] = { LINUX_SIGHUP, /* SIGHUP */ LINUX_SIGINT, /* SIGINT */ LINUX_SIGQUIT, /* SIGQUIT */ LINUX_SIGILL, /* SIGILL */ LINUX_SIGTRAP, /* SIGTRAP */ LINUX_SIGABRT, /* SIGABRT */ 0, /* SIGEMT */ LINUX_SIGFPE, /* SIGFPE */ LINUX_SIGKILL, /* SIGKILL */ LINUX_SIGBUS, /* SIGBUS */ LINUX_SIGSEGV, /* SIGSEGV */ LINUX_SIGSYS, /* SIGSYS */ LINUX_SIGPIPE, /* SIGPIPE */ LINUX_SIGALRM, /* SIGALRM */ LINUX_SIGTERM, /* SIGTERM */ LINUX_SIGURG, /* SIGURG */ LINUX_SIGSTOP, /* SIGSTOP */ LINUX_SIGTSTP, /* SIGTSTP */ LINUX_SIGCONT, /* SIGCONT */ LINUX_SIGCHLD, /* SIGCHLD */ LINUX_SIGTTIN, /* SIGTTIN */ LINUX_SIGTTOU, /* SIGTTOU */ LINUX_SIGIO, /* SIGIO */ LINUX_SIGXCPU, /* SIGXCPU */ LINUX_SIGXFSZ, /* SIGXFSZ */ LINUX_SIGVTALRM,/* SIGVTALRM */ LINUX_SIGPROF, /* SIGPROF */ LINUX_SIGWINCH, /* SIGWINCH */ 0, /* SIGINFO */ LINUX_SIGUSR1, /* SIGUSR1 */ LINUX_SIGUSR2 /* SIGUSR2 */ }; static int linux_to_bsd_sigtbl[LINUX_SIGTBLSZ] = { SIGHUP, /* LINUX_SIGHUP */ SIGINT, /* LINUX_SIGINT */ SIGQUIT, /* LINUX_SIGQUIT */ SIGILL, /* LINUX_SIGILL */ SIGTRAP, /* LINUX_SIGTRAP */ SIGABRT, /* LINUX_SIGABRT */ SIGBUS, /* LINUX_SIGBUS */ SIGFPE, /* LINUX_SIGFPE */ SIGKILL, /* LINUX_SIGKILL */ SIGUSR1, /* LINUX_SIGUSR1 */ SIGSEGV, /* LINUX_SIGSEGV */ SIGUSR2, /* LINUX_SIGUSR2 */ SIGPIPE, /* LINUX_SIGPIPE */ SIGALRM, /* LINUX_SIGALRM */ SIGTERM, /* LINUX_SIGTERM */ SIGBUS, /* LINUX_SIGSTKFLT */ SIGCHLD, /* LINUX_SIGCHLD */ SIGCONT, /* LINUX_SIGCONT */ SIGSTOP, /* LINUX_SIGSTOP */ SIGTSTP, /* LINUX_SIGTSTP */ SIGTTIN, /* LINUX_SIGTTIN */ SIGTTOU, /* LINUX_SIGTTOU */ SIGURG, /* LINUX_SIGURG */ SIGXCPU, /* LINUX_SIGXCPU */ SIGXFSZ, /* LINUX_SIGXFSZ */ SIGVTALRM, /* LINUX_SIGVTALARM */ SIGPROF, /* LINUX_SIGPROF */ SIGWINCH, /* LINUX_SIGWINCH */ SIGIO, /* LINUX_SIGIO */ /* * FreeBSD does not have SIGPWR signal, map Linux SIGPWR signal * to the first unused FreeBSD signal number. Since Linux supports * signals from 1 to 64 we are ok here as our SIGRTMIN = 65. */ SIGRTMIN, /* LINUX_SIGPWR */ SIGSYS /* LINUX_SIGSYS */ }; static struct cdev *dev_shm_cdev; static struct cdevsw dev_shm_cdevsw = { .d_version = D_VERSION, .d_name = "dev_shm", }; /* * Map Linux RT signals to the FreeBSD RT signals. */ static inline int linux_to_bsd_rt_signal(int sig) { return (SIGRTMIN + 1 + sig - LINUX_SIGRTMIN); } static inline int bsd_to_linux_rt_signal(int sig) { return (sig - SIGRTMIN - 1 + LINUX_SIGRTMIN); } int linux_to_bsd_signal(int sig) { KASSERT(sig > 0 && sig <= LINUX_SIGRTMAX, ("invalid Linux signal %d\n", sig)); if (sig < LINUX_SIGRTMIN) return (linux_to_bsd_sigtbl[_SIG_IDX(sig)]); return (linux_to_bsd_rt_signal(sig)); } int bsd_to_linux_signal(int sig) { if (sig <= LINUX_SIGTBLSZ) return (bsd_to_linux_sigtbl[_SIG_IDX(sig)]); if (sig == SIGRTMIN) return (LINUX_SIGPWR); return (bsd_to_linux_rt_signal(sig)); } int linux_to_bsd_sigaltstack(int lsa) { int bsa = 0; if (lsa & LINUX_SS_DISABLE) bsa |= SS_DISABLE; /* * Linux ignores SS_ONSTACK flag for ss * parameter while FreeBSD prohibits it. */ return (bsa); } int bsd_to_linux_sigaltstack(int bsa) { int lsa = 0; if (bsa & SS_DISABLE) lsa |= LINUX_SS_DISABLE; if (bsa & SS_ONSTACK) lsa |= LINUX_SS_ONSTACK; return (lsa); } void linux_to_bsd_sigset(l_sigset_t *lss, sigset_t *bss) { int b, l; SIGEMPTYSET(*bss); for (l = 1; l <= LINUX_SIGRTMAX; l++) { if (LINUX_SIGISMEMBER(*lss, l)) { b = linux_to_bsd_signal(l); if (b) SIGADDSET(*bss, b); } } } void bsd_to_linux_sigset(sigset_t *bss, l_sigset_t *lss) { int b, l; LINUX_SIGEMPTYSET(*lss); for (b = 1; b <= SIGRTMAX; b++) { if (SIGISMEMBER(*bss, b)) { l = bsd_to_linux_signal(b); if (l) LINUX_SIGADDSET(*lss, l); } } } /* * Translate a Linux interface name to a FreeBSD interface name, * and return the associated ifnet structure * bsdname and lxname need to be least IFNAMSIZ bytes long, but * can point to the same buffer. */ struct ifnet * ifname_linux_to_bsd(struct thread *td, const char *lxname, char *bsdname) { struct ifnet *ifp; int len, unit; char *ep; int index; bool is_eth, is_lo; for (len = 0; len < LINUX_IFNAMSIZ; ++len) if (!isalpha(lxname[len]) || lxname[len] == '\0') break; if (len == 0 || len == LINUX_IFNAMSIZ) return (NULL); /* Linux loopback interface name is lo (not lo0) */ is_lo = (len == 2 && strncmp(lxname, "lo", len) == 0); unit = (int)strtoul(lxname + len, &ep, 10); if ((ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ) && is_lo == 0) return (NULL); index = 0; is_eth = (len == 3 && strncmp(lxname, "eth", len) == 0); CURVNET_SET(TD_TO_VNET(td)); IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make * the test optional based on is_eth. */ if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0) break; if (is_eth && IFP_IS_ETH(ifp) && unit == index++) break; if (is_lo && IFP_IS_LOOP(ifp)) break; } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (ifp != NULL && bsdname != NULL) strlcpy(bsdname, ifp->if_xname, IFNAMSIZ); return (ifp); } void linux_ifflags(struct ifnet *ifp, short *flags) { unsigned short fl; fl = (ifp->if_flags | ifp->if_drv_flags) & 0xffff; *flags = 0; if (fl & IFF_UP) *flags |= LINUX_IFF_UP; if (fl & IFF_BROADCAST) *flags |= LINUX_IFF_BROADCAST; if (fl & IFF_DEBUG) *flags |= LINUX_IFF_DEBUG; if (fl & IFF_LOOPBACK) *flags |= LINUX_IFF_LOOPBACK; if (fl & IFF_POINTOPOINT) *flags |= LINUX_IFF_POINTOPOINT; if (fl & IFF_DRV_RUNNING) *flags |= LINUX_IFF_RUNNING; if (fl & IFF_NOARP) *flags |= LINUX_IFF_NOARP; if (fl & IFF_PROMISC) *flags |= LINUX_IFF_PROMISC; if (fl & IFF_ALLMULTI) *flags |= LINUX_IFF_ALLMULTI; if (fl & IFF_MULTICAST) *flags |= LINUX_IFF_MULTICAST; } int linux_ifhwaddr(struct ifnet *ifp, struct l_sockaddr *lsa) { struct ifaddr *ifa; struct sockaddr_dl *sdl; if (IFP_IS_LOOP(ifp)) { bzero(lsa, sizeof(*lsa)); lsa->sa_family = LINUX_ARPHRD_LOOPBACK; return (0); } if (!IFP_IS_ETH(ifp)) return (ENOENT); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (sdl != NULL && (sdl->sdl_family == AF_LINK) && (sdl->sdl_type == IFT_ETHER)) { bzero(lsa, sizeof(*lsa)); lsa->sa_family = LINUX_ARPHRD_ETHER; bcopy(LLADDR(sdl), lsa->sa_data, LINUX_IFHWADDRLEN); return (0); } } return (ENOENT); } int linux_to_bsd_domain(int domain) { switch (domain) { case LINUX_AF_UNSPEC: return (AF_UNSPEC); case LINUX_AF_UNIX: return (AF_LOCAL); case LINUX_AF_INET: return (AF_INET); case LINUX_AF_INET6: return (AF_INET6); case LINUX_AF_AX25: return (AF_CCITT); case LINUX_AF_IPX: return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); } return (-1); } int bsd_to_linux_domain(int domain) { switch (domain) { case AF_UNSPEC: return (LINUX_AF_UNSPEC); case AF_LOCAL: return (LINUX_AF_UNIX); case AF_INET: return (LINUX_AF_INET); case AF_INET6: return (LINUX_AF_INET6); case AF_CCITT: return (LINUX_AF_AX25); case AF_IPX: return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); } return (-1); } /* * Based on the fact that: * 1. Native and Linux storage of struct sockaddr * and struct sockaddr_in6 are equal. * 2. On Linux sa_family is the first member of all struct sockaddr. */ int bsd_to_linux_sockaddr(const struct sockaddr *sa, struct l_sockaddr **lsa, socklen_t len) { struct l_sockaddr *kosa; int error, bdom; *lsa = NULL; if (len < 2 || len > UCHAR_MAX) return (EINVAL); kosa = malloc(len, M_SONAME, M_WAITOK); bcopy(sa, kosa, len); bdom = bsd_to_linux_domain(sa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } kosa->sa_family = bdom; *lsa = kosa; return (0); out: free(kosa, M_SONAME); return (error); } int linux_to_bsd_sockaddr(const struct l_sockaddr *osa, struct sockaddr **sap, socklen_t *len) { struct sockaddr *sa; struct l_sockaddr *kosa; #ifdef INET6 struct sockaddr_in6 *sin6; bool oldv6size; #endif char *name; int salen, bdom, error, hdrlen, namelen; if (*len < 2 || *len > UCHAR_MAX) return (EINVAL); salen = *len; #ifdef INET6 oldv6size = false; /* * Check for old (pre-RFC2553) sockaddr_in6. We may accept it * if it's a v4-mapped address, so reserve the proper space * for it. */ if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) { salen += sizeof(uint32_t); oldv6size = true; } #endif kosa = malloc(salen, M_SONAME, M_WAITOK); if ((error = copyin(osa, kosa, *len))) goto out; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } #ifdef INET6 /* * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6, * which lacks the scope id compared with RFC2553 one. If we detect * the situation, reject the address and write a message to system log. * * Still accept addresses for which the scope id is not used. */ if (oldv6size) { if (bdom == AF_INET6) { sin6 = (struct sockaddr_in6 *)kosa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) || (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) { sin6->sin6_scope_id = 0; } else { linux_msg(curthread, - "obsolete pre-RFC2553 sockaddr_in6 rejected\n"); + "obsolete pre-RFC2553 sockaddr_in6 rejected"); error = EINVAL; goto out; } } else salen -= sizeof(uint32_t); } #endif if (bdom == AF_INET) { if (salen < sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } salen = sizeof(struct sockaddr_in); } if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) { hdrlen = offsetof(struct sockaddr_un, sun_path); name = ((struct sockaddr_un *)kosa)->sun_path; if (*name == '\0') { /* * Linux abstract namespace starts with a NULL byte. * XXX We do not support abstract namespace yet. */ namelen = strnlen(name + 1, salen - hdrlen - 1) + 1; } else namelen = strnlen(name, salen - hdrlen); salen = hdrlen + namelen; if (salen > sizeof(struct sockaddr_un)) { error = ENAMETOOLONG; goto out; } } sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; *sap = sa; *len = salen; return (0); out: free(kosa, M_SONAME); return (error); } void linux_dev_shm_create(void) { int error; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &dev_shm_cdev, &dev_shm_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0, "shm/.mountpoint"); if (error != 0) { printf("%s: failed to create device node, error %d\n", __func__, error); } } void linux_dev_shm_destroy(void) { destroy_dev(dev_shm_cdev); } Index: head/sys/compat/linux/linux_event.c =================================================================== --- head/sys/compat/linux/linux_event.c (revision 362058) +++ head/sys/compat/linux/linux_event.c (revision 362059) @@ -1,1343 +1,1343 @@ /*- * Copyright (c) 2007 Roman Divacky * Copyright (c) 2014 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include /* * epoll defines 'struct epoll_event' with the field 'data' as 64 bits * on all architectures. But on 32 bit architectures BSD 'struct kevent' only * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied * data verbatuim. Therefore we allocate 64-bit memory block to pass * user supplied data for every file descriptor. */ typedef uint64_t epoll_udata_t; struct epoll_emuldata { uint32_t fdc; /* epoll udata max index */ epoll_udata_t udata[1]; /* epoll user data vector */ }; #define EPOLL_DEF_SZ 16 #define EPOLL_SIZE(fdn) \ (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) struct epoll_event { uint32_t events; epoll_udata_t data; } #if defined(__amd64__) __attribute__((packed)) #endif ; #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); static int epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, struct kevent *kevent, int *nkevents); static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); static int epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, unsigned int flags); static int epoll_fd_registered(struct thread *td, struct file *epfp, int fd); static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd); struct epoll_copyin_args { struct kevent *changelist; }; struct epoll_copyout_args { struct epoll_event *leventlist; struct proc *p; uint32_t count; int error; }; /* eventfd */ typedef uint64_t eventfd_t; static fo_rdwr_t eventfd_read; static fo_rdwr_t eventfd_write; static fo_ioctl_t eventfd_ioctl; static fo_poll_t eventfd_poll; static fo_kqfilter_t eventfd_kqfilter; static fo_stat_t eventfd_stat; static fo_close_t eventfd_close; static fo_fill_kinfo_t eventfd_fill_kinfo; static struct fileops eventfdops = { .fo_read = eventfd_read, .fo_write = eventfd_write, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = eventfd_poll, .fo_kqfilter = eventfd_kqfilter, .fo_stat = eventfd_stat, .fo_close = eventfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = eventfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_eventfddetach(struct knote *kn); static int filt_eventfdread(struct knote *kn, long hint); static int filt_eventfdwrite(struct knote *kn, long hint); static struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdread }; static struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdwrite }; /* timerfd */ typedef uint64_t timerfd_t; static fo_rdwr_t timerfd_read; static fo_poll_t timerfd_poll; static fo_kqfilter_t timerfd_kqfilter; static fo_stat_t timerfd_stat; static fo_close_t timerfd_close; static fo_fill_kinfo_t timerfd_fill_kinfo; static struct fileops timerfdops = { .fo_read = timerfd_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = timerfd_poll, .fo_kqfilter = timerfd_kqfilter, .fo_stat = timerfd_stat, .fo_close = timerfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = timerfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_timerfddetach(struct knote *kn); static int filt_timerfdread(struct knote *kn, long hint); static struct filterops timerfd_rfiltops = { .f_isfd = 1, .f_detach = filt_timerfddetach, .f_event = filt_timerfdread }; struct eventfd { eventfd_t efd_count; uint32_t efd_flags; struct selinfo efd_sel; struct mtx efd_lock; }; struct timerfd { clockid_t tfd_clockid; struct itimerspec tfd_time; struct callout tfd_callout; timerfd_t tfd_count; bool tfd_canceled; struct selinfo tfd_sel; struct mtx tfd_lock; }; static int eventfd_create(struct thread *td, uint32_t initval, int flags); static void linux_timerfd_expire(void *); static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) { struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct proc *p; p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_XLOCK(pem); if (pem->epoll == NULL) { emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); emd->fdc = fd; pem->epoll = emd; } else { emd = pem->epoll; if (fd > emd->fdc) { emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); emd->fdc = fd; pem->epoll = emd; } } emd->udata[fd] = udata; LINUX_PEM_XUNLOCK(pem); } static int epoll_create_common(struct thread *td, int flags) { int error; error = kern_kqueue(td, flags, NULL); if (error != 0) return (error); epoll_fd_install(td, EPOLL_DEF_SZ, 0); return (0); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) { /* * args->size is unused. Linux just tests it * and then forgets it as well. */ if (args->size <= 0) return (EINVAL); return (epoll_create_common(td, 0)); } #endif int linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) { int flags; if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= O_CLOEXEC; return (epoll_create_common(td, flags)); } /* Structure converting function from epoll to kevent. */ static int epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, struct kevent *kevent, int *nkevents) { uint32_t levents = l_event->events; struct linux_pemuldata *pem; struct proc *p; unsigned short kev_flags = EV_ADD | EV_ENABLE; /* flags related to how event is registered */ if ((levents & LINUX_EPOLLONESHOT) != 0) kev_flags |= EV_DISPATCH; if ((levents & LINUX_EPOLLET) != 0) kev_flags |= EV_CLEAR; if ((levents & LINUX_EPOLLERR) != 0) kev_flags |= EV_ERROR; if ((levents & LINUX_EPOLLRDHUP) != 0) kev_flags |= EV_EOF; /* flags related to what event is registered */ if ((levents & LINUX_EPOLL_EVRD) != 0) { EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, 0); ++(*nkevents); } if ((levents & LINUX_EPOLL_EVWR) != 0) { EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); ++(*nkevents); } /* zero event mask is legal */ if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) { EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); ++(*nkevents); } if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); LINUX_PEM_XLOCK(pem); if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { pem->flags |= LINUX_XUNSUP_EPOLL; LINUX_PEM_XUNLOCK(pem); - linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", + linux_msg(td, "epoll_ctl unsupported flags: 0x%x", levents); } else LINUX_PEM_XUNLOCK(pem); return (EINVAL); } return (0); } /* * Structure converting function from kevent to epoll. In a case * this is called on error in registration we store the error in * event->data and pick it up later in linux_epoll_ctl(). */ static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) { if ((kevent->flags & EV_ERROR) != 0) { l_event->events = LINUX_EPOLLERR; return; } /* XXX EPOLLPRI, EPOLLHUP */ switch (kevent->filter) { case EVFILT_READ: l_event->events = LINUX_EPOLLIN; if ((kevent->flags & EV_EOF) != 0) l_event->events |= LINUX_EPOLLRDHUP; break; case EVFILT_WRITE: l_event->events = LINUX_EPOLLOUT; break; } } /* * Copyout callback used by kevent. This converts kevent * events to epoll events and copies them back to the * userspace. This is also called on error on registering * of the filter. */ static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count) { struct epoll_copyout_args *args; struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct epoll_event *eep; int error, fd, i; args = (struct epoll_copyout_args*) arg; eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); pem = pem_find(args->p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_SLOCK(pem); emd = pem->epoll; KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); for (i = 0; i < count; i++) { kevent_to_epoll(&kevp[i], &eep[i]); fd = kevp[i].ident; KASSERT(fd <= emd->fdc, ("epoll user data vector" " is too small.\n")); eep[i].data = emd->udata[fd]; } LINUX_PEM_SUNLOCK(pem); error = copyout(eep, args->leventlist, count * sizeof(*eep)); if (error == 0) { args->leventlist += count; args->count += count; } else if (args->error == 0) args->error = error; free(eep, M_EPOLL); return (error); } /* * Copyin callback used by kevent. This copies already * converted filters from kernel memory to the kevent * internal kernel memory. Hence the memcpy instead of * copyin. */ static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count) { struct epoll_copyin_args *args; args = (struct epoll_copyin_args*) arg; memcpy(kevp, args->changelist, count * sizeof(*kevp)); args->changelist += count; return (0); } /* * Load epoll filter, convert it to kevent filter * and load it into kevent subsystem. */ int linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) { struct file *epfp, *fp; struct epoll_copyin_args ciargs; struct kevent kev[2]; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; struct epoll_event le; cap_rights_t rights; int nchanges = 0; int error; if (args->op != LINUX_EPOLL_CTL_DEL) { error = copyin(args->event, &le, sizeof(le)); if (error != 0) return (error); } error = fget(td, args->epfd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave1; } /* Protect user data vector from incorrectly supplied fd. */ error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); if (error != 0) goto leave1; /* Linux disallows spying on himself */ if (epfp == fp) { error = EINVAL; goto leave0; } ciargs.changelist = kev; if (args->op != LINUX_EPOLL_CTL_DEL) { error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges); if (error != 0) goto leave0; } switch (args->op) { case LINUX_EPOLL_CTL_MOD: error = epoll_delete_all_events(td, epfp, args->fd); if (error != 0) goto leave0; break; case LINUX_EPOLL_CTL_ADD: if (epoll_fd_registered(td, epfp, args->fd)) { error = EEXIST; goto leave0; } break; case LINUX_EPOLL_CTL_DEL: /* CTL_DEL means unregister this fd with this epoll */ error = epoll_delete_all_events(td, epfp, args->fd); goto leave0; default: error = EINVAL; goto leave0; } epoll_fd_install(td, args->fd, le.data); error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); leave0: fdrop(fp, td); leave1: fdrop(epfp, td); return (error); } /* * Wait for a filter to be triggered on the epoll file descriptor. */ static int linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, int maxevents, int timeout, sigset_t *uset) { struct epoll_copyout_args coargs; struct kevent_copyops k_ops = { &coargs, epoll_kev_copyout, NULL}; struct timespec ts, *tsp; cap_rights_t rights; struct file *epfp; sigset_t omask; int error; if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) return (EINVAL); error = fget(td, epfd, cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave; } if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &omask, 0); if (error != 0) goto leave; td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } coargs.leventlist = events; coargs.p = td->td_proc; coargs.count = 0; coargs.error = 0; /* * Linux epoll_wait(2) man page states that timeout of -1 causes caller * to block indefinitely. Real implementation does it if any negative * timeout value is passed. */ if (timeout >= 0) { /* Convert from milliseconds to timespec. */ ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; tsp = &ts; } else { tsp = NULL; } error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); if (error == 0 && coargs.error != 0) error = coargs.error; /* * kern_kevent might return ENOMEM which is not expected from epoll_wait. * Maybe we should translate that but I don't think it matters at all. */ if (error == 0) td->td_retval[0] = coargs.count; if (uset != NULL) error = kern_sigprocmask(td, SIG_SETMASK, &omask, NULL, 0); leave: fdrop(epfp, td); return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) { return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, NULL)); } #endif int linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) { sigset_t mask, *pmask; l_sigset_t lmask; int error; if (args->mask != NULL) { if (args->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); if (error != 0) return (error); linux_to_bsd_sigset(&lmask, &mask); pmask = &mask; } else pmask = NULL; return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, pmask)); } static int epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, unsigned int flags) { struct epoll_copyin_args ciargs; struct kevent kev; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; ciargs.changelist = &kev; EV_SET(&kev, fd, filter, flags, 0, 0, 0); return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); } static int epoll_fd_registered(struct thread *td, struct file *epfp, int fd) { /* * Set empty filter flags to avoid accidental modification of already * registered events. In the case of event re-registration: * 1. If event does not exists kevent() does nothing and returns ENOENT * 2. If event does exists, it's enabled/disabled state is preserved * but fflags, data and udata fields are overwritten. So we can not * set socket lowats and store user's context pointer in udata. */ if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT || epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT) return (1); return (0); } static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) { int error1, error2; error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE); error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE); /* return 0 if at least one result positive */ return (error1 == 0 ? 0 : error2); } static int eventfd_create(struct thread *td, uint32_t initval, int flags) { struct filedesc *fdp; struct eventfd *efd; struct file *fp; int fflags, fd, error; fflags = 0; if ((flags & LINUX_O_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); efd->efd_flags = flags; efd->efd_count = initval; mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); fflags = FREAD | FWRITE; if ((flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_eventfd(struct thread *td, struct linux_eventfd_args *args) { return (eventfd_create(td, args->initval, 0)); } #endif int linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) { if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) return (EINVAL); return (eventfd_create(td, args->initval, args->flags)); } static int eventfd_close(struct file *fp, struct thread *td) { struct eventfd *efd; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); seldrain(&efd->efd_sel); knlist_destroy(&efd->efd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&efd->efd_lock); free(efd, M_EPOLL); return (0); } static int eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = 0; mtx_lock(&efd->efd_lock); retry: if (efd->efd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); if (error == 0) goto retry; } if (error == 0) { if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { count = 1; --efd->efd_count; } else { count = efd->efd_count; efd->efd_count = 0; } KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); mtx_unlock(&efd->efd_lock); error = uiomove(&count, sizeof(eventfd_t), uio); } else mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = uiomove(&count, sizeof(eventfd_t), uio); if (error != 0) return (error); if (count == UINT64_MAX) return (EINVAL); mtx_lock(&efd->efd_lock); retry: if (UINT64_MAX - efd->efd_count <= count) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); /* Do not not return the number of bytes written */ uio->uio_resid += sizeof(eventfd_t); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdwr", 0); if (error == 0) goto retry; } if (error == 0) { efd->efd_count += count; KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); } mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct eventfd *efd; int revents = 0; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (POLLERR); mtx_lock(&efd->efd_lock); if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) revents |= events & (POLLOUT|POLLWRNORM); if (revents == 0) selrecord(td, &efd->efd_sel); mtx_unlock(&efd->efd_lock); return (revents); } /*ARGSUSED*/ static int eventfd_kqfilter(struct file *fp, struct knote *kn) { struct eventfd *efd; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); mtx_lock(&efd->efd_lock); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &eventfd_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &eventfd_wfiltops; break; default: mtx_unlock(&efd->efd_lock); return (EINVAL); } kn->kn_hook = efd; knlist_add(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); return (0); } static void filt_eventfddetach(struct knote *kn) { struct eventfd *efd = kn->kn_hook; mtx_lock(&efd->efd_lock); knlist_remove(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); } /*ARGSUSED*/ static int filt_eventfdread(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); ret = (efd->efd_count > 0); return (ret); } /*ARGSUSED*/ static int filt_eventfdwrite(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); ret = (UINT64_MAX - 1 > efd->efd_count); return (ret); } /*ARGSUSED*/ static int eventfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && fp->f_type != DTYPE_LINUXTFD)) return (EINVAL); switch (cmd) { case FIONBIO: if ((*(int *)data)) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); case FIOASYNC: return (0); default: return (ENXIO); } } /*ARGSUSED*/ static int eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } int linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) { struct filedesc *fdp; struct timerfd *tfd; struct file *fp; clockid_t clockid; int fflags, fd, error; if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) return (EINVAL); error = linux_to_native_clockid(&clockid, args->clockid); if (error != 0) return (error); if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return (EINVAL); fflags = 0; if ((args->flags & LINUX_TFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); tfd->tfd_clockid = clockid; mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); fflags = FREAD; if ((args->flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } static int timerfd_close(struct file *fp, struct thread *td) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); timespecclear(&tfd->tfd_time.it_value); timespecclear(&tfd->tfd_time.it_interval); mtx_lock(&tfd->tfd_lock); callout_drain(&tfd->tfd_callout); mtx_unlock(&tfd->tfd_lock); seldrain(&tfd->tfd_sel); knlist_destroy(&tfd->tfd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&tfd->tfd_lock); free(tfd, M_EPOLL); return (0); } static int timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct timerfd *tfd; timerfd_t count; int error; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(timerfd_t)) return (EINVAL); error = 0; mtx_lock(&tfd->tfd_lock); retry: if (tfd->tfd_canceled) { tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); return (ECANCELED); } if (tfd->tfd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&tfd->tfd_lock); return (EAGAIN); } error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); if (error == 0) goto retry; } if (error == 0) { count = tfd->tfd_count; tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); error = uiomove(&count, sizeof(timerfd_t), uio); } else mtx_unlock(&tfd->tfd_lock); return (error); } static int timerfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct timerfd *tfd; int revents = 0; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (POLLERR); mtx_lock(&tfd->tfd_lock); if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if (revents == 0) selrecord(td, &tfd->tfd_sel); mtx_unlock(&tfd->tfd_lock); return (revents); } /*ARGSUSED*/ static int timerfd_kqfilter(struct file *fp, struct knote *kn) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (kn->kn_filter == EVFILT_READ) kn->kn_fop = &timerfd_rfiltops; else return (EINVAL); kn->kn_hook = tfd; knlist_add(&tfd->tfd_sel.si_note, kn, 0); return (0); } static void filt_timerfddetach(struct knote *kn) { struct timerfd *tfd = kn->kn_hook; mtx_lock(&tfd->tfd_lock); knlist_remove(&tfd->tfd_sel.si_note, kn, 1); mtx_unlock(&tfd->tfd_lock); } /*ARGSUSED*/ static int filt_timerfdread(struct knote *kn, long hint) { struct timerfd *tfd = kn->kn_hook; return (tfd->tfd_count > 0); } /*ARGSUSED*/ static int timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } static void linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) { if (tfd->tfd_clockid == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } static void linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) { struct timespec cts; linux_timerfd_clocktime(tfd, &cts); *ots = tfd->tfd_time; if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { timespecsub(&ots->it_value, &cts, &ots->it_value); if (ots->it_value.tv_sec < 0 || (ots->it_value.tv_sec == 0 && ots->it_value.tv_nsec == 0)) { ots->it_value.tv_sec = 0; ots->it_value.tv_nsec = 1; } } } int linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) { struct l_itimerspec lots; struct itimerspec ots; struct timerfd *tfd; struct file *fp; int error; error = fget(td, args->fd, &cap_read_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); linux_timerfd_curval(tfd, &ots); mtx_unlock(&tfd->tfd_lock); error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); out: fdrop(fp, td); return (error); } int linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) { struct l_itimerspec lots; struct itimerspec nts, ots; struct timespec cts, ts; struct timerfd *tfd; struct timeval tv; struct file *fp; int error; if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) return (EINVAL); error = copyin(args->new_value, &lots, sizeof(lots)); if (error != 0) return (error); error = linux_to_native_itimerspec(&nts, &lots); if (error != 0) return (error); error = fget(td, args->fd, &cap_write_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); if (!timespecisset(&nts.it_value)) timespecclear(&nts.it_interval); if (args->old_value != NULL) linux_timerfd_curval(tfd, &ots); tfd->tfd_time = nts; if (timespecisset(&nts.it_value)) { linux_timerfd_clocktime(tfd, &cts); ts = nts.it_value; if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { timespecadd(&tfd->tfd_time.it_value, &cts, &tfd->tfd_time.it_value); } else { timespecsub(&ts, &cts, &ts); } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); tfd->tfd_canceled = false; } else { tfd->tfd_canceled = true; callout_stop(&tfd->tfd_callout); } mtx_unlock(&tfd->tfd_lock); if (args->old_value != NULL) { error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); } out: fdrop(fp, td); return (error); } static void linux_timerfd_expire(void *arg) { struct timespec cts, ts; struct timeval tv; struct timerfd *tfd; tfd = (struct timerfd *)arg; linux_timerfd_clocktime(tfd, &cts); if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { if (timespecisset(&tfd->tfd_time.it_interval)) timespecadd(&tfd->tfd_time.it_value, &tfd->tfd_time.it_interval, &tfd->tfd_time.it_value); else /* single shot timer */ timespecclear(&tfd->tfd_time.it_value); if (timespecisset(&tfd->tfd_time.it_value)) { timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } tfd->tfd_count++; KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); selwakeup(&tfd->tfd_sel); wakeup(&tfd->tfd_count); } else if (timespecisset(&tfd->tfd_time.it_value)) { timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } } Index: head/sys/compat/linux/linux_futex.c =================================================================== --- head/sys/compat/linux/linux_futex.c (revision 362058) +++ head/sys/compat/linux/linux_futex.c (revision 362059) @@ -1,1319 +1,1319 @@ /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2009-2016 Dmitry Chagin * Copyright (c) 2005 Emmanuel Dreyfus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if 0 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); #endif #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /** * Futex part for the special DTrace module "locks". */ LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *"); LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *"); /** * Per futex probes. */ LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *"); LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *"); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_put, return); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *", "struct waiting_proc **", "struct futex **"); LIN_SDT_PROBE_DEFINE0(futex, futex_get, error); LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *", "struct waiting_proc **", "struct timespec *"); LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *", "struct waiting_proc *", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int", "struct futex *", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *", "struct waiting_proc **", "struct timespec *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *", "struct linux_sys_futex_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq, "uint32_t *", "uint32_t", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *", "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, "uint32_t", "int"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *", "int", "uint32_t", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *", "struct linux_set_robust_list_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *", "struct linux_get_robust_list_args *"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, "struct linux_emuldata *", "uint32_t *", "unsigned int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry, "struct linux_robust_list **", "struct linux_robust_list **", "unsigned int *"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, release_futexes, entry, "struct thread *", "struct linux_emuldata *"); LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return); struct futex; struct waiting_proc { uint32_t wp_flags; struct futex *wp_futex; TAILQ_ENTRY(waiting_proc) wp_list; }; struct futex { struct mtx f_lck; uint32_t *f_uaddr; /* user-supplied value, for debug */ struct umtx_key f_key; uint32_t f_refcount; uint32_t f_bitset; LIST_ENTRY(futex) f_list; TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; }; #define FUTEX_LOCK(f) mtx_lock(&(f)->f_lck) #define FUTEX_LOCKED(f) mtx_owned(&(f)->f_lck) #define FUTEX_UNLOCK(f) mtx_unlock(&(f)->f_lck) #define FUTEX_INIT(f) do { \ mtx_init(&(f)->f_lck, "ftlk", NULL, \ MTX_DUPOK); \ LIN_SDT_PROBE1(futex, futex, create, \ &(f)->f_lck); \ } while (0) #define FUTEX_DESTROY(f) do { \ LIN_SDT_PROBE1(futex, futex, destroy, \ &(f)->f_lck); \ mtx_destroy(&(f)->f_lck); \ } while (0) #define FUTEX_ASSERT_LOCKED(f) mtx_assert(&(f)->f_lck, MA_OWNED) #define FUTEX_ASSERT_UNLOCKED(f) mtx_assert(&(f)->f_lck, MA_NOTOWNED) #define FUTEXES_LOCK do { \ mtx_lock(&futex_mtx); \ LIN_SDT_PROBE1(locks, futex_mtx, \ locked, &futex_mtx); \ } while (0) #define FUTEXES_UNLOCK do { \ LIN_SDT_PROBE1(locks, futex_mtx, \ unlock, &futex_mtx); \ mtx_unlock(&futex_mtx); \ } while (0) /* flags for futex_get() */ #define FUTEX_CREATE_WP 0x1 /* create waiting_proc */ #define FUTEX_DONTCREATE 0x2 /* don't create futex if not exists */ #define FUTEX_DONTEXISTS 0x4 /* return EINVAL if futex exists */ #define FUTEX_SHARED 0x8 /* shared futex */ #define FUTEX_DONTLOCK 0x10 /* don't lock futex */ /* wp_flags */ #define FUTEX_WP_REQUEUED 0x1 /* wp requeued - wp moved from wp_list * of futex where thread sleep to wp_list * of another futex. */ #define FUTEX_WP_REMOVED 0x2 /* wp is woken up and removed from futex * wp_list to prevent double wakeup. */ static void futex_put(struct futex *, struct waiting_proc *); static int futex_get0(uint32_t *, struct futex **f, uint32_t); static int futex_get(uint32_t *, struct waiting_proc **, struct futex **, uint32_t); static int futex_sleep(struct futex *, struct waiting_proc *, struct timespec *); static int futex_wake(struct futex *, int, uint32_t); static int futex_requeue(struct futex *, int, struct futex *, int); static int futex_copyin_timeout(int, struct l_timespec *, int, struct timespec *); static int futex_wait(struct futex *, struct waiting_proc *, struct timespec *, uint32_t); static void futex_lock(struct futex *); static void futex_unlock(struct futex *); static int futex_atomic_op(struct thread *, int, uint32_t *); static int handle_futex_death(struct linux_emuldata *, uint32_t *, unsigned int); static int fetch_robust_entry(struct linux_robust_list **, struct linux_robust_list **, unsigned int *); static int futex_copyin_timeout(int op, struct l_timespec *luts, int clockrt, struct timespec *ts) { struct l_timespec lts; struct timespec kts; int error; error = copyin(luts, <s, sizeof(lts)); if (error) return (error); error = linux_to_native_timespec(ts, <s); if (error) return (error); if (clockrt) { nanotime(&kts); timespecsub(ts, &kts, ts); } else if (op == LINUX_FUTEX_WAIT_BITSET) { nanouptime(&kts); timespecsub(ts, &kts, ts); } return (error); } static void futex_put(struct futex *f, struct waiting_proc *wp) { LIN_SDT_PROBE2(futex, futex_put, entry, f, wp); if (wp != NULL) { if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0) TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); free(wp, M_FUTEX_WP); } FUTEXES_LOCK; if (--f->f_refcount == 0) { LIST_REMOVE(f, f_list); FUTEXES_UNLOCK; if (FUTEX_LOCKED(f)) futex_unlock(f); LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d " "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); umtx_key_release(&f->f_key); FUTEX_DESTROY(f); free(f, M_FUTEX); LIN_SDT_PROBE0(futex, futex_put, return); return; } LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); if (FUTEX_LOCKED(f)) futex_unlock(f); FUTEXES_UNLOCK; LIN_SDT_PROBE0(futex, futex_put, return); } static int futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags) { struct futex *f, *tmpf; struct umtx_key key; int error; LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags); *newf = tmpf = NULL; error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE, &key); if (error) { LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error); LIN_SDT_PROBE1(futex, futex_get0, return, error); return (error); } retry: FUTEXES_LOCK; LIST_FOREACH(f, &futex_list, f_list) { if (umtx_key_match(&f->f_key, &key)) { if (tmpf != NULL) { if (FUTEX_LOCKED(tmpf)) futex_unlock(tmpf); FUTEX_DESTROY(tmpf); free(tmpf, M_FUTEX); } if (flags & FUTEX_DONTEXISTS) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, return, EINVAL); return (EINVAL); } /* * Increment refcount of the found futex to * prevent it from deallocation before FUTEX_LOCK() */ ++f->f_refcount; FUTEXES_UNLOCK; umtx_key_release(&key); if ((flags & FUTEX_DONTLOCK) == 0) futex_lock(f); *newf = f; LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d", uaddr, f->f_refcount, f->f_key.shared); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } } if (flags & FUTEX_DONTCREATE) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, null, uaddr); LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } if (tmpf == NULL) { FUTEXES_UNLOCK; tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO); tmpf->f_uaddr = uaddr; tmpf->f_key = key; tmpf->f_refcount = 1; tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY; FUTEX_INIT(tmpf); TAILQ_INIT(&tmpf->f_waiting_proc); /* * Lock the new futex before an insert into the futex_list * to prevent futex usage by other. */ if ((flags & FUTEX_DONTLOCK) == 0) futex_lock(tmpf); goto retry; } LIST_INSERT_HEAD(&futex_list, tmpf, f_list); FUTEXES_UNLOCK; LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount, tmpf->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new", uaddr, tmpf->f_refcount, tmpf->f_key.shared); *newf = tmpf; LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } static int futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f, uint32_t flags) { int error; LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f); if (flags & FUTEX_CREATE_WP) { *wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK); (*wp)->wp_flags = 0; } error = futex_get0(uaddr, f, flags); if (error) { LIN_SDT_PROBE0(futex, futex_get, error); if (flags & FUTEX_CREATE_WP) free(*wp, M_FUTEX_WP); LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } if (flags & FUTEX_CREATE_WP) { TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list); (*wp)->wp_futex = *f; } LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } static inline void futex_lock(struct futex *f) { LINUX_CTR3(sys_futex, "futex_lock uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEX_ASSERT_UNLOCKED(f); FUTEX_LOCK(f); } static inline void futex_unlock(struct futex *f) { LINUX_CTR3(sys_futex, "futex_unlock uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEX_ASSERT_LOCKED(f); FUTEX_UNLOCK(f); } static int futex_sleep(struct futex *f, struct waiting_proc *wp, struct timespec *ts) { struct timespec uts; sbintime_t sbt, prec, tmp; time_t over; int error; FUTEX_ASSERT_LOCKED(f); if (ts != NULL) { uts = *ts; if (uts.tv_sec > INT32_MAX / 2) { over = uts.tv_sec - INT32_MAX / 2; uts.tv_sec -= over; } tmp = tstosbt(uts); if (TIMESEL(&sbt, tmp)) sbt += tc_tick_sbt; sbt += tmp; prec = tmp; prec >>= tc_precexp; } else { sbt = 0; prec = 0; } LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, sbt); LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %ld ref %d", f->f_uaddr, wp, sbt, f->f_refcount); error = msleep_sbt(wp, &f->f_lck, PCATCH, "futex", sbt, prec, C_ABSOLUTE); if (wp->wp_flags & FUTEX_WP_REQUEUED) { KASSERT(f != wp->wp_futex, ("futex != wp_futex")); if (error) { LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); } LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp" " %p requeued uaddr %p ref %d", error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); futex_put(f, NULL); f = wp->wp_futex; futex_lock(f); } else { if (error) { LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error, f->f_uaddr, wp); } LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p", error, f->f_uaddr, wp); } futex_put(f, wp); LIN_SDT_PROBE1(futex, futex_sleep, return, error); return (error); } static int futex_wake(struct futex *f, int n, uint32_t bitset) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL); return (EINVAL); } FUTEX_ASSERT_LOCKED(f); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp, f->f_refcount); LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d", f->f_uaddr, wp, f->f_refcount); /* * Unless we find a matching bit in * the bitset, continue searching. */ if (!(wp->wp_futex->f_bitset & bitset)) continue; wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp); wakeup_one(wp); if (++count == n) break; } LIN_SDT_PROBE1(futex, futex_wake, return, count); return (count); } static int futex_requeue(struct futex *f, int n, struct futex *f2, int n2) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2); FUTEX_ASSERT_LOCKED(f); FUTEX_ASSERT_LOCKED(f2); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { if (++count <= n) { LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p", f->f_uaddr, wp); wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp); wakeup_one(wp); } else { LIN_SDT_PROBE3(futex, futex_requeue, requeue, f->f_uaddr, wp, f2->f_uaddr); LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p", f->f_uaddr, wp, f2->f_uaddr); wp->wp_flags |= FUTEX_WP_REQUEUED; /* Move wp to wp_list of f2 futex */ TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list); /* * Thread which sleeps on wp after waking should * acquire f2 lock, so increment refcount of f2 to * prevent it from premature deallocation. */ wp->wp_futex = f2; FUTEXES_LOCK; ++f2->f_refcount; FUTEXES_UNLOCK; if (count - n >= n2) break; } } LIN_SDT_PROBE1(futex, futex_requeue, return, count); return (count); } static int futex_wait(struct futex *f, struct waiting_proc *wp, struct timespec *ts, uint32_t bitset) { int error; LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, ts, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL); return (EINVAL); } f->f_bitset = bitset; error = futex_sleep(f, wp, ts); if (error) LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error); if (error == EWOULDBLOCK) error = ETIMEDOUT; LIN_SDT_PROBE1(futex, futex_wait, return, error); return (error); } static int futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr); if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg, cmparg); /* XXX: Linux verifies access here and returns EFAULT */ LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check); switch (op) { case FUTEX_OP_SET: ret = futex_xchgl(oparg, uaddr, &oldval); break; case FUTEX_OP_ADD: ret = futex_addl(oparg, uaddr, &oldval); break; case FUTEX_OP_OR: ret = futex_orl(oparg, uaddr, &oldval); break; case FUTEX_OP_ANDN: ret = futex_andl(~oparg, uaddr, &oldval); break; case FUTEX_OP_XOR: ret = futex_xorl(oparg, uaddr, &oldval); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op); ret = -ENOSYS; break; } if (ret) { LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } switch (cmp) { case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp); ret = -ENOSYS; } LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) { int clockrt, nrwake, op_ret, ret; struct linux_pemuldata *pem; struct waiting_proc *wp; struct futex *f, *f2; struct timespec uts, *ts; int error, save; uint32_t flags, val; LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args); if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { flags = 0; args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; } else flags = FUTEX_SHARED; /* * Currently support for switching between CLOCK_MONOTONIC and * CLOCK_REALTIME is not present. However Linux forbids the use of * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and * FUTEX_WAIT_REQUEUE_PI. */ clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET && args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) { LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } error = 0; f = f2 = NULL; switch (args->op) { case LINUX_FUTEX_WAIT: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAIT_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); if (args->timeout != NULL) { error = futex_copyin_timeout(args->op, args->timeout, clockrt, &uts); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } ts = &uts; } else ts = NULL; retry0: error = futex_get(args->uaddr, &wp, &f, flags | FUTEX_CREATE_WP); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = copyin_nofault(args->uaddr, &val, sizeof(val)); if (error) { futex_put(f, wp); error = copyin(args->uaddr, &val, sizeof(val)); if (error == 0) goto retry0; LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "WAIT copyin failed %d", error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val) { LIN_SDT_PROBE4(futex, linux_sys_futex, debug_wait_value_neq, args->uaddr, args->val, val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x != uval 0x%x", args->uaddr, args->val, val); futex_put(f, wp); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EWOULDBLOCK); return (EWOULDBLOCK); } error = futex_wait(f, wp, ts, args->val3); break; case LINUX_FUTEX_WAKE: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAKE_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTCREATE); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (f == NULL) { td->td_retval[0] = 0; LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } td->td_retval[0] = futex_wake(f, args->val, args->val3); futex_put(f, NULL); break; case LINUX_FUTEX_CMP_REQUEUE: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue, args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); /* * Linux allows this, we would not, it is an incorrect * usage of declared ABI, so return EINVAL. */ if (args->uaddr == args->uaddr2) { LIN_SDT_PROBE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); } retry1: error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } /* * To avoid deadlocks return EINVAL if second futex * exists at this time. * * Glibc fall back to FUTEX_WAKE in case of any error * returned by FUTEX_CMP_REQUEUE. */ error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTEXISTS | FUTEX_DONTLOCK); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } futex_lock(f); futex_lock(f2); error = copyin_nofault(args->uaddr, &val, sizeof(val)); if (error) { futex_put(f2, NULL); futex_put(f, NULL); error = copyin(args->uaddr, &val, sizeof(val)); if (error == 0) goto retry1; LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d", error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val3) { LIN_SDT_PROBE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, args->val, val); LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x", args->val, val); futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN); return (EAGAIN); } nrwake = (int)(unsigned long)args->timeout; td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake); futex_put(f2, NULL); futex_put(f, NULL); break; case LINUX_FUTEX_WAKE_OP: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op, args->uaddr, args->op, args->val, args->uaddr2, args->val3); LINUX_CTR5(sys_futex, "WAKE_OP " "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", args->uaddr, args->val, args->uaddr2, args->val3, args->timeout); if (args->uaddr == args->uaddr2) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); } retry2: error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTLOCK); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } futex_lock(f); futex_lock(f2); /* * This function returns positive number as results and * negative as errors */ save = vm_fault_disable_pagefaults(); op_ret = futex_atomic_op(td, args->val3, args->uaddr2); vm_fault_enable_pagefaults(save); LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x", args->uaddr, op_ret); if (op_ret < 0) { if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); error = copyin(args->uaddr2, &val, sizeof(val)); if (error == 0) goto retry2; LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } ret = futex_wake(f, args->val, args->val3); if (op_ret > 0) { op_ret = 0; nrwake = (int)(unsigned long)args->timeout; if (f2 != NULL) op_ret += futex_wake(f2, nrwake, args->val3); else op_ret += futex_wake(f, nrwake, args->val3); ret += op_ret; } if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); td->td_retval[0] = ret; break; case LINUX_FUTEX_LOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_LOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_UNLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_UNLOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_TRYLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_TRYLOCK_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_trylock_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_REQUEUE: /* * Glibc does not use this operation since version 2.3.3, * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when * FUTEX_REQUEUE returned EINVAL. */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { linux_msg(td, "unsupported FUTEX_REQUEUE"); pem->flags |= LINUX_XDEPR_REQUEUEOP; LIN_SDT_PROBE0(futex, linux_sys_futex, deprecated_requeue); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); case LINUX_FUTEX_WAIT_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_CMP_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); default: - linux_msg(td, "unsupported futex op %d\n", args->op); + linux_msg(td, "unsupported futex op %d", args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation, args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } int linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) { struct linux_emuldata *em; LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args); if (args->len != sizeof(struct linux_robust_list_head)) { LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL); return (EINVAL); } em = em_find(td); em->robust_futexes = args->head; LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0); return (0); } int linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) { struct linux_emuldata *em; struct linux_robust_list_head *head; l_size_t len = sizeof(struct linux_robust_list_head); struct thread *td2; int error = 0; LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args); if (!args->pid) { em = em_find(td); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); head = em->robust_futexes; } else { td2 = tdfind(args->pid, -1); if (td2 == NULL) { LIN_SDT_PROBE1(futex, linux_get_robust_list, return, ESRCH); return (ESRCH); } if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EPERM); PROC_UNLOCK(td2->td_proc); return (EPERM); } em = em_find(td2); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); /* XXX: ptrace? */ if (priv_check(td, PRIV_CRED_SETUID) || priv_check(td, PRIV_CRED_SETEUID) || p_candebug(td, td2->td_proc)) { PROC_UNLOCK(td2->td_proc); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EPERM); return (EPERM); } head = em->robust_futexes; PROC_UNLOCK(td2->td_proc); } error = copyout(&len, args->len, sizeof(l_size_t)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT); return (EFAULT); } error = copyout(&head, args->head, sizeof(head)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); } LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error); return (error); } static int handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr, unsigned int pi) { uint32_t uval, nval, mval; struct futex *f; int error; LIN_SDT_PROBE3(futex, handle_futex_death, entry, em, uaddr, pi); retry: error = copyin(uaddr, &uval, 4); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error); LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } if ((uval & FUTEX_TID_MASK) == em->em_tid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = casuword32(uaddr, uval, mval); if (nval == -1) { LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { error = futex_get(uaddr, NULL, &f, FUTEX_DONTCREATE | FUTEX_SHARED); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, return, error); return (error); } if (f != NULL) { futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY); futex_put(f, NULL); } } } LIN_SDT_PROBE1(futex, handle_futex_death, return, 0); return (0); } static int fetch_robust_entry(struct linux_robust_list **entry, struct linux_robust_list **head, unsigned int *pi) { l_ulong uentry; int error; LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi); error = copyin((const void *)head, &uentry, sizeof(l_ulong)); if (error) { LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error); LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT); return (EFAULT); } *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0); return (0); } /* This walks the list of robust futexes releasing them. */ void release_futexes(struct thread *td, struct linux_emuldata *em) { struct linux_robust_list_head *head = NULL; struct linux_robust_list *entry, *next_entry, *pending; unsigned int limit = 2048, pi, next_pi, pip; l_long futex_offset; int rc, error; LIN_SDT_PROBE2(futex, release_futexes, entry, td, em); head = em->robust_futexes; if (head == NULL) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } error = copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)); if (error) { LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error); LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } while (entry != &head->list) { rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); if (entry != pending) if (handle_futex_death(em, (uint32_t *)((caddr_t)entry + futex_offset), pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (rc) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } entry = next_entry; pi = next_pi; if (!--limit) break; sched_relinquish(curthread); } if (pending) handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip); LIN_SDT_PROBE0(futex, release_futexes, return); }