Index: head/sys/amd64/linux/linux_dummy.c =================================================================== --- head/sys/amd64/linux/linux_dummy.c +++ head/sys/amd64/linux/linux_dummy.c @@ -69,13 +69,10 @@ DUMMY(security); DUMMY(set_thread_area); DUMMY(lookup_dcookie); -DUMMY(epoll_create); DUMMY(epoll_ctl_old); DUMMY(epoll_wait_old); DUMMY(remap_file_pages); DUMMY(semtimedop); -DUMMY(epoll_ctl); -DUMMY(epoll_wait); DUMMY(mbind); DUMMY(get_mempolicy); DUMMY(set_mempolicy); @@ -112,7 +109,6 @@ DUMMY(timerfd_gettime); DUMMY(signalfd4); DUMMY(eventfd2); -DUMMY(epoll_create1); DUMMY(inotify_init1); DUMMY(preadv); DUMMY(pwritev); Index: head/sys/amd64/linux/syscalls.master =================================================================== --- head/sys/amd64/linux/syscalls.master +++ head/sys/amd64/linux/syscalls.master @@ -373,7 +373,7 @@ 210 AUE_NULL UNIMPL linux_io_cancel 211 AUE_NULL UNIMPL linux_get_thread_area 212 AUE_NULL STD { int linux_lookup_dcookie(void); } -213 AUE_NULL STD { int linux_epoll_create(void); } +213 AUE_NULL STD { int linux_epoll_create(l_int size); } 214 AUE_NULL STD { int linux_epoll_ctl_old(void); } 215 AUE_NULL STD { int linux_epoll_wait_old(void); } 216 AUE_NULL STD { int linux_remap_file_pages(void); } @@ -397,8 +397,10 @@ 230 AUE_NULL STD { int linux_clock_nanosleep(clockid_t which, int flags, \ struct l_timespec *rqtp, struct l_timespec *rmtp); } 231 AUE_EXIT STD { int linux_exit_group(int error_code); } -232 AUE_NULL STD { int linux_epoll_wait(void); } -233 AUE_NULL STD { int linux_epoll_ctl(void); } +232 AUE_NULL STD { int linux_epoll_wait(l_int epfd, struct epoll_event *events, \ + l_int maxevents, l_int timeout); } +233 AUE_NULL STD { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \ + struct epoll_event *event); } 234 AUE_NULL STD { int linux_tgkill(int tgid, int pid, int sig); } 235 AUE_UTIMES STD { int linux_utimes(char *fname, \ struct l_timeval *tptr); } @@ -466,7 +468,8 @@ 278 AUE_NULL STD { int linux_vmsplice(void); } 279 AUE_NULL STD { int linux_move_pages(void); } 280 AUE_NULL STD { int linux_utimensat(void); } -281 AUE_NULL STD { int linux_epoll_pwait(void); } +281 AUE_NULL STD { int linux_epoll_pwait(l_int epfd, struct epoll_event *events, \ + l_int maxevents, l_int timeout, l_sigset_t *mask); } 282 AUE_NULL STD { int linux_signalfd(void); } 283 AUE_NULL STD { int linux_timerfd(void); } 284 AUE_NULL STD { int linux_eventfd(void); } @@ -477,7 +480,7 @@ l_uintptr_t namelen, int flags); } 289 AUE_NULL STD { int linux_signalfd4(void); } 290 AUE_NULL STD { int linux_eventfd2(void); } -291 AUE_NULL STD { int linux_epoll_create1(void); } +291 AUE_NULL STD { int linux_epoll_create1(l_int flags); } 292 AUE_NULL STD { int linux_dup3(l_int oldfd, \ l_int newfd, l_int flags); } 293 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); } Index: head/sys/amd64/linux32/linux32_dummy.c =================================================================== --- head/sys/amd64/linux32/linux32_dummy.c +++ head/sys/amd64/linux32/linux32_dummy.c @@ -68,9 +68,6 @@ DUMMY(mincore); DUMMY(ptrace); DUMMY(lookup_dcookie); -DUMMY(epoll_create); -DUMMY(epoll_ctl); -DUMMY(epoll_wait); DUMMY(remap_file_pages); DUMMY(fstatfs64); DUMMY(mbind); @@ -120,7 +117,6 @@ /* linux 2.6.27: */ DUMMY(signalfd4); DUMMY(eventfd2); -DUMMY(epoll_create1); DUMMY(inotify_init1); /* linux 2.6.30: */ DUMMY(preadv); Index: head/sys/amd64/linux32/syscalls.master =================================================================== --- head/sys/amd64/linux32/syscalls.master +++ head/sys/amd64/linux32/syscalls.master @@ -430,9 +430,11 @@ 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } 253 AUE_NULL STD { int linux_lookup_dcookie(void); } -254 AUE_NULL STD { int linux_epoll_create(void); } -255 AUE_NULL STD { int linux_epoll_ctl(void); } -256 AUE_NULL STD { int linux_epoll_wait(void); } +254 AUE_NULL STD { int linux_epoll_create(l_int size); } +255 AUE_NULL STD { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \ + struct epoll_event *event); } +256 AUE_NULL STD { int linux_epoll_wait(l_int epfd, struct epoll_event *events, \ + l_int maxevents, l_int timeout); } 257 AUE_NULL STD { int linux_remap_file_pages(void); } 258 AUE_NULL STD { int linux_set_tid_address(int *tidptr); } 259 AUE_NULL STD { int linux_timer_create(clockid_t clock_id, \ @@ -527,7 +529,8 @@ 317 AUE_NULL STD { int linux_move_pages(void); } ; linux 2.6.19: 318 AUE_NULL STD { int linux_getcpu(void); } -319 AUE_NULL STD { int linux_epoll_pwait(void); } +319 AUE_NULL STD { int linux_epoll_pwait(l_int epfd, struct epoll_event *events, \ + l_int maxevents, l_int timeout, l_osigset_t *mask); } ; linux 2.6.22: 320 AUE_NULL STD { int linux_utimensat(void); } 321 AUE_NULL STD { int linux_signalfd(void); } @@ -541,7 +544,7 @@ ; linux 2.6.27: 327 AUE_NULL STD { int linux_signalfd4(void); } 328 AUE_NULL STD { int linux_eventfd2(void); } -329 AUE_NULL STD { int linux_epoll_create1(void); } +329 AUE_NULL STD { int linux_epoll_create1(l_int flags); } 330 AUE_NULL STD { int linux_dup3(l_int oldfd, \ l_int newfd, l_int flags); } 331 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); } Index: head/sys/compat/linux/linux_emul.h =================================================================== --- head/sys/compat/linux/linux_emul.h +++ head/sys/compat/linux/linux_emul.h @@ -60,9 +60,12 @@ /* process emuldata flags */ #define LINUX_XDEPR_REQUEUEOP 0x00000001 /* uses deprecated futex REQUEUE op*/ +#define LINUX_XUNSUP_EPOLL 0x00000002 /* unsupported epoll events */ + struct linux_pemuldata { uint32_t flags; /* process emuldata flags */ struct sx pem_sx; /* lock for this struct */ + void *epoll; /* epoll data */ }; #define LINUX_PEM_XLOCK(p) sx_xlock(&(p)->pem_sx) Index: head/sys/compat/linux/linux_emul.c =================================================================== --- head/sys/compat/linux/linux_emul.c +++ head/sys/compat/linux/linux_emul.c @@ -42,8 +42,6 @@ #include #include #include -#include -#include #include #include @@ -86,6 +84,7 @@ { struct linux_emuldata *em; struct linux_pemuldata *pem; + struct epoll_emuldata *emd; if (newtd != NULL) { /* non-exec call */ @@ -93,8 +92,13 @@ em->pdeath_signal = 0; em->robust_futexes = NULL; if (flags & LINUX_CLONE_THREAD) { + LINUX_CTR1(proc_init, "thread newtd(%d)", + newtd->td_tid); + em->em_tid = newtd->td_tid; } else { + LINUX_CTR1(proc_init, "fork newtd(%d)", + newtd->td_proc->p_pid); em->em_tid = newtd->td_proc->p_pid; @@ -105,12 +109,24 @@ newtd->td_emuldata = em; } else { /* exec */ + LINUX_CTR1(proc_init, "exec newtd(%d)", + td->td_proc->p_pid); /* lookup the old one */ em = em_find(td); KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n")); em->em_tid = td->td_proc->p_pid; + + /* epoll should be destroyed in a case of exec. */ + pem = pem_find(td->td_proc); + KASSERT(pem != NULL, ("proc_exit: proc emuldata not found.\n")); + + if (pem->epoll != NULL) { + emd = pem->epoll; + pem->epoll = NULL; + free(emd, M_EPOLL); + } } em->child_clear_tid = NULL; @@ -121,6 +137,7 @@ linux_proc_exit(void *arg __unused, struct proc *p) { struct linux_pemuldata *pem; + struct epoll_emuldata *emd; struct thread *td = curthread; if (__predict_false(SV_CURPROC_ABI() != SV_ABI_LINUX)) @@ -133,6 +150,12 @@ p->p_emuldata = NULL; + if (pem->epoll != NULL) { + emd = pem->epoll; + pem->epoll = NULL; + free(emd, M_EPOLL); + } + sx_destroy(&pem->pem_sx); free(pem, M_LINUX); } @@ -141,6 +164,7 @@ linux_common_execve(struct thread *td, struct image_args *eargs) { struct linux_pemuldata *pem; + struct epoll_emuldata *emd; struct linux_emuldata *em; struct proc *p; int error; @@ -180,6 +204,12 @@ p->p_emuldata = NULL; PROC_UNLOCK(p); + if (pem->epoll != NULL) { + emd = pem->epoll; + pem->epoll = NULL; + free(emd, M_EPOLL); + } + free(em, M_TEMP); free(pem, M_LINUX); } @@ -197,6 +227,7 @@ */ if (__predict_false((imgp->sysent->sv_flags & SV_ABI_MASK) == SV_ABI_LINUX)) { + if (SV_PROC_ABI(p) == SV_ABI_LINUX) linux_proc_init(td, NULL, 0); else Index: head/sys/compat/linux/linux_event.h =================================================================== --- head/sys/compat/linux/linux_event.h +++ head/sys/compat/linux/linux_event.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2007 Roman Divacky + * Copyright (c) 2014 Dmitry Chagin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _LINUX_EVENT_H_ +#define _LINUX_EVENT_H_ + +#define LINUX_EPOLLIN 0x001 +#define LINUX_EPOLLPRI 0x002 +#define LINUX_EPOLLOUT 0x004 +#define LINUX_EPOLLRDNORM 0x040 +#define LINUX_EPOLLRDBAND 0x080 +#define LINUX_EPOLLWRNORM 0x100 +#define LINUX_EPOLLWRBAND 0x200 +#define LINUX_EPOLLMSG 0x400 +#define LINUX_EPOLLERR 0x008 +#define LINUX_EPOLLHUP 0x010 +#define LINUX_EPOLLRDHUP 0x2000 +#define LINUX_EPOLLWAKEUP 1u<<29 +#define LINUX_EPOLLONESHOT 1u<<30 +#define LINUX_EPOLLET 1u<<31 + +#define LINUX_EPOLL_EVRD (LINUX_EPOLLIN|LINUX_EPOLLRDNORM \ + |LINUX_EPOLLHUP|LINUX_EPOLLPRI) +#define LINUX_EPOLL_EVWR (LINUX_EPOLLOUT|LINUX_EPOLLWRNORM) +#define LINUX_EPOLL_EVSUP (LINUX_EPOLLET|LINUX_EPOLLONESHOT \ + |LINUX_EPOLL_EVRD|LINUX_EPOLL_EVWR) + +#define LINUX_EPOLL_CTL_ADD 1 +#define LINUX_EPOLL_CTL_DEL 2 +#define LINUX_EPOLL_CTL_MOD 3 + +#endif /* !_LINUX_EVENT_H_ */ Index: head/sys/compat/linux/linux_event.c =================================================================== --- head/sys/compat/linux/linux_event.c +++ head/sys/compat/linux/linux_event.c @@ -0,0 +1,500 @@ +/*- + * Copyright (c) 2007 Roman Divacky + * Copyright (c) 2014 Dmitry Chagin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_compat.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef COMPAT_LINUX32 +#include +#include +#else +#include +#include +#endif + +#include +#include +#include +#include + +/* + * epoll defines 'struct epoll_event' with the field 'data' as 64 bits + * on all architectures. But on 32 bit architectures BSD 'struct kevent' only + * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied + * data verbatuim. Therefore we allocate 64-bit memory block to pass + * user supplied data for every file descriptor. + */ + +typedef uint64_t epoll_udata_t; + +struct epoll_emuldata { + uint32_t fdc; /* epoll udata max index */ + epoll_udata_t udata[1]; /* epoll user data vector */ +}; + +#define EPOLL_DEF_SZ 16 +#define EPOLL_SIZE(fdn) \ + (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) + +struct epoll_event { + uint32_t events; + epoll_udata_t data; +} +#if defined(__amd64__) +__attribute__((packed)) +#endif +; + +#define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) + +static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); +static int epoll_to_kevent(struct thread *td, struct file *epfp, + int fd, struct epoll_event *l_event, int *kev_flags, + struct kevent *kevent, int *nkevents); +static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); +static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); +static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); +static int epoll_delete_event(struct thread *td, struct file *epfp, + int fd, int filter); +static int epoll_delete_all_events(struct thread *td, struct file *epfp, + int fd); + +struct epoll_copyin_args { + struct kevent *changelist; +}; + +struct epoll_copyout_args { + struct epoll_event *leventlist; + struct proc *p; + uint32_t count; + int error; +}; + + +static void +epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) +{ + struct linux_pemuldata *pem; + struct epoll_emuldata *emd; + struct proc *p; + + p = td->td_proc; + + pem = pem_find(p); + KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); + + LINUX_PEM_XLOCK(pem); + if (pem->epoll == NULL) { + emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); + emd->fdc = fd; + pem->epoll = emd; + } else { + emd = pem->epoll; + if (fd > emd->fdc) { + emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); + emd->fdc = fd; + pem->epoll = emd; + } + } + emd->udata[fd] = udata; + LINUX_PEM_XUNLOCK(pem); +} + +static int +epoll_create_common(struct thread *td, int flags) +{ + int error; + + error = kern_kqueue(td, flags); + if (error) + return (error); + + epoll_fd_install(td, EPOLL_DEF_SZ, 0); + + return (0); +} + +int +linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) +{ + + /* + * args->size is unused. Linux just tests it + * and then forgets it as well. + */ + if (args->size <= 0) + return (EINVAL); + + return (epoll_create_common(td, 0)); +} + +int +linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) +{ + int flags; + + if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) + return (EINVAL); + + flags = 0; + if ((args->flags & LINUX_O_CLOEXEC) != 0) + flags |= O_CLOEXEC; + + return (epoll_create_common(td, flags)); +} + +/* Structure converting function from epoll to kevent. */ +static int +epoll_to_kevent(struct thread *td, struct file *epfp, + int fd, struct epoll_event *l_event, int *kev_flags, + struct kevent *kevent, int *nkevents) +{ + uint32_t levents = l_event->events; + struct linux_pemuldata *pem; + struct proc *p; + + /* flags related to how event is registered */ + if ((levents & LINUX_EPOLLONESHOT) != 0) + *kev_flags |= EV_ONESHOT; + if ((levents & LINUX_EPOLLET) != 0) + *kev_flags |= EV_CLEAR; + + /* flags related to what event is registered */ + if ((levents & LINUX_EPOLL_EVRD) != 0) { + EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); + ++(*nkevents); + } + if ((levents & LINUX_EPOLL_EVWR) != 0) { + EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); + ++(*nkevents); + } + + if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { + p = td->td_proc; + + pem = pem_find(p); + KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); + KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); + + LINUX_PEM_XLOCK(pem); + if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { + pem->flags |= LINUX_XUNSUP_EPOLL; + LINUX_PEM_XUNLOCK(pem); + linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", + levents); + } else + LINUX_PEM_XUNLOCK(pem); + return (EINVAL); + } + + return (0); +} + +/* + * Structure converting function from kevent to epoll. In a case + * this is called on error in registration we store the error in + * event->data and pick it up later in linux_epoll_ctl(). + */ +static void +kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) +{ + + if ((kevent->flags & EV_ERROR) != 0) + return; + + switch (kevent->filter) { + case EVFILT_READ: + l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI; + break; + case EVFILT_WRITE: + l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM; + break; + } +} + +/* + * Copyout callback used by kevent. This converts kevent + * events to epoll events and copies them back to the + * userspace. This is also called on error on registering + * of the filter. + */ +static int +epoll_kev_copyout(void *arg, struct kevent *kevp, int count) +{ + struct epoll_copyout_args *args; + struct linux_pemuldata *pem; + struct epoll_emuldata *emd; + struct epoll_event *eep; + int error, fd, i; + + args = (struct epoll_copyout_args*) arg; + eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); + + pem = pem_find(args->p); + KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); + LINUX_PEM_SLOCK(pem); + emd = pem->epoll; + KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); + + for (i = 0; i < count; i++) { + kevent_to_epoll(&kevp[i], &eep[i]); + + fd = kevp[i].ident; + KASSERT(fd <= emd->fdc, ("epoll user data vector" + " is too small.\n")); + eep[i].data = emd->udata[fd]; + } + LINUX_PEM_SUNLOCK(pem); + + error = copyout(eep, args->leventlist, count * sizeof(*eep)); + if (error == 0) { + args->leventlist += count; + args->count += count; + } else if (args->error == 0) + args->error = error; + + free(eep, M_EPOLL); + return (error); +} + +/* + * Copyin callback used by kevent. This copies already + * converted filters from kernel memory to the kevent + * internal kernel memory. Hence the memcpy instead of + * copyin. + */ +static int +epoll_kev_copyin(void *arg, struct kevent *kevp, int count) +{ + struct epoll_copyin_args *args; + + args = (struct epoll_copyin_args*) arg; + + memcpy(kevp, args->changelist, count * sizeof(*kevp)); + args->changelist += count; + + return (0); +} + +/* + * Load epoll filter, convert it to kevent filter + * and load it into kevent subsystem. + */ +int +linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) +{ + struct file *epfp, *fp; + struct epoll_copyin_args ciargs; + struct kevent kev[2]; + struct kevent_copyops k_ops = { &ciargs, + NULL, + epoll_kev_copyin}; + struct epoll_event le; + cap_rights_t rights; + int kev_flags; + int nchanges = 0; + int error; + + if (args->op != LINUX_EPOLL_CTL_DEL) { + error = copyin(args->event, &le, sizeof(le)); + if (error != 0) + return (error); + } + + error = fget(td, args->epfd, + cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); + if (error != 0) + return (error); + if (epfp->f_type != DTYPE_KQUEUE) + goto leave1; + + /* Protect user data vector from incorrectly supplied fd. */ + error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); + if (error != 0) + goto leave1; + + /* Linux disallows spying on himself */ + if (epfp == fp) { + error = EINVAL; + goto leave0; + } + + ciargs.changelist = kev; + + switch (args->op) { + case LINUX_EPOLL_CTL_MOD: + /* + * We don't memorize which events were set for this FD + * on this level, so just delete all we could have set: + * EVFILT_READ and EVFILT_WRITE, ignoring any errors + */ + error = epoll_delete_all_events(td, epfp, args->fd); + if (error) + goto leave0; + /* FALLTHROUGH */ + + case LINUX_EPOLL_CTL_ADD: + kev_flags = EV_ADD | EV_ENABLE; + break; + + case LINUX_EPOLL_CTL_DEL: + /* CTL_DEL means unregister this fd with this epoll */ + error = epoll_delete_all_events(td, epfp, args->fd); + goto leave0; + + default: + error = EINVAL; + goto leave0; + } + + error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags, + kev, &nchanges); + if (error) + goto leave0; + + epoll_fd_install(td, args->fd, le.data); + + error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); + +leave0: + fdrop(fp, td); + +leave1: + fdrop(epfp, td); + return (error); +} + +/* + * Wait for a filter to be triggered on the epoll file descriptor. + */ +int +linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) +{ + struct file *epfp; + struct timespec ts, *tsp; + cap_rights_t rights; + struct epoll_copyout_args coargs; + struct kevent_copyops k_ops = { &coargs, + epoll_kev_copyout, + NULL}; + int error; + + if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS) + return (EINVAL); + + error = fget(td, args->epfd, + cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); + if (error != 0) + return (error); + + coargs.leventlist = args->events; + coargs.p = td->td_proc; + coargs.count = 0; + coargs.error = 0; + + if (args->timeout != -1) { + if (args->timeout < 0) { + error = EINVAL; + goto leave; + } + /* Convert from milliseconds to timespec. */ + ts.tv_sec = args->timeout / 1000; + ts.tv_nsec = (args->timeout % 1000) * 1000000; + tsp = &ts; + } else { + tsp = NULL; + } + + error = kern_kevent_fp(td, epfp, 0, args->maxevents, &k_ops, tsp); + if (error == 0 && coargs.error != 0) + error = coargs.error; + + /* + * kern_kevent might return ENOMEM which is not expected from epoll_wait. + * Maybe we should translate that but I don't think it matters at all. + */ + if (error == 0) + td->td_retval[0] = coargs.count; +leave: + fdrop(epfp, td); + return (error); +} + +static int +epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) +{ + struct epoll_copyin_args ciargs; + struct kevent kev; + struct kevent_copyops k_ops = { &ciargs, + NULL, + epoll_kev_copyin}; + int error; + + ciargs.changelist = &kev; + EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); + + error = kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL); + + /* + * here we ignore ENONT, because we don't keep track of events here + */ + if (error == ENOENT) + error = 0; + return (error); +} + +static int +epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) +{ + int error1, error2; + + error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); + error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); + + /* report any errors we got */ + return (error1 == 0 ? error2 : error1); +} Index: head/sys/compat/linux/linux_util.h =================================================================== --- head/sys/compat/linux/linux_util.h +++ head/sys/compat/linux/linux_util.h @@ -45,6 +45,7 @@ #include MALLOC_DECLARE(M_LINUX); +MALLOC_DECLARE(M_EPOLL); MALLOC_DECLARE(M_FUTEX); MALLOC_DECLARE(M_FUTEX_WP); Index: head/sys/compat/linux/linux_util.c =================================================================== --- head/sys/compat/linux/linux_util.c +++ head/sys/compat/linux/linux_util.c @@ -54,6 +54,7 @@ #include MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures"); +MALLOC_DEFINE(M_EPOLL, "lepoll", "Linux events structures"); MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes"); MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futex waiting proc"); Index: head/sys/conf/files.amd64 =================================================================== --- head/sys/conf/files.amd64 +++ head/sys/conf/files.amd64 @@ -509,6 +509,7 @@ compat/linux/linux_util.c optional compat_linux32 compat/linux/linux_vdso.c optional compat_linux32 compat/linux/linux_common.c optional compat_linux32 +compat/linux/linux_event.c optional compat_linux32 dev/amr/amr_linux.c optional compat_linux32 amr dev/mfi/mfi_linux.c optional compat_linux32 mfi # Index: head/sys/conf/files.i386 =================================================================== --- head/sys/conf/files.i386 +++ head/sys/conf/files.i386 @@ -81,6 +81,7 @@ cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}" compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs +compat/linux/linux_event.c optional compat_linux compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_fork.c optional compat_linux Index: head/sys/conf/files.pc98 =================================================================== --- head/sys/conf/files.pc98 +++ head/sys/conf/files.pc98 @@ -41,6 +41,7 @@ cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}" compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs +compat/linux/linux_event.c optional compat_linux compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_fork.c optional compat_linux Index: head/sys/i386/linux/linux_dummy.c =================================================================== --- head/sys/i386/linux/linux_dummy.c +++ head/sys/i386/linux/linux_dummy.c @@ -70,9 +70,6 @@ DUMMY(pivot_root); DUMMY(mincore); DUMMY(lookup_dcookie); -DUMMY(epoll_create); -DUMMY(epoll_ctl); -DUMMY(epoll_wait); DUMMY(remap_file_pages); DUMMY(fstatfs64); DUMMY(mbind); @@ -116,7 +113,6 @@ /* linux 2.6.27: */ DUMMY(signalfd4); DUMMY(eventfd2); -DUMMY(epoll_create1); DUMMY(inotify_init1); /* linux 2.6.30: */ DUMMY(preadv); Index: head/sys/i386/linux/syscalls.master =================================================================== --- head/sys/i386/linux/syscalls.master +++ head/sys/i386/linux/syscalls.master @@ -432,9 +432,11 @@ 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } 253 AUE_NULL STD { int linux_lookup_dcookie(void); } -254 AUE_NULL STD { int linux_epoll_create(void); } -255 AUE_NULL STD { int linux_epoll_ctl(void); } -256 AUE_NULL STD { int linux_epoll_wait(void); } +254 AUE_NULL STD { int linux_epoll_create(l_int size); } +255 AUE_NULL STD { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \ + struct epoll_event *event); } +256 AUE_NULL STD { int linux_epoll_wait(l_int epfd, struct epoll_event *events, \ + l_int maxevents, l_int timeout); } 257 AUE_NULL STD { int linux_remap_file_pages(void); } 258 AUE_NULL STD { int linux_set_tid_address(int *tidptr); } 259 AUE_NULL STD { int linux_timer_create(clockid_t clock_id, \ @@ -535,7 +537,8 @@ 317 AUE_NULL STD { int linux_move_pages(void); } ; linux 2.6.19: 318 AUE_NULL STD { int linux_getcpu(void); } -319 AUE_NULL STD { int linux_epoll_pwait(void); } +319 AUE_NULL STD { int linux_epoll_pwait(l_int epfd, struct epoll_event *events, \ + l_int maxevents, l_int timeout, l_osigset_t *mask); } ; linux 2.6.22: 320 AUE_NULL STD { int linux_utimensat(void); } 321 AUE_NULL STD { int linux_signalfd(void); } @@ -549,7 +552,7 @@ ; linux 2.6.27: 327 AUE_NULL STD { int linux_signalfd4(void); } 328 AUE_NULL STD { int linux_eventfd2(void); } -329 AUE_NULL STD { int linux_epoll_create1(void); } +329 AUE_NULL STD { int linux_epoll_create1(l_int flags); } 330 AUE_NULL STD { int linux_dup3(l_int oldfd, \ l_int newfd, l_int flags); } 331 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); } Index: head/sys/modules/linux/Makefile =================================================================== --- head/sys/modules/linux/Makefile +++ head/sys/modules/linux/Makefile @@ -10,7 +10,7 @@ VDSO= linux${SFX}_vdso KMOD= linux -SRCS= linux_fork.c linux${SFX}_dummy.c linux_file.c \ +SRCS= linux_fork.c linux${SFX}_dummy.c linux_file.c linux_event.c \ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ linux${SFX}_machdep.c linux_misc.c linux_signal.c \ linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \ Index: head/sys/modules/linux64/Makefile =================================================================== --- head/sys/modules/linux64/Makefile +++ head/sys/modules/linux64/Makefile @@ -5,7 +5,7 @@ VDSO= linux_vdso KMOD= linux64 -SRCS= linux_fork.c linux_dummy.c linux_file.c \ +SRCS= linux_fork.c linux_dummy.c linux_file.c linux_event.c \ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ linux_machdep.c linux_misc.c linux_signal.c \ linux_socket.c linux_stats.c linux_sysctl.c linux_sysent.c \