Index: sys/compat/linuxkpi/common/include/linux/cdev.h =================================================================== --- sys/compat/linuxkpi/common/include/linux/cdev.h +++ sys/compat/linuxkpi/common/include/linux/cdev.h @@ -52,7 +52,8 @@ struct cdev *cdev; dev_t dev; const struct file_operations *ops; - atomic_long_t refs; + u_int refs; + u_int siref; }; static inline void @@ -61,7 +62,7 @@ kobject_init(&cdev->kobj, &linux_cdev_static_ktype); cdev->ops = ops; - atomic_long_set(&cdev->refs, 0); + cdev->refs = 1; } static inline struct linux_cdev * @@ -70,8 +71,9 @@ struct linux_cdev *cdev; cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK); - if (cdev) + if (cdev != NULL) kobject_init(&cdev->kobj, &linux_cdev_ktype); + cdev->refs = 1; return (cdev); } Index: sys/compat/linuxkpi/common/include/linux/mm.h =================================================================== --- sys/compat/linuxkpi/common/include/linux/mm.h +++ sys/compat/linuxkpi/common/include/linux/mm.h @@ -180,12 +180,8 @@ return (-ENOTSUP); } -static inline int -zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, - unsigned long size) -{ - return (-ENOTSUP); -} +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size); static inline int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, Index: sys/compat/linuxkpi/common/src/linux_compat.c =================================================================== --- sys/compat/linuxkpi/common/src/linux_compat.c +++ sys/compat/linuxkpi/common/src/linux_compat.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -100,6 +101,7 @@ #undef cdev #define RB_ROOT(head) (head)->rbh_root +static void linux_cdev_deref(struct linux_cdev *ldev); static struct vm_area_struct *linux_cdev_handle_find(void *handle); struct kobject linux_class_root; @@ -538,7 +540,7 @@ struct vm_fault vmf; /* fill out VM fault structure */ - vmf.virtual_address = (void *)((uintptr_t)pidx << PAGE_SHIFT); + vmf.virtual_address = (void *)IDX_TO_OFF(pidx); vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; vmf.pgoff = 0; vmf.page = NULL; @@ -672,6 +674,71 @@ }, }; +int +zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + vm_object_t obj; + vm_page_t m; + + obj = vma->vm_obj; + if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0) + return (-ENOTSUP); + VM_OBJECT_RLOCK(obj); + for (m = vm_page_find_least(obj, OFF_TO_IDX(address)); + m != NULL && m->pindex < OFF_TO_IDX(address + size); + m = TAILQ_NEXT(m, listq)) + pmap_remove_all(m); + VM_OBJECT_RUNLOCK(obj); + return (0); +} + +static struct file_operations dummy_ldev_ops = { + /* XXXKIB */ +}; + +static struct linux_cdev dummy_ldev = { + .ops = &dummy_ldev_ops, +}; + +#define LDEV_SI_DTR 0x0001 +#define LDEV_SI_REF 0x0002 + +static void +linux_get_fop(struct linux_file *filp, const struct file_operations **fop, + struct linux_cdev **dev) +{ + struct linux_cdev *ldev; + u_int siref; + + ldev = filp->f_cdev; + *fop = filp->f_op; + if (ldev != NULL) { + for (siref = ldev->siref;;) { + if ((siref & LDEV_SI_DTR) != 0) { + ldev = &dummy_ldev; + siref = ldev->siref; + *fop = ldev->ops; + MPASS((ldev->siref & LDEV_SI_DTR) == 0); + } else if (atomic_fcmpset_int(&ldev->siref, &siref, + siref + LDEV_SI_REF)) { + break; + } + } + } + *dev = ldev; +} + +static void +linux_drop_fop(struct linux_cdev *ldev) +{ + + if (ldev == NULL) + return; + MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); + atomic_subtract_int(&ldev->siref, LDEV_SI_REF); +} + #define OPW(fp,td,code) ({ \ struct file *__fpop; \ __typeof(code) __retval; \ @@ -684,10 +751,12 @@ }) static int -linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *file) +linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, + struct file *file) { struct linux_cdev *ldev; struct linux_file *filp; + const struct file_operations *fop; int error; ldev = dev->si_drv1; @@ -699,20 +768,17 @@ filp->f_flags = file->f_flag; filp->f_vnode = file->f_vnode; filp->_file = file; + refcount_acquire(&ldev->refs); filp->f_cdev = ldev; linux_set_current(td); + linux_get_fop(filp, &fop, &ldev); - /* get a reference on the Linux character device */ - if (atomic_long_add_unless(&ldev->refs, 1, -1L) == 0) { - kfree(filp); - return (EINVAL); - } - - if (filp->f_op->open) { - error = -filp->f_op->open(file->f_vnode, filp); - if (error) { - atomic_long_dec(&ldev->refs); + if (fop->open != NULL) { + error = -fop->open(file->f_vnode, filp); + if (error != 0) { + linux_drop_fop(ldev); + linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); } @@ -723,6 +789,7 @@ /* release the file from devfs */ finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); + linux_drop_fop(ldev); return (ENXIO); } @@ -858,7 +925,8 @@ static int linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, - u_long cmd, caddr_t data, struct thread *td) + const struct file_operations *fop, u_long cmd, caddr_t data, + struct thread *td) { struct task_struct *task = current; unsigned size; @@ -883,20 +951,28 @@ #if defined(__amd64__) if (td->td_proc->p_elf_machine == EM_386) { /* try the compat IOCTL handler first */ - if (filp->f_op->compat_ioctl != NULL) - error = -OPW(fp, td, filp->f_op->compat_ioctl(filp, cmd, (u_long)data)); - else + if (fop->compat_ioctl != NULL) { + error = -OPW(fp, td, fop->compat_ioctl(filp, + cmd, (u_long)data)); + } else { error = ENOTTY; + } /* fallback to the regular IOCTL handler, if any */ - if (error == ENOTTY && filp->f_op->unlocked_ioctl != NULL) - error = -OPW(fp, td, filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data)); + if (error == ENOTTY && fop->unlocked_ioctl != NULL) { + error = -OPW(fp, td, fop->unlocked_ioctl(filp, + cmd, (u_long)data)); + } } else #endif - if (filp->f_op->unlocked_ioctl != NULL) - error = -OPW(fp, td, filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data)); - else - error = ENOTTY; + { + if (fop->unlocked_ioctl != NULL) { + error = -OPW(fp, td, fop->unlocked_ioctl(filp, + cmd, (u_long)data)); + } else { + error = ENOTTY; + } + } if (size > 0) { task->bsd_ioctl_data = NULL; task->bsd_ioctl_len = 0; @@ -1065,30 +1141,36 @@ static void linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) { + struct thread *td; + const struct file_operations *fop; + struct linux_cdev *ldev; int temp; - if (filp->f_kqflags & kqflags) { - struct thread *td = curthread; - - /* get the latest polling state */ - temp = OPW(filp->_file, td, filp->f_op->poll(filp, NULL)); - - spin_lock(&filp->f_kqlock); - /* clear kqflags */ - filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | - LINUX_KQ_FLAG_NEED_WRITE); - /* update kqflags */ - if (temp & (POLLIN | POLLOUT)) { - if (temp & POLLIN) - filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; - if (temp & POLLOUT) - filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; - - /* make sure the "knote" gets woken up */ - KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); - } - spin_unlock(&filp->f_kqlock); + if ((filp->f_kqflags & kqflags) == 0) + return; + + td = curthread; + + linux_get_fop(filp, &fop, &ldev); + /* get the latest polling state */ + temp = OPW(filp->_file, td, fop->poll(filp, NULL)); + linux_drop_fop(ldev); + + spin_lock(&filp->f_kqlock); + /* clear kqflags */ + filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | + LINUX_KQ_FLAG_NEED_WRITE); + /* update kqflags */ + if ((temp & (POLLIN | POLLOUT)) != 0) { + if ((temp & POLLIN) != 0) + filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; + if ((temp & POLLOUT) != 0) + filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; + + /* make sure the "knote" gets woken up */ + KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); } + spin_unlock(&filp->f_kqlock); } static int @@ -1137,9 +1219,9 @@ } static int -linux_file_mmap_single(struct file *fp, vm_ooffset_t *offset, - vm_size_t size, struct vm_object **object, int nprot, - struct thread *td) +linux_file_mmap_single(struct file *fp, const struct file_operations *fop, + vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, + int nprot, struct thread *td) { struct task_struct *task; struct vm_area_struct *vmap; @@ -1151,7 +1233,7 @@ filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; - if (filp->f_op->mmap == NULL) + if (fop->mmap == NULL) return (EOPNOTSUPP); linux_set_current(td); @@ -1181,7 +1263,7 @@ if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { error = linux_get_error(task, EINTR); } else { - error = -OPW(fp, td, filp->f_op->mmap(filp, vmap)); + error = -OPW(fp, td, fop->mmap(filp, vmap)); error = linux_get_error(task, error); up_write(&vmap->vm_mm->mmap_sem); } @@ -1300,6 +1382,8 @@ int flags, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; ssize_t bytes; int error; @@ -1312,8 +1396,10 @@ if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); - if (filp->f_op->read) { - bytes = OPW(file, td, filp->f_op->read(filp, uio->uio_iov->iov_base, + linux_get_fop(filp, &fop, &ldev); + if (fop->read != NULL) { + bytes = OPW(file, td, fop->read(filp, + uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = @@ -1328,6 +1414,7 @@ /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); + linux_drop_fop(ldev); return (error); } @@ -1337,10 +1424,11 @@ int flags, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; ssize_t bytes; int error; - error = 0; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ @@ -1349,14 +1437,17 @@ if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); - if (filp->f_op->write) { - bytes = OPW(file, td, filp->f_op->write(filp, uio->uio_iov->iov_base, + linux_get_fop(filp, &fop, &ldev); + if (fop->write != NULL) { + bytes = OPW(file, td, fop->write(filp, + uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; + error = 0; } else { error = linux_get_error(current, -bytes); } @@ -1366,6 +1457,8 @@ /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); + linux_drop_fop(ldev); + return (error); } @@ -1374,16 +1467,21 @@ struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; int revents; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; linux_set_current(td); - if (filp->f_op->poll != NULL) - revents = OPW(file, td, filp->f_op->poll(filp, LINUX_POLL_TABLE_NORMAL)) & events; - else + linux_get_fop(filp, &fop, &ldev); + if (fop->poll != NULL) { + revents = OPW(file, td, fop->poll(filp, + LINUX_POLL_TABLE_NORMAL)) & events; + } else { revents = 0; - + } + linux_drop_fop(ldev); return (revents); } @@ -1391,23 +1489,28 @@ linux_file_close(struct file *file, struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; int error; filp = (struct linux_file *)file->f_data; - KASSERT(file_count(filp) == 0, ("File refcount(%d) is not zero", file_count(filp))); + KASSERT(file_count(filp) == 0, + ("File refcount(%d) is not zero", file_count(filp))); + error = 0; filp->f_flags = file->f_flag; linux_set_current(td); linux_poll_wait_dequeue(filp); - error = -OPW(file, td, filp->f_op->release(filp->f_vnode, filp)); + linux_get_fop(filp, &fop, &ldev); + if (fop->release != NULL) + error = -OPW(file, td, fop->release(filp->f_vnode, filp)); funsetown(&filp->f_sigio); if (filp->f_vnode != NULL) vdrop(filp->f_vnode); - if (filp->f_cdev != NULL) { - /* put a reference on the Linux character device */ - atomic_long_dec(&filp->f_cdev->refs); - } + linux_drop_fop(ldev); + if (filp->f_cdev != NULL) + linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); @@ -1418,27 +1521,30 @@ struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; int error; + error = 0; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; - error = 0; + linux_get_fop(filp, &fop, &ldev); linux_set_current(td); switch (cmd) { case FIONBIO: break; case FIOASYNC: - if (filp->f_op->fasync == NULL) + if (fop->fasync == NULL) break; - error = -OPW(fp, td, filp->f_op->fasync(0, filp, fp->f_flag & FASYNC)); + error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); break; case FIOSETOWN: error = fsetown(*(int *)data, &filp->f_sigio); if (error == 0) { - if (filp->f_op->fasync == NULL) + if (fop->fasync == NULL) break; - error = -OPW(fp, td, filp->f_op->fasync(0, filp, + error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); } break; @@ -1446,16 +1552,17 @@ *(int *)data = fgetown(&filp->f_sigio); break; default: - error = linux_file_ioctl_sub(fp, filp, cmd, data, td); + error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); break; } + linux_drop_fop(ldev); return (error); } static int linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct file *fp, - vm_ooffset_t *foff, vm_object_t *objp) + vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) { /* * Character devices do not provide private mappings @@ -1467,7 +1574,8 @@ if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0) return (EINVAL); - return (linux_file_mmap_single(fp, foff, objsize, objp, (int)prot, td)); + return (linux_file_mmap_single(fp, fop, foff, objsize, objp, + (int)prot, td)); } static int @@ -1476,6 +1584,8 @@ struct thread *td) { struct linux_file *filp; + const struct file_operations *fop; + struct linux_cdev *ldev; struct mount *mp; struct vnode *vp; vm_object_t object; @@ -1522,15 +1632,18 @@ } maxprot &= cap_maxprot; - error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp, &foff, - &object); + linux_get_fop(filp, &fop, &ldev); + error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp, + &foff, fop, &object); if (error != 0) - return (error); + goto out; error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); +out: + linux_drop_fop(ldev); return (error); } @@ -1951,6 +2064,14 @@ return (isdone); } +static void +linux_cdev_deref(struct linux_cdev *ldev) +{ + + if (refcount_release(&ldev->refs)) + kfree(ldev); +} + static void linux_cdev_release(struct kobject *kobj) { @@ -1960,7 +2081,7 @@ cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; linux_destroy_dev(cdev); - kfree(cdev); + linux_cdev_deref(cdev); kobject_put(parent); } @@ -1977,20 +2098,19 @@ } void -linux_destroy_dev(struct linux_cdev *cdev) +linux_destroy_dev(struct linux_cdev *ldev) { - if (cdev->cdev == NULL) + if (ldev->cdev == NULL) return; - atomic_long_dec(&cdev->refs); - - /* wait for all open files to be closed */ - while (atomic_long_read(&cdev->refs) != -1L) - pause("ldevdrn", hz); + MPASS((ldev->siref & LDEV_SI_DTR) == 0); + atomic_set_int(&ldev->siref, LDEV_SI_DTR); + while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) + pause("ldevdtr", hz / 4); - destroy_dev(cdev->cdev); - cdev->cdev = NULL; + destroy_dev(ldev->cdev); + ldev->cdev = NULL; } const struct kobj_type linux_cdev_ktype = { Index: sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c =================================================================== --- sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c +++ sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c @@ -1335,6 +1335,70 @@ return 0; } +static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + int ret; + struct vm_area_struct *vma; + struct mlx5_ib_vma_private_data *vma_private, *n; + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); + if (!owning_process) + return; + + owning_mm = get_task_mm(owning_process); + if (!owning_mm) { + pr_info("no mm, disassociate ucontext is pending task termination\n"); + while (1) { + put_task_struct(owning_process); + usleep_range(1000, 2000); + owning_process = get_pid_task(ibcontext->tgid, + PIDTYPE_PID); + if (!owning_process || +#if 0 + owning_process->state == TASK_DEAD +#else + owning_process->task_thread->td_proc->p_state == PRS_ZOMBIE +#endif + ) { + pr_info("disassociate ucontext done, task was terminated\n"); + /* in case task was dead need to release the + * task struct. + */ + if (owning_process) + put_task_struct(owning_process); + return; + } + } + } + + /* need to protect from a race on closing the vma as part of + * mlx5_ib_vma_close. + */ + down_write(&owning_mm->mmap_sem); + list_for_each_entry_safe(vma_private, n, &context->vma_private_list, + list) { + vma = vma_private->vma; + ret = zap_vma_ptes(vma, vma->vm_start, + PAGE_SIZE); + WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + /* context going to be destroyed, should + * not access ops any more. + */ +#if 0 + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); +#endif + vma->vm_ops = NULL; + list_del(&vma_private->list); + kfree(vma_private); + } + up_write(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); +} + static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) { switch (cmd) { @@ -3084,6 +3148,8 @@ dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; } + dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; + mlx5_ib_internal_fill_odp_caps(dev); if (MLX5_CAP_GEN(mdev, imaicl)) {