Index: sys/compat/linux/linux_mmap.c =================================================================== --- sys/compat/linux/linux_mmap.c +++ sys/compat/linux/linux_mmap.c @@ -38,9 +38,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -48,6 +50,7 @@ #include #include #include +#include #include #include @@ -242,6 +245,92 @@ return (kern_mprotect(td, addr, len, prot)); } +/* + * Implement Linux madvise(MADV_DONTNEED), which has unusual semantics: for + * anonymous memory, pages in the range are immediately discarded. + */ +static int +linux_madvise_dontneed(struct thread *td, vm_offset_t start, vm_offset_t end) +{ + vm_map_t map; + vm_map_entry_t entry; + vm_object_t object; + vm_offset_t estart, eend; + vm_pindex_t pstart, pend; + int error; + + map = &td->td_proc->p_vmspace->vm_map; + + if (!vm_map_check_range(map, start, end)) + return (EINVAL); + start = trunc_page(start); + end = round_page(end); + + error = 0; + vm_map_lock_read(map); + if (!vm_map_lookup_entry(map, start, &entry)) + entry = vm_map_entry_succ(entry); + for (; entry->start < end; entry = vm_map_entry_succ(entry)) { + if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) + continue; + + if (entry->wired_count != 0) { + error = EINVAL; + break; + } + + object = entry->object.vm_object; + if (object == NULL) + continue; + + pstart = OFF_TO_IDX(entry->offset); + if (start > entry->start) { + pstart += atop(start - entry->start); + estart = start; + } else { + estart = entry->start; + } + pend = OFF_TO_IDX(entry->offset) + + atop(entry->end - entry->start); + if (entry->end > end) { + pend -= atop(entry->end - end); + eend = end; + } else { + eend = entry->end; + } + + if ((object->flags & (OBJ_ANON | OBJ_ONEMAPPING)) != + (OBJ_ANON | OBJ_ONEMAPPING)) { + /* + * Handle shared mappings the same way as native FreeBSD + * madvise(MADV_DONTNEED). + */ + pmap_advise(map->pmap, estart, eend, MADV_DONTNEED); + vm_object_madvise(object, pstart, pend, MADV_DONTNEED); + } else { + /* + * Singly-mapped anonymous memory is discarded. This + * does not match Linux's semantics when the object + * belongs to a shadow chain of length > 1, since + * subsequent faults may retrieve pages from an + * intermediate anonymous object. However, handling + * this case correctly introduces a fair bit of + * complexity. + */ + VM_OBJECT_WLOCK(object); + vm_object_collapse(object); + vm_object_page_remove(object, pstart, pend, 0); + if (object->backing_object != NULL && + (object->backing_object->flags & OBJ_ANON) != 0) + linux_msg("possibly incorrect MADV_DONTNEED"); + VM_OBJECT_WUNLOCK(object); + } + } + vm_map_unlock_read(map); + + return (error); +} + int linux_madvise_common(struct thread *td, uintptr_t addr, size_t len, int l_behav) { @@ -261,12 +350,7 @@ behav = MADV_WILLNEED; break; case LINUX_MADV_DONTNEED: - /* - * Note that the semantics of Linux MADV_DONTNEED is very - * different from the FreeBSD's madvise(2) flag of the same name. - */ - behav = MADV_DONTNEED; - break; + return (linux_madvise_dontneed(td, addr, addr + len)); case LINUX_MADV_FREE: behav = MADV_FREE; break;