diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -35,6 +35,7 @@ #include struct vm_snapshot_meta; +struct vm_get_dirty_page_list; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); @@ -287,6 +288,7 @@ void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); +int vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list); #ifdef _SYS__CPUSET_H_ /* diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -31,6 +31,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include "vmm_migration.h" + struct vm_snapshot_meta; #ifdef _KERNEL @@ -257,6 +259,13 @@ }; _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); +struct vm_get_dirty_page_list { + uint8_t *page_list; + size_t num_pages; + struct vmm_migration_segment lowmem; + struct vmm_migration_segment highmem; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -345,7 +354,8 @@ /* checkpoint */ IOCNUM_SNAPSHOT_REQ = 113, - IOCNUM_RESTORE_TIME = 115 + IOCNUM_RESTORE_TIME = 115, + IOCNUM_VM_GET_DIRTY_PAGE_LIST = 117, }; #define VM_RUN \ @@ -476,4 +486,6 @@ _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta) #define VM_RESTORE_TIME \ _IOWR('v', IOCNUM_RESTORE_TIME, int) +#define VM_GET_DIRTY_PAGE_LIST \ + _IOWR('v', IOCNUM_VM_GET_DIRTY_PAGE_LIST, struct vm_get_dirty_page_list) #endif diff --git a/sys/amd64/include/vmm_migration.h b/sys/amd64/include/vmm_migration.h new file mode 100644 --- /dev/null +++ b/sys/amd64/include/vmm_migration.h @@ -0,0 +1,48 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _VMM_MIGRATION_H_ +#define _VMM_MIGRATION_H_ + +#include + +/* + * A bhyve guest has two memory segments: + * - lowmem segment: mapped from 0GB to 3GB (which is lowmem_limit) + * - highmem segment: mapped starting from 4GB + * The object that represents a segment is identified by start and end values. + * */ +struct vmm_migration_segment { + vm_offset_t start; + vm_offset_t end; +}; + +#endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,7 @@ #include #include #include +#include #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -146,6 +148,9 @@ }; #define VM_MAX_MEMMAPS 8 +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + /* * Initialization: * (o) initialized the first time the VM is created @@ -2950,4 +2955,90 @@ return (0); } -#endif + +static inline void +vm_search_dirty_pages_in_object(vm_object_t object, size_t start, size_t end, + size_t offset, uint8_t *page_list) +{ + vm_pindex_t pindex; + vm_page_t m; + uint8_t result; + + for (pindex = start / PAGE_SIZE; pindex < end / PAGE_SIZE; pindex ++) { + VM_OBJECT_WLOCK(object); + m = vm_page_lookup(object, pindex); + VM_OBJECT_WUNLOCK(object); + if (m != NULL) { + result = vm_page_test_vmm_dirty(m); + copyout(&result, page_list + pindex - offset, sizeof(result)); + } + } + +} + +int +vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list) +{ + int error = 0; + struct vmspace *vm_vmspace; + struct vm_map *vmmap; + struct vm_map_entry *entry; + struct vm_object *object; + uint8_t *page_list; + size_t offset; + + page_list = list->page_list; + + if (page_list == NULL) + return (-1); + + vm_vmspace = vm->vmspace; + + if (vm_vmspace == NULL) { + printf("%s: vm_vmspace is null\r\n", __func__); + error = -1; + return (error); + } + + vmmap = &vm_vmspace->vm_map; + + vm_map_lock(vmmap); + if (vmmap->busy) + vm_map_wait_busy(vmmap); + + for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) { + object = entry->object.vm_object; + + if (entry->start == list->lowmem.start && + entry->end == list->lowmem.end) { + // if object is lowmem + if (object == NULL) + continue; + vm_search_dirty_pages_in_object(object, + list->lowmem.start, + list->lowmem.end, + 0, + page_list); + } + + if (entry->start == list->highmem.start && + entry->end == list->highmem.end) { + if (object == NULL) + continue; + // if object is highmem + offset = (list->highmem.start - list->lowmem.end) / PAGE_SIZE; + vm_search_dirty_pages_in_object(object, + list->highmem.start, + list->highmem.end, + offset, + page_list); + } + } + + vm_map_unlock(vmmap); + + return (error); +} + +#endif /* BHYVE_SNAPSHOT */ + diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "vmm_lapic.h" #include "vmm_stat.h" @@ -410,6 +411,7 @@ int *regnums; #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; + struct vm_get_dirty_page_list *page_list; #endif error = vmm_priv_check(curthread->td_ucred); @@ -903,6 +905,10 @@ case VM_RESTORE_TIME: error = vm_restore_time(sc->vm); break; + case VM_GET_DIRTY_PAGE_LIST: + page_list = (struct vm_get_dirty_page_list *)data; + error = vm_get_dirty_page_list(sc->vm, page_list); + break; #endif default: error = ENOTTY; diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -392,6 +392,8 @@ vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); bool vm_object_is_active(vm_object_t obj); +int vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst); +int vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -2686,6 +2686,51 @@ CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list_swap, "S,kinfo_vmobject", "List of swap VM objects"); +int +vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst) +{ + vm_page_t page; + vm_offset_t page_src; + + VM_OBJECT_ASSERT_WLOCKED(object); + + page = vm_page_lookup(object, pindex); + if (page == NULL) { + // cannot find page + return (-1); + } + + vm_page_tryxbusy(page); + page->oflags &= ~VPO_VMM_DIRTY; + + pmap_clear_modify(page); + + page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page)); + copyout((void *)page_src, dst, PAGE_SIZE); + vm_page_xunbusy(page); + + return (0); +} + +int +vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src) +{ + vm_page_t page; + vm_offset_t page_src; + + VM_OBJECT_ASSERT_WLOCKED(object); + + page = vm_page_lookup(object, pindex); + if (page == NULL) { + // cannot find page + return (-1); + } + + page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page)); + copyin(src, (void *)page_src, PAGE_SIZE); + + return (0); +} #include "opt_ddb.h" #ifdef DDB diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -295,6 +295,7 @@ #define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */ #define VPO_UNMANAGED 0x04 /* no PV management for page */ #define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */ +#define VPO_VMM_DIRTY 0x80 /* dirty bit used for bhyve migration */ /* * Busy page implementation details. @@ -705,6 +706,7 @@ void vm_page_valid(vm_page_t m); int vm_page_is_valid(vm_page_t, int, int); void vm_page_test_dirty(vm_page_t); +uint8_t vm_page_test_vmm_dirty(vm_page_t m); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count); @@ -890,6 +892,7 @@ vm_page_dirty_KBI(m); #else m->dirty = VM_PAGE_BITS_ALL; + m->oflags |= VPO_VMM_DIRTY; #endif } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1416,6 +1416,25 @@ /* Refer to this operation by its public name. */ KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; + m->oflags |= VPO_VMM_DIRTY; +} + +uint8_t +vm_page_test_vmm_dirty(vm_page_t m) +{ + uint64_t value; + + vm_page_tryxbusy(m); + vm_page_test_dirty(m); + vm_page_xunbusy(m); + +// VM_OBJECT_ASSERT_WLOCKED(m->object); + + value = m->oflags & VPO_VMM_DIRTY; + if (value == 0 && pmap_is_modified(m)) + value = 1; + + return (value != 0); } /*