Index: head/emulators/xen-kernel/Makefile =================================================================== --- head/emulators/xen-kernel/Makefile +++ head/emulators/xen-kernel/Makefile @@ -2,7 +2,7 @@ PORTNAME= xen PORTVERSION= 4.11.1 -PORTREVISION= 0 +PORTREVISION= 1 CATEGORIES= emulators MASTER_SITES= http://downloads.xenproject.org/release/xen/${PORTVERSION}/ PKGNAMESUFFIX= -kernel @@ -45,6 +45,29 @@ EXTRA_PATCHES+= ${FILESDIR}/0001-x86-replace-usage-in-the-linker-script.patch:-p1 # Fix PVH Dom0 build with shadow paging EXTRA_PATCHES+= ${FILESDIR}/0001-x86-pvh-change-the-order-of-the-iommu-initialization.patch:-p1 +# Forward dom0 lapic EOIs to underlying hardware +EXTRA_PATCHES+= ${FILESDIR}/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch:-p1 +# Fix deadlock in IO-APIC gsi mapping +EXTRA_PATCHES+= ${FILESDIR}/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch:-p1 +# Fix for migration/save +EXTRA_PATCHES+= ${FILESDIR}/0001-x86-mm-locks-remove-trailing-whitespace.patch:-p1 \ + ${FILESDIR}/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch:-p1 \ + ${FILESDIR}/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch:-p1 + +# XSA-284 +EXTRA_PATCHES+= ${FILESDIR}/xsa284.patch:-p1 +# XSA-287 +EXTRA_PATCHES+= ${FILESDIR}/xsa287-4.11.patch:-p1 +# XSA-290 +EXTRA_PATCHES+= ${FILESDIR}/xsa290-4.11-1.patch:-p1 \ + ${FILESDIR}/xsa290-4.11-2.patch:-p1 +# XSA-292 +EXTRA_PATCHES+= ${FILESDIR}/xsa292.patch:-p1 +# XSA-293 +EXTRA_PATCHES+= ${FILESDIR}/xsa293-4.11-1.patch:-p1 \ + ${FILESDIR}/xsa293-4.11-2.patch:-p1 +# XSA-294 +EXTRA_PATCHES+= ${FILESDIR}/xsa294-4.11.patch:-p1 .include Index: head/emulators/xen-kernel/files/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch =================================================================== --- head/emulators/xen-kernel/files/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch +++ head/emulators/xen-kernel/files/0001-pvh-dom0-fix-deadlock-in-GSI-mapping.patch @@ -0,0 +1,115 @@ +From 603ad88fe8a681a2c5408c3f432d7083dd1c41b1 Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Mon, 28 Jan 2019 15:22:45 +0100 +Subject: [PATCH] pvh/dom0: fix deadlock in GSI mapping +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current GSI mapping code can cause the following deadlock: + +(XEN) *** Dumping CPU0 host state: *** +(XEN) ----[ Xen-4.12.0-rc x86_64 debug=y Tainted: C ]---- +[...] +(XEN) Xen call trace: +(XEN) [] vmac.c#_spin_lock_cb+0x32/0x70 +(XEN) [] vmac.c#hvm_gsi_assert+0x2f/0x60 <- pick hvm.irq_lock +(XEN) [] io.c#hvm_dirq_assist+0xd9/0x130 <- pick event_lock +(XEN) [] io.c#dpci_softirq+0xdb/0x120 +(XEN) [] softirq.c#__do_softirq+0x46/0xa0 +(XEN) [] domain.c#idle_loop+0x35/0x90 +(XEN) +[...] +(XEN) *** Dumping CPU3 host state: *** +(XEN) ----[ Xen-4.12.0-rc x86_64 debug=y Tainted: C ]---- +[...] +(XEN) Xen call trace: +(XEN) [] vmac.c#_spin_lock_cb+0x3d/0x70 +(XEN) [] vmac.c#allocate_and_map_gsi_pirq+0xc8/0x130 <- pick event_lock +(XEN) [] vioapic.c#vioapic_hwdom_map_gsi+0x80/0x130 +(XEN) [] vioapic.c#vioapic_write_redirent+0x119/0x1c0 <- pick hvm.irq_lock +(XEN) [] vioapic.c#vioapic_write+0x35/0x40 +(XEN) [] vmac.c#hvm_process_io_intercept+0xd2/0x230 +(XEN) [] vmac.c#hvm_io_intercept+0x22/0x50 +(XEN) [] emulate.c#hvmemul_do_io+0x21b/0x3c0 +(XEN) [] emulate.c#hvmemul_do_io_buffer+0x32/0x70 +(XEN) [] emulate.c#hvmemul_do_mmio_buffer+0x29/0x30 +(XEN) [] emulate.c#hvmemul_phys_mmio_access+0xf9/0x1b0 +(XEN) [] emulate.c#hvmemul_linear_mmio_access+0xf0/0x180 +(XEN) [] emulate.c#hvmemul_linear_mmio_write+0x21/0x30 +(XEN) [] emulate.c#linear_write+0xa2/0x100 +(XEN) [] emulate.c#hvmemul_write+0xb5/0x120 +(XEN) [] vmac.c#x86_emulate+0x132aa/0x149a0 +(XEN) [] vmac.c#x86_emulate_wrapper+0x29/0x70 +(XEN) [] emulate.c#_hvm_emulate_one+0x50/0x140 +(XEN) [] vmac.c#hvm_emulate_one_insn+0x41/0x100 +(XEN) [] guest_4.o#sh_page_fault__guest_4+0x976/0xd30 +(XEN) [] vmac.c#vmx_vmexit_handler+0x949/0xea0 +(XEN) [] vmac.c#vmx_asm_vmexit_handler+0xfa/0x270 + +In order to solve it move the vioapic_hwdom_map_gsi outside of the +locked region in vioapic_write_redirent. vioapic_hwdom_map_gsi will +not access any of the vioapic fields, so there's no need to call the +function holding the hvm.irq_lock. + +Signed-off-by: Roger Pau Monné +Reviewed-by: Wei Liu +Reviewed-by: Jan Beulich +Release-acked-by: Juergen Gross +--- + xen/arch/x86/hvm/vioapic.c | 32 ++++++++++++++++++-------------- + 1 file changed, 18 insertions(+), 14 deletions(-) + +diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c +index 2b74f92d51..2d71c33c1c 100644 +--- a/xen/arch/x86/hvm/vioapic.c ++++ b/xen/arch/x86/hvm/vioapic.c +@@ -236,20 +236,6 @@ static void vioapic_write_redirent( + + *pent = ent; + +- if ( is_hardware_domain(d) && unmasked ) +- { +- int ret; +- +- ret = vioapic_hwdom_map_gsi(gsi, ent.fields.trig_mode, +- ent.fields.polarity); +- if ( ret ) +- { +- /* Mask the entry again. */ +- pent->fields.mask = 1; +- unmasked = 0; +- } +- } +- + if ( gsi == 0 ) + { + vlapic_adjust_i8259_target(d); +@@ -266,6 +252,24 @@ static void vioapic_write_redirent( + + spin_unlock(&d->arch.hvm.irq_lock); + ++ if ( is_hardware_domain(d) && unmasked ) ++ { ++ /* ++ * NB: don't call vioapic_hwdom_map_gsi while holding hvm.irq_lock ++ * since it can cause deadlocks as event_lock is taken by ++ * allocate_and_map_gsi_pirq, and that will invert the locking order ++ * used by other parts of the code. ++ */ ++ int ret = vioapic_hwdom_map_gsi(gsi, ent.fields.trig_mode, ++ ent.fields.polarity); ++ if ( ret ) ++ { ++ gprintk(XENLOG_ERR, ++ "unable to bind gsi %u to hardware domain: %d\n", gsi, ret); ++ unmasked = 0; ++ } ++ } ++ + if ( gsi == 0 || unmasked ) + pt_may_unmask_irq(d, NULL); + } +-- +2.17.2 (Apple Git-113) + Index: head/emulators/xen-kernel/files/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch =================================================================== --- head/emulators/xen-kernel/files/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch +++ head/emulators/xen-kernel/files/0001-x86-dom0-propagate-PVH-vlapic-EOIs-to-hardware.patch @@ -0,0 +1,39 @@ +From 19d2bce1c3cbfdc636c142cdf0ae38795f2202dd Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Thu, 14 Feb 2019 14:41:03 +0100 +Subject: [PATCH for-4.12] x86/dom0: propagate PVH vlapic EOIs to hardware +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Current check for MSI EIO is missing a special case for PVH Dom0, +which doesn't have a hvm_irq_dpci struct but requires EIOs to be +forwarded to the physical lapic for passed-through devices. + +Add a short-circuit to allow EOIs from PVH Dom0 to be propagated. + +Signed-off-by: Roger Pau Monné +--- +Cc: Jan Beulich +Cc: Juergen Gross +--- + xen/drivers/passthrough/io.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c +index a6eb8a4336..4290c7c710 100644 +--- a/xen/drivers/passthrough/io.c ++++ b/xen/drivers/passthrough/io.c +@@ -869,7 +869,8 @@ static int _hvm_dpci_msi_eoi(struct domain *d, + + void hvm_dpci_msi_eoi(struct domain *d, int vector) + { +- if ( !iommu_enabled || !hvm_domain_irq(d)->dpci ) ++ if ( !iommu_enabled || ++ (!hvm_domain_irq(d)->dpci && !is_hardware_domain(d)) ) + return; + + spin_lock(&d->event_lock); +-- +2.17.2 (Apple Git-113) + Index: head/emulators/xen-kernel/files/0001-x86-mm-locks-remove-trailing-whitespace.patch =================================================================== --- head/emulators/xen-kernel/files/0001-x86-mm-locks-remove-trailing-whitespace.patch +++ head/emulators/xen-kernel/files/0001-x86-mm-locks-remove-trailing-whitespace.patch @@ -0,0 +1,101 @@ +From 468937da985661e5cd1d6b2df6d6ab2d1fb1e5e4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 12 Mar 2019 12:21:03 +0100 +Subject: [PATCH 1/3] x86/mm-locks: remove trailing whitespace +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +No functional change. + +Signed-off-by: Roger Pau Monné +Reviewed-by: George Dunlap +--- + xen/arch/x86/mm/mm-locks.h | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index e5fceb2d2e..6c15b9a4cc 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -3,11 +3,11 @@ + * + * Spinlocks used by the code in arch/x86/mm. + * +- * Copyright (c) 2011 Citrix Systems, inc. ++ * Copyright (c) 2011 Citrix Systems, inc. + * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) + * Copyright (c) 2006-2007 XenSource Inc. + * Copyright (c) 2006 Michael A Fetterman +- * ++ * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or +@@ -41,7 +41,7 @@ static inline void mm_lock_init(mm_lock_t *l) + l->unlock_level = 0; + } + +-static inline int mm_locked_by_me(mm_lock_t *l) ++static inline int mm_locked_by_me(mm_lock_t *l) + { + return (l->lock.recurse_cpu == current->processor); + } +@@ -67,7 +67,7 @@ do { \ + + static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec) + { +- if ( !((mm_locked_by_me(l)) && rec) ) ++ if ( !((mm_locked_by_me(l)) && rec) ) + __check_lock_level(level); + spin_lock_recursive(&l->lock); + if ( l->lock.recurse_cnt == 1 ) +@@ -186,7 +186,7 @@ static inline void mm_unlock(mm_lock_t *l) + spin_unlock_recursive(&l->lock); + } + +-static inline void mm_enforce_order_unlock(int unlock_level, ++static inline void mm_enforce_order_unlock(int unlock_level, + unsigned short *recurse_count) + { + if ( recurse_count ) +@@ -310,7 +310,7 @@ declare_mm_rwlock(altp2m); + #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) + + /* PoD lock (per-p2m-table) +- * ++ * + * Protects private PoD data structs: entry and cache + * counts, page lists, sweep parameters. */ + +@@ -322,7 +322,7 @@ declare_mm_lock(pod) + + /* Page alloc lock (per-domain) + * +- * This is an external lock, not represented by an mm_lock_t. However, ++ * This is an external lock, not represented by an mm_lock_t. However, + * pod code uses it in conjunction with the p2m lock, and expecting + * the ordering which we enforce here. + * The lock is not recursive. */ +@@ -338,13 +338,13 @@ declare_mm_order_constraint(page_alloc) + * For shadow pagetables, this lock protects + * - all changes to shadow page table pages + * - the shadow hash table +- * - the shadow page allocator ++ * - the shadow page allocator + * - all changes to guest page table pages + * - all changes to the page_info->tlbflush_timestamp +- * - the page_info->count fields on shadow pages +- * +- * For HAP, it protects the NPT/EPT tables and mode changes. +- * ++ * - the page_info->count fields on shadow pages ++ * ++ * For HAP, it protects the NPT/EPT tables and mode changes. ++ * + * It also protects the log-dirty bitmap from concurrent accesses (and + * teardowns, etc). */ + +-- +2.17.2 (Apple Git-113) + Index: head/emulators/xen-kernel/files/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch =================================================================== --- head/emulators/xen-kernel/files/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch +++ head/emulators/xen-kernel/files/0002-x86-mm-locks-convert-some-macros-to-inline-functions.patch @@ -0,0 +1,210 @@ +From 45e260afe7ee0e6b18a7e64173a081eec6e056aa Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Tue, 12 Mar 2019 12:24:37 +0100 +Subject: [PATCH 2/3] x86/mm-locks: convert some macros to inline functions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +And rename to have only one prefix underscore where applicable. + +No functional change. + +Signed-off-by: Roger Pau Monné +Reviewed-by: George Dunlap +--- + xen/arch/x86/mm/mm-locks.h | 98 ++++++++++++++++++++------------------ + 1 file changed, 52 insertions(+), 46 deletions(-) + +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index 6c15b9a4cc..d3497713e9 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -29,7 +29,6 @@ + + /* Per-CPU variable for enforcing the lock ordering */ + DECLARE_PER_CPU(int, mm_lock_level); +-#define __get_lock_level() (this_cpu(mm_lock_level)) + + DECLARE_PERCPU_RWLOCK_GLOBAL(p2m_percpu_rwlock); + +@@ -46,43 +45,47 @@ static inline int mm_locked_by_me(mm_lock_t *l) + return (l->lock.recurse_cpu == current->processor); + } + ++static inline int _get_lock_level(void) ++{ ++ return this_cpu(mm_lock_level); ++} ++ + /* + * If you see this crash, the numbers printed are order levels defined + * in this file. + */ +-#define __check_lock_level(l) \ +-do { \ +- if ( unlikely(__get_lock_level() > (l)) ) \ +- { \ +- printk("mm locking order violation: %i > %i\n", \ +- __get_lock_level(), (l)); \ +- BUG(); \ +- } \ +-} while(0) +- +-#define __set_lock_level(l) \ +-do { \ +- __get_lock_level() = (l); \ +-} while(0) ++static inline void _check_lock_level(int l) ++{ ++ if ( unlikely(_get_lock_level() > l) ) ++ { ++ printk("mm locking order violation: %i > %i\n", _get_lock_level(), l); ++ BUG(); ++ } ++} ++ ++static inline void _set_lock_level(int l) ++{ ++ this_cpu(mm_lock_level) = l; ++} + + static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec) + { + if ( !((mm_locked_by_me(l)) && rec) ) +- __check_lock_level(level); ++ _check_lock_level(level); + spin_lock_recursive(&l->lock); + if ( l->lock.recurse_cnt == 1 ) + { + l->locker_function = func; +- l->unlock_level = __get_lock_level(); ++ l->unlock_level = _get_lock_level(); + } + else if ( (unlikely(!rec)) ) +- panic("mm lock already held by %s", l->locker_function); +- __set_lock_level(level); ++ panic("mm lock already held by %s\n", l->locker_function); ++ _set_lock_level(level); + } + + static inline void _mm_enforce_order_lock_pre(int level) + { +- __check_lock_level(level); ++ _check_lock_level(level); + } + + static inline void _mm_enforce_order_lock_post(int level, int *unlock_level, +@@ -92,12 +95,12 @@ static inline void _mm_enforce_order_lock_post(int level, int *unlock_level, + { + if ( (*recurse_count)++ == 0 ) + { +- *unlock_level = __get_lock_level(); ++ *unlock_level = _get_lock_level(); + } + } else { +- *unlock_level = __get_lock_level(); ++ *unlock_level = _get_lock_level(); + } +- __set_lock_level(level); ++ _set_lock_level(level); + } + + +@@ -118,12 +121,12 @@ static inline void _mm_write_lock(mm_rwlock_t *l, const char *func, int level) + { + if ( !mm_write_locked_by_me(l) ) + { +- __check_lock_level(level); ++ _check_lock_level(level); + percpu_write_lock(p2m_percpu_rwlock, &l->lock); + l->locker = get_processor_id(); + l->locker_function = func; +- l->unlock_level = __get_lock_level(); +- __set_lock_level(level); ++ l->unlock_level = _get_lock_level(); ++ _set_lock_level(level); + } + l->recurse_count++; + } +@@ -134,13 +137,13 @@ static inline void mm_write_unlock(mm_rwlock_t *l) + return; + l->locker = -1; + l->locker_function = "nobody"; +- __set_lock_level(l->unlock_level); ++ _set_lock_level(l->unlock_level); + percpu_write_unlock(p2m_percpu_rwlock, &l->lock); + } + + static inline void _mm_read_lock(mm_rwlock_t *l, int level) + { +- __check_lock_level(level); ++ _check_lock_level(level); + percpu_read_lock(p2m_percpu_rwlock, &l->lock); + /* There's nowhere to store the per-CPU unlock level so we can't + * set the lock level. */ +@@ -181,7 +184,7 @@ static inline void mm_unlock(mm_lock_t *l) + if ( l->lock.recurse_cnt == 1 ) + { + l->locker_function = "nobody"; +- __set_lock_level(l->unlock_level); ++ _set_lock_level(l->unlock_level); + } + spin_unlock_recursive(&l->lock); + } +@@ -194,10 +197,10 @@ static inline void mm_enforce_order_unlock(int unlock_level, + BUG_ON(*recurse_count == 0); + if ( (*recurse_count)-- == 1 ) + { +- __set_lock_level(unlock_level); ++ _set_lock_level(unlock_level); + } + } else { +- __set_lock_level(unlock_level); ++ _set_lock_level(unlock_level); + } + } + +@@ -287,21 +290,24 @@ declare_mm_lock(altp2mlist) + + #define MM_LOCK_ORDER_altp2m 40 + declare_mm_rwlock(altp2m); +-#define p2m_lock(p) \ +- do { \ +- if ( p2m_is_altp2m(p) ) \ +- mm_write_lock(altp2m, &(p)->lock); \ +- else \ +- mm_write_lock(p2m, &(p)->lock); \ +- (p)->defer_flush++; \ +- } while (0) +-#define p2m_unlock(p) \ +- do { \ +- if ( --(p)->defer_flush == 0 ) \ +- p2m_unlock_and_tlb_flush(p); \ +- else \ +- mm_write_unlock(&(p)->lock); \ +- } while (0) ++ ++static inline void p2m_lock(struct p2m_domain *p) ++{ ++ if ( p2m_is_altp2m(p) ) ++ mm_write_lock(altp2m, &p->lock); ++ else ++ mm_write_lock(p2m, &p->lock); ++ p->defer_flush++; ++} ++ ++static inline void p2m_unlock(struct p2m_domain *p) ++{ ++ if ( --p->defer_flush == 0 ) ++ p2m_unlock_and_tlb_flush(p); ++ else ++ mm_write_unlock(&p->lock); ++} ++ + #define gfn_lock(p,g,o) p2m_lock(p) + #define gfn_unlock(p,g,o) p2m_unlock(p) + #define p2m_read_lock(p) mm_read_lock(p2m, &(p)->lock) +-- +2.17.2 (Apple Git-113) + Index: head/emulators/xen-kernel/files/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch =================================================================== --- head/emulators/xen-kernel/files/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch +++ head/emulators/xen-kernel/files/0003-x86-mm-locks-apply-a-bias-to-lock-levels-for-control.patch @@ -0,0 +1,319 @@ +From efce89c1df5969486bef82eec05223a4a6522d2d Mon Sep 17 00:00:00 2001 +From: Roger Pau Monne +Date: Tue, 12 Mar 2019 12:25:21 +0100 +Subject: [PATCH 3/3] x86/mm-locks: apply a bias to lock levels for control + domain +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +paging_log_dirty_op function takes mm locks from a subject domain and +then attempts to perform copy to operations against the caller domain +in order to copy the result of the hypercall into the caller provided +buffer. + +This works fine when the caller is a non-paging domain, but triggers a +lock order panic when the caller is a paging domain due to the fact +that at the point where the copy to operation is performed the subject +domain paging lock is locked, and the copy operation requires +locking the caller p2m lock which has a lower level. + +Fix this limitation by adding a bias to the level of control domain mm +locks, so that the lower control domain mm lock always has a level +greater than the higher unprivileged domain lock level. This allows +locking the subject domain mm locks and then locking the control +domain mm locks, while keeping the same lock ordering and the changes +mostly confined to mm-locks.h. + +Note that so far only this flow (locking a subject domain locks and +then the control domain ones) has been identified, but not all +possible code paths have been inspected. Hence this solution attempts +to be a non-intrusive fix for the problem at hand, without discarding +further changes in the future if other valid code paths are found that +require more complex lock level ordering. + +Signed-off-by: Roger Pau Monné +Reviewed-by: George Dunlap +--- + xen/arch/x86/mm/mm-locks.h | 119 +++++++++++++++++++++++-------------- + xen/arch/x86/mm/p2m-pod.c | 5 +- + 2 files changed, 78 insertions(+), 46 deletions(-) + +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index d3497713e9..d6c073dc5c 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -50,15 +50,35 @@ static inline int _get_lock_level(void) + return this_cpu(mm_lock_level); + } + ++#define MM_LOCK_ORDER_MAX 64 ++/* ++ * Return the lock level taking the domain bias into account. If the domain is ++ * privileged a bias of MM_LOCK_ORDER_MAX is applied to the lock level, so that ++ * mm locks that belong to a control domain can be acquired after having ++ * acquired mm locks of an unprivileged domain. ++ * ++ * This is required in order to use some hypercalls from a paging domain that ++ * take locks of a subject domain and then attempt to copy data to/from the ++ * caller domain. ++ */ ++static inline int _lock_level(const struct domain *d, int l) ++{ ++ ASSERT(l <= MM_LOCK_ORDER_MAX); ++ ++ return l + (d && is_control_domain(d) ? MM_LOCK_ORDER_MAX : 0); ++} ++ + /* + * If you see this crash, the numbers printed are order levels defined + * in this file. + */ +-static inline void _check_lock_level(int l) ++static inline void _check_lock_level(const struct domain *d, int l) + { +- if ( unlikely(_get_lock_level() > l) ) ++ int lvl = _lock_level(d, l); ++ ++ if ( unlikely(_get_lock_level() > lvl) ) + { +- printk("mm locking order violation: %i > %i\n", _get_lock_level(), l); ++ printk("mm locking order violation: %i > %i\n", _get_lock_level(), lvl); + BUG(); + } + } +@@ -68,10 +88,11 @@ static inline void _set_lock_level(int l) + this_cpu(mm_lock_level) = l; + } + +-static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec) ++static inline void _mm_lock(const struct domain *d, mm_lock_t *l, ++ const char *func, int level, int rec) + { + if ( !((mm_locked_by_me(l)) && rec) ) +- _check_lock_level(level); ++ _check_lock_level(d, level); + spin_lock_recursive(&l->lock); + if ( l->lock.recurse_cnt == 1 ) + { +@@ -80,16 +101,17 @@ static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec) + } + else if ( (unlikely(!rec)) ) + panic("mm lock already held by %s\n", l->locker_function); +- _set_lock_level(level); ++ _set_lock_level(_lock_level(d, level)); + } + +-static inline void _mm_enforce_order_lock_pre(int level) ++static inline void _mm_enforce_order_lock_pre(const struct domain *d, int level) + { +- _check_lock_level(level); ++ _check_lock_level(d, level); + } + +-static inline void _mm_enforce_order_lock_post(int level, int *unlock_level, +- unsigned short *recurse_count) ++static inline void _mm_enforce_order_lock_post(const struct domain *d, int level, ++ int *unlock_level, ++ unsigned short *recurse_count) + { + if ( recurse_count ) + { +@@ -100,7 +122,7 @@ static inline void _mm_enforce_order_lock_post(int level, int *unlock_level, + } else { + *unlock_level = _get_lock_level(); + } +- _set_lock_level(level); ++ _set_lock_level(_lock_level(d, level)); + } + + +@@ -117,16 +139,17 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l) + return (l->locker == get_processor_id()); + } + +-static inline void _mm_write_lock(mm_rwlock_t *l, const char *func, int level) ++static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, ++ const char *func, int level) + { + if ( !mm_write_locked_by_me(l) ) + { +- _check_lock_level(level); ++ _check_lock_level(d, level); + percpu_write_lock(p2m_percpu_rwlock, &l->lock); + l->locker = get_processor_id(); + l->locker_function = func; + l->unlock_level = _get_lock_level(); +- _set_lock_level(level); ++ _set_lock_level(_lock_level(d, level)); + } + l->recurse_count++; + } +@@ -141,9 +164,10 @@ static inline void mm_write_unlock(mm_rwlock_t *l) + percpu_write_unlock(p2m_percpu_rwlock, &l->lock); + } + +-static inline void _mm_read_lock(mm_rwlock_t *l, int level) ++static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, ++ int level) + { +- _check_lock_level(level); ++ _check_lock_level(d, level); + percpu_read_lock(p2m_percpu_rwlock, &l->lock); + /* There's nowhere to store the per-CPU unlock level so we can't + * set the lock level. */ +@@ -156,28 +180,32 @@ static inline void mm_read_unlock(mm_rwlock_t *l) + + /* This wrapper uses the line number to express the locking order below */ + #define declare_mm_lock(name) \ +- static inline void mm_lock_##name(mm_lock_t *l, const char *func, int rec)\ +- { _mm_lock(l, func, MM_LOCK_ORDER_##name, rec); } ++ static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \ ++ const char *func, int rec) \ ++ { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); } + #define declare_mm_rwlock(name) \ +- static inline void mm_write_lock_##name(mm_rwlock_t *l, const char *func) \ +- { _mm_write_lock(l, func, MM_LOCK_ORDER_##name); } \ +- static inline void mm_read_lock_##name(mm_rwlock_t *l) \ +- { _mm_read_lock(l, MM_LOCK_ORDER_##name); } ++ static inline void mm_write_lock_##name(const struct domain *d, \ ++ mm_rwlock_t *l, const char *func) \ ++ { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \ ++ static inline void mm_read_lock_##name(const struct domain *d, \ ++ mm_rwlock_t *l) \ ++ { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); } + /* These capture the name of the calling function */ +-#define mm_lock(name, l) mm_lock_##name(l, __func__, 0) +-#define mm_lock_recursive(name, l) mm_lock_##name(l, __func__, 1) +-#define mm_write_lock(name, l) mm_write_lock_##name(l, __func__) +-#define mm_read_lock(name, l) mm_read_lock_##name(l) ++#define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0) ++#define mm_lock_recursive(name, d, l) mm_lock_##name(d, l, __func__, 1) ++#define mm_write_lock(name, d, l) mm_write_lock_##name(d, l, __func__) ++#define mm_read_lock(name, d, l) mm_read_lock_##name(d, l) + + /* This wrapper is intended for "external" locks which do not use + * the mm_lock_t types. Such locks inside the mm code are also subject + * to ordering constraints. */ +-#define declare_mm_order_constraint(name) \ +- static inline void mm_enforce_order_lock_pre_##name(void) \ +- { _mm_enforce_order_lock_pre(MM_LOCK_ORDER_##name); } \ +- static inline void mm_enforce_order_lock_post_##name( \ +- int *unlock_level, unsigned short *recurse_count) \ +- { _mm_enforce_order_lock_post(MM_LOCK_ORDER_##name, unlock_level, recurse_count); } \ ++#define declare_mm_order_constraint(name) \ ++ static inline void mm_enforce_order_lock_pre_##name(const struct domain *d) \ ++ { _mm_enforce_order_lock_pre(d, MM_LOCK_ORDER_##name); } \ ++ static inline void mm_enforce_order_lock_post_##name(const struct domain *d,\ ++ int *unlock_level, unsigned short *recurse_count) \ ++ { _mm_enforce_order_lock_post(d, MM_LOCK_ORDER_##name, unlock_level, \ ++ recurse_count); } + + static inline void mm_unlock(mm_lock_t *l) + { +@@ -221,7 +249,7 @@ static inline void mm_enforce_order_unlock(int unlock_level, + + #define MM_LOCK_ORDER_nestedp2m 8 + declare_mm_lock(nestedp2m) +-#define nestedp2m_lock(d) mm_lock(nestedp2m, &(d)->arch.nested_p2m_lock) ++#define nestedp2m_lock(d) mm_lock(nestedp2m, d, &(d)->arch.nested_p2m_lock) + #define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock) + + /* P2M lock (per-non-alt-p2m-table) +@@ -260,9 +288,10 @@ declare_mm_rwlock(p2m); + + #define MM_LOCK_ORDER_per_page_sharing 24 + declare_mm_order_constraint(per_page_sharing) +-#define page_sharing_mm_pre_lock() mm_enforce_order_lock_pre_per_page_sharing() ++#define page_sharing_mm_pre_lock() \ ++ mm_enforce_order_lock_pre_per_page_sharing(NULL) + #define page_sharing_mm_post_lock(l, r) \ +- mm_enforce_order_lock_post_per_page_sharing((l), (r)) ++ mm_enforce_order_lock_post_per_page_sharing(NULL, (l), (r)) + #define page_sharing_mm_unlock(l, r) mm_enforce_order_unlock((l), (r)) + + /* Alternate P2M list lock (per-domain) +@@ -275,7 +304,8 @@ declare_mm_order_constraint(per_page_sharing) + + #define MM_LOCK_ORDER_altp2mlist 32 + declare_mm_lock(altp2mlist) +-#define altp2m_list_lock(d) mm_lock(altp2mlist, &(d)->arch.altp2m_list_lock) ++#define altp2m_list_lock(d) mm_lock(altp2mlist, d, \ ++ &(d)->arch.altp2m_list_lock) + #define altp2m_list_unlock(d) mm_unlock(&(d)->arch.altp2m_list_lock) + + /* P2M lock (per-altp2m-table) +@@ -294,9 +324,9 @@ declare_mm_rwlock(altp2m); + static inline void p2m_lock(struct p2m_domain *p) + { + if ( p2m_is_altp2m(p) ) +- mm_write_lock(altp2m, &p->lock); ++ mm_write_lock(altp2m, p->domain, &p->lock); + else +- mm_write_lock(p2m, &p->lock); ++ mm_write_lock(p2m, p->domain, &p->lock); + p->defer_flush++; + } + +@@ -310,7 +340,7 @@ static inline void p2m_unlock(struct p2m_domain *p) + + #define gfn_lock(p,g,o) p2m_lock(p) + #define gfn_unlock(p,g,o) p2m_unlock(p) +-#define p2m_read_lock(p) mm_read_lock(p2m, &(p)->lock) ++#define p2m_read_lock(p) mm_read_lock(p2m, (p)->domain, &(p)->lock) + #define p2m_read_unlock(p) mm_read_unlock(&(p)->lock) + #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) + #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) +@@ -322,7 +352,7 @@ static inline void p2m_unlock(struct p2m_domain *p) + + #define MM_LOCK_ORDER_pod 48 + declare_mm_lock(pod) +-#define pod_lock(p) mm_lock(pod, &(p)->pod.lock) ++#define pod_lock(p) mm_lock(pod, (p)->domain, &(p)->pod.lock) + #define pod_unlock(p) mm_unlock(&(p)->pod.lock) + #define pod_locked_by_me(p) mm_locked_by_me(&(p)->pod.lock) + +@@ -335,8 +365,9 @@ declare_mm_lock(pod) + + #define MM_LOCK_ORDER_page_alloc 56 + declare_mm_order_constraint(page_alloc) +-#define page_alloc_mm_pre_lock() mm_enforce_order_lock_pre_page_alloc() +-#define page_alloc_mm_post_lock(l) mm_enforce_order_lock_post_page_alloc(&(l), NULL) ++#define page_alloc_mm_pre_lock(d) mm_enforce_order_lock_pre_page_alloc(d) ++#define page_alloc_mm_post_lock(d, l) \ ++ mm_enforce_order_lock_post_page_alloc(d, &(l), NULL) + #define page_alloc_mm_unlock(l) mm_enforce_order_unlock((l), NULL) + + /* Paging lock (per-domain) +@@ -356,9 +387,9 @@ declare_mm_order_constraint(page_alloc) + + #define MM_LOCK_ORDER_paging 64 + declare_mm_lock(paging) +-#define paging_lock(d) mm_lock(paging, &(d)->arch.paging.lock) ++#define paging_lock(d) mm_lock(paging, d, &(d)->arch.paging.lock) + #define paging_lock_recursive(d) \ +- mm_lock_recursive(paging, &(d)->arch.paging.lock) ++ mm_lock_recursive(paging, d, &(d)->arch.paging.lock) + #define paging_unlock(d) mm_unlock(&(d)->arch.paging.lock) + #define paging_locked_by_me(d) mm_locked_by_me(&(d)->arch.paging.lock) + +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index 631e9aec33..725a2921d9 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -34,9 +34,10 @@ + /* Enforce lock ordering when grabbing the "external" page_alloc lock */ + static inline void lock_page_alloc(struct p2m_domain *p2m) + { +- page_alloc_mm_pre_lock(); ++ page_alloc_mm_pre_lock(p2m->domain); + spin_lock(&(p2m->domain->page_alloc_lock)); +- page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level); ++ page_alloc_mm_post_lock(p2m->domain, ++ p2m->domain->arch.page_alloc_unlock_level); + } + + static inline void unlock_page_alloc(struct p2m_domain *p2m) +-- +2.17.2 (Apple Git-113) + Index: head/emulators/xen-kernel/files/xsa284.patch =================================================================== --- head/emulators/xen-kernel/files/xsa284.patch +++ head/emulators/xen-kernel/files/xsa284.patch @@ -0,0 +1,31 @@ +From: Jan Beulich +Subject: gnttab: set page refcount for copy-on-grant-transfer + +Commit 5cc77f9098 ("32-on-64: Fix domain address-size clamping, +implement"), which introduced this functionality, took care of clearing +the old page's PGC_allocated, but failed to set the bit (and install the +associated reference) on the newly allocated one. Furthermore the "mfn" +local variable was never updated, and hence the wrong MFN was passed to +guest_physmap_add_page() (and back to the destination domain) in this +case, leading to an IOMMU mapping into an unowned page. + +Ideally the code would use assign_pages(), but the call to +gnttab_prepare_for_transfer() sits in the middle of the actions +mirroring that function. + +This is XSA-284. + +Signed-off-by: Jan Beulich +Acked-by: George Dunlap + +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2183,6 +2183,8 @@ gnttab_transfer( + page->count_info &= ~(PGC_count_mask|PGC_allocated); + free_domheap_page(page); + page = new_page; ++ page->count_info = PGC_allocated | 1; ++ mfn = page_to_mfn(page); + } + + spin_lock(&e->page_alloc_lock); Index: head/emulators/xen-kernel/files/xsa287-4.11.patch =================================================================== --- head/emulators/xen-kernel/files/xsa287-4.11.patch +++ head/emulators/xen-kernel/files/xsa287-4.11.patch @@ -0,0 +1,328 @@ +From 67620c1ccb13f7b58645f48248ba1f408b021fdc Mon Sep 17 00:00:00 2001 +From: George Dunlap +Date: Fri, 18 Jan 2019 15:00:34 +0000 +Subject: [PATCH] steal_page: Get rid of bogus struct page states + +The original rules for `struct page` required the following invariants +at all times: + +- refcount > 0 implies owner != NULL +- PGC_allocated implies refcount > 0 + +steal_page, in a misguided attempt to protect against unknown races, +violates both of these rules, thus introducing other races: + +- Temporarily, the count_info has the refcount go to 0 while + PGC_allocated is set + +- It explicitly returns the page PGC_allocated set, but owner == NULL + and page not on the page_list. + +The second one meant that page_get_owner_and_reference() could return +NULL even after having successfully grabbed a reference on the page, +leading the caller to leak the reference (since "couldn't get ref" and +"got ref but no owner" look the same). + +Furthermore, rather than grabbing a page reference to ensure that the +owner doesn't change under its feet, it appears to rely on holding +d->page_alloc lock to prevent this. + +Unfortunately, this is ineffective: page->owner remains non-NULL for +some time after the count has been set to 0; meaning that it would be +entirely possible for the page to be freed and re-allocated to a +different domain between the page_get_owner() check and the count_info +check. + +Modify steal_page to instead follow the appropriate access discipline, +taking the page through series of states similar to being freed and +then re-allocated with MEMF_no_owner: + +- Grab an extra reference to make sure we don't race with anyone else + freeing the page + +- Drop both references and PGC_allocated atomically, so that (if +successful), anyone else trying to grab a reference will fail + +- Attempt to reset Xen's mappings + +- Reset the rest of the state. + +Then, modify the two callers appropriately: + +- Leave count_info alone (it's already been cleared) +- Call free_domheap_page() directly if appropriate +- Call assign_pages() rather than open-coding a partial assign + +With all callers to assign_pages() now passing in pages with the +type_info field clear, tighten the respective assertion there. + +This is XSA-287. + +Signed-off-by: George Dunlap +Signed-off-by: Jan Beulich +--- + xen/arch/x86/mm.c | 84 ++++++++++++++++++++++++++++------------ + xen/common/grant_table.c | 20 +++++----- + xen/common/memory.c | 19 +++++---- + xen/common/page_alloc.c | 2 +- + 4 files changed, 83 insertions(+), 42 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 6509035a5c..d8ff58c901 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -3966,70 +3966,106 @@ int donate_page( + return -EINVAL; + } + ++/* ++ * Steal page will attempt to remove `page` from domain `d`. Upon ++ * return, `page` will be in a state similar to the state of a page ++ * returned from alloc_domheap_page() with MEMF_no_owner set: ++ * - refcount 0 ++ * - type count cleared ++ * - owner NULL ++ * - page caching attributes cleaned up ++ * - removed from the domain's page_list ++ * ++ * If MEMF_no_refcount is not set, the domain's tot_pages will be ++ * adjusted. If this results in the page count falling to 0, ++ * put_domain() will be called. ++ * ++ * The caller should either call free_domheap_page() to free the ++ * page, or assign_pages() to put it back on some domain's page list. ++ */ + int steal_page( + struct domain *d, struct page_info *page, unsigned int memflags) + { + unsigned long x, y; + bool drop_dom_ref = false; +- const struct domain *owner = dom_xen; ++ const struct domain *owner; ++ int rc; + + if ( paging_mode_external(d) ) + return -EOPNOTSUPP; + +- spin_lock(&d->page_alloc_lock); +- +- if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) ) ++ /* Grab a reference to make sure the page doesn't change under our feet */ ++ rc = -EINVAL; ++ if ( !(owner = page_get_owner_and_reference(page)) ) + goto fail; + ++ if ( owner != d || is_xen_heap_page(page) ) ++ goto fail_put; ++ + /* +- * We require there is just one reference (PGC_allocated). We temporarily +- * drop this reference now so that we can safely swizzle the owner. ++ * We require there are exactly two references -- the one we just ++ * took, and PGC_allocated. We temporarily drop both these ++ * references so that the page becomes effectively non-"live" for ++ * the domain. + */ + y = page->count_info; + do { + x = y; +- if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) ) +- goto fail; +- y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask); ++ if ( (x & (PGC_count_mask|PGC_allocated)) != (2 | PGC_allocated) ) ++ goto fail_put; ++ y = cmpxchg(&page->count_info, x, x & ~(PGC_count_mask|PGC_allocated)); + } while ( y != x ); + + /* +- * With the sole reference dropped temporarily, no-one can update type +- * information. Type count also needs to be zero in this case, but e.g. +- * PGT_seg_desc_page may still have PGT_validated set, which we need to +- * clear before transferring ownership (as validation criteria vary +- * depending on domain type). ++ * NB this is safe even if the page ends up being given back to ++ * the domain, because the count is zero: subsequent mappings will ++ * cause the cache attributes to be re-instated inside ++ * get_page_from_l1e(). ++ */ ++ if ( (rc = cleanup_page_cacheattr(page)) ) ++ { ++ /* ++ * Couldn't fixup Xen's mappings; put things the way we found ++ * it and return an error ++ */ ++ page->count_info |= PGC_allocated | 1; ++ goto fail; ++ } ++ ++ /* ++ * With the reference count now zero, nobody can grab references ++ * to do anything else with the page. Return the page to a state ++ * that it might be upon return from alloc_domheap_pages with ++ * MEMF_no_owner set. + */ ++ spin_lock(&d->page_alloc_lock); ++ + BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked | + PGT_pinned)); + page->u.inuse.type_info = 0; +- +- /* Swizzle the owner then reinstate the PGC_allocated reference. */ + page_set_owner(page, NULL); +- y = page->count_info; +- do { +- x = y; +- BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated); +- } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x ); ++ page_list_del(page, &d->page_list); + + /* Unlink from original owner. */ + if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) ) + drop_dom_ref = true; +- page_list_del(page, &d->page_list); + + spin_unlock(&d->page_alloc_lock); ++ + if ( unlikely(drop_dom_ref) ) + put_domain(d); ++ + return 0; + ++ fail_put: ++ put_page(page); + fail: +- spin_unlock(&d->page_alloc_lock); + gdprintk(XENLOG_WARNING, "Bad steal mfn %" PRI_mfn + " from d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n", + mfn_x(page_to_mfn(page)), d->domain_id, + owner ? owner->domain_id : DOMID_INVALID, + page->count_info, page->u.inuse.type_info); +- return -EINVAL; ++ return rc; + } + + static int __do_update_va_mapping( +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index c0585d33f4..656fad1b42 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2179,7 +2179,7 @@ gnttab_transfer( + rcu_unlock_domain(e); + put_gfn_and_copyback: + put_gfn(d, gop.mfn); +- page->count_info &= ~(PGC_count_mask|PGC_allocated); ++ /* The count_info has already been cleaned */ + free_domheap_page(page); + goto copyback; + } +@@ -2202,10 +2202,9 @@ gnttab_transfer( + + copy_domain_page(page_to_mfn(new_page), mfn); + +- page->count_info &= ~(PGC_count_mask|PGC_allocated); ++ /* The count_info has already been cleared */ + free_domheap_page(page); + page = new_page; +- page->count_info = PGC_allocated | 1; + mfn = page_to_mfn(page); + } + +@@ -2245,12 +2244,17 @@ gnttab_transfer( + */ + spin_unlock(&e->page_alloc_lock); + okay = gnttab_prepare_for_transfer(e, d, gop.ref); +- spin_lock(&e->page_alloc_lock); + +- if ( unlikely(!okay) || unlikely(e->is_dying) ) ++ if ( unlikely(!okay || assign_pages(e, page, 0, MEMF_no_refcount)) ) + { +- bool_t drop_dom_ref = !domain_adjust_tot_pages(e, -1); ++ bool drop_dom_ref; + ++ /* ++ * Need to grab this again to safely free our "reserved" ++ * page in the page total ++ */ ++ spin_lock(&e->page_alloc_lock); ++ drop_dom_ref = !domain_adjust_tot_pages(e, -1); + spin_unlock(&e->page_alloc_lock); + + if ( okay /* i.e. e->is_dying due to the surrounding if() */ ) +@@ -2263,10 +2267,6 @@ gnttab_transfer( + goto unlock_and_copyback; + } + +- page_list_add_tail(page, &e->page_list); +- page_set_owner(page, e); +- +- spin_unlock(&e->page_alloc_lock); + put_gfn(d, gop.mfn); + + TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id); +diff --git a/xen/common/memory.c b/xen/common/memory.c +index 4fb7962c79..f71163221f 100644 +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -675,20 +675,22 @@ static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg) + * Success! Beyond this point we cannot fail for this chunk. + */ + +- /* Destroy final reference to each input page. */ ++ /* ++ * These pages have already had owner and reference cleared. ++ * Do the final two steps: Remove from the physmap, and free ++ * them. ++ */ + while ( (page = page_list_remove_head(&in_chunk_list)) ) + { + unsigned long gfn; + +- if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) ) +- BUG(); + mfn = page_to_mfn(page); + gfn = mfn_to_gmfn(d, mfn_x(mfn)); + /* Pages were unshared above */ + BUG_ON(SHARED_M2P(gfn)); + if ( guest_physmap_remove_page(d, _gfn(gfn), mfn, 0) ) + domain_crash(d); +- put_page(page); ++ free_domheap_page(page); + } + + /* Assign each output page to the domain. */ +@@ -761,13 +763,16 @@ static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg) + * chunks succeeded. + */ + fail: +- /* Reassign any input pages we managed to steal. */ ++ /* ++ * Reassign any input pages we managed to steal. NB that if the assign ++ * fails again, we're on the hook for freeing the page, since we've already ++ * cleared PGC_allocated. ++ */ + while ( (page = page_list_remove_head(&in_chunk_list)) ) + if ( assign_pages(d, page, 0, MEMF_no_refcount) ) + { + BUG_ON(!d->is_dying); +- if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) +- put_page(page); ++ free_domheap_page(page); + } + + dying: +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index 482f0988f7..52da7762e3 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -2221,7 +2221,7 @@ int assign_pages( + for ( i = 0; i < (1 << order); i++ ) + { + ASSERT(page_get_owner(&pg[i]) == NULL); +- ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0); ++ ASSERT(!pg[i].count_info); + page_set_owner(&pg[i], d); + smp_wmb(); /* Domain pointer must be visible before updating refcnt. */ + pg[i].count_info = PGC_allocated | 1; +-- +2.20.1 + Index: head/emulators/xen-kernel/files/xsa290-4.11-1.patch =================================================================== --- head/emulators/xen-kernel/files/xsa290-4.11-1.patch +++ head/emulators/xen-kernel/files/xsa290-4.11-1.patch @@ -0,0 +1,237 @@ +From: Jan Beulich +Subject: x86/mm: also allow L2 (un)validation to be preemptible + +Commit c612481d1c ("x86/mm: Plumbing to allow any PTE update to fail +with -ERESTART") added assertions next to the {alloc,free}_l2_table() +invocations to document (and validate in debug builds) that L2 +(un)validations are always preemptible. + +The assertion in free_page_type() was now observed to trigger when +recursive L2 page tables get cleaned up. + +In particular put_page_from_l2e()'s assumption that _put_page_type() +would always succeed is now wrong, resulting in a partially un-validated +page left in a domain, which has no other means of getting cleaned up +later on. If not causing any problems earlier, this would ultimately +trigger the check for ->u.inuse.type_info having a zero count when +freeing the page during cleanup after the domain has died. + +As a result it should be considered a mistake to not have extended +preemption fully to L2 when it was added to L3/L4 table handling, which +this change aims to correct. + +The validation side additions are done just for symmetry. + +This is part of XSA-290. + +Reported-by: Manuel Bouyer +Tested-by: Manuel Bouyer +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1126,7 +1126,7 @@ get_page_from_l1e( + define_get_linear_pagetable(l2); + static int + get_page_from_l2e( +- l2_pgentry_t l2e, unsigned long pfn, struct domain *d) ++ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial) + { + unsigned long mfn = l2e_get_pfn(l2e); + int rc; +@@ -1141,7 +1141,8 @@ get_page_from_l2e( + return -EINVAL; + } + +- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0); ++ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, ++ partial, false); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; + +@@ -1295,8 +1296,11 @@ void put_page_from_l1e(l1_pgentry_t l1e, + * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. + * Note also that this automatically deals correctly with linear p.t.'s. + */ +-static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) ++static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, ++ int partial, bool defer) + { ++ int rc = 0; ++ + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) ) + return 1; + +@@ -1311,13 +1315,27 @@ static int put_page_from_l2e(l2_pgentry_ + else + { + struct page_info *pg = l2e_get_page(l2e); +- int rc = _put_page_type(pg, false, mfn_to_page(_mfn(pfn))); ++ struct page_info *ptpg = mfn_to_page(_mfn(pfn)); + +- ASSERT(!rc); +- put_page(pg); ++ if ( unlikely(partial > 0) ) ++ { ++ ASSERT(!defer); ++ rc = _put_page_type(pg, true, ptpg); ++ } ++ else if ( defer ) ++ { ++ current->arch.old_guest_ptpg = ptpg; ++ current->arch.old_guest_table = pg; ++ } ++ else ++ { ++ rc = _put_page_type(pg, true, ptpg); ++ if ( likely(!rc) ) ++ put_page(pg); ++ } + } + +- return 0; ++ return rc; + } + + static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, +@@ -1487,11 +1505,12 @@ static int alloc_l2_table(struct page_in + unsigned long pfn = mfn_x(page_to_mfn(page)); + l2_pgentry_t *pl2e; + unsigned int i; +- int rc = 0; ++ int rc = 0, partial = page->partial_pte; + + pl2e = map_domain_page(_mfn(pfn)); + +- for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ ) ++ for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; ++ i++, partial = 0 ) + { + if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) + { +@@ -1501,23 +1520,33 @@ static int alloc_l2_table(struct page_in + } + + if ( !is_guest_l2_slot(d, type, i) || +- (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 ) ++ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 ) + continue; + +- if ( unlikely(rc == -ERESTART) ) ++ if ( rc == -ERESTART ) + { + page->nr_validated_ptes = i; +- break; ++ page->partial_pte = partial ?: 1; + } +- +- if ( rc < 0 ) ++ else if ( rc == -EINTR && i ) ++ { ++ page->nr_validated_ptes = i; ++ page->partial_pte = 0; ++ rc = -ERESTART; ++ } ++ else if ( rc < 0 && rc != -EINTR ) + { + gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i); +- while ( i-- > 0 ) +- if ( is_guest_l2_slot(d, type, i) ) +- put_page_from_l2e(pl2e[i], pfn); +- break; ++ if ( i ) ++ { ++ page->nr_validated_ptes = i; ++ page->partial_pte = 0; ++ current->arch.old_guest_ptpg = NULL; ++ current->arch.old_guest_table = page; ++ } + } ++ if ( rc < 0 ) ++ break; + + pl2e[i] = adjust_guest_l2e(pl2e[i], d); + } +@@ -1797,28 +1826,50 @@ static int free_l2_table(struct page_inf + struct domain *d = page_get_owner(page); + unsigned long pfn = mfn_x(page_to_mfn(page)); + l2_pgentry_t *pl2e; +- unsigned int i = page->nr_validated_ptes - 1; +- int err = 0; ++ int rc = 0, partial = page->partial_pte; ++ unsigned int i = page->nr_validated_ptes - !partial; + + pl2e = map_domain_page(_mfn(pfn)); + +- ASSERT(page->nr_validated_ptes); +- do { +- if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) && +- put_page_from_l2e(pl2e[i], pfn) == 0 && +- i && hypercall_preempt_check() ) ++ for ( ; ; ) ++ { ++ if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) ++ rc = put_page_from_l2e(pl2e[i], pfn, partial, false); ++ if ( rc < 0 ) ++ break; ++ ++ partial = 0; ++ ++ if ( !i-- ) ++ break; ++ ++ if ( hypercall_preempt_check() ) + { +- page->nr_validated_ptes = i; +- err = -ERESTART; ++ rc = -EINTR; ++ break; + } +- } while ( !err && i-- ); ++ } + + unmap_domain_page(pl2e); + +- if ( !err ) ++ if ( rc >= 0 ) ++ { + page->u.inuse.type_info &= ~PGT_pae_xen_l2; ++ rc = 0; ++ } ++ else if ( rc == -ERESTART ) ++ { ++ page->nr_validated_ptes = i; ++ page->partial_pte = partial ?: -1; ++ } ++ else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 ) ++ { ++ page->nr_validated_ptes = i + 1; ++ page->partial_pte = 0; ++ rc = -ERESTART; ++ } + +- return err; ++ return rc; + } + + static int free_l3_table(struct page_info *page) +@@ -2138,7 +2189,7 @@ static int mod_l2_entry(l2_pgentry_t *pl + return -EBUSY; + } + +- if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) ) ++ if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d, 0)) < 0) ) + return rc; + + nl2e = adjust_guest_l2e(nl2e, d); +@@ -2157,7 +2208,8 @@ static int mod_l2_entry(l2_pgentry_t *pl + return -EBUSY; + } + +- put_page_from_l2e(ol2e, pfn); ++ put_page_from_l2e(ol2e, pfn, 0, true); ++ + return rc; + } + Index: head/emulators/xen-kernel/files/xsa290-4.11-2.patch =================================================================== --- head/emulators/xen-kernel/files/xsa290-4.11-2.patch +++ head/emulators/xen-kernel/files/xsa290-4.11-2.patch @@ -0,0 +1,71 @@ +From: Jan Beulich +Subject: x86/mm: add explicit preemption checks to L3 (un)validation + +When recursive page tables are used at the L3 level, unvalidation of a +single L4 table may incur unvalidation of two levels of L3 tables, i.e. +a maximum iteration count of 512^3 for unvalidating an L4 table. The +preemption check in free_l2_table() as well as the one in +_put_page_type() may never be reached, so explicit checking is needed in +free_l3_table(). + +When recursive page tables are used at the L4 level, the iteration count +at L4 alone is capped at 512^2. As soon as a present L3 entry is hit +which itself needs unvalidation (and hence requiring another nested loop +with 512 iterations), the preemption checks added here kick in, so no +further preemption checking is needed at L4 (until we decide to permit +5-level paging for PV guests). + +The validation side additions are done just for symmetry. + +This is part of XSA-290. + +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper + +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -1581,6 +1581,13 @@ static int alloc_l3_table(struct page_in + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; + i++, partial = 0 ) + { ++ if ( i > page->nr_validated_ptes && hypercall_preempt_check() ) ++ { ++ page->nr_validated_ptes = i; ++ rc = -ERESTART; ++ break; ++ } ++ + if ( is_pv_32bit_domain(d) && (i == 3) ) + { + if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) || +@@ -1882,15 +1889,25 @@ static int free_l3_table(struct page_inf + + pl3e = map_domain_page(_mfn(pfn)); + +- do { ++ for ( ; ; ) ++ { + rc = put_page_from_l3e(pl3e[i], pfn, partial, 0); + if ( rc < 0 ) + break; ++ + partial = 0; +- if ( rc > 0 ) +- continue; +- pl3e[i] = unadjust_guest_l3e(pl3e[i], d); +- } while ( i-- ); ++ if ( rc == 0 ) ++ pl3e[i] = unadjust_guest_l3e(pl3e[i], d); ++ ++ if ( !i-- ) ++ break; ++ ++ if ( hypercall_preempt_check() ) ++ { ++ rc = -EINTR; ++ break; ++ } ++ } + + unmap_domain_page(pl3e); + Index: head/emulators/xen-kernel/files/xsa292.patch =================================================================== --- head/emulators/xen-kernel/files/xsa292.patch +++ head/emulators/xen-kernel/files/xsa292.patch @@ -0,0 +1,95 @@ +From: Jan Beulich +Subject: x86/mm: properly flush TLB in switch_cr3_cr4() + +The CR3 values used for contexts run with PCID enabled uniformly have +CR3.NOFLUSH set, resulting in the CR3 write itself to not cause any +flushing at all. When the second CR4 write is skipped or doesn't do any +flushing, there's nothing so far which would purge TLB entries which may +have accumulated again if the PCID doesn't change; the "just in case" +flush only affects the case where the PCID actually changes. (There may +be particularly many TLB entries re-accumulated in case of a watchdog +NMI kicking in during the critical time window.) + +Suppress the no-flush behavior of the CR3 write in this particular case. + +Similarly the second CR4 write may not cause any flushing of TLB entries +established again while the original PCID was still in use - it may get +performed because of unrelated bits changing. The flush of the old PCID +needs to happen nevertheless. + +At the same time also eliminate a possible race with lazy context +switch: Just like for CR4, CR3 may change at any time while interrupts +are enabled, due to the __sync_local_execstate() invocation from the +flush IPI handler. It is for that reason that the CR3 read, just like +the CR4 one, must happen only after interrupts have been turned off. + +This is XSA-292. + +Reported-by: Sergey Dyasli +Reported-by: Andrew Cooper +Tested-by: Sergey Dyasli +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +--- +v3: Adjust comments. Drop old_cr4 from the PGE check in the expression + controlling the invocation of invpcid_flush_single_context(), as PGE + is always clear there. +v2: Decouple invpcid_flush_single_context() from 2nd CR4 write. + +--- a/xen/arch/x86/flushtlb.c ++++ b/xen/arch/x86/flushtlb.c +@@ -103,9 +103,8 @@ static void do_tlb_flush(void) + + void switch_cr3_cr4(unsigned long cr3, unsigned long cr4) + { +- unsigned long flags, old_cr4; ++ unsigned long flags, old_cr4, old_pcid; + u32 t; +- unsigned long old_pcid = cr3_pcid(read_cr3()); + + /* This non-reentrant function is sometimes called in interrupt context. */ + local_irq_save(flags); +@@ -133,15 +132,38 @@ void switch_cr3_cr4(unsigned long cr3, u + */ + invpcid_flush_all_nonglobals(); + ++ /* ++ * If we don't change PCIDs, the CR3 write below needs to flush this very ++ * PCID, even when a full flush was performed above, as we are currently ++ * accumulating TLB entries again from the old address space. ++ * NB: Clearing the bit when we don't use PCID is benign (as it is clear ++ * already in that case), but allows the if() to be more simple. ++ */ ++ old_pcid = cr3_pcid(read_cr3()); ++ if ( old_pcid == cr3_pcid(cr3) ) ++ cr3 &= ~X86_CR3_NOFLUSH; ++ + write_cr3(cr3); + + if ( old_cr4 != cr4 ) + write_cr4(cr4); +- else if ( old_pcid != cr3_pcid(cr3) ) +- /* +- * Make sure no TLB entries related to the old PCID created between +- * flushing the TLB and writing the new %cr3 value remain in the TLB. +- */ ++ ++ /* ++ * Make sure no TLB entries related to the old PCID created between ++ * flushing the TLB and writing the new %cr3 value remain in the TLB. ++ * ++ * The write to CR4 just above has performed a wider flush in certain ++ * cases, which therefore get excluded here. Since that write is ++ * conditional, note in particular that it won't be skipped if PCIDE ++ * transitions from 1 to 0. This is because the CR4 write further up will ++ * have been skipped in this case, as PCIDE and PGE won't both be set at ++ * the same time. ++ * ++ * Note also that PGE is always clear in old_cr4. ++ */ ++ if ( old_pcid != cr3_pcid(cr3) && ++ !(cr4 & X86_CR4_PGE) && ++ (old_cr4 & X86_CR4_PCIDE) <= (cr4 & X86_CR4_PCIDE) ) + invpcid_flush_single_context(old_pcid); + + post_flush(t); Index: head/emulators/xen-kernel/files/xsa293-4.11-1.patch =================================================================== --- head/emulators/xen-kernel/files/xsa293-4.11-1.patch +++ head/emulators/xen-kernel/files/xsa293-4.11-1.patch @@ -0,0 +1,317 @@ +From: Andrew Cooper +Subject: x86/pv: Rewrite guest %cr4 handling from scratch + +The PV cr4 logic is almost impossible to follow, and leaks bits into guest +context which definitely shouldn't be visible (in particular, VMXE). + +The biggest problem however, and source of the complexity, is that it derives +new real and guest cr4 values from the current value in hardware - this is +context dependent and an inappropriate source of information. + +Rewrite the cr4 logic to be invariant of the current value in hardware. + +First of all, modify write_ptbase() to always use mmu_cr4_features for IDLE +and HVM contexts. mmu_cr4_features *is* the correct value to use, and makes +the ASSERT() obviously redundant. + +For PV guests, curr->arch.pv.ctrlreg[4] remains the guests view of cr4, but +all logic gets reworked in terms of this and mmu_cr4_features only. + +Two masks are introduced; bits which the guest has control over, and bits +which are forwarded from Xen's settings. One guest-visible change here is +that Xen's VMXE setting is no longer visible at all. + +pv_make_cr4() follows fairly closely from pv_guest_cr4_to_real_cr4(), but +deliberately starts with mmu_cr4_features, and only alters the minimal subset +of bits. + +The boot-time {compat_,}pv_cr4_mask variables are removed, as they are a +remnant of the pre-CPUID policy days. pv_fixup_guest_cr4() gains a related +derivation from the policy. + +Another guest visible change here is that a 32bit PV guest can now flip +FSGSBASE in its view of CR4. While the {RD,WR}{FS,GS}BASE instructions are +unusable outside of a 64bit code segment, the ability to modify FSGSBASE +matches real hardware behaviour, and avoids the need for any 32bit/64bit +differences in the logic. + +Overall, this patch shouldn't have a practical change in guest behaviour. +VMXE will disappear from view, and an inquisitive 32bit kernel can now see +FSGSBASE changing, but this new logic is otherwise bug-compatible with before. + +This is part of XSA-293 + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index b1e50d1..675152a 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -733,49 +733,6 @@ int arch_domain_soft_reset(struct domain *d) + return ret; + } + +-/* +- * These are the masks of CR4 bits (subject to hardware availability) which a +- * PV guest may not legitimiately attempt to modify. +- */ +-static unsigned long __read_mostly pv_cr4_mask, compat_pv_cr4_mask; +- +-static int __init init_pv_cr4_masks(void) +-{ +- unsigned long common_mask = ~X86_CR4_TSD; +- +- /* +- * All PV guests may attempt to modify TSD, DE and OSXSAVE. +- */ +- if ( cpu_has_de ) +- common_mask &= ~X86_CR4_DE; +- if ( cpu_has_xsave ) +- common_mask &= ~X86_CR4_OSXSAVE; +- +- pv_cr4_mask = compat_pv_cr4_mask = common_mask; +- +- /* +- * 64bit PV guests may attempt to modify FSGSBASE. +- */ +- if ( cpu_has_fsgsbase ) +- pv_cr4_mask &= ~X86_CR4_FSGSBASE; +- +- return 0; +-} +-__initcall(init_pv_cr4_masks); +- +-unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4) +-{ +- unsigned long hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4()); +- unsigned long mask = is_pv_32bit_vcpu(v) ? compat_pv_cr4_mask : pv_cr4_mask; +- +- if ( (guest_cr4 & mask) != (hv_cr4 & mask) ) +- printk(XENLOG_G_WARNING +- "d%d attempted to change %pv's CR4 flags %08lx -> %08lx\n", +- current->domain->domain_id, v, hv_cr4, guest_cr4); +- +- return (hv_cr4 & mask) | (guest_cr4 & ~mask); +-} +- + #define xen_vcpu_guest_context vcpu_guest_context + #define fpu_ctxt fpu_ctxt.x + CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt); +@@ -789,7 +746,7 @@ int arch_set_info_guest( + struct domain *d = v->domain; + unsigned long cr3_gfn; + struct page_info *cr3_page; +- unsigned long flags, cr4; ++ unsigned long flags; + unsigned int i; + int rc = 0, compat; + +@@ -978,9 +935,8 @@ int arch_set_info_guest( + v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS; + v->arch.pv_vcpu.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS; + +- cr4 = v->arch.pv_vcpu.ctrlreg[4]; +- v->arch.pv_vcpu.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) : +- real_cr4_to_pv_guest_cr4(mmu_cr4_features); ++ v->arch.pv_vcpu.ctrlreg[4] = ++ pv_fixup_guest_cr4(v, v->arch.pv_vcpu.ctrlreg[4]); + + memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg)); + for ( i = 0; i < 8; i++ ) +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 6509035..08634b7 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -505,33 +505,13 @@ void make_cr3(struct vcpu *v, mfn_t mfn) + v->arch.cr3 |= get_pcid_bits(v, false); + } + +-unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v) +-{ +- const struct domain *d = v->domain; +- unsigned long cr4; +- +- cr4 = v->arch.pv_vcpu.ctrlreg[4] & ~X86_CR4_DE; +- cr4 |= mmu_cr4_features & (X86_CR4_PSE | X86_CR4_SMEP | X86_CR4_SMAP | +- X86_CR4_OSXSAVE | X86_CR4_FSGSBASE); +- +- if ( d->arch.pv_domain.pcid ) +- cr4 |= X86_CR4_PCIDE; +- else if ( !d->arch.pv_domain.xpti ) +- cr4 |= X86_CR4_PGE; +- +- cr4 |= d->arch.vtsc ? X86_CR4_TSD : 0; +- +- return cr4; +-} +- + void write_ptbase(struct vcpu *v) + { + struct cpu_info *cpu_info = get_cpu_info(); + unsigned long new_cr4; + + new_cr4 = (is_pv_vcpu(v) && !is_idle_vcpu(v)) +- ? pv_guest_cr4_to_real_cr4(v) +- : ((read_cr4() & ~(X86_CR4_PCIDE | X86_CR4_TSD)) | X86_CR4_PGE); ++ ? pv_make_cr4(v) : mmu_cr4_features; + + if ( is_pv_vcpu(v) && v->domain->arch.pv_domain.xpti ) + { +@@ -550,8 +530,6 @@ void write_ptbase(struct vcpu *v) + switch_cr3_cr4(v->arch.cr3, new_cr4); + cpu_info->pv_cr3 = 0; + } +- +- ASSERT(is_pv_vcpu(v) || read_cr4() == mmu_cr4_features); + } + + /* +diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c +index b75ff6b..3965959 100644 +--- a/xen/arch/x86/pv/domain.c ++++ b/xen/arch/x86/pv/domain.c +@@ -97,6 +97,52 @@ static void release_compat_l4(struct vcpu *v) + v->arch.guest_table_user = pagetable_null(); + } + ++unsigned long pv_fixup_guest_cr4(const struct vcpu *v, unsigned long cr4) ++{ ++ const struct cpuid_policy *p = v->domain->arch.cpuid; ++ ++ /* Discard attempts to set guest controllable bits outside of the policy. */ ++ cr4 &= ~((p->basic.tsc ? 0 : X86_CR4_TSD) | ++ (p->basic.de ? 0 : X86_CR4_DE) | ++ (p->feat.fsgsbase ? 0 : X86_CR4_FSGSBASE) | ++ (p->basic.xsave ? 0 : X86_CR4_OSXSAVE)); ++ ++ /* Masks expected to be disjoint sets. */ ++ BUILD_BUG_ON(PV_CR4_GUEST_MASK & PV_CR4_GUEST_VISIBLE_MASK); ++ ++ /* ++ * A guest sees the policy subset of its own choice of guest controllable ++ * bits, and a subset of Xen's choice of certain hardware settings. ++ */ ++ return ((cr4 & PV_CR4_GUEST_MASK) | ++ (mmu_cr4_features & PV_CR4_GUEST_VISIBLE_MASK)); ++} ++ ++unsigned long pv_make_cr4(const struct vcpu *v) ++{ ++ const struct domain *d = v->domain; ++ unsigned long cr4 = mmu_cr4_features & ++ ~(X86_CR4_PCIDE | X86_CR4_PGE | X86_CR4_TSD); ++ ++ /* ++ * PCIDE or PGE depends on the PCID/XPTI settings, but must not both be ++ * set, as it impacts the safety of TLB flushing. ++ */ ++ if ( d->arch.pv_domain.pcid ) ++ cr4 |= X86_CR4_PCIDE; ++ else if ( !d->arch.pv_domain.xpti ) ++ cr4 |= X86_CR4_PGE; ++ ++ /* ++ * TSD is needed if either the guest has elected to use it, or Xen is ++ * virtualising the TSC value the guest sees. ++ */ ++ if ( d->arch.vtsc || (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) ) ++ cr4 |= X86_CR4_TSD; ++ ++ return cr4; ++} ++ + int switch_compat(struct domain *d) + { + struct vcpu *v; +@@ -191,7 +237,7 @@ int pv_vcpu_initialise(struct vcpu *v) + /* PV guests by default have a 100Hz ticker. */ + v->periodic_period = MILLISECS(10); + +- v->arch.pv_vcpu.ctrlreg[4] = real_cr4_to_pv_guest_cr4(mmu_cr4_features); ++ v->arch.pv_vcpu.ctrlreg[4] = pv_fixup_guest_cr4(v, 0); + + if ( is_pv_32bit_domain(d) ) + { +diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c +index ce2ec76..4abbc14 100644 +--- a/xen/arch/x86/pv/emul-priv-op.c ++++ b/xen/arch/x86/pv/emul-priv-op.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -785,8 +786,8 @@ static int write_cr(unsigned int reg, unsigned long val, + } + + case 4: /* Write CR4 */ +- curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val); +- write_cr4(pv_guest_cr4_to_real_cr4(curr)); ++ curr->arch.pv_vcpu.ctrlreg[4] = pv_fixup_guest_cr4(curr, val); ++ write_cr4(pv_make_cr4(curr)); + ctxt_switch_levelling(curr); + return X86EMUL_OKAY; + } +diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h +index ec81d78..c8aa8a5 100644 +--- a/xen/include/asm-x86/domain.h ++++ b/xen/include/asm-x86/domain.h +@@ -610,17 +610,6 @@ bool update_secondary_system_time(struct vcpu *, + void vcpu_show_execution_state(struct vcpu *); + void vcpu_show_registers(const struct vcpu *); + +-/* Clean up CR4 bits that are not under guest control. */ +-unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4); +- +-/* Convert between guest-visible and real CR4 values. */ +-unsigned long pv_guest_cr4_to_real_cr4(const struct vcpu *v); +- +-#define real_cr4_to_pv_guest_cr4(c) \ +- ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | \ +- X86_CR4_OSXSAVE | X86_CR4_SMEP | \ +- X86_CR4_FSGSBASE | X86_CR4_SMAP | X86_CR4_PCIDE)) +- + #define domain_max_vcpus(d) (is_hvm_domain(d) ? HVM_MAX_VCPUS : MAX_VIRT_CPUS) + + static inline struct vcpu_guest_context *alloc_vcpu_guest_context(void) +diff --git a/xen/include/asm-x86/pv/domain.h b/xen/include/asm-x86/pv/domain.h +index 4fea764..4e4710c 100644 +--- a/xen/include/asm-x86/pv/domain.h ++++ b/xen/include/asm-x86/pv/domain.h +@@ -59,6 +59,23 @@ int pv_vcpu_initialise(struct vcpu *v); + void pv_domain_destroy(struct domain *d); + int pv_domain_initialise(struct domain *d); + ++/* ++ * Bits which a PV guest can toggle in its view of cr4. Some are loaded into ++ * hardware, while some are fully emulated. ++ */ ++#define PV_CR4_GUEST_MASK \ ++ (X86_CR4_TSD | X86_CR4_DE | X86_CR4_FSGSBASE | X86_CR4_OSXSAVE) ++ ++/* Bits which a PV guest may observe from the real hardware settings. */ ++#define PV_CR4_GUEST_VISIBLE_MASK \ ++ (X86_CR4_PAE | X86_CR4_MCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT) ++ ++/* Given a new cr4 value, construct the resulting guest-visible cr4 value. */ ++unsigned long pv_fixup_guest_cr4(const struct vcpu *v, unsigned long cr4); ++ ++/* Create a cr4 value to load into hardware, based on vcpu settings. */ ++unsigned long pv_make_cr4(const struct vcpu *v); ++ + #else /* !CONFIG_PV */ + + #include +@@ -68,6 +85,8 @@ static inline int pv_vcpu_initialise(struct vcpu *v) { return -EOPNOTSUPP; } + static inline void pv_domain_destroy(struct domain *d) {} + static inline int pv_domain_initialise(struct domain *d) { return -EOPNOTSUPP; } + ++static inline unsigned long pv_make_cr4(const struct vcpu *v) { return ~0ul; } ++ + #endif /* CONFIG_PV */ + + void paravirt_ctxt_switch_from(struct vcpu *v); Index: head/emulators/xen-kernel/files/xsa293-4.11-2.patch =================================================================== --- head/emulators/xen-kernel/files/xsa293-4.11-2.patch +++ head/emulators/xen-kernel/files/xsa293-4.11-2.patch @@ -0,0 +1,260 @@ +From: Andrew Cooper +Subject: x86/pv: Don't have %cr4.fsgsbase active behind a guest kernels back + +Currently, a 64bit PV guest can appear to set and clear FSGSBASE in %cr4, but +the bit remains set in hardware. Therefore, the {RD,WR}{FS,GS}BASE are usable +even when the guest kernel believes that they are disabled. + +The FSGSBASE feature isn't currently supported in Linux, and its context +switch path has some optimisations which rely on userspace being unable to use +the WR{FS,GS}BASE instructions. Xen's current behaviour undermines this +expectation. + +In 64bit PV guest context, always load the guest kernels setting of FSGSBASE +into %cr4. This requires adjusting how Xen uses the {RD,WR}{FS,GS}BASE +instructions. + + * Delete the cpu_has_fsgsbase helper. It is no longer safe, as users need to + check %cr4 directly. + * The raw __rd{fs,gs}base() helpers are only safe to use when %cr4.fsgsbase + is set. Comment this property. + * The {rd,wr}{fs,gs}{base,shadow}() and read_msr() helpers are updated to use + the current %cr4 value to determine which mechanism to use. + * toggle_guest_mode() and save_segments() are update to avoid reading + fs/gsbase if the values in hardware cannot be stale WRT struct vcpu. A + consequence of this is that the write_cr() path needs to cache the current + bases, as subsequent context switches will skip saving the values. + * write_cr4() is updated to ensure that the shadow %cr4.fsgsbase value is + observed in a safe way WRT the hardware setting, if an interrupt happens to + hit in the middle. + * pv_make_cr4() is updated for 64bit PV guests to use the guest kernels + choice of FSGSBASE. + +This is part of XSA-293 + +Reported-by: Andy Lutomirski +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 675152a..29f892c 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -1433,7 +1433,8 @@ static void save_segments(struct vcpu *v) + regs->fs = read_sreg(fs); + regs->gs = read_sreg(gs); + +- if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) ) ++ /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */ ++ if ( (read_cr4() & X86_CR4_FSGSBASE) && !is_pv_32bit_vcpu(v) ) + { + v->arch.pv_vcpu.fs_base = __rdfsbase(); + if ( v->arch.flags & TF_kernel_mode ) +diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c +index 3965959..228a174 100644 +--- a/xen/arch/x86/pv/domain.c ++++ b/xen/arch/x86/pv/domain.c +@@ -140,6 +140,16 @@ unsigned long pv_make_cr4(const struct vcpu *v) + if ( d->arch.vtsc || (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) ) + cr4 |= X86_CR4_TSD; + ++ /* ++ * The {RD,WR}{FS,GS}BASE are only useable in 64bit code segments. While ++ * we must not have CR4.FSGSBASE set behind the back of a 64bit PV kernel, ++ * we do leave it set in 32bit PV context to speed up Xen's context switch ++ * path. ++ */ ++ if ( !is_pv_32bit_domain(d) && ++ !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_FSGSBASE) ) ++ cr4 &= ~X86_CR4_FSGSBASE; ++ + return cr4; + } + +@@ -375,7 +385,8 @@ void toggle_guest_mode(struct vcpu *v) + { + ASSERT(!is_pv_32bit_vcpu(v)); + +- if ( cpu_has_fsgsbase ) ++ /* %fs/%gs bases can only be stale if WR{FS,GS}BASE are usable. */ ++ if ( read_cr4() & X86_CR4_FSGSBASE ) + { + if ( v->arch.flags & TF_kernel_mode ) + v->arch.pv_vcpu.gs_base_kernel = __rdgsbase(); +diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c +index 4abbc14..312c1ee 100644 +--- a/xen/arch/x86/pv/emul-priv-op.c ++++ b/xen/arch/x86/pv/emul-priv-op.c +@@ -786,6 +786,17 @@ static int write_cr(unsigned int reg, unsigned long val, + } + + case 4: /* Write CR4 */ ++ /* ++ * If this write will disable FSGSBASE, refresh Xen's idea of the ++ * guest bases now that they can no longer change. ++ */ ++ if ( (curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_FSGSBASE) && ++ !(val & X86_CR4_FSGSBASE) ) ++ { ++ curr->arch.pv_vcpu.fs_base = __rdfsbase(); ++ curr->arch.pv_vcpu.gs_base_kernel = __rdgsbase(); ++ } ++ + curr->arch.pv_vcpu.ctrlreg[4] = pv_fixup_guest_cr4(curr, val); + write_cr4(pv_make_cr4(curr)); + ctxt_switch_levelling(curr); +@@ -835,14 +846,15 @@ static int read_msr(unsigned int reg, uint64_t *val, + case MSR_FS_BASE: + if ( is_pv_32bit_domain(currd) ) + break; +- *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base; ++ *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdfsbase() ++ : curr->arch.pv_vcpu.fs_base; + return X86EMUL_OKAY; + + case MSR_GS_BASE: + if ( is_pv_32bit_domain(currd) ) + break; +- *val = cpu_has_fsgsbase ? __rdgsbase() +- : curr->arch.pv_vcpu.gs_base_kernel; ++ *val = (read_cr4() & X86_CR4_FSGSBASE) ? __rdgsbase() ++ : curr->arch.pv_vcpu.gs_base_kernel; + return X86EMUL_OKAY; + + case MSR_SHADOW_GS_BASE: +diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c +index ecb0149..a353d76 100644 +--- a/xen/arch/x86/setup.c ++++ b/xen/arch/x86/setup.c +@@ -1567,7 +1567,7 @@ void __init noreturn __start_xen(unsigned long mbi_p) + + cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS; + +- if ( cpu_has_fsgsbase ) ++ if ( boot_cpu_has(X86_FEATURE_FSGSBASE) ) + set_in_cr4(X86_CR4_FSGSBASE); + + if ( opt_invpcid && cpu_has_invpcid ) +diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h +index b237da1..861cb0a 100644 +--- a/xen/include/asm-x86/cpufeature.h ++++ b/xen/include/asm-x86/cpufeature.h +@@ -90,7 +90,6 @@ + #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) + + /* CPUID level 0x00000007:0.ebx */ +-#define cpu_has_fsgsbase boot_cpu_has(X86_FEATURE_FSGSBASE) + #define cpu_has_bmi1 boot_cpu_has(X86_FEATURE_BMI1) + #define cpu_has_hle boot_cpu_has(X86_FEATURE_HLE) + #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) +diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h +index afbeb7f..1ba6ee3 100644 +--- a/xen/include/asm-x86/msr.h ++++ b/xen/include/asm-x86/msr.h +@@ -120,6 +120,14 @@ static inline uint64_t rdtsc_ordered(void) + : "=a" (low), "=d" (high) \ + : "c" (counter)) + ++/* ++ * On hardware supporting FSGSBASE, the value loaded into hardware is the ++ * guest kernel's choice for 64bit PV guests (Xen's choice for Idle, HVM and ++ * 32bit PV). ++ * ++ * Therefore, the {RD,WR}{FS,GS}BASE instructions are only safe to use if ++ * %cr4.fsgsbase is set. ++ */ + static inline unsigned long __rdfsbase(void) + { + unsigned long base; +@@ -150,7 +158,7 @@ static inline unsigned long rdfsbase(void) + { + unsigned long base; + +- if ( cpu_has_fsgsbase ) ++ if ( read_cr4() & X86_CR4_FSGSBASE ) + return __rdfsbase(); + + rdmsrl(MSR_FS_BASE, base); +@@ -162,7 +170,7 @@ static inline unsigned long rdgsbase(void) + { + unsigned long base; + +- if ( cpu_has_fsgsbase ) ++ if ( read_cr4() & X86_CR4_FSGSBASE ) + return __rdgsbase(); + + rdmsrl(MSR_GS_BASE, base); +@@ -174,7 +182,7 @@ static inline unsigned long rdgsshadow(void) + { + unsigned long base; + +- if ( cpu_has_fsgsbase ) ++ if ( read_cr4() & X86_CR4_FSGSBASE ) + { + asm volatile ( "swapgs" ); + base = __rdgsbase(); +@@ -188,7 +196,7 @@ static inline unsigned long rdgsshadow(void) + + static inline void wrfsbase(unsigned long base) + { +- if ( cpu_has_fsgsbase ) ++ if ( read_cr4() & X86_CR4_FSGSBASE ) + #ifdef HAVE_AS_FSGSBASE + asm volatile ( "wrfsbase %0" :: "r" (base) ); + #else +@@ -200,7 +208,7 @@ static inline void wrfsbase(unsigned long base) + + static inline void wrgsbase(unsigned long base) + { +- if ( cpu_has_fsgsbase ) ++ if ( read_cr4() & X86_CR4_FSGSBASE ) + #ifdef HAVE_AS_FSGSBASE + asm volatile ( "wrgsbase %0" :: "r" (base) ); + #else +@@ -212,7 +220,7 @@ static inline void wrgsbase(unsigned long base) + + static inline void wrgsshadow(unsigned long base) + { +- if ( cpu_has_fsgsbase ) ++ if ( read_cr4() & X86_CR4_FSGSBASE ) + { + asm volatile ( "swapgs\n\t" + #ifdef HAVE_AS_FSGSBASE +diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h +index 2bd9e69..8e253dc 100644 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -305,11 +305,31 @@ static inline unsigned long read_cr4(void) + + static inline void write_cr4(unsigned long val) + { ++ struct cpu_info *info = get_cpu_info(); ++ + /* No global pages in case of PCIDs enabled! */ + ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE)); + +- get_cpu_info()->cr4 = val; +- asm volatile ( "mov %0,%%cr4" : : "r" (val) ); ++ /* ++ * On hardware supporting FSGSBASE, the value in %cr4 is the kernel's ++ * choice for 64bit PV guests, which impacts whether Xen can use the ++ * instructions. ++ * ++ * The {rd,wr}{fs,gs}base() helpers use info->cr4 to work out whether it ++ * is safe to execute the {RD,WR}{FS,GS}BASE instruction, falling back to ++ * the MSR path if not. Some users require interrupt safety. ++ * ++ * If FSGSBASE is currently or about to become clear, reflect this in ++ * info->cr4 before updating %cr4, so an interrupt which hits in the ++ * middle won't observe FSGSBASE set in info->cr4 but clear in %cr4. ++ */ ++ info->cr4 = val & (info->cr4 | ~X86_CR4_FSGSBASE); ++ ++ asm volatile ( "mov %[val], %%cr4" ++ : "+m" (info->cr4) /* Force ordering without a barrier. */ ++ : [val] "r" (val) ); ++ ++ info->cr4 = val; + } + + /* Clear and set 'TS' bit respectively */ Index: head/emulators/xen-kernel/files/xsa294-4.11.patch =================================================================== --- head/emulators/xen-kernel/files/xsa294-4.11.patch +++ head/emulators/xen-kernel/files/xsa294-4.11.patch @@ -0,0 +1,71 @@ +From: Jan Beulich +Subject: x86/pv: _toggle_guest_pt() may not skip TLB flush for shadow mode guests + +For shadow mode guests (e.g. PV ones forced into that mode as L1TF +mitigation, or during migration) update_cr3() -> sh_update_cr3() may +result in a change to the (shadow) root page table (compared to the +previous one when running the same vCPU with the same PCID). This can, +first and foremost, be a result of memory pressure on the shadow memory +pool of the domain. Shadow code legitimately relies on the original +(prior to commit 5c81d260c2 ["xen/x86: use PCID feature"]) behavior of +the subsequent CR3 write to flush the TLB of entries still left from +walks with an earlier, different (shadow) root page table. + +Restore the flushing behavior, also for the second CR3 write on the exit +path to guest context when XPTI is active. For the moment accept that +this will introduce more flushes than are strictly necessary - no flush +would be needed when the (shadow) root page table doesn't actually +change, but this information isn't readily (i.e. without introducing a +layering violation) available here. + +This is XSA-294. + +Reported-by: XXX PERSON +Signed-off-by: Jan Beulich +Tested-by: Juergen Gross +Reviewed-by: Andrew Cooper + +diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c +index b75ff6b..528413a 100644 +--- a/xen/arch/x86/pv/domain.c ++++ b/xen/arch/x86/pv/domain.c +@@ -296,21 +296,35 @@ int pv_domain_initialise(struct domain *d) + static void _toggle_guest_pt(struct vcpu *v) + { + const struct domain *d = v->domain; ++ struct cpu_info *cpu_info = get_cpu_info(); ++ unsigned long cr3; + + v->arch.flags ^= TF_kernel_mode; + update_cr3(v); + if ( d->arch.pv_domain.xpti ) + { +- struct cpu_info *cpu_info = get_cpu_info(); +- + cpu_info->root_pgt_changed = true; + cpu_info->pv_cr3 = __pa(this_cpu(root_pgt)) | + (d->arch.pv_domain.pcid + ? get_pcid_bits(v, true) : 0); + } + +- /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */ +- write_cr3(v->arch.cr3); ++ /* ++ * Don't flush user global mappings from the TLB. Don't tick TLB clock. ++ * ++ * In shadow mode, though, update_cr3() may need to be accompanied by a ++ * TLB flush (for just the incoming PCID), as the top level page table may ++ * have changed behind our backs. To be on the safe side, suppress the ++ * no-flush unconditionally in this case. The XPTI CR3 write, if enabled, ++ * will then need to be a flushing one too. ++ */ ++ cr3 = v->arch.cr3; ++ if ( shadow_mode_enabled(d) ) ++ { ++ cr3 &= ~X86_CR3_NOFLUSH; ++ cpu_info->pv_cr3 &= ~X86_CR3_NOFLUSH; ++ } ++ write_cr3(cr3); + + if ( !(v->arch.flags & TF_kernel_mode) ) + return;