summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/virtual/kvm/mmu.txt14
-rw-r--r--arch/x86/kvm/mmu.c20
-rw-r--r--include/linux/kvm_host.h2
-rw-r--r--virt/kvm/kvm_main.c21
4 files changed, 42 insertions, 15 deletions
diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 290894176142..53838d9c6295 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -425,6 +425,20 @@ fault through the slow path.
Since only 19 bits are used to store generation-number on mmio spte, all
pages are zapped when there is an overflow.
+Unfortunately, a single memory access might access kvm_memslots(kvm) multiple
+times, the last one happening when the generation number is retrieved and
+stored into the MMIO spte. Thus, the MMIO spte might be created based on
+out-of-date information, but with an up-to-date generation number.
+
+To avoid this, the generation number is incremented again after synchronize_srcu
+returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a
+memslot update, while some SRCU readers might be using the old copy. We do not
+want to use an MMIO sptes created with an odd generation number, and we can do
+this without losing a bit in the MMIO spte. The low bit of the generation
+is not stored in MMIO spte, and presumed zero when it is extracted out of the
+spte. If KVM is unlucky and creates an MMIO spte while the low bit is 1,
+the next access to the spte will always be a cache miss.
+
Further reading
===============
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c1871b182929..073b39d13696 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -198,16 +198,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
/*
- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
- * number.
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
#define MMIO_SPTE_GEN_HIGH_SHIFT 52
-#define MMIO_GEN_SHIFT 19
-#define MMIO_GEN_LOW_SHIFT 9
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
@@ -4424,7 +4428,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
+ if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0fbbc7aa02cb..e47c7e2f4d04 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -464,8 +464,6 @@ void kvm_exit(void);
void kvm_get_kvm(struct kvm *kvm);
void kvm_put_kvm(struct kvm *kvm);
-void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
- u64 last_generation);
static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aac732d17c17..c803795f4acd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -673,8 +673,7 @@ static void sort_memslots(struct kvm_memslots *slots)
slots->id_to_index[slots->memslots[i].id] = i;
}
-void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
- u64 last_generation)
+void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
{
if (new) {
int id = new->id;
@@ -685,8 +684,6 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new,
if (new->npages != npages)
sort_memslots(slots);
}
-
- slots->generation = last_generation + 1;
}
static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
@@ -708,10 +705,24 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
{
struct kvm_memslots *old_memslots = kvm->memslots;
- update_memslots(slots, new, kvm->memslots->generation);
+ /*
+ * Set the low bit in the generation, which disables SPTE caching
+ * until the end of synchronize_srcu_expedited.
+ */
+ WARN_ON(old_memslots->generation & 1);
+ slots->generation = old_memslots->generation + 1;
+
+ update_memslots(slots, new);
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
+ /*
+ * Increment the new memslot generation a second time. This prevents
+ * vm exits that race with memslot updates from caching a memslot
+ * generation that will (potentially) be valid forever.
+ */
+ slots->generation++;
+
kvm_arch_memslots_updated(kvm);
return old_memslots;