From 980c41f554c3029ce4f99678c0cd95296212775f Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Fri, 16 Aug 2024 14:28:19 +0100 Subject: [PATCH 01/12] KVM: arm64: Make the exposed feature bits in AA64DFR0_EL1 writable from userspace KVM exposes the OS double lock feature bit to Guests but returns RAZ/WI on Guest OSDLR_EL1 access. This breaks Guest migration between systems where this feature differ. Add support to make this feature writable from userspace by setting the mask bit. While at it, set the mask bits for the exposed WRPs(Number of Watchpoints) as well. Also update the selftest to cover these fields. However we still can't make BRPs and CTX_CMPs fields writable, because as per ARM ARM DDI 0487K.a, section D2.8.3 Breakpoint types and linking of breakpoints, highest numbered breakpoints(BRPs) must be context aware breakpoints(CTX_CMPs). KVM does not trap + emulate the breakpoint registers, and as such cannot support a layout that misaligns with the underlying hardware. Reviewed-by: Oliver Upton Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20240816132819.34316-1-shameerali.kolothum.thodi@huawei.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 16 +++++++++++++++- .../testing/selftests/kvm/aarch64/set_id_regs.c | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c90324060436..5a49e8331fbf 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2376,7 +2376,21 @@ static const struct sys_reg_desc sys_reg_descs[] = { .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, .reset = read_sanitised_id_aa64dfr0_el1, - .val = ID_AA64DFR0_EL1_PMUVer_MASK | + /* + * Prior to FEAT_Debugv8.9, the architecture defines context-aware + * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). + * KVM does not trap + emulate the breakpoint registers, and as such + * cannot support a layout that misaligns with the underlying hardware. + * While it may be possible to describe a subset that aligns with + * hardware, just prevent changes to BRPs and CTX_CMPs altogether for + * simplicity. + * + * See DDI0487K.a, section D2.8.3 Breakpoint types and linking + * of breakpoints for more details. + */ + .val = ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_PMUVer_MASK | ID_AA64DFR0_EL1_DebugVer_MASK, }, ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index d20981663831..6edc5412abe8 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -68,6 +68,8 @@ struct test_feature_reg { } static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = { + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DoubleLock, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, WRPs, 0), S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP), REG_FTR_END, From ffe68b2d19a5a84440fea99a732cfc3b157559eb Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:00 -0400 Subject: [PATCH 02/12] KVM: arm64: Disable fields that KVM doesn't know how to handle in ID_AA64PFR1_EL1 For some of the fields in the ID_AA64PFR1_EL1 register, KVM doesn't know how to handle them right now. So explicitly disable them in the register accessor, then those fields value will be masked to 0 even if on the hardware the field value is 1. This is safe because from a UAPI point of view that read_sanitised_ftr_reg() doesn't yet return a nonzero value for any of those fields. This will benifit the migration if the host and VM have different values when restoring a VM. Those fields include RNDR_trap, NMI, MTE_frac, GCS, THE, MTEX, DF2, PFAR. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-2-shahuang@redhat.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 5a49e8331fbf..0d983c1b859b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1538,6 +1538,14 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) From e8d164974cfa46fe5ec87869c8a7113641f322d5 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:01 -0400 Subject: [PATCH 03/12] KVM: arm64: Use kvm_has_feat() to check if FEAT_SSBS is advertised to the guest Currently KVM use cpus_have_final_cap() to check if FEAT_SSBS is advertised to the guest. But if FEAT_SSBS is writable and isn't advertised to the guest, this is wrong. Update it to use kvm_has_feat() to check if FEAT_SSBS is advertised to the guest, thus the KVM can do the right thing if FEAT_SSBS isn't advertised to the guest. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-3-shahuang@redhat.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hypercalls.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 5763d979d8ca..ee6573befb81 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -317,7 +317,7 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) * to the guest, and hide SSBS so that the * guest stays protected. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) break; fallthrough; case SPECTRE_UNAFFECTED: @@ -428,7 +428,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) * Convert the workaround level into an easy-to-compare number, where higher * values mean better protection. */ -static int get_kernel_wa_level(u64 regid) +static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid) { switch (regid) { case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: @@ -449,7 +449,7 @@ static int get_kernel_wa_level(u64 regid) * don't have any FW mitigation if SSBS is there at * all times. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; fallthrough; case SPECTRE_UNAFFECTED: @@ -486,7 +486,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK; break; case KVM_REG_ARM_STD_BMAP: val = READ_ONCE(smccc_feat->std_bmap); @@ -588,7 +588,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (val & ~KVM_REG_FEATURE_LEVEL_MASK) return -EINVAL; - if (get_kernel_wa_level(reg->id) < val) + if (get_kernel_wa_level(vcpu, reg->id) < val) return -EINVAL; return 0; @@ -624,7 +624,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the * other way around. */ - if (get_kernel_wa_level(reg->id) < wa_level) + if (get_kernel_wa_level(vcpu, reg->id) < wa_level) return -EINVAL; return 0; From 78c4446b5f957fb14737582e503b1b25f66edc45 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:02 -0400 Subject: [PATCH 04/12] KVM: arm64: Allow userspace to change ID_AA64PFR1_EL1 Allow userspace to change the guest-visible value of the register with different way of handling: - Since the RAS and MPAM is not writable in the ID_AA64PFR0_EL1 register, RAS_frac and MPAM_frac are also not writable in the ID_AA64PFR1_EL1 register. - The MTE is controlled by a separate UAPI (KVM_CAP_ARM_MTE) with an internal flag (KVM_ARCH_FLAG_MTE_ENABLED). So it's not writable. - For those fields which KVM doesn't know how to handle, they are not exposed to the guest (being disabled in the register read accessor), those fields value will always be 0. Those fields don't have a known behavior now, so don't advertise them to the userspace. Thus still not writable. Those fields include SME, RNDR_trap, NMI, GCS, THE, DF2, PFAR, MTE_frac, MTEX. - The BT, SSBS, CSV2_frac don't introduce any new registers which KVM doesn't know how to handle, they can be written without ill effect. So let them writable. Besides, we don't do the crosscheck in KVM about the CSV2_frac even if it depends on the value of CSV2, it should be made sure by the VMM instead of KVM. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-4-shahuang@redhat.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0d983c1b859b..da7b8e078b20 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2370,7 +2370,19 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_GIC | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, - ID_SANITISED(ID_AA64PFR1_EL1), + ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_DF2 | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_RNDR_trap | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_RAS_frac | + ID_AA64PFR1_EL1_MTE)), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), From dc9b5d7e0bd40e68a94013766b27be3dda10c006 Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Tue, 23 Jul 2024 03:20:03 -0400 Subject: [PATCH 05/12] KVM: selftests: aarch64: Add writable test for ID_AA64PFR1_EL1 Add writable test for the ID_AA64PFR1_EL1 register. Signed-off-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240723072004.1470688-5-shahuang@redhat.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index 6edc5412abe8..01522567a8c2 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -135,6 +135,13 @@ static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = { REG_FTR_END, }; +static const struct reg_ftr_bits ftr_id_aa64pfr1_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, CSV2_frac, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, SSBS, ID_AA64PFR1_EL1_SSBS_NI), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR1_EL1, BT, 0), + REG_FTR_END, +}; + static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ECV, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, EXS, 0), @@ -201,6 +208,7 @@ static struct test_feature_reg test_regs[] = { TEST_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1_el1), TEST_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2_el1), TEST_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0_el1), + TEST_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1_el1), TEST_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0_el1), TEST_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1_el1), TEST_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2_el1), @@ -570,9 +578,9 @@ int main(void) test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + - ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - - ARRAY_SIZE(test_regs) + 2; + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2; ksft_set_plan(test_cnt); From ae8f8b37610269009326f4318df161206c59843e Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Oct 2024 22:39:09 +0000 Subject: [PATCH 06/12] KVM: arm64: Unregister redistributor for failed vCPU creation Alex reports that syzkaller has managed to trigger a use-after-free when tearing down a VM: BUG: KASAN: slab-use-after-free in kvm_put_kvm+0x300/0xe68 virt/kvm/kvm_main.c:5769 Read of size 8 at addr ffffff801c6890d0 by task syz.3.2219/10758 CPU: 3 UID: 0 PID: 10758 Comm: syz.3.2219 Not tainted 6.11.0-rc6-dirty #64 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x17c/0x1a8 arch/arm64/kernel/stacktrace.c:317 show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:324 __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x94/0xc0 lib/dump_stack.c:119 print_report+0x144/0x7a4 mm/kasan/report.c:377 kasan_report+0xcc/0x128 mm/kasan/report.c:601 __asan_report_load8_noabort+0x20/0x2c mm/kasan/report_generic.c:381 kvm_put_kvm+0x300/0xe68 virt/kvm/kvm_main.c:5769 kvm_vm_release+0x4c/0x60 virt/kvm/kvm_main.c:1409 __fput+0x198/0x71c fs/file_table.c:422 ____fput+0x20/0x30 fs/file_table.c:450 task_work_run+0x1cc/0x23c kernel/task_work.c:228 do_notify_resume+0x144/0x1a0 include/linux/resume_user_mode.h:50 el0_svc+0x64/0x68 arch/arm64/kernel/entry-common.c:169 el0t_64_sync_handler+0x90/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Upon closer inspection, it appears that we do not properly tear down the MMIO registration for a vCPU that fails creation late in the game, e.g. a vCPU w/ the same ID already exists in the VM. It is important to consider the context of commit that introduced this bug by moving the unregistration out of __kvm_vgic_vcpu_destroy(). That change correctly sought to avoid an srcu v. config_lock inversion by breaking up the vCPU teardown into two parts, one guarded by the config_lock. Fix the use-after-free while avoiding lock inversion by adding a special-cased unregistration to __kvm_vgic_vcpu_destroy(). This is safe because failed vCPUs are torn down outside of the config_lock. Cc: stable@vger.kernel.org Fixes: f616506754d3 ("KVM: arm64: vgic: Don't hold config_lock while unregistering redistributors") Reported-by: Alexander Potapenko Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241007223909.2157336-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index e7c53e8af3d1..57aedf7b2b76 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -417,8 +417,28 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vgic_cpu->private_irqs); vgic_cpu->private_irqs = NULL; - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + /* + * If this vCPU is being destroyed because of a failed creation + * then unregister the redistributor to avoid leaving behind a + * dangling pointer to the vCPU struct. + * + * vCPUs that have been successfully created (i.e. added to + * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as + * this function gets called while holding kvm->arch.config_lock + * in the VM teardown path and would otherwise introduce a lock + * inversion w.r.t. kvm->srcu. + * + * vCPUs that failed creation are torn down outside of the + * kvm->arch.config_lock and do not get unregistered in + * kvm_vgic_destroy(), meaning it is both safe and necessary to + * do so here. + */ + if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu) + vgic_unregister_redist_iodev(vcpu); + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + } } void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) From 6ded46b5a4fd7fc9c6104b770627043aaf996abf Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Oct 2024 23:30:25 +0000 Subject: [PATCH 07/12] KVM: arm64: nv: Keep reference on stage-2 MMU when scheduled out If a vCPU is scheduling out and not in WFI emulation, it is highly likely it will get scheduled again soon and reuse the MMU it had before. Dropping the MMU at vcpu_put() can have some unfortunate consequences, as the MMU could get reclaimed and used in a different context, forcing another 'cold start' on an otherwise active MMU. Avoid that altogether by keeping a reference on the MMU if the vCPU is scheduling out, ensuring that another vCPU cannot reclaim it while the current vCPU is away. Since there are more MMUs than vCPUs, this does not affect the guarantee that an unused MMU is available at any time. Furthermore, this makes the vcpu->arch.hw_mmu ~stable in preemptible code, at least for where it matters in the stage-2 abort path. Yes, the MMU can change across WFI emulation, but there isn't even a use case where this would matter. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241007233028.2236133-2-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index f9e30dd34c7a..df670c14e1c6 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -663,6 +663,13 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) { + /* + * The vCPU kept its reference on the MMU after the last put, keep + * rolling with it. + */ + if (vcpu->arch.hw_mmu) + return; + if (is_hyp_ctxt(vcpu)) { vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; } else { @@ -674,10 +681,18 @@ void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) { - if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) { + /* + * Keep a reference on the associated stage-2 MMU if the vCPU is + * scheduling out and not in WFI emulation, suggesting it is likely to + * reuse the MMU sometime soon. + */ + if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI)) + return; + + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) atomic_dec(&vcpu->arch.hw_mmu->refcnt); - vcpu->arch.hw_mmu = NULL; - } + + vcpu->arch.hw_mmu = NULL; } /* From 3c164eb9464d39ba339c1487dcac0dc9508e03f0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Oct 2024 23:30:26 +0000 Subject: [PATCH 08/12] KVM: arm64: nv: Do not block when unmapping stage-2 if disallowed Right now the nested code allows unmap operations on a shadow stage-2 to block unconditionally. This is wrong in a couple places, such as a non-blocking MMU notifier or on the back of a sched_in() notifier as part of shadow MMU recycling. Carry through whether or not blocking is allowed to kvm_pgtable_stage2_unmap(). This 'fixes' an issue where stage-2 MMU reclaim would precipitate a stack overflow from a pile of kvm_sched_in() callbacks, all trying to recycle a stage-2 MMU. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241007233028.2236133-3-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 3 ++- arch/arm64/include/asm/kvm_nested.h | 2 +- arch/arm64/kvm/mmu.c | 15 ++++++++------- arch/arm64/kvm/nested.c | 6 +++--- arch/arm64/kvm/sys_regs.c | 6 +++--- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index cd4087fbda9a..66d93e320ec8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -166,7 +166,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr); void __init free_hyp_pgds(void); -void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size); +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block); void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index e8bc6d67aba2..e74b90dcfac4 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -124,7 +124,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans); extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); extern void kvm_nested_s2_wp(struct kvm *kvm); -extern void kvm_nested_s2_unmap(struct kvm *kvm); +extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block); extern void kvm_nested_s2_flush(struct kvm *kvm); unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a509b63bd4dd..0f7658aefa1a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -328,9 +328,10 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 may_block)); } -void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block) { - __unmap_stage2_range(mmu, start, size, true); + __unmap_stage2_range(mmu, start, size, may_block); } void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) @@ -1015,7 +1016,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); } hva = vm_end; } while (hva < reg_end); @@ -1042,7 +1043,7 @@ void stage2_unmap_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_unmap_memslot(kvm, memslot); - kvm_nested_s2_unmap(kvm); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); @@ -1912,7 +1913,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) (range->end - range->start) << PAGE_SHIFT, range->may_block); - kvm_nested_s2_unmap(kvm); + kvm_nested_s2_unmap(kvm, range->may_block); return false; } @@ -2179,8 +2180,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size); - kvm_nested_s2_unmap(kvm); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index df670c14e1c6..58d3b998793a 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -634,7 +634,7 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) /* Clear the old state */ if (kvm_s2_mmu_valid(s2_mmu)) - kvm_stage2_unmap_range(s2_mmu, 0, kvm_phys_size(s2_mmu)); + kvm_stage2_unmap_range(s2_mmu, 0, kvm_phys_size(s2_mmu), false); /* * The virtual VMID (modulo CnP) will be used as a key when matching @@ -745,7 +745,7 @@ void kvm_nested_s2_wp(struct kvm *kvm) } } -void kvm_nested_s2_unmap(struct kvm *kvm) +void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) { int i; @@ -755,7 +755,7 @@ void kvm_nested_s2_unmap(struct kvm *kvm) struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; if (kvm_s2_mmu_valid(mmu)) - kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu)); + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); } } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c75dfc702a5c..edd385baf270 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2937,7 +2937,7 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the * corresponding VMIDs. */ - kvm_nested_s2_unmap(vcpu->kvm); + kvm_nested_s2_unmap(vcpu->kvm, true); write_unlock(&vcpu->kvm->mmu_lock); @@ -2989,7 +2989,7 @@ union tlbi_info { static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { - kvm_stage2_unmap_range(mmu, info->range.start, info->range.size); + kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); } static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -3084,7 +3084,7 @@ static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, max_size = compute_tlb_inval_range(mmu, info->ipa.addr); base_addr &= ~(max_size - 1); - kvm_stage2_unmap_range(mmu, base_addr, max_size); + kvm_stage2_unmap_range(mmu, base_addr, max_size, true); } static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, From c268f204f7c5784e84583c1c44d427bac09f517a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Oct 2024 23:30:27 +0000 Subject: [PATCH 09/12] KVM: arm64: nv: Punt stage-2 recycling to a vCPU request Currently, when a nested MMU is repurposed for some other MMU context, KVM unmaps everything during vcpu_load() while holding the MMU lock for write. This is quite a performance bottleneck for large nested VMs, as all vCPU scheduling will spin until the unmap completes. Start punting the MMU cleanup to a vCPU request, where it is then possible to periodically release the MMU lock and CPU in the presence of contention. Ensure that no vCPU winds up using a stale MMU by tracking the pending unmap on the S2 MMU itself and requesting an unmap on every vCPU that finds it. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241007233028.2236133-4-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 7 +++++++ arch/arm64/include/asm/kvm_nested.h | 2 ++ arch/arm64/kvm/arm.c | 2 ++ arch/arm64/kvm/nested.c | 28 ++++++++++++++++++++++++++-- 4 files changed, 37 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 94cff508874b..bf64fed9820e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -51,6 +51,7 @@ #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) #define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) +#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -211,6 +212,12 @@ struct kvm_s2_mmu { */ bool nested_stage2_enabled; + /* + * true when this MMU needs to be unmapped before being used for a new + * purpose. + */ + bool pending_unmap; + /* * 0: Nobody is currently using this, check vttbr for validity * >0: Somebody is actively using this. diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index e74b90dcfac4..233e65522716 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -78,6 +78,8 @@ extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); +extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu); + struct kvm_s2_trans { phys_addr_t output; unsigned long block_size; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a0d01c46e408..c7b659604bae 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1031,6 +1031,8 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_dirty_ring_check_request(vcpu)) return 0; + + check_nested_vcpu_requests(vcpu); } return 1; diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 58d3b998793a..c4b17d90fc49 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -632,9 +632,9 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) /* Set the scene for the next search */ kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; - /* Clear the old state */ + /* Make sure we don't forget to do the laundry */ if (kvm_s2_mmu_valid(s2_mmu)) - kvm_stage2_unmap_range(s2_mmu, 0, kvm_phys_size(s2_mmu), false); + s2_mmu->pending_unmap = true; /* * The virtual VMID (modulo CnP) will be used as a key when matching @@ -650,6 +650,16 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) out: atomic_inc(&s2_mmu->refcnt); + + /* + * Set the vCPU request to perform an unmap, even if the pending unmap + * originates from another vCPU. This guarantees that the MMU has been + * completely unmapped before any vCPU actually uses it, and allows + * multiple vCPUs to lend a hand with completing the unmap. + */ + if (s2_mmu->pending_unmap) + kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu); + return s2_mmu; } @@ -1199,3 +1209,17 @@ int kvm_init_nv_sysregs(struct kvm *kvm) return 0; } + +void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) { + struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + + write_lock(&vcpu->kvm->mmu_lock); + if (mmu->pending_unmap) { + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true); + mmu->pending_unmap = false; + } + write_unlock(&vcpu->kvm->mmu_lock); + } +} From 79cc6cdb932a5cf1a1ee05f6de12a7d102818d21 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 7 Oct 2024 23:30:28 +0000 Subject: [PATCH 10/12] KVM: arm64: nv: Clarify safety of allowing TLBI unmaps to reschedule There's been a decent amount of attention around unmaps of nested MMUs, and TLBI handling is no exception to this. Add a comment clarifying why it is safe to reschedule during a TLBI unmap, even without a reference on the MMU in progress. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241007233028.2236133-5-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index edd385baf270..1d84bc3396cb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2989,6 +2989,29 @@ union tlbi_info { static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { + /* + * The unmap operation is allowed to drop the MMU lock and block, which + * means that @mmu could be used for a different context than the one + * currently being invalidated. + * + * This behavior is still safe, as: + * + * 1) The vCPU(s) that recycled the MMU are responsible for invalidating + * the entire MMU before reusing it, which still honors the intent + * of a TLBI. + * + * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC + * and ERET to the guest), other vCPUs are allowed to use stale + * translations. + * + * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and + * at worst may cause more aborts for shadow stage-2 fills. + * + * Dropping the MMU lock also implies that shadow stage-2 fills could + * happen behind the back of the TLBI. This is still safe, though, as + * the L1 needs to put its stage-2 in a consistent state before doing + * the TLBI. + */ kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); } @@ -3084,6 +3107,10 @@ static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, max_size = compute_tlb_inval_range(mmu, info->ipa.addr); base_addr &= ~(max_size - 1); + /* + * See comment in s2_mmu_unmap_range() for why this is allowed to + * reschedule. + */ kvm_stage2_unmap_range(mmu, base_addr, max_size, true); } From d4a89e5aee23eaebdc45f63cb3d6d5917ff6acf4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 5 Oct 2024 00:19:37 +0100 Subject: [PATCH 11/12] KVM: arm64: Expose S1PIE to guests Prior to commit 70ed7238297f ("KVM: arm64: Sanitise ID_AA64MMFR3_EL1") we just exposed the santised view of ID_AA64MMFR3_EL1 to guests, meaning that they saw both TCRX and S1PIE if present on the host machine. That commit added VMM control over the contents of the register and exposed S1POE but removed S1PIE, meaning that the extension is no longer visible to guests. Reenable support for S1PIE with VMM control. Fixes: 70ed7238297f ("KVM: arm64: Sanitise ID_AA64MMFR3_EL1") Signed-off-by: Mark Brown Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20241005-kvm-arm64-fix-s1pie-v1-1-5901f02de749@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1d84bc3396cb..375052d8cd22 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1558,7 +1558,8 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; case SYS_ID_AA64MMFR3_EL1: - val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE; + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | + ID_AA64MMFR3_EL1_S1PIE; break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); @@ -2467,6 +2468,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1PIE | ID_AA64MMFR3_EL1_S1POE)), ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), From df5fd75ee305cb5927e0b1a0b46cc988ad8db2b1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 9 Oct 2024 19:36:03 +0100 Subject: [PATCH 12/12] KVM: arm64: Don't eagerly teardown the vgic on init error As there is very little ordering in the KVM API, userspace can instanciate a half-baked GIC (missing its memory map, for example) at almost any time. This means that, with the right timing, a thread running vcpu-0 can enter the kernel without a GIC configured and get a GIC created behind its back by another thread. Amusingly, it will pick up that GIC and start messing with the data structures without the GIC having been fully initialised. Similarly, a thread running vcpu-1 can enter the kernel, and try to init the GIC that was previously created. Since this GIC isn't properly configured (no memory map), it fails to correctly initialise. And that's the point where we decide to teardown the GIC, freeing all its resources. Behind vcpu-0's back. Things stop pretty abruptly, with a variety of symptoms. Clearly, this isn't good, we should be a bit more careful about this. It is obvious that this guest is not viable, as it is missing some important part of its configuration. So instead of trying to tear bits of it down, let's just mark it as *dead*. It means that any further interaction from userspace will result in -EIO. The memory will be released on the "normal" path, when userspace gives up. Cc: stable@vger.kernel.org Reported-by: Alexander Potapenko Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20241009183603.3221824-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 3 +++ arch/arm64/kvm/vgic/vgic-init.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index c7b659604bae..48cafb65d6ac 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -997,6 +997,9 @@ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) kvm_vcpu_sleep(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 57aedf7b2b76..d88fdeaf6144 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -556,10 +556,10 @@ int kvm_vgic_map_resources(struct kvm *kvm) out: mutex_unlock(&kvm->arch.config_lock); out_slots: - mutex_unlock(&kvm->slots_lock); - if (ret) - kvm_vgic_destroy(kvm); + kvm_vm_dead(kvm); + + mutex_unlock(&kvm->slots_lock); return ret; }