| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * Kernel-based Virtual Machine driver for Linux |
| * |
| * derived from drivers/kvm/kvm_main.c |
| * |
| * Copyright (C) 2006 Qumranet, Inc. |
| * Copyright (C) 2008 Qumranet, Inc. |
| * Copyright IBM Corporation, 2008 |
| * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
| * |
| * Authors: |
| * Avi Kivity <avi@qumranet.com> |
| * Yaniv Kamay <yaniv@qumranet.com> |
| * Amit Shah <amit.shah@qumranet.com> |
| * Ben-Ami Yassour <benami@il.ibm.com> |
| */ |
| #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| |
| #include <linux/kvm_host.h> |
| #include "irq.h" |
| #include "ioapic.h" |
| #include "mmu.h" |
| #include "i8254.h" |
| #include "tss.h" |
| #include "kvm_cache_regs.h" |
| #include "kvm_emulate.h" |
| #include "mmu/page_track.h" |
| #include "x86.h" |
| #include "cpuid.h" |
| #include "pmu.h" |
| #include "hyperv.h" |
| #include "lapic.h" |
| #include "xen.h" |
| #include "smm.h" |
| |
| #include <linux/clocksource.h> |
| #include <linux/interrupt.h> |
| #include <linux/kvm.h> |
| #include <linux/fs.h> |
| #include <linux/vmalloc.h> |
| #include <linux/export.h> |
| #include <linux/moduleparam.h> |
| #include <linux/mman.h> |
| #include <linux/highmem.h> |
| #include <linux/iommu.h> |
| #include <linux/cpufreq.h> |
| #include <linux/user-return-notifier.h> |
| #include <linux/srcu.h> |
| #include <linux/slab.h> |
| #include <linux/perf_event.h> |
| #include <linux/uaccess.h> |
| #include <linux/hash.h> |
| #include <linux/pci.h> |
| #include <linux/timekeeper_internal.h> |
| #include <linux/pvclock_gtod.h> |
| #include <linux/kvm_irqfd.h> |
| #include <linux/irqbypass.h> |
| #include <linux/sched/stat.h> |
| #include <linux/sched/isolation.h> |
| #include <linux/mem_encrypt.h> |
| #include <linux/entry-kvm.h> |
| #include <linux/suspend.h> |
| #include <linux/smp.h> |
| |
| #include <trace/events/ipi.h> |
| #include <trace/events/kvm.h> |
| |
| #include <asm/debugreg.h> |
| #include <asm/msr.h> |
| #include <asm/desc.h> |
| #include <asm/mce.h> |
| #include <asm/pkru.h> |
| #include <linux/kernel_stat.h> |
| #include <asm/fpu/api.h> |
| #include <asm/fpu/xcr.h> |
| #include <asm/fpu/xstate.h> |
| #include <asm/pvclock.h> |
| #include <asm/div64.h> |
| #include <asm/irq_remapping.h> |
| #include <asm/mshyperv.h> |
| #include <asm/hypervisor.h> |
| #include <asm/tlbflush.h> |
| #include <asm/intel_pt.h> |
| #include <asm/emulate_prefix.h> |
| #include <asm/sgx.h> |
| #include <clocksource/hyperv_timer.h> |
| |
| #define CREATE_TRACE_POINTS |
| #include "trace.h" |
| |
| #define MAX_IO_MSRS 256 |
| #define KVM_MAX_MCE_BANKS 32 |
| |
| struct kvm_caps kvm_caps __read_mostly = { |
| .supported_mce_cap = MCG_CTL_P | MCG_SER_P, |
| }; |
| EXPORT_SYMBOL_GPL(kvm_caps); |
| |
| #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) |
| |
| #define emul_to_vcpu(ctxt) \ |
| ((struct kvm_vcpu *)(ctxt)->vcpu) |
| |
| /* EFER defaults: |
| * - enable syscall per default because its emulated by KVM |
| * - enable LME and LMA per default on 64 bit KVM |
| */ |
| #ifdef CONFIG_X86_64 |
| static |
| u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); |
| #else |
| static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); |
| #endif |
| |
| static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; |
| |
| #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) |
| |
| #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE |
| |
| #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ |
| KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) |
| |
| static void update_cr8_intercept(struct kvm_vcpu *vcpu); |
| static void process_nmi(struct kvm_vcpu *vcpu); |
| static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); |
| static void store_regs(struct kvm_vcpu *vcpu); |
| static int sync_regs(struct kvm_vcpu *vcpu); |
| static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); |
| |
| static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); |
| static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); |
| |
| static DEFINE_MUTEX(vendor_module_lock); |
| struct kvm_x86_ops kvm_x86_ops __read_mostly; |
| |
| #define KVM_X86_OP(func) \ |
| DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ |
| *(((struct kvm_x86_ops *)0)->func)); |
| #define KVM_X86_OP_OPTIONAL KVM_X86_OP |
| #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP |
| #include <asm/kvm-x86-ops.h> |
| EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); |
| EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); |
| |
| static bool __read_mostly ignore_msrs = 0; |
| module_param(ignore_msrs, bool, 0644); |
| |
| bool __read_mostly report_ignored_msrs = true; |
| module_param(report_ignored_msrs, bool, 0644); |
| EXPORT_SYMBOL_GPL(report_ignored_msrs); |
| |
| unsigned int min_timer_period_us = 200; |
| module_param(min_timer_period_us, uint, 0644); |
| |
| static bool __read_mostly kvmclock_periodic_sync = true; |
| module_param(kvmclock_periodic_sync, bool, 0444); |
| |
| /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ |
| static u32 __read_mostly tsc_tolerance_ppm = 250; |
| module_param(tsc_tolerance_ppm, uint, 0644); |
| |
| /* |
| * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables |
| * adaptive tuning starting from default advancement of 1000ns. '0' disables |
| * advancement entirely. Any other value is used as-is and disables adaptive |
| * tuning, i.e. allows privileged userspace to set an exact advancement time. |
| */ |
| static int __read_mostly lapic_timer_advance_ns = -1; |
| module_param(lapic_timer_advance_ns, int, 0644); |
| |
| static bool __read_mostly vector_hashing = true; |
| module_param(vector_hashing, bool, 0444); |
| |
| bool __read_mostly enable_vmware_backdoor = false; |
| module_param(enable_vmware_backdoor, bool, 0444); |
| EXPORT_SYMBOL_GPL(enable_vmware_backdoor); |
| |
| /* |
| * Flags to manipulate forced emulation behavior (any non-zero value will |
| * enable forced emulation). |
| */ |
| #define KVM_FEP_CLEAR_RFLAGS_RF BIT(1) |
| static int __read_mostly force_emulation_prefix; |
| module_param(force_emulation_prefix, int, 0644); |
| |
| int __read_mostly pi_inject_timer = -1; |
| module_param(pi_inject_timer, bint, 0644); |
| |
| /* Enable/disable PMU virtualization */ |
| bool __read_mostly enable_pmu = true; |
| EXPORT_SYMBOL_GPL(enable_pmu); |
| module_param(enable_pmu, bool, 0444); |
| |
| bool __read_mostly eager_page_split = true; |
| module_param(eager_page_split, bool, 0644); |
| |
| /* Enable/disable SMT_RSB bug mitigation */ |
| static bool __read_mostly mitigate_smt_rsb; |
| module_param(mitigate_smt_rsb, bool, 0444); |
| |
| /* |
| * Restoring the host value for MSRs that are only consumed when running in |
| * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU |
| * returns to userspace, i.e. the kernel can run with the guest's value. |
| */ |
| #define KVM_MAX_NR_USER_RETURN_MSRS 16 |
| |
| struct kvm_user_return_msrs { |
| struct user_return_notifier urn; |
| bool registered; |
| struct kvm_user_return_msr_values { |
| u64 host; |
| u64 curr; |
| } values[KVM_MAX_NR_USER_RETURN_MSRS]; |
| }; |
| |
| u32 __read_mostly kvm_nr_uret_msrs; |
| EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); |
| static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; |
| static struct kvm_user_return_msrs __percpu *user_return_msrs; |
| |
| #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ |
| | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ |
| | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ |
| | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) |
| |
| u64 __read_mostly host_efer; |
| EXPORT_SYMBOL_GPL(host_efer); |
| |
| bool __read_mostly allow_smaller_maxphyaddr = 0; |
| EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); |
| |
| bool __read_mostly enable_apicv = true; |
| EXPORT_SYMBOL_GPL(enable_apicv); |
| |
| u64 __read_mostly host_xss; |
| EXPORT_SYMBOL_GPL(host_xss); |
| |
| u64 __read_mostly host_arch_capabilities; |
| EXPORT_SYMBOL_GPL(host_arch_capabilities); |
| |
| const struct _kvm_stats_desc kvm_vm_stats_desc[] = { |
| KVM_GENERIC_VM_STATS(), |
| STATS_DESC_COUNTER(VM, mmu_shadow_zapped), |
| STATS_DESC_COUNTER(VM, mmu_pte_write), |
| STATS_DESC_COUNTER(VM, mmu_pde_zapped), |
| STATS_DESC_COUNTER(VM, mmu_flooded), |
| STATS_DESC_COUNTER(VM, mmu_recycled), |
| STATS_DESC_COUNTER(VM, mmu_cache_miss), |
| STATS_DESC_ICOUNTER(VM, mmu_unsync), |
| STATS_DESC_ICOUNTER(VM, pages_4k), |
| STATS_DESC_ICOUNTER(VM, pages_2m), |
| STATS_DESC_ICOUNTER(VM, pages_1g), |
| STATS_DESC_ICOUNTER(VM, nx_lpage_splits), |
| STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), |
| STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) |
| }; |
| |
| const struct kvm_stats_header kvm_vm_stats_header = { |
| .name_size = KVM_STATS_NAME_SIZE, |
| .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), |
| .id_offset = sizeof(struct kvm_stats_header), |
| .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, |
| .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + |
| sizeof(kvm_vm_stats_desc), |
| }; |
| |
| const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { |
| KVM_GENERIC_VCPU_STATS(), |
| STATS_DESC_COUNTER(VCPU, pf_taken), |
| STATS_DESC_COUNTER(VCPU, pf_fixed), |
| STATS_DESC_COUNTER(VCPU, pf_emulate), |
| STATS_DESC_COUNTER(VCPU, pf_spurious), |
| STATS_DESC_COUNTER(VCPU, pf_fast), |
| STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created), |
| STATS_DESC_COUNTER(VCPU, pf_guest), |
| STATS_DESC_COUNTER(VCPU, tlb_flush), |
| STATS_DESC_COUNTER(VCPU, invlpg), |
| STATS_DESC_COUNTER(VCPU, exits), |
| STATS_DESC_COUNTER(VCPU, io_exits), |
| STATS_DESC_COUNTER(VCPU, mmio_exits), |
| STATS_DESC_COUNTER(VCPU, signal_exits), |
| STATS_DESC_COUNTER(VCPU, irq_window_exits), |
| STATS_DESC_COUNTER(VCPU, nmi_window_exits), |
| STATS_DESC_COUNTER(VCPU, l1d_flush), |
| STATS_DESC_COUNTER(VCPU, halt_exits), |
| STATS_DESC_COUNTER(VCPU, request_irq_exits), |
| STATS_DESC_COUNTER(VCPU, irq_exits), |
| STATS_DESC_COUNTER(VCPU, host_state_reload), |
| STATS_DESC_COUNTER(VCPU, fpu_reload), |
| STATS_DESC_COUNTER(VCPU, insn_emulation), |
| STATS_DESC_COUNTER(VCPU, insn_emulation_fail), |
| STATS_DESC_COUNTER(VCPU, hypercalls), |
| STATS_DESC_COUNTER(VCPU, irq_injections), |
| STATS_DESC_COUNTER(VCPU, nmi_injections), |
| STATS_DESC_COUNTER(VCPU, req_event), |
| STATS_DESC_COUNTER(VCPU, nested_run), |
| STATS_DESC_COUNTER(VCPU, directed_yield_attempted), |
| STATS_DESC_COUNTER(VCPU, directed_yield_successful), |
| STATS_DESC_COUNTER(VCPU, preemption_reported), |
| STATS_DESC_COUNTER(VCPU, preemption_other), |
| STATS_DESC_IBOOLEAN(VCPU, guest_mode), |
| STATS_DESC_COUNTER(VCPU, notify_window_exits), |
| }; |
| |
| const struct kvm_stats_header kvm_vcpu_stats_header = { |
| .name_size = KVM_STATS_NAME_SIZE, |
| .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), |
| .id_offset = sizeof(struct kvm_stats_header), |
| .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, |
| .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + |
| sizeof(kvm_vcpu_stats_desc), |
| }; |
| |
| u64 __read_mostly host_xcr0; |
| |
| static struct kmem_cache *x86_emulator_cache; |
| |
| /* |
| * When called, it means the previous get/set msr reached an invalid msr. |
| * Return true if we want to ignore/silent this failed msr access. |
| */ |
| static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) |
| { |
| const char *op = write ? "wrmsr" : "rdmsr"; |
| |
| if (ignore_msrs) { |
| if (report_ignored_msrs) |
| kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", |
| op, msr, data); |
| /* Mask the error */ |
| return true; |
| } else { |
| kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", |
| op, msr, data); |
| return false; |
| } |
| } |
| |
| static struct kmem_cache *kvm_alloc_emulator_cache(void) |
| { |
| unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); |
| unsigned int size = sizeof(struct x86_emulate_ctxt); |
| |
| return kmem_cache_create_usercopy("x86_emulator", size, |
| __alignof__(struct x86_emulate_ctxt), |
| SLAB_ACCOUNT, useroffset, |
| size - useroffset, NULL); |
| } |
| |
| static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); |
| |
| static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) |
| { |
| int i; |
| for (i = 0; i < ASYNC_PF_PER_VCPU; i++) |
| vcpu->arch.apf.gfns[i] = ~0; |
| } |
| |
| static void kvm_on_user_return(struct user_return_notifier *urn) |
| { |
| unsigned slot; |
| struct kvm_user_return_msrs *msrs |
| = container_of(urn, struct kvm_user_return_msrs, urn); |
| struct kvm_user_return_msr_values *values; |
| unsigned long flags; |
| |
| /* |
| * Disabling irqs at this point since the following code could be |
| * interrupted and executed through kvm_arch_hardware_disable() |
| */ |
| local_irq_save(flags); |
| if (msrs->registered) { |
| msrs->registered = false; |
| user_return_notifier_unregister(urn); |
| } |
| local_irq_restore(flags); |
| for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { |
| values = &msrs->values[slot]; |
| if (values->host != values->curr) { |
| wrmsrl(kvm_uret_msrs_list[slot], values->host); |
| values->curr = values->host; |
| } |
| } |
| } |
| |
| static int kvm_probe_user_return_msr(u32 msr) |
| { |
| u64 val; |
| int ret; |
| |
| preempt_disable(); |
| ret = rdmsrl_safe(msr, &val); |
| if (ret) |
| goto out; |
| ret = wrmsrl_safe(msr, val); |
| out: |
| preempt_enable(); |
| return ret; |
| } |
| |
| int kvm_add_user_return_msr(u32 msr) |
| { |
| BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); |
| |
| if (kvm_probe_user_return_msr(msr)) |
| return -1; |
| |
| kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; |
| return kvm_nr_uret_msrs++; |
| } |
| EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); |
| |
| int kvm_find_user_return_msr(u32 msr) |
| { |
| int i; |
| |
| for (i = 0; i < kvm_nr_uret_msrs; ++i) { |
| if (kvm_uret_msrs_list[i] == msr) |
| return i; |
| } |
| return -1; |
| } |
| EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); |
| |
| static void kvm_user_return_msr_cpu_online(void) |
| { |
| unsigned int cpu = smp_processor_id(); |
| struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
| u64 value; |
| int i; |
| |
| for (i = 0; i < kvm_nr_uret_msrs; ++i) { |
| rdmsrl_safe(kvm_uret_msrs_list[i], &value); |
| msrs->values[i].host = value; |
| msrs->values[i].curr = value; |
| } |
| } |
| |
| int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) |
| { |
| unsigned int cpu = smp_processor_id(); |
| struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
| int err; |
| |
| value = (value & mask) | (msrs->values[slot].host & ~mask); |
| if (value == msrs->values[slot].curr) |
| return 0; |
| err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); |
| if (err) |
| return 1; |
| |
| msrs->values[slot].curr = value; |
| if (!msrs->registered) { |
| msrs->urn.on_user_return = kvm_on_user_return; |
| user_return_notifier_register(&msrs->urn); |
| msrs->registered = true; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); |
| |
| static void drop_user_return_notifiers(void) |
| { |
| unsigned int cpu = smp_processor_id(); |
| struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
| |
| if (msrs->registered) |
| kvm_on_user_return(&msrs->urn); |
| } |
| |
| u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) |
| { |
| return vcpu->arch.apic_base; |
| } |
| |
| enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) |
| { |
| return kvm_apic_mode(kvm_get_apic_base(vcpu)); |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_apic_mode); |
| |
| int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
| { |
| enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); |
| enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); |
| u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | |
| (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); |
| |
| if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) |
| return 1; |
| if (!msr_info->host_initiated) { |
| if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) |
| return 1; |
| if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) |
| return 1; |
| } |
| |
| kvm_lapic_set_base(vcpu, msr_info->data); |
| kvm_recalculate_apic_map(vcpu->kvm); |
| return 0; |
| } |
| |
| /* |
| * Handle a fault on a hardware virtualization (VMX or SVM) instruction. |
| * |
| * Hardware virtualization extension instructions may fault if a reboot turns |
| * off virtualization while processes are running. Usually after catching the |
| * fault we just panic; during reboot instead the instruction is ignored. |
| */ |
| noinstr void kvm_spurious_fault(void) |
| { |
| /* Fault while not rebooting. We want the trace. */ |
| BUG_ON(!kvm_rebooting); |
| } |
| EXPORT_SYMBOL_GPL(kvm_spurious_fault); |
| |
| #define EXCPT_BENIGN 0 |
| #define EXCPT_CONTRIBUTORY 1 |
| #define EXCPT_PF 2 |
| |
| static int exception_class(int vector) |
| { |
| switch (vector) { |
| case PF_VECTOR: |
| return EXCPT_PF; |
| case DE_VECTOR: |
| case TS_VECTOR: |
| case NP_VECTOR: |
| case SS_VECTOR: |
| case GP_VECTOR: |
| return EXCPT_CONTRIBUTORY; |
| default: |
| break; |
| } |
| return EXCPT_BENIGN; |
| } |
| |
| #define EXCPT_FAULT 0 |
| #define EXCPT_TRAP 1 |
| #define EXCPT_ABORT 2 |
| #define EXCPT_INTERRUPT 3 |
| #define EXCPT_DB 4 |
| |
| static int exception_type(int vector) |
| { |
| unsigned int mask; |
| |
| if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) |
| return EXCPT_INTERRUPT; |
| |
| mask = 1 << vector; |
| |
| /* |
| * #DBs can be trap-like or fault-like, the caller must check other CPU |
| * state, e.g. DR6, to determine whether a #DB is a trap or fault. |
| */ |
| if (mask & (1 << DB_VECTOR)) |
| return EXCPT_DB; |
| |
| if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) |
| return EXCPT_TRAP; |
| |
| if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) |
| return EXCPT_ABORT; |
| |
| /* Reserved exceptions will result in fault */ |
| return EXCPT_FAULT; |
| } |
| |
| void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, |
| struct kvm_queued_exception *ex) |
| { |
| if (!ex->has_payload) |
| return; |
| |
| switch (ex->vector) { |
| case DB_VECTOR: |
| /* |
| * "Certain debug exceptions may clear bit 0-3. The |
| * remaining contents of the DR6 register are never |
| * cleared by the processor". |
| */ |
| vcpu->arch.dr6 &= ~DR_TRAP_BITS; |
| /* |
| * In order to reflect the #DB exception payload in guest |
| * dr6, three components need to be considered: active low |
| * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, |
| * DR6_BS and DR6_BT) |
| * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. |
| * In the target guest dr6: |
| * FIXED_1 bits should always be set. |
| * Active low bits should be cleared if 1-setting in payload. |
| * Active high bits should be set if 1-setting in payload. |
| * |
| * Note, the payload is compatible with the pending debug |
| * exceptions/exit qualification under VMX, that active_low bits |
| * are active high in payload. |
| * So they need to be flipped for DR6. |
| */ |
| vcpu->arch.dr6 |= DR6_ACTIVE_LOW; |
| vcpu->arch.dr6 |= ex->payload; |
| vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW; |
| |
| /* |
| * The #DB payload is defined as compatible with the 'pending |
| * debug exceptions' field under VMX, not DR6. While bit 12 is |
| * defined in the 'pending debug exceptions' field (enabled |
| * breakpoint), it is reserved and must be zero in DR6. |
| */ |
| vcpu->arch.dr6 &= ~BIT(12); |
| break; |
| case PF_VECTOR: |
| vcpu->arch.cr2 = ex->payload; |
| break; |
| } |
| |
| ex->has_payload = false; |
| ex->payload = 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); |
| |
| static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, |
| bool has_error_code, u32 error_code, |
| bool has_payload, unsigned long payload) |
| { |
| struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; |
| |
| ex->vector = vector; |
| ex->injected = false; |
| ex->pending = true; |
| ex->has_error_code = has_error_code; |
| ex->error_code = error_code; |
| ex->has_payload = has_payload; |
| ex->payload = payload; |
| } |
| |
| /* Forcibly leave the nested mode in cases like a vCPU reset */ |
| static void kvm_leave_nested(struct kvm_vcpu *vcpu) |
| { |
| kvm_x86_ops.nested_ops->leave_nested(vcpu); |
| } |
| |
| static void kvm_multiple_exception(struct kvm_vcpu *vcpu, |
| unsigned nr, bool has_error, u32 error_code, |
| bool has_payload, unsigned long payload, bool reinject) |
| { |
| u32 prev_nr; |
| int class1, class2; |
| |
| kvm_make_request(KVM_REQ_EVENT, vcpu); |
| |
| /* |
| * If the exception is destined for L2 and isn't being reinjected, |
| * morph it to a VM-Exit if L1 wants to intercept the exception. A |
| * previously injected exception is not checked because it was checked |
| * when it was original queued, and re-checking is incorrect if _L1_ |
| * injected the exception, in which case it's exempt from interception. |
| */ |
| if (!reinject && is_guest_mode(vcpu) && |
| kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { |
| kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, |
| has_payload, payload); |
| return; |
| } |
| |
| if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { |
| queue: |
| if (reinject) { |
| /* |
| * On VM-Entry, an exception can be pending if and only |
| * if event injection was blocked by nested_run_pending. |
| * In that case, however, vcpu_enter_guest() requests an |
| * immediate exit, and the guest shouldn't proceed far |
| * enough to need reinjection. |
| */ |
| WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); |
| vcpu->arch.exception.injected = true; |
| if (WARN_ON_ONCE(has_payload)) { |
| /* |
| * A reinjected event has already |
| * delivered its payload. |
| */ |
| has_payload = false; |
| payload = 0; |
| } |
| } else { |
| vcpu->arch.exception.pending = true; |
| vcpu->arch.exception.injected = false; |
| } |
| vcpu->arch.exception.has_error_code = has_error; |
| vcpu->arch.exception.vector = nr; |
| vcpu->arch.exception.error_code = error_code; |
| vcpu->arch.exception.has_payload = has_payload; |
| vcpu->arch.exception.payload = payload; |
| if (!is_guest_mode(vcpu)) |
| kvm_deliver_exception_payload(vcpu, |
| &vcpu->arch.exception); |
| return; |
| } |
| |
| /* to check exception */ |
| prev_nr = vcpu->arch.exception.vector; |
| if (prev_nr == DF_VECTOR) { |
| /* triple fault -> shutdown */ |
| kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
| return; |
| } |
| class1 = exception_class(prev_nr); |
| class2 = exception_class(nr); |
| if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || |
| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { |
| /* |
| * Synthesize #DF. Clear the previously injected or pending |
| * exception so as not to incorrectly trigger shutdown. |
| */ |
| vcpu->arch.exception.injected = false; |
| vcpu->arch.exception.pending = false; |
| |
| kvm_queue_exception_e(vcpu, DF_VECTOR, 0); |
| } else { |
| /* replace previous exception with a new one in a hope |
| that instruction re-execution will regenerate lost |
| exception */ |
| goto queue; |
| } |
| } |
| |
| void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
| { |
| kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_queue_exception); |
| |
| void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
| { |
| kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); |
| } |
| EXPORT_SYMBOL_GPL(kvm_requeue_exception); |
| |
| void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, |
| unsigned long payload) |
| { |
| kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_queue_exception_p); |
| |
| static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, |
| u32 error_code, unsigned long payload) |
| { |
| kvm_multiple_exception(vcpu, nr, true, error_code, |
| true, payload, false); |
| } |
| |
| int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) |
| { |
| if (err) |
| kvm_inject_gp(vcpu, 0); |
| else |
| return kvm_skip_emulated_instruction(vcpu); |
| |
| return 1; |
| } |
| EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); |
| |
| static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) |
| { |
| if (err) { |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } |
| |
| return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | |
| EMULTYPE_COMPLETE_USER_EXIT); |
| } |
| |
| void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) |
| { |
| ++vcpu->stat.pf_guest; |
| |
| /* |
| * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of |
| * whether or not L1 wants to intercept "regular" #PF. |
| */ |
| if (is_guest_mode(vcpu) && fault->async_page_fault) |
| kvm_queue_exception_vmexit(vcpu, PF_VECTOR, |
| true, fault->error_code, |
| true, fault->address); |
| else |
| kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, |
| fault->address); |
| } |
| |
| void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, |
| struct x86_exception *fault) |
| { |
| struct kvm_mmu *fault_mmu; |
| WARN_ON_ONCE(fault->vector != PF_VECTOR); |
| |
| fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : |
| vcpu->arch.walk_mmu; |
| |
| /* |
| * Invalidate the TLB entry for the faulting address, if it exists, |
| * else the access will fault indefinitely (and to emulate hardware). |
| */ |
| if ((fault->error_code & PFERR_PRESENT_MASK) && |
| !(fault->error_code & PFERR_RSVD_MASK)) |
| kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address, |
| KVM_MMU_ROOT_CURRENT); |
| |
| fault_mmu->inject_page_fault(vcpu, fault); |
| } |
| EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); |
| |
| void kvm_inject_nmi(struct kvm_vcpu *vcpu) |
| { |
| atomic_inc(&vcpu->arch.nmi_queued); |
| kvm_make_request(KVM_REQ_NMI, vcpu); |
| } |
| |
| void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
| { |
| kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_queue_exception_e); |
| |
| void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
| { |
| kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); |
| } |
| EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); |
| |
| /* |
| * Checks if cpl <= required_cpl; if true, return true. Otherwise queue |
| * a #GP and return false. |
| */ |
| bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) |
| { |
| if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) |
| return true; |
| kvm_queue_exception_e(vcpu, GP_VECTOR, 0); |
| return false; |
| } |
| |
| bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) |
| { |
| if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE)) |
| return true; |
| |
| kvm_queue_exception(vcpu, UD_VECTOR); |
| return false; |
| } |
| EXPORT_SYMBOL_GPL(kvm_require_dr); |
| |
| static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) |
| { |
| return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); |
| } |
| |
| /* |
| * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. |
| */ |
| int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) |
| { |
| struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
| gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
| gpa_t real_gpa; |
| int i; |
| int ret; |
| u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; |
| |
| /* |
| * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated |
| * to an L1 GPA. |
| */ |
| real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), |
| PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); |
| if (real_gpa == INVALID_GPA) |
| return 0; |
| |
| /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ |
| ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, |
| cr3 & GENMASK(11, 5), sizeof(pdpte)); |
| if (ret < 0) |
| return 0; |
| |
| for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
| if ((pdpte[i] & PT_PRESENT_MASK) && |
| (pdpte[i] & pdptr_rsvd_bits(vcpu))) { |
| return 0; |
| } |
| } |
| |
| /* |
| * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. |
| * Shadow page roots need to be reconstructed instead. |
| */ |
| if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) |
| kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); |
| |
| memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); |
| kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); |
| kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); |
| vcpu->arch.pdptrs_from_userspace = false; |
| |
| return 1; |
| } |
| EXPORT_SYMBOL_GPL(load_pdptrs); |
| |
| static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| { |
| #ifdef CONFIG_X86_64 |
| if (cr0 & 0xffffffff00000000UL) |
| return false; |
| #endif |
| |
| if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) |
| return false; |
| |
| if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) |
| return false; |
| |
| return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); |
| } |
| |
| void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) |
| { |
| /* |
| * CR0.WP is incorporated into the MMU role, but only for non-nested, |
| * indirect shadow MMUs. If paging is disabled, no updates are needed |
| * as there are no permission bits to emulate. If TDP is enabled, the |
| * MMU's metadata needs to be updated, e.g. so that emulating guest |
| * translations does the right thing, but there's no need to unload the |
| * root as CR0.WP doesn't affect SPTEs. |
| */ |
| if ((cr0 ^ old_cr0) == X86_CR0_WP) { |
| if (!(cr0 & X86_CR0_PG)) |
| return; |
| |
| if (tdp_enabled) { |
| kvm_init_mmu(vcpu); |
| return; |
| } |
| } |
| |
| if ((cr0 ^ old_cr0) & X86_CR0_PG) { |
| kvm_clear_async_pf_completion_queue(vcpu); |
| kvm_async_pf_hash_reset(vcpu); |
| |
| /* |
| * Clearing CR0.PG is defined to flush the TLB from the guest's |
| * perspective. |
| */ |
| if (!(cr0 & X86_CR0_PG)) |
| kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
| } |
| |
| if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) |
| kvm_mmu_reset_context(vcpu); |
| |
| if (((cr0 ^ old_cr0) & X86_CR0_CD) && |
| kvm_mmu_honors_guest_mtrrs(vcpu->kvm) && |
| !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) |
| kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); |
| } |
| EXPORT_SYMBOL_GPL(kvm_post_set_cr0); |
| |
| int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| { |
| unsigned long old_cr0 = kvm_read_cr0(vcpu); |
| |
| if (!kvm_is_valid_cr0(vcpu, cr0)) |
| return 1; |
| |
| cr0 |= X86_CR0_ET; |
| |
| /* Write to CR0 reserved bits are ignored, even on Intel. */ |
| cr0 &= ~CR0_RESERVED_BITS; |
| |
| #ifdef CONFIG_X86_64 |
| if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && |
| (cr0 & X86_CR0_PG)) { |
| int cs_db, cs_l; |
| |
| if (!is_pae(vcpu)) |
| return 1; |
| static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); |
| if (cs_l) |
| return 1; |
| } |
| #endif |
| if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && |
| is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && |
| !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) |
| return 1; |
| |
| if (!(cr0 & X86_CR0_PG) && |
| (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) |
| return 1; |
| |
| static_call(kvm_x86_set_cr0)(vcpu, cr0); |
| |
| kvm_post_set_cr0(vcpu, old_cr0, cr0); |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr0); |
| |
| void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
| { |
| (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); |
| } |
| EXPORT_SYMBOL_GPL(kvm_lmsw); |
| |
| void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) |
| { |
| if (vcpu->arch.guest_state_protected) |
| return; |
| |
| if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { |
| |
| if (vcpu->arch.xcr0 != host_xcr0) |
| xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); |
| |
| if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && |
| vcpu->arch.ia32_xss != host_xss) |
| wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); |
| } |
| |
| if (cpu_feature_enabled(X86_FEATURE_PKU) && |
| vcpu->arch.pkru != vcpu->arch.host_pkru && |
| ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || |
| kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) |
| write_pkru(vcpu->arch.pkru); |
| } |
| EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); |
| |
| void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) |
| { |
| if (vcpu->arch.guest_state_protected) |
| return; |
| |
| if (cpu_feature_enabled(X86_FEATURE_PKU) && |
| ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || |
| kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { |
| vcpu->arch.pkru = rdpkru(); |
| if (vcpu->arch.pkru != vcpu->arch.host_pkru) |
| write_pkru(vcpu->arch.host_pkru); |
| } |
| |
| if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { |
| |
| if (vcpu->arch.xcr0 != host_xcr0) |
| xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); |
| |
| if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && |
| vcpu->arch.ia32_xss != host_xss) |
| wrmsrl(MSR_IA32_XSS, host_xss); |
| } |
| |
| } |
| EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); |
| |
| #ifdef CONFIG_X86_64 |
| static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) |
| { |
| return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; |
| } |
| #endif |
| |
| static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) |
| { |
| u64 xcr0 = xcr; |
| u64 old_xcr0 = vcpu->arch.xcr0; |
| u64 valid_bits; |
| |
| /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ |
| if (index != XCR_XFEATURE_ENABLED_MASK) |
| return 1; |
| if (!(xcr0 & XFEATURE_MASK_FP)) |
| return 1; |
| if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) |
| return 1; |
| |
| /* |
| * Do not allow the guest to set bits that we do not support |
| * saving. However, xcr0 bit 0 is always set, even if the |
| * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). |
| */ |
| valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; |
| if (xcr0 & ~valid_bits) |
| return 1; |
| |
| if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != |
| (!(xcr0 & XFEATURE_MASK_BNDCSR))) |
| return 1; |
| |
| if (xcr0 & XFEATURE_MASK_AVX512) { |
| if (!(xcr0 & XFEATURE_MASK_YMM)) |
| return 1; |
| if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) |
| return 1; |
| } |
| |
| if ((xcr0 & XFEATURE_MASK_XTILE) && |
| ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE)) |
| return 1; |
| |
| vcpu->arch.xcr0 = xcr0; |
| |
| if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) |
| kvm_update_cpuid_runtime(vcpu); |
| return 0; |
| } |
| |
| int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) |
| { |
| /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ |
| if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || |
| __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } |
| |
| return kvm_skip_emulated_instruction(vcpu); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); |
| |
| bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| { |
| if (cr4 & cr4_reserved_bits) |
| return false; |
| |
| if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) |
| return false; |
| |
| return true; |
| } |
| EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); |
| |
| static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| { |
| return __kvm_is_valid_cr4(vcpu, cr4) && |
| static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); |
| } |
| |
| void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) |
| { |
| if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) |
| kvm_mmu_reset_context(vcpu); |
| |
| /* |
| * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB |
| * according to the SDM; however, stale prev_roots could be reused |
| * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we |
| * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST |
| * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, |
| * so fall through. |
| */ |
| if (!tdp_enabled && |
| (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) |
| kvm_mmu_unload(vcpu); |
| |
| /* |
| * The TLB has to be flushed for all PCIDs if any of the following |
| * (architecturally required) changes happen: |
| * - CR4.PCIDE is changed from 1 to 0 |
| * - CR4.PGE is toggled |
| * |
| * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. |
| */ |
| if (((cr4 ^ old_cr4) & X86_CR4_PGE) || |
| (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) |
| kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
| |
| /* |
| * The TLB has to be flushed for the current PCID if any of the |
| * following (architecturally required) changes happen: |
| * - CR4.SMEP is changed from 0 to 1 |
| * - CR4.PAE is toggled |
| */ |
| else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || |
| ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) |
| kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); |
| |
| } |
| EXPORT_SYMBOL_GPL(kvm_post_set_cr4); |
| |
| int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| { |
| unsigned long old_cr4 = kvm_read_cr4(vcpu); |
| |
| if (!kvm_is_valid_cr4(vcpu, cr4)) |
| return 1; |
| |
| if (is_long_mode(vcpu)) { |
| if (!(cr4 & X86_CR4_PAE)) |
| return 1; |
| if ((cr4 ^ old_cr4) & X86_CR4_LA57) |
| return 1; |
| } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
| && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) |
| && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) |
| return 1; |
| |
| if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { |
| /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ |
| if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) |
| return 1; |
| } |
| |
| static_call(kvm_x86_set_cr4)(vcpu, cr4); |
| |
| kvm_post_set_cr4(vcpu, old_cr4, cr4); |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr4); |
| |
| static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) |
| { |
| struct kvm_mmu *mmu = vcpu->arch.mmu; |
| unsigned long roots_to_free = 0; |
| int i; |
| |
| /* |
| * MOV CR3 and INVPCID are usually not intercepted when using TDP, but |
| * this is reachable when running EPT=1 and unrestricted_guest=0, and |
| * also via the emulator. KVM's TDP page tables are not in the scope of |
| * the invalidation, but the guest's TLB entries need to be flushed as |
| * the CPU may have cached entries in its TLB for the target PCID. |
| */ |
| if (unlikely(tdp_enabled)) { |
| kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
| return; |
| } |
| |
| /* |
| * If neither the current CR3 nor any of the prev_roots use the given |
| * PCID, then nothing needs to be done here because a resync will |
| * happen anyway before switching to any other CR3. |
| */ |
| if (kvm_get_active_pcid(vcpu) == pcid) { |
| kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); |
| kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); |
| } |
| |
| /* |
| * If PCID is disabled, there is no need to free prev_roots even if the |
| * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB |
| * with PCIDE=0. |
| */ |
| if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) |
| return; |
| |
| for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) |
| if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) |
| roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); |
| |
| kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); |
| } |
| |
| int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
| { |
| bool skip_tlb_flush = false; |
| unsigned long pcid = 0; |
| #ifdef CONFIG_X86_64 |
| if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { |
| skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; |
| cr3 &= ~X86_CR3_PCID_NOFLUSH; |
| pcid = cr3 & X86_CR3_PCID_MASK; |
| } |
| #endif |
| |
| /* PDPTRs are always reloaded for PAE paging. */ |
| if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) |
| goto handle_tlb_flush; |
| |
| /* |
| * Do not condition the GPA check on long mode, this helper is used to |
| * stuff CR3, e.g. for RSM emulation, and there is no guarantee that |
| * the current vCPU mode is accurate. |
| */ |
| if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) |
| return 1; |
| |
| if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) |
| return 1; |
| |
| if (cr3 != kvm_read_cr3(vcpu)) |
| kvm_mmu_new_pgd(vcpu, cr3); |
| |
| vcpu->arch.cr3 = cr3; |
| kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); |
| /* Do not call post_set_cr3, we do not get here for confidential guests. */ |
| |
| handle_tlb_flush: |
| /* |
| * A load of CR3 that flushes the TLB flushes only the current PCID, |
| * even if PCID is disabled, in which case PCID=0 is flushed. It's a |
| * moot point in the end because _disabling_ PCID will flush all PCIDs, |
| * and it's impossible to use a non-zero PCID when PCID is disabled, |
| * i.e. only PCID=0 can be relevant. |
| */ |
| if (!skip_tlb_flush) |
| kvm_invalidate_pcid(vcpu, pcid); |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr3); |
| |
| int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
| { |
| if (cr8 & CR8_RESERVED_BITS) |
| return 1; |
| if (lapic_in_kernel(vcpu)) |
| kvm_lapic_set_tpr(vcpu, cr8); |
| else |
| vcpu->arch.cr8 = cr8; |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr8); |
| |
| unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) |
| { |
| if (lapic_in_kernel(vcpu)) |
| return kvm_lapic_get_cr8(vcpu); |
| else |
| return vcpu->arch.cr8; |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_cr8); |
| |
| static void kvm_update_dr0123(struct kvm_vcpu *vcpu) |
| { |
| int i; |
| |
| if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { |
| for (i = 0; i < KVM_NR_DB_REGS; i++) |
| vcpu->arch.eff_db[i] = vcpu->arch.db[i]; |
| } |
| } |
| |
| void kvm_update_dr7(struct kvm_vcpu *vcpu) |
| { |
| unsigned long dr7; |
| |
| if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) |
| dr7 = vcpu->arch.guest_debug_dr7; |
| else |
| dr7 = vcpu->arch.dr7; |
| static_call(kvm_x86_set_dr7)(vcpu, dr7); |
| vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; |
| if (dr7 & DR7_BP_EN_MASK) |
| vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; |
| } |
| EXPORT_SYMBOL_GPL(kvm_update_dr7); |
| |
| static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) |
| { |
| u64 fixed = DR6_FIXED_1; |
| |
| if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) |
| fixed |= DR6_RTM; |
| |
| if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) |
| fixed |= DR6_BUS_LOCK; |
| return fixed; |
| } |
| |
| int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) |
| { |
| size_t size = ARRAY_SIZE(vcpu->arch.db); |
| |
| switch (dr) { |
| case 0 ... 3: |
| vcpu->arch.db[array_index_nospec(dr, size)] = val; |
| if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) |
| vcpu->arch.eff_db[dr] = val; |
| break; |
| case 4: |
| case 6: |
| if (!kvm_dr6_valid(val)) |
| return 1; /* #GP */ |
| vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); |
| break; |
| case 5: |
| default: /* 7 */ |
| if (!kvm_dr7_valid(val)) |
| return 1; /* #GP */ |
| vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; |
| kvm_update_dr7(vcpu); |
| break; |
| } |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_dr); |
| |
| unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) |
| { |
| size_t size = ARRAY_SIZE(vcpu->arch.db); |
| |
| switch (dr) { |
| case 0 ... 3: |
| return vcpu->arch.db[array_index_nospec(dr, size)]; |
| case 4: |
| case 6: |
| return vcpu->arch.dr6; |
| case 5: |
| default: /* 7 */ |
| return vcpu->arch.dr7; |
| } |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_dr); |
| |
| int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) |
| { |
| u32 ecx = kvm_rcx_read(vcpu); |
| u64 data; |
| |
| if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } |
| |
| kvm_rax_write(vcpu, (u32)data); |
| kvm_rdx_write(vcpu, data >> 32); |
| return kvm_skip_emulated_instruction(vcpu); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); |
| |
| /* |
| * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track |
| * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, |
| * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that |
| * require host support, i.e. should be probed via RDMSR. emulated_msrs holds |
| * MSRs that KVM emulates without strictly requiring host support. |
| * msr_based_features holds MSRs that enumerate features, i.e. are effectively |
| * CPUID leafs. Note, msr_based_features isn't mutually exclusive with |
| * msrs_to_save and emulated_msrs. |
| */ |
| |
| static const u32 msrs_to_save_base[] = { |
| MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
| MSR_STAR, |
| #ifdef CONFIG_X86_64 |
| MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
| #endif |
| MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, |
| MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, |
| MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, |
| MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, |
| MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, |
| MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, |
| MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, |
| MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, |
| MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, |
| MSR_IA32_UMWAIT_CONTROL, |
| |
| MSR_IA32_XFD, MSR_IA32_XFD_ERR, |
| }; |
| |
| static const u32 msrs_to_save_pmu[] = { |
| MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, |
| MSR_ARCH_PERFMON_FIXED_CTR0 + 2, |
| MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, |
| MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, |
| MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, |
| |
| /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ |
| MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, |
| MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, |
| MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, |
| MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, |
| MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, |
| |
| MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, |
| MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, |
| |
| /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ |
| MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, |
| MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, |
| MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, |
| MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, |
| |
| MSR_AMD64_PERF_CNTR_GLOBAL_CTL, |
| MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, |
| MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, |
| }; |
| |
| static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + |
| ARRAY_SIZE(msrs_to_save_pmu)]; |
| static unsigned num_msrs_to_save; |
| |
| static const u32 emulated_msrs_all[] = { |
| MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
| MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
| |
| #ifdef CONFIG_KVM_HYPERV |
| HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
| HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, |
| HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, |
| HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, |
| HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, |
| HV_X64_MSR_RESET, |
| HV_X64_MSR_VP_INDEX, |
| HV_X64_MSR_VP_RUNTIME, |
| HV_X64_MSR_SCONTROL, |
| HV_X64_MSR_STIMER0_CONFIG, |
| HV_X64_MSR_VP_ASSIST_PAGE, |
| HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, |
| HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, |
| HV_X64_MSR_SYNDBG_OPTIONS, |
| HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, |
| HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, |
| HV_X64_MSR_SYNDBG_PENDING_BUFFER, |
| #endif |
| |
| MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
| MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, |
| |
| MSR_IA32_TSC_ADJUST, |
| MSR_IA32_TSC_DEADLINE, |
| MSR_IA32_ARCH_CAPABILITIES, |
| MSR_IA32_PERF_CAPABILITIES, |
| MSR_IA32_MISC_ENABLE, |
| MSR_IA32_MCG_STATUS, |
| MSR_IA32_MCG_CTL, |
| MSR_IA32_MCG_EXT_CTL, |
| MSR_IA32_SMBASE, |
| MSR_SMI_COUNT, |
| MSR_PLATFORM_INFO, |
| MSR_MISC_FEATURES_ENABLES, |
| MSR_AMD64_VIRT_SPEC_CTRL, |
| MSR_AMD64_TSC_RATIO, |
| MSR_IA32_POWER_CTL, |
| MSR_IA32_UCODE_REV, |
| |
| /* |
| * KVM always supports the "true" VMX control MSRs, even if the host |
| * does not. The VMX MSRs as a whole are considered "emulated" as KVM |
| * doesn't strictly require them to exist in the host (ignoring that |
| * KVM would refuse to load in the first place if the core set of MSRs |
| * aren't supported). |
| */ |
| MSR_IA32_VMX_BASIC, |
| MSR_IA32_VMX_TRUE_PINBASED_CTLS, |
| MSR_IA32_VMX_TRUE_PROCBASED_CTLS, |
| MSR_IA32_VMX_TRUE_EXIT_CTLS, |
| MSR_IA32_VMX_TRUE_ENTRY_CTLS, |
| MSR_IA32_VMX_MISC, |
| MSR_IA32_VMX_CR0_FIXED0, |
| MSR_IA32_VMX_CR4_FIXED0, |
| MSR_IA32_VMX_VMCS_ENUM, |
| MSR_IA32_VMX_PROCBASED_CTLS2, |
| MSR_IA32_VMX_EPT_VPID_CAP, |
| MSR_IA32_VMX_VMFUNC, |
| |
| MSR_K7_HWCR, |
| MSR_KVM_POLL_CONTROL, |
| }; |
| |
| static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; |
| static unsigned num_emulated_msrs; |
| |
| /* |
| * List of MSRs that control the existence of MSR-based features, i.e. MSRs |
| * that are effectively CPUID leafs. VMX MSRs are also included in the set of |
| * feature MSRs, but are handled separately to allow expedited lookups. |
| */ |
| static const u32 msr_based_features_all_except_vmx[] = { |
| MSR_AMD64_DE_CFG, |
| MSR_IA32_UCODE_REV, |
| MSR_IA32_ARCH_CAPABILITIES, |
| MSR_IA32_PERF_CAPABILITIES, |
| }; |
| |
| static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + |
| (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; |
| static unsigned int num_msr_based_features; |
| |
| /* |
| * All feature MSRs except uCode revID, which tracks the currently loaded uCode |
| * patch, are immutable once the vCPU model is defined. |
| */ |
| static bool kvm_is_immutable_feature_msr(u32 msr) |
| { |
| int i; |
| |
| if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) |
| return true; |
| |
| for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { |
| if (msr == msr_based_features_all_except_vmx[i]) |
| return msr != MSR_IA32_UCODE_REV; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM |
| * does not yet virtualize. These include: |
| * 10 - MISC_PACKAGE_CTRLS |
| * 11 - ENERGY_FILTERING_CTL |
| * 12 - DOITM |
| * 18 - FB_CLEAR_CTRL |
| * 21 - XAPIC_DISABLE_STATUS |
| * 23 - OVERCLOCKING_STATUS |
| */ |
| |
| #define KVM_SUPPORTED_ARCH_CAP \ |
| (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ |
| ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ |
| ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ |
| ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ |
| ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ |
| ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) |
| |
| static u64 kvm_get_arch_capabilities(void) |
| { |
| u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; |
| |
| /* |
| * If nx_huge_pages is enabled, KVM's shadow paging will ensure that |
| * the nested hypervisor runs with NX huge pages. If it is not, |
| * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other |
| * L1 guests, so it need not worry about its own (L2) guests. |
| */ |
| data |= ARCH_CAP_PSCHANGE_MC_NO; |
| |
| /* |
| * If we're doing cache flushes (either "always" or "cond") |
| * we will do one whenever the guest does a vmlaunch/vmresume. |
| * If an outer hypervisor is doing the cache flush for us |
| * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that |
| * capability to the guest too, and if EPT is disabled we're not |
| * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will |
| * require a nested hypervisor to do a flush of its own. |
| */ |
| if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) |
| data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; |
| |
| if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) |
| data |= ARCH_CAP_RDCL_NO; |
| if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) |
| data |= ARCH_CAP_SSB_NO; |
| if (!boot_cpu_has_bug(X86_BUG_MDS)) |
| data |= ARCH_CAP_MDS_NO; |
| if (!boot_cpu_has_bug(X86_BUG_RFDS)) |
| data |= ARCH_CAP_RFDS_NO; |
| |
| if (!boot_cpu_has(X86_FEATURE_RTM)) { |
| /* |
| * If RTM=0 because the kernel has disabled TSX, the host might |
| * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 |
| * and therefore knows that there cannot be TAA) but keep |
| * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, |
| * and we want to allow migrating those guests to tsx=off hosts. |
| */ |
| data &= ~ARCH_CAP_TAA_NO; |
| } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { |
| data |= ARCH_CAP_TAA_NO; |
| } else { |
| /* |
| * Nothing to do here; we emulate TSX_CTRL if present on the |
| * host so the guest can choose between disabling TSX or |
| * using VERW to clear CPU buffers. |
| */ |
| } |
| |
| if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) |
| data |= ARCH_CAP_GDS_NO; |
| |
| return data; |
| } |
| |
| static int kvm_get_msr_feature(struct kvm_msr_entry *msr) |
| { |
| switch (msr->index) { |
| case MSR_IA32_ARCH_CAPABILITIES: |
| msr->data = kvm_get_arch_capabilities(); |
| break; |
| case MSR_IA32_PERF_CAPABILITIES: |
| msr->data = kvm_caps.supported_perf_cap; |
| break; |
| case MSR_IA32_UCODE_REV: |
| rdmsrl_safe(msr->index, &msr->data); |
| break; |
| default: |
| return static_call(kvm_x86_get_msr_feature)(msr); |
| } |
| return 0; |
| } |
| |
| static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
| { |
| struct kvm_msr_entry msr; |
| int r; |
| |
| /* Unconditionally clear the output for simplicity */ |
| msr.data = 0; |
| msr.index = index; |
| r = kvm_get_msr_feature(&msr); |
| |
| if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) |
| r = 0; |
| |
| *data = msr.data; |
| |
| return r; |
| } |
| |
| static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) |
| { |
| if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) |
| return false; |
| |
| if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) |
| return false; |
| |
| if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) |
| return false; |
| |
| if (efer & (EFER_LME | EFER_LMA) && |
| !guest_cpuid_has(vcpu, X86_FEATURE_LM)) |
| return false; |
| |
| if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) |
| return false; |
| |
| return true; |
| |
| } |
| bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) |
| { |
| if (efer & efer_reserved_bits) |
| return false; |
| |
| return __kvm_valid_efer(vcpu, efer); |
| } |
| EXPORT_SYMBOL_GPL(kvm_valid_efer); |
| |
| static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
| { |
| u64 old_efer = vcpu->arch.efer; |
| u64 efer = msr_info->data; |
| int r; |
| |
| if (efer & efer_reserved_bits) |
| return 1; |
| |
| if (!msr_info->host_initiated) { |
| if (!__kvm_valid_efer(vcpu, efer)) |
| return 1; |
| |
| if (is_paging(vcpu) && |
| (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) |
| return 1; |
| } |
| |
| efer &= ~EFER_LMA; |
| efer |= vcpu->arch.efer & EFER_LMA; |
| |
| r = static_call(kvm_x86_set_efer)(vcpu, efer); |
| if (r) { |
| WARN_ON(r > 0); |
| return r; |
| } |
| |
| if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) |
| kvm_mmu_reset_context(vcpu); |
| |
| if (!static_cpu_has(X86_FEATURE_XSAVES) && |
| (efer & EFER_SVME)) |
| kvm_hv_xsaves_xsavec_maybe_warn(vcpu); |
| |
| return 0; |
| } |
| |
| void kvm_enable_efer_bits(u64 mask) |
| { |
| efer_reserved_bits &= ~mask; |
| } |
| EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); |
| |
| bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) |
| { |
| struct kvm_x86_msr_filter *msr_filter; |
| struct msr_bitmap_range *ranges; |
| struct kvm *kvm = vcpu->kvm; |
| bool allowed; |
| int idx; |
| u32 i; |
| |
| /* x2APIC MSRs do not support filtering. */ |
| if (index >= 0x800 && index <= 0x8ff) |
| return true; |
| |
| idx = srcu_read_lock(&kvm->srcu); |
| |
| msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); |
| if (!msr_filter) { |
| allowed = true; |
| goto out; |
| } |
| |
| allowed = msr_filter->default_allow; |
| ranges = msr_filter->ranges; |
| |
| for (i = 0; i < msr_filter->count; i++) { |
| u32 start = ranges[i].base; |
| u32 end = start + ranges[i].nmsrs; |
| u32 flags = ranges[i].flags; |
| unsigned long *bitmap = ranges[i].bitmap; |
| |
| if ((index >= start) && (index < end) && (flags & type)) { |
| allowed = test_bit(index - start, bitmap); |
| break; |
| } |
| } |
| |
| out: |
| srcu_read_unlock(&kvm->srcu, idx); |
| |
| return allowed; |
| } |
| EXPORT_SYMBOL_GPL(kvm_msr_allowed); |
| |
| /* |
| * Write @data into the MSR specified by @index. Select MSR specific fault |
| * checks are bypassed if @host_initiated is %true. |
| * Returns 0 on success, non-0 otherwise. |
| * Assumes vcpu_load() was already called. |
| */ |
| static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, |
| bool host_initiated) |
| { |
| struct msr_data msr; |
| |
| switch (index) { |
| case MSR_FS_BASE: |
| case MSR_GS_BASE: |
| case MSR_KERNEL_GS_BASE: |
| case MSR_CSTAR: |
| case MSR_LSTAR: |
| if (is_noncanonical_address(data, vcpu)) |
| return 1; |
| break; |
| case MSR_IA32_SYSENTER_EIP: |
| case MSR_IA32_SYSENTER_ESP: |
| /* |
| * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if |
| * non-canonical address is written on Intel but not on |
| * AMD (which ignores the top 32-bits, because it does |
| * not implement 64-bit SYSENTER). |
| * |
| * 64-bit code should hence be able to write a non-canonical |
| * value on AMD. Making the address canonical ensures that |
| * vmentry does not fail on Intel after writing a non-canonical |
| * value, and that something deterministic happens if the guest |
| * invokes 64-bit SYSENTER. |
| */ |
| data = __canonical_address(data, vcpu_virt_addr_bits(vcpu)); |
| break; |
| case MSR_TSC_AUX: |
| if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) |
| return 1; |
| |
| if (!host_initiated && |
| !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && |
| !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) |
| return 1; |
| |
| /* |
| * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has |
| * incomplete and conflicting architectural behavior. Current |
| * AMD CPUs completely ignore bits 63:32, i.e. they aren't |
| * reserved and always read as zeros. Enforce Intel's reserved |
| * bits check if and only if the guest CPU is Intel, and clear |
| * the bits in all other cases. This ensures cross-vendor |
| * migration will provide consistent behavior for the guest. |
| */ |
| if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) |
| return 1; |
| |
| data = (u32)data; |
| break; |
| } |
| |
| msr.data = data; |
| msr.index = index; |
| msr.host_initiated = host_initiated; |
| |
| return static_call(kvm_x86_set_msr)(vcpu, &msr); |
| } |
| |
| static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, |
| u32 index, u64 data, bool host_initiated) |
| { |
| int ret = __kvm_set_msr(vcpu, index, data, host_initiated); |
| |
| if (ret == KVM_MSR_RET_INVALID) |
| if (kvm_msr_ignored_check(index, data, true)) |
| ret = 0; |
| |
| return ret; |
| } |
| |
| /* |
| * Read the MSR specified by @index into @data. Select MSR specific fault |
| * checks are bypassed if @host_initiated is %true. |
| * Returns 0 on success, non-0 otherwise. |
| * Assumes vcpu_load() was already called. |
| */ |
| int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, |
| bool host_initiated) |
| { |
| struct msr_data msr; |
| int ret; |
| |
| switch (index) { |
| case MSR_TSC_AUX: |
| if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) |
| return 1; |
| |
| if (!host_initiated && |
| !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && |
| !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) |
| return 1; |
| break; |
| } |
| |
| msr.index = index; |
| msr.host_initiated = host_initiated; |
| |
| ret = static_call(kvm_x86_get_msr)(vcpu, &msr); |
| if (!ret) |
| *data = msr.data; |
| return ret; |
| } |
| |
| static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, |
| u32 index, u64 *data, bool host_initiated) |
| { |
| int ret = __kvm_get_msr(vcpu, index, data, host_initiated); |
| |
| if (ret == KVM_MSR_RET_INVALID) { |
| /* Unconditionally clear *data for simplicity */ |
| *data = 0; |
| if (kvm_msr_ignored_check(index, 0, false)) |
| ret = 0; |
| } |
| |
| return ret; |
| } |
| |
| static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) |
| { |
| if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) |
| return KVM_MSR_RET_FILTERED; |
| return kvm_get_msr_ignored_check(vcpu, index, data, false); |
| } |
| |
| static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) |
| { |
| if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) |
| return KVM_MSR_RET_FILTERED; |
| return kvm_set_msr_ignored_check(vcpu, index, data, false); |
| } |
| |
| int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) |
| { |
| return kvm_get_msr_ignored_check(vcpu, index, data, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_msr); |
| |
| int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) |
| { |
| return kvm_set_msr_ignored_check(vcpu, index, data, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_msr); |
| |
| static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) |
| { |
| if (!vcpu->run->msr.error) { |
| kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); |
| kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); |
| } |
| } |
| |
| static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) |
| { |
| return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); |
| } |
| |
| static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) |
| { |
| complete_userspace_rdmsr(vcpu); |
| return complete_emulated_msr_access(vcpu); |
| } |
| |
| static int complete_fast_msr_access(struct kvm_vcpu *vcpu) |
| { |
| return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); |
| } |
| |
| static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) |
| { |
| complete_userspace_rdmsr(vcpu); |
| return complete_fast_msr_access(vcpu); |
| } |
| |
| static u64 kvm_msr_reason(int r) |
| { |
| switch (r) { |
| case KVM_MSR_RET_INVALID: |
| return KVM_MSR_EXIT_REASON_UNKNOWN; |
| case KVM_MSR_RET_FILTERED: |
| return KVM_MSR_EXIT_REASON_FILTER; |
| default: |
| return KVM_MSR_EXIT_REASON_INVAL; |
| } |
| } |
| |
| static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, |
| u32 exit_reason, u64 data, |
| int (*completion)(struct kvm_vcpu *vcpu), |
| int r) |
| { |
| u64 msr_reason = kvm_msr_reason(r); |
| |
| /* Check if the user wanted to know about this MSR fault */ |
| if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) |
| return 0; |
| |
| vcpu->run->exit_reason = exit_reason; |
| vcpu->run->msr.error = 0; |
| memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); |
| vcpu->run->msr.reason = msr_reason; |
| vcpu->run->msr.index = index; |
| vcpu->run->msr.data = data; |
| vcpu->arch.complete_userspace_io = completion; |
| |
| return 1; |
| } |
| |
| int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) |
| { |
| u32 ecx = kvm_rcx_read(vcpu); |
| u64 data; |
| int r; |
| |
| r = kvm_get_msr_with_filter(vcpu, ecx, &data); |
| |
| if (!r) { |
| trace_kvm_msr_read(ecx, data); |
| |
| kvm_rax_write(vcpu, data & -1u); |
| kvm_rdx_write(vcpu, (data >> 32) & -1u); |
| } else { |
| /* MSR read failed? See if we should ask user space */ |
| if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, |
| complete_fast_rdmsr, r)) |
| return 0; |
| trace_kvm_msr_read_ex(ecx); |
| } |
| |
| return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); |
| |
| int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) |
| { |
| u32 ecx = kvm_rcx_read(vcpu); |
| u64 data = kvm_read_edx_eax(vcpu); |
| int r; |
| |
| r = kvm_set_msr_with_filter(vcpu, ecx, data); |
| |
| if (!r) { |
| trace_kvm_msr_write(ecx, data); |
| } else { |
| /* MSR write failed? See if we should ask user space */ |
| if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, |
| complete_fast_msr_access, r)) |
| return 0; |
| /* Signal all other negative errors to userspace */ |
| if (r < 0) |
| return r; |
| trace_kvm_msr_write_ex(ecx, data); |
| } |
| |
| return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); |
| |
| int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) |
| { |
| return kvm_skip_emulated_instruction(vcpu); |
| } |
| |
| int kvm_emulate_invd(struct kvm_vcpu *vcpu) |
| { |
| /* Treat an INVD instruction as a NOP and just skip it. */ |
| return kvm_emulate_as_nop(vcpu); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_invd); |
| |
| int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) |
| { |
| kvm_queue_exception(vcpu, UD_VECTOR); |
| return 1; |
| } |
| EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); |
| |
| |
| static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) |
| { |
| if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && |
| !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) |
| return kvm_handle_invalid_op(vcpu); |
| |
| pr_warn_once("%s instruction emulated as NOP!\n", insn); |
| return kvm_emulate_as_nop(vcpu); |
| } |
| int kvm_emulate_mwait(struct kvm_vcpu *vcpu) |
| { |
| return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_mwait); |
| |
| int kvm_emulate_monitor(struct kvm_vcpu *vcpu) |
| { |
| return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_monitor); |
| |
| static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) |
| { |
| xfer_to_guest_mode_prepare(); |
| return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || |
| xfer_to_guest_mode_work_pending(); |
| } |
| |
| /* |
| * The fast path for frequent and performance sensitive wrmsr emulation, |
| * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces |
| * the latency of virtual IPI by avoiding the expensive bits of transitioning |
| * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the |
| * other cases which must be called after interrupts are enabled on the host. |
| */ |
| static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) |
| { |
| if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) |
| return 1; |
| |
| if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && |
| ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && |
| ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && |
| ((u32)(data >> 32) != X2APIC_BROADCAST)) |
| return kvm_x2apic_icr_write(vcpu->arch.apic, data); |
| |
| return 1; |
| } |
| |
| static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) |
| { |
| if (!kvm_can_use_hv_timer(vcpu)) |
| return 1; |
| |
| kvm_set_lapic_tscdeadline_msr(vcpu, data); |
| return 0; |
| } |
| |
| fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) |
| { |
| u32 msr = kvm_rcx_read(vcpu); |
| u64 data; |
| fastpath_t ret = EXIT_FASTPATH_NONE; |
| |
| kvm_vcpu_srcu_read_lock(vcpu); |
| |
| switch (msr) { |
| case APIC_BASE_MSR + (APIC_ICR >> 4): |
| data = kvm_read_edx_eax(vcpu); |
| if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { |
| kvm_skip_emulated_instruction(vcpu); |
| ret = EXIT_FASTPATH_EXIT_HANDLED; |
| } |
| break; |
| case MSR_IA32_TSC_DEADLINE: |
| data = kvm_read_edx_eax(vcpu); |
| if (!handle_fastpath_set_tscdeadline(vcpu, data)) { |
| kvm_skip_emulated_instruction(vcpu); |
| ret = EXIT_FASTPATH_REENTER_GUEST; |
| } |
| break; |
| default: |
| break; |
| } |
| |
| if (ret != EXIT_FASTPATH_NONE) |
| trace_kvm_msr_write(msr, data); |
| |
| kvm_vcpu_srcu_read_unlock(vcpu); |
| |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); |
| |
| /* |
| * Adapt set_msr() to msr_io()'s calling convention |
| */ |
| static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
| { |
| return kvm_get_msr_ignored_check(vcpu, index, data, true); |
| } |
| |
| static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
| { |
| u64 val; |
| |
| /* |
| * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does |
| * not support modifying the guest vCPU model on the fly, e.g. changing |
| * the nVMX capabilities while L2 is running is nonsensical. Ignore |
| * writes of the same value, e.g. to allow userspace to blindly stuff |
| * all MSRs when emulating RESET. |
| */ |
| if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) { |
| if (do_get_msr(vcpu, index, &val) || *data != val) |
| return -EINVAL; |
| |
| return 0; |
| } |
| |
| return kvm_set_msr_ignored_check(vcpu, index, *data, true); |
| } |
| |
| #ifdef CONFIG_X86_64 |
| struct pvclock_clock { |
| int vclock_mode; |
| u64 cycle_last; |
| u64 mask; |
| u32 mult; |
| u32 shift; |
| u64 base_cycles; |
| u64 offset; |
| }; |
| |
| struct pvclock_gtod_data { |
| seqcount_t seq; |
| |
| struct pvclock_clock clock; /* extract of a clocksource struct */ |
| struct pvclock_clock raw_clock; /* extract of a clocksource struct */ |
| |
| ktime_t offs_boot; |
| u64 wall_time_sec; |
| }; |
| |
| static struct pvclock_gtod_data pvclock_gtod_data; |
| |
| static void update_pvclock_gtod(struct timekeeper *tk) |
| { |
| struct pvclock_gtod_data *vdata = &pvclock_gtod_data; |
| |
| write_seqcount_begin(&vdata->seq); |
| |
| /* copy pvclock gtod data */ |
| vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; |
| vdata->clock.cycle_last = tk->tkr_mono.cycle_last; |
| vdata->clock.mask = tk->tkr_mono.mask; |
| vdata->clock.mult = tk->tkr_mono.mult; |
| vdata->clock.shift = tk->tkr_mono.shift; |
| vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; |
| vdata->clock.offset = tk->tkr_mono.base; |
| |
| vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; |
| vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; |
| vdata->raw_clock.mask = tk->tkr_raw.mask; |
| vdata->raw_clock.mult = tk->tkr_raw.mult; |
| vdata->raw_clock.shift = tk->tkr_raw.shift; |
| vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; |
| vdata->raw_clock.offset = tk->tkr_raw.base; |
| |
| vdata->wall_time_sec = tk->xtime_sec; |
| |
| vdata->offs_boot = tk->offs_boot; |
| |
| write_seqcount_end(&vdata->seq); |
| } |
| |
| static s64 get_kvmclock_base_ns(void) |
| { |
| /* Count up from boot time, but with the frequency of the raw clock. */ |
| return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); |
| } |
| #else |
| static s64 get_kvmclock_base_ns(void) |
| { |
| /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ |
| return ktime_get_boottime_ns(); |
| } |
| #endif |
| |
| static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) |
| { |
| int version; |
| int r; |
| struct pvclock_wall_clock wc; |
| u32 wc_sec_hi; |
| u64 wall_nsec; |
| |
| if (!wall_clock) |
| return; |
| |
| r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); |
| if (r) |
| return; |
| |
| if (version & 1) |
| ++version; /* first time write, random junk */ |
| |
| ++version; |
| |
| if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) |
| return; |
| |
| wall_nsec = kvm_get_wall_clock_epoch(kvm); |
| |
| wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); |
| wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ |
| wc.version = version; |
| |
| kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); |
| |
| if (sec_hi_ofs) { |
| wc_sec_hi = wall_nsec >> 32; |
| kvm_write_guest(kvm, wall_clock + sec_hi_ofs, |
| &wc_sec_hi, sizeof(wc_sec_hi)); |
| } |
| |
| version++; |
| kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); |
| } |
| |
| static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, |
| bool old_msr, bool host_initiated) |
| { |
| struct kvm_arch *ka = &vcpu->kvm->arch; |
| |
| if (vcpu->vcpu_id == 0 && !host_initiated) { |
| if (ka->boot_vcpu_runs_old_kvmclock != old_msr) |
| kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
| |
| ka->boot_vcpu_runs_old_kvmclock = old_msr; |
| } |
| |
| vcpu->arch.time = system_time; |
| kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); |
| |
| /* we verify if the enable bit is set... */ |
| if (system_time & 1) |
| kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, |
| sizeof(struct pvclock_vcpu_time_info)); |
| else |
| kvm_gpc_deactivate(&vcpu->arch.pv_time); |
| |
| return; |
| } |
| |
| static uint32_t div_frac(uint32_t dividend, uint32_t divisor) |
| { |
| do_shl32_div32(dividend, divisor); |
| return dividend; |
| } |
| |
| static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, |
| s8 *pshift, u32 *pmultiplier) |
| { |
| uint64_t scaled64; |
| int32_t shift = 0; |
| uint64_t tps64; |
| uint32_t tps32; |
| |
| tps64 = base_hz; |
| scaled64 = scaled_hz; |
| while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { |
| tps64 >>= 1; |
| shift--; |
| } |
| |
| tps32 = (uint32_t)tps64; |
| while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { |
| if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) |
| scaled64 >>= 1; |
| else |
| tps32 <<= 1; |
| shift++; |
| } |
| |
| *pshift = shift; |
| *pmultiplier = div_frac(scaled64, tps32); |
| } |
| |
| #ifdef CONFIG_X86_64 |
| static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); |
| #endif |
| |
| static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
| static unsigned long max_tsc_khz; |
| |
| static u32 adjust_tsc_khz(u32 khz, s32 ppm) |
| { |
| u64 v = (u64)khz * (1000000 + ppm); |
| do_div(v, 1000000); |
| return v; |
| } |
| |
| static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); |
| |
| static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) |
| { |
| u64 ratio; |
| |
| /* Guest TSC same frequency as host TSC? */ |
| if (!scale) { |
| kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); |
| return 0; |
| } |
| |
| /* TSC scaling supported? */ |
| if (!kvm_caps.has_tsc_control) { |
| if (user_tsc_khz > tsc_khz) { |
| vcpu->arch.tsc_catchup = 1; |
| vcpu->arch.tsc_always_catchup = 1; |
| return 0; |
| } else { |
| pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); |
| return -1; |
| } |
| } |
| |
| /* TSC scaling required - calculate ratio */ |
| ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, |
| user_tsc_khz, tsc_khz); |
| |
| if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { |
| pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", |
| user_tsc_khz); |
| return -1; |
| } |
| |
| kvm_vcpu_write_tsc_multiplier(vcpu, ratio); |
| return 0; |
| } |
| |
| static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) |
| { |
| u32 thresh_lo, thresh_hi; |
| int use_scaling = 0; |
| |
| /* tsc_khz can be zero if TSC calibration fails */ |
| if (user_tsc_khz == 0) { |
| /* set tsc_scaling_ratio to a safe value */ |
| kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); |
| return -1; |
| } |
| |
| /* Compute a scale to convert nanoseconds in TSC cycles */ |
| kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, |
| &vcpu->arch.virtual_tsc_shift, |
| &vcpu->arch.virtual_tsc_mult); |
| vcpu->arch.virtual_tsc_khz = user_tsc_khz; |
| |
| /* |
| * Compute the variation in TSC rate which is acceptable |
| * within the range of tolerance and decide if the |
| * rate being applied is within that bounds of the hardware |
| * rate. If so, no scaling or compensation need be done. |
| */ |
| thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); |
| thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); |
| if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { |
| pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n", |
| user_tsc_khz, thresh_lo, thresh_hi); |
| use_scaling = 1; |
| } |
| return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); |
| } |
| |
| static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) |
| { |
| u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, |
| vcpu->arch.virtual_tsc_mult, |
| vcpu->arch.virtual_tsc_shift); |
| tsc += vcpu->arch.this_tsc_write; |
| return tsc; |
| } |
| |
| #ifdef CONFIG_X86_64 |
| static inline bool gtod_is_based_on_tsc(int mode) |
| { |
| return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; |
| } |
| #endif |
| |
| static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation) |
| { |
| #ifdef CONFIG_X86_64 |
| struct kvm_arch *ka = &vcpu->kvm->arch; |
| struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
| |
| /* |
| * To use the masterclock, the host clocksource must be based on TSC |
| * and all vCPUs must have matching TSCs. Note, the count for matching |
| * vCPUs doesn't include the reference vCPU, hence "+1". |
| */ |
| bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 == |
| atomic_read(&vcpu->kvm->online_vcpus)) && |
| gtod_is_based_on_tsc(gtod->clock.vclock_mode); |
| |
| /* |
| * Request a masterclock update if the masterclock needs to be toggled |
| * on/off, or when starting a new generation and the masterclock is |
| * enabled (compute_guest_tsc() requires the masterclock snapshot to be |
| * taken _after_ the new generation is created). |
| */ |
| if ((ka->use_master_clock && new_generation) || |
| (ka->use_master_clock != use_master_clock)) |
| kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
| |
| trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, |
| atomic_read(&vcpu->kvm->online_vcpus), |
| ka->use_master_clock, gtod->clock.vclock_mode); |
| #endif |
| } |
| |
| /* |
| * Multiply tsc by a fixed point number represented by ratio. |
| * |
| * The most significant 64-N bits (mult) of ratio represent the |
| * integral part of the fixed point number; the remaining N bits |
| * (frac) represent the fractional part, ie. ratio represents a fixed |
| * point number (mult + frac * 2^(-N)). |
| * |
| * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. |
| */ |
| static inline u64 __scale_tsc(u64 ratio, u64 tsc) |
| { |
| return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); |
| } |
| |
| u64 kvm_scale_tsc(u64 tsc, u64 ratio) |
| { |
| u64 _tsc = tsc; |
| |
| if (ratio != kvm_caps.default_tsc_scaling_ratio) |
| _tsc = __scale_tsc(ratio, tsc); |
| |
| return _tsc; |
| } |
| |
| static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) |
| { |
| u64 tsc; |
| |
| tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); |
| |
| return target_tsc - tsc; |
| } |
| |
| u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) |
| { |
| return vcpu->arch.l1_tsc_offset + |
| kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); |
| } |
| EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); |
| |
| u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) |
| { |
| u64 nested_offset; |
| |
| if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) |
| nested_offset = l1_offset; |
| else |
| nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, |
| kvm_caps.tsc_scaling_ratio_frac_bits); |
| |
| nested_offset += l2_offset; |
| return nested_offset; |
| } |
| EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); |
| |
| u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) |
| { |
| if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) |
| return mul_u64_u64_shr(l1_multiplier, l2_multiplier, |
| kvm_caps.tsc_scaling_ratio_frac_bits); |
| |
| return l1_multiplier; |
| } |
| EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); |
| |
| static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) |
| { |
| trace_kvm_write_tsc_offset(vcpu->vcpu_id, |
| vcpu->arch.l1_tsc_offset, |
| l1_offset); |
| |
| vcpu->arch.l1_tsc_offset = l1_offset; |
| |
| /* |
| * If we are here because L1 chose not to trap WRMSR to TSC then |
| * according to the spec this should set L1's TSC (as opposed to |
| * setting L1's offset for L2). |
| */ |
| if (is_guest_mode(vcpu)) |
| vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( |
| l1_offset, |
| static_call(kvm_x86_get_l2_tsc_offset)(vcpu), |
| static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); |
| else |
| vcpu->arch.tsc_offset = l1_offset; |
| |
| static_call(kvm_x86_write_tsc_offset)(vcpu); |
| } |
| |
| static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) |
| { |
| vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; |
| |
| /* Userspace is changing the multiplier while L2 is active */ |
| if (is_guest_mode(vcpu)) |
| vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( |
| l1_multiplier, |
| static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); |
| else |
| vcpu->arch.tsc_scaling_ratio = l1_multiplier; |
| |
| if (kvm_caps.has_tsc_control) |
| static_call(kvm_x86_write_tsc_multiplier)(vcpu); |
| } |
| |
| static inline bool kvm_check_tsc_unstable(void) |
| { |
| #ifdef CONFIG_X86_64 |
| /* |
| * TSC is marked unstable when we're running on Hyper-V, |
| * 'TSC page' clocksource is good. |
| */ |
| if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) |
| return false; |
| #endif |
| return check_tsc_unstable(); |
| } |
| |
| /* |
| * Infers attempts to synchronize the guest's tsc from host writes. Sets the |
| * offset for the vcpu and tracks the TSC matching generation that the vcpu |
| * participates in. |
| */ |
| static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, |
| u64 ns, bool matched) |
| { |
| struct kvm *kvm = vcpu->kvm; |
| |
| lockdep_assert_held(&kvm->arch.tsc_write_lock); |
| |
| /* |
| * We also track th most recent recorded KHZ, write and time to |
| * allow the matching interval to be extended at each write. |
| */ |
| kvm->arch.last_tsc_nsec = ns; |
| kvm->arch.last_tsc_write = tsc; |
| kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; |
| kvm->arch.last_tsc_offset = offset; |
| |
| vcpu->arch.last_guest_tsc = tsc; |
| |
| kvm_vcpu_write_tsc_offset(vcpu, offset); |
| |
| if (!matched) { |
| /* |
| * We split periods of matched TSC writes into generations. |
| * For each generation, we track the original measured |
| * nanosecond time, offset, and write, so if TSCs are in |
| * sync, we can match exact offset, and if not, we can match |
| * exact software computation in compute_guest_tsc() |
| * |
| * These values are tracked in kvm->arch.cur_xxx variables. |
| */ |
| kvm->arch.cur_tsc_generation++; |
| kvm->arch.cur_tsc_nsec = ns; |
| kvm->arch.cur_tsc_write = tsc; |
| kvm->arch.cur_tsc_offset = offset; |
| kvm->arch.nr_vcpus_matched_tsc = 0; |
| } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { |
| kvm->arch.nr_vcpus_matched_tsc++; |
| } |
| |
| /* Keep track of which generation this VCPU has synchronized to */ |
| vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; |
| vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; |
| vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; |
| |
| kvm_track_tsc_matching(vcpu, !matched); |
| } |
| |
| static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) |
| { |
| u64 data = user_value ? *user_value : 0; |
| struct kvm *kvm = vcpu->kvm; |
| u64 offset, ns, elapsed; |
| unsigned long flags; |
| bool matched = false; |
| bool synchronizing = false; |
| |
| raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
| offset = kvm_compute_l1_tsc_offset(vcpu, data); |
| ns = get_kvmclock_base_ns(); |
| elapsed = ns - kvm->arch.last_tsc_nsec; |
| |
| if (vcpu->arch.virtual_tsc_khz) { |
| if (data == 0) { |
| /* |
| * Force synchronization when creating a vCPU, or when |
| * userspace explicitly writes a zero value. |
| */ |
| synchronizing = true; |
| } else if (kvm->arch.user_set_tsc) { |
| u64 tsc_exp = kvm->arch.last_tsc_write + |
| nsec_to_cycles(vcpu, elapsed); |
| u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; |
| /* |
| * Here lies UAPI baggage: when a user-initiated TSC write has |
| * a small delta (1 second) of virtual cycle time against the |
| * previously set vCPU, we assume that they were intended to be |
| * in sync and the delta was only due to the racy nature of the |
| * legacy API. |
| * |
| * This trick falls down when restoring a guest which genuinely |
| * has been running for less time than the 1 second of imprecision |
| * which we allow for in the legacy API. In this case, the first |
| * value written by userspace (on any vCPU) should not be subject |
| * to this 'correction' to make it sync up with values that only |
| * come from the kernel's default vCPU creation. Make the 1-second |
| * slop hack only trigger if the user_set_tsc flag is already set. |
| */ |
| synchronizing = data < tsc_exp + tsc_hz && |
| data + tsc_hz > tsc_exp; |
| } |
| } |
| |
| if (user_value) |
| kvm->arch.user_set_tsc = true; |
| |
| /* |
| * For a reliable TSC, we can match TSC offsets, and for an unstable |
| * TSC, we add elapsed time in this computation. We could let the |
| * compensation code attempt to catch up if we fall behind, but |
| * it's better to try to match offsets from the beginning. |
| */ |
| if (synchronizing && |
| vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { |
| if (!kvm_check_tsc_unstable()) { |
| offset = kvm->arch.cur_tsc_offset; |
| } else { |
| u64 delta = nsec_to_cycles(vcpu, elapsed); |
| data += delta; |
| offset = kvm_compute_l1_tsc_offset(vcpu, data); |
| } |
| matched = true; |
| } |
| |
| __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); |
| raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); |
| } |
| |
| static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, |
| s64 adjustment) |
| { |
| u64 tsc_offset = vcpu->arch.l1_tsc_offset; |
| kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); |
| } |
| |
| static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) |
| { |
| if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) |
| WARN_ON(adjustment < 0); |
| adjustment = kvm_scale_tsc((u64) adjustment, |
| vcpu->arch.l1_tsc_scaling_ratio); |
| adjust_tsc_offset_guest(vcpu, adjustment); |
| } |
| |
| #ifdef CONFIG_X86_64 |
| |
| static u64 read_tsc(void) |
| { |
| u64 ret = (u64)rdtsc_ordered(); |
| u64 last = pvclock_gtod_data.clock.cycle_last; |
| |
| if (likely(ret >= last)) |
| return ret; |
| |
| /* |
| * GCC likes to generate cmov here, but this branch is extremely |
| * predictable (it's just a function of time and the likely is |
| * very likely) and there's a data dependence, so force GCC |
| * to generate a branch instead. I don't barrier() because |
| * we don't actually need a barrier, and if this function |
| * ever gets inlined it will generate worse code. |
| */ |
| asm volatile (""); |
| return last; |
| } |
| |
| static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, |
| int *mode) |
| { |
| u64 tsc_pg_val; |
| long v; |
| |
| switch (clock->vclock_mode) { |
| case VDSO_CLOCKMODE_HVCLOCK: |
| if (hv_read_tsc_page_tsc(hv_get_tsc_page(), |
| tsc_timestamp, &tsc_pg_val)) { |
| /* TSC page valid */ |
| *mode = VDSO_CLOCKMODE_HVCLOCK; |
| v = (tsc_pg_val - clock->cycle_last) & |
| clock->mask; |
| } else { |
| /* TSC page invalid */ |
| *mode = VDSO_CLOCKMODE_NONE; |
| } |
| break; |
| case VDSO_CLOCKMODE_TSC: |
| *mode = VDSO_CLOCKMODE_TSC; |
| *tsc_timestamp = read_tsc(); |
| v = (*tsc_timestamp - clock->cycle_last) & |
| clock->mask; |
| break; |
| default: |
| *mode = VDSO_CLOCKMODE_NONE; |
| } |
| |
| if (*mode == VDSO_CLOCKMODE_NONE) |
| *tsc_timestamp = v = 0; |
| |
| return v * clock->mult; |
| } |
| |
| /* |
| * As with get_kvmclock_base_ns(), this counts from boot time, at the |
| * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). |
| */ |
| static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) |
| { |
| struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
| unsigned long seq; |
| int mode; |
| u64 ns; |
| |
| do { |
| seq = read_seqcount_begin(>od->seq); |
| ns = gtod->raw_clock.base_cycles; |
| ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); |
| ns >>= gtod->raw_clock.shift; |
| ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); |
| } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
| *t = ns; |
| |
| return mode; |
| } |
| |
| /* |
| * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with |
| * no boot time offset. |
| */ |
| static int do_monotonic(s64 *t, u64 *tsc_timestamp) |
| { |
| struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
| unsigned long seq; |
| int mode; |
| u64 ns; |
| |
| do { |
| seq = read_seqcount_begin(>od->seq); |
| ns = gtod->clock.base_cycles; |
| ns += vgettsc(>od->clock, tsc_timestamp, &mode); |
| ns >>= gtod->clock.shift; |
| ns += ktime_to_ns(gtod->clock.offset); |
| } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
| *t = ns; |
| |
| return mode; |
| } |
| |
| static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) |
| { |
| struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
| unsigned long seq; |
| int mode; |
| u64 ns; |
| |
| do { |
| seq = read_seqcount_begin(>od->seq); |
| ts->tv_sec = gtod->wall_time_sec; |
| ns = gtod->clock.base_cycles; |
| ns += vgettsc(>od->clock, tsc_timestamp, &mode); |
| ns >>= gtod->clock.shift; |
| } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
| |
| ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); |
| ts->tv_nsec = ns; |
| |
| return mode; |
| } |
| |
| /* |
| * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and |
| * reports the TSC value from which it do so. Returns true if host is |
| * using TSC based clocksource. |
| */ |
| static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) |
| { |
| /* checked again under seqlock below */ |
| if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) |
| return false; |
| |
| return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns, |
| tsc_timestamp)); |
| } |
| |
| /* |
| * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did |
| * so. Returns true if host is using TSC based clocksource. |
| */ |
| bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) |
| { |
| /* checked again under seqlock below */ |
| if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) |
| return false; |
| |
| return gtod_is_based_on_tsc(do_monotonic(kernel_ns, |
| tsc_timestamp)); |
| } |
| |
| /* |
| * Calculates CLOCK_REALTIME and reports the TSC value from which it did |
| * so. Returns true if host is using TSC based clocksource. |
| * |
| * DO NOT USE this for anything related to migration. You want CLOCK_TAI |
| * for that. |
| */ |
| static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, |
| u64 *tsc_timestamp) |
| { |
| /* checked again under seqlock below */ |
| if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) |
| return false; |
| |
| return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); |
| } |
| #endif |
| |
| /* |
| * |
| * Assuming a stable TSC across physical CPUS, and a stable TSC |
| * across virtual CPUs, the following condition is possible. |
| * Each numbered line represents an event visible to both |
| * CPUs at the next numbered event. |
| * |
| * "timespecX" represents host monotonic time. "tscX" represents |
| * RDTSC value. |
| * |
| * VCPU0 on CPU0 | VCPU1 on CPU1 |
| * |
| * 1. read timespec0,tsc0 |
| * 2. | timespec1 = timespec0 + N |
| * | tsc1 = tsc0 + M |
| * 3. transition to guest | transition to guest |
| * 4. ret0 = timespec0 + (rdtsc - tsc0) | |
| * 5. | ret1 = timespec1 + (rdtsc - tsc1) |
| * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) |
| * |
| * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: |
| * |
| * - ret0 < ret1 |
| * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) |
| * ... |
| * - 0 < N - M => M < N |
| * |
| * That is, when timespec0 != timespec1, M < N. Unfortunately that is not |
| * always the case (the difference between two distinct xtime instances |
| * might be smaller then the difference between corresponding TSC reads, |
| * when updating guest vcpus pvclock areas). |
| * |
| * To avoid that problem, do not allow visibility of distinct |
| * system_timestamp/tsc_timestamp values simultaneously: use a master |
| * copy of host monotonic time values. Update that master copy |
| * in lockstep. |
| * |
| * Rely on synchronization of host TSCs and guest TSCs for monotonicity. |
| * |
| */ |
| |
| static void pvclock_update_vm_gtod_copy(struct kvm *kvm) |
| { |
| #ifdef CONFIG_X86_64 |
| struct kvm_arch *ka = &kvm->arch; |
| int vclock_mode; |
| bool host_tsc_clocksource, vcpus_matched; |
| |
| lockdep_assert_held(&kvm->arch.tsc_write_lock); |
| vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == |
| atomic_read(&kvm->online_vcpus)); |
| |
| /* |
| * If the host uses TSC clock, then passthrough TSC as stable |
| * to the guest. |
| */ |
| host_tsc_clocksource = kvm_get_time_and_clockread( |
| &ka->master_kernel_ns, |
| &ka->master_cycle_now); |
| |
| ka->use_master_clock = host_tsc_clocksource && vcpus_matched |
| && !ka->backwards_tsc_observed |
| && !ka->boot_vcpu_runs_old_kvmclock; |
| |
| if (ka->use_master_clock) |
| atomic_set(&kvm_guest_has_master_clock, 1); |
| |
| vclock_mode = pvclock_gtod_data.clock.vclock_mode; |
| trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, |
| vcpus_matched); |
| #endif |
| } |
| |
| static void kvm_make_mclock_inprogress_request(struct kvm *kvm) |
| { |
| kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); |
| } |
| |
| static void __kvm_start_pvclock_update(struct kvm *kvm) |
| { |
| raw_spin_lock_irq(&kvm->arch.tsc_write_lock); |
| write_seqcount_begin(&kvm->arch.pvclock_sc); |
| } |
| |
| static void kvm_start_pvclock_update(struct kvm *kvm) |
| { |
| kvm_make_mclock_inprogress_request(kvm); |
| |
| /* no guest entries from this point */ |
| __kvm_start_pvclock_update(kvm); |
| } |
| |
| static void kvm_end_pvclock_update(struct kvm *kvm) |
| { |
| struct kvm_arch *ka = &kvm->arch; |
| struct kvm_vcpu *vcpu; |
| unsigned long i; |
| |
| write_seqcount_end(&ka->pvclock_sc); |
| raw_spin_unlock_irq(&ka->tsc_write_lock); |
| kvm_for_each_vcpu(i, vcpu, kvm) |
| kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
| |
| /* guest entries allowed */ |
| kvm_for_each_vcpu(i, vcpu, kvm) |
| kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); |
| } |
| |
| static void kvm_update_masterclock(struct kvm *kvm) |
| { |
| kvm_hv_request_tsc_page_update(kvm); |
| kvm_start_pvclock_update(kvm); |
| pvclock_update_vm_gtod_copy(kvm); |
| kvm_end_pvclock_update(kvm); |
| } |
| |
| /* |
| * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's |
| * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz |
| * can change during boot even if the TSC is constant, as it's possible for KVM |
| * to be loaded before TSC calibration completes. Ideally, KVM would get a |
| * notification when calibration completes, but practically speaking calibration |
| * will complete before userspace is alive enough to create VMs. |
| */ |
| static unsigned long get_cpu_tsc_khz(void) |
| { |
| if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) |
| return tsc_khz; |
| else |
| return __this_cpu_read(cpu_tsc_khz); |
| } |
| |
| /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ |
| static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) |
| { |
| struct kvm_arch *ka = &kvm->arch; |
| struct pvclock_vcpu_time_info hv_clock; |
| |
| /* both __this_cpu_read() and rdtsc() should be on the same cpu */ |
| get_cpu(); |
| |
| data->flags = 0; |
| if (ka->use_master_clock && |
| (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { |
| #ifdef CONFIG_X86_64 |
| struct timespec64 ts; |
| |
| if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { |
| data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; |
| data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; |
| } else |
| #endif |
| data->host_tsc = rdtsc(); |
| |
| data->flags |= KVM_CLOCK_TSC_STABLE; |
| hv_clock.tsc_timestamp = ka->master_cycle_now; |
| hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; |
| kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, |
| &hv_clock.tsc_shift, |
| &hv_clock.tsc_to_system_mul); |
| data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); |
| } else { |
| data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; |
| } |
| |
| put_cpu(); |
| } |
| |
| static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) |
| { |
| struct kvm_arch *ka = &kvm->arch; |
| unsigned seq; |
| |
| do { |
| seq = read_seqcount_begin(&ka->pvclock_sc); |
| __get_kvmclock(kvm, data); |
| } while (read_seqcount_retry(&ka->pvclock_sc, seq)); |
| } |
| |
| u64 get_kvmclock_ns(struct kvm *kvm) |
| { |
| struct kvm_clock_data data; |
| |
| get_kvmclock(kvm, &data); |
| return data.clock; |
| } |
| |
| static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, |
| struct gfn_to_pfn_cache *gpc, |
| unsigned int offset, |
| bool force_tsc_unstable) |
| { |
| struct kvm_vcpu_arch *vcpu = &v->arch; |
| struct pvclock_vcpu_time_info *guest_hv_clock; |
| unsigned long flags; |
| |
| read_lock_irqsave(&gpc->lock, flags); |
| while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { |
| read_unlock_irqrestore(&gpc->lock, flags); |
| |
| if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv
|