KVM: lapic: Track lapic timer advance per vCPU
authorSean Christopherson <sean.j.christopherson@intel.com>
Wed, 17 Apr 2019 17:15:32 +0000 (10:15 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 8 May 2019 05:22:46 +0000 (07:22 +0200)
commit 39497d7660d9866a47a2dc9055672358da57ad3d upstream.

Automatically adjusting the globally-shared timer advancement could
corrupt the timer, e.g. if multiple vCPUs are concurrently adjusting
the advancement value.  That could be partially fixed by using a local
variable for the arithmetic, but it would still be susceptible to a
race when setting timer_advance_adjust_done.

And because virtual_tsc_khz and tsc_scaling_ratio are per-vCPU, the
correct calibration for a given vCPU may not apply to all vCPUs.

Furthermore, lapic_timer_advance_ns is marked __read_mostly, which is
effectively violated when finding a stable advancement takes an extended
amount of timer.

Opportunistically change the definition of lapic_timer_advance_ns to
a u32 so that it matches the style of struct kvm_timer.  Explicitly
pass the param to kvm_create_lapic() so that it doesn't have to be
exposed to lapic.c, thus reducing the probability of unintentionally
using the global value instead of the per-vCPU value.

Cc: Liran Alon <liran.alon@oracle.com>
Cc: Wanpeng Li <wanpengli@tencent.com>
Reviewed-by: Liran Alon <liran.alon@oracle.com>
Cc: stable@vger.kernel.org
Fixes: 3b8a5df6c4dc6 ("KVM: LAPIC: Tune lapic_timer_advance_ns automatically")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h

index 34678ea56b4b4c8e3a9923a8e51377247bd97040..b6b4b44940f06c30d310077f104709acffe0984f 100644 (file)
@@ -70,7 +70,6 @@
 #define APIC_BROADCAST                 0xFF
 #define X2APIC_BROADCAST               0xFFFFFFFFul
 
-static bool lapic_timer_advance_adjust_done = false;
 #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -1482,6 +1481,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
 void wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
+       u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
        u64 guest_tsc, tsc_deadline, ns;
 
        if (!lapic_in_kernel(vcpu))
@@ -1501,34 +1501,36 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
        /* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
        if (guest_tsc < tsc_deadline)
                __delay(min(tsc_deadline - guest_tsc,
-                       nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+                       nsec_to_cycles(vcpu, timer_advance_ns)));
 
-       if (!lapic_timer_advance_adjust_done) {
+       if (!apic->lapic_timer.timer_advance_adjust_done) {
                /* too early */
                if (guest_tsc < tsc_deadline) {
                        ns = (tsc_deadline - guest_tsc) * 1000000ULL;
                        do_div(ns, vcpu->arch.virtual_tsc_khz);
-                       lapic_timer_advance_ns -= min((unsigned int)ns,
-                               lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+                       timer_advance_ns -= min((u32)ns,
+                               timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
                } else {
                /* too late */
                        ns = (guest_tsc - tsc_deadline) * 1000000ULL;
                        do_div(ns, vcpu->arch.virtual_tsc_khz);
-                       lapic_timer_advance_ns += min((unsigned int)ns,
-                               lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+                       timer_advance_ns += min((u32)ns,
+                               timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
                }
                if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
-                       lapic_timer_advance_adjust_done = true;
-               if (unlikely(lapic_timer_advance_ns > 5000)) {
-                       lapic_timer_advance_ns = 0;
-                       lapic_timer_advance_adjust_done = true;
+                       apic->lapic_timer.timer_advance_adjust_done = true;
+               if (unlikely(timer_advance_ns > 5000)) {
+                       timer_advance_ns = 0;
+                       apic->lapic_timer.timer_advance_adjust_done = true;
                }
+               apic->lapic_timer.timer_advance_ns = timer_advance_ns;
        }
 }
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
 {
-       u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+       struct kvm_timer *ktimer = &apic->lapic_timer;
+       u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
        u64 ns = 0;
        ktime_t expire;
        struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1548,11 +1550,10 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
        do_div(ns, this_tsc_khz);
 
        if (likely(tscdeadline > guest_tsc) &&
-           likely(ns > lapic_timer_advance_ns)) {
+           likely(ns > apic->lapic_timer.timer_advance_ns)) {
                expire = ktime_add_ns(now, ns);
-               expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-               hrtimer_start(&apic->lapic_timer.timer,
-                               expire, HRTIMER_MODE_ABS_PINNED);
+               expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+               hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED);
        } else
                apic_timer_expired(apic);
 
@@ -2259,7 +2260,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
                return HRTIMER_NORESTART;
 }
 
-int kvm_create_lapic(struct kvm_vcpu *vcpu)
+int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns)
 {
        struct kvm_lapic *apic;
 
@@ -2283,6 +2284,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
        hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_ABS_PINNED);
        apic->lapic_timer.timer.function = apic_timer_fn;
+       apic->lapic_timer.timer_advance_ns = timer_advance_ns;
 
        /*
         * APIC is created enabled. This will prevent kvm_lapic_set_base from
index ff6ef9c3d760c7d6db6d5ee86d1a0bb21c1c63b5..3e97f8a68967b2b437c03cdaafedf761ea5a651a 100644 (file)
@@ -31,8 +31,10 @@ struct kvm_timer {
        u32 timer_mode_mask;
        u64 tscdeadline;
        u64 expired_tscdeadline;
+       u32 timer_advance_ns;
        atomic_t pending;                       /* accumulated triggered timers */
        bool hv_timer_in_use;
+       bool timer_advance_adjust_done;
 };
 
 struct kvm_lapic {
@@ -62,7 +64,7 @@ struct kvm_lapic {
 
 struct dest_map;
 
-int kvm_create_lapic(struct kvm_vcpu *vcpu);
+int kvm_create_lapic(struct kvm_vcpu *vcpu, u32 timer_advance_ns);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
index e7fe8c6923625e8de13e4674efdbf5c68b434e70..72f1d17711f24f398bf60a4ae044baa463cb4cc3 100644 (file)
@@ -7133,6 +7133,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
 {
        struct vcpu_vmx *vmx;
        u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+       struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
 
        if (kvm_mwait_in_guest(vcpu->kvm))
                return -EOPNOTSUPP;
@@ -7141,7 +7142,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
        tscl = rdtsc();
        guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
        delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
-       lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+       lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+                                                   ktimer->timer_advance_ns);
 
        if (delta_tsc > lapic_timer_advance_cycles)
                delta_tsc -= lapic_timer_advance_cycles;
index 7e413ea19a9a24058bd3ef00213999827cd672ac..3244994b111d29fe86aad205361f9377f6cb89a3 100644 (file)
@@ -137,9 +137,8 @@ static u32 __read_mostly tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 
 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 1000;
+static u32 __read_mostly lapic_timer_advance_ns = 1000;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
 
 static bool __read_mostly vector_hashing = true;
 module_param(vector_hashing, bool, S_IRUGO);
@@ -7882,7 +7881,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        }
 
        trace_kvm_entry(vcpu->vcpu_id);
-       if (lapic_timer_advance_ns)
+       if (vcpu->arch.apic->lapic_timer.timer_advance_ns)
                wait_lapic_expire(vcpu);
        guest_enter_irqoff();
 
@@ -9070,7 +9069,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                goto fail_free_pio_data;
 
        if (irqchip_in_kernel(vcpu->kvm)) {
-               r = kvm_create_lapic(vcpu);
+               r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
                if (r < 0)
                        goto fail_mmu_destroy;
        } else
index de3d46769ee3972b2357e99fd3b4fa6f2db36337..b457160dc7ba1081540151228d07e5f7b44312da 100644 (file)
@@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void);
 
 extern unsigned int min_timer_period_us;
 
-extern unsigned int lapic_timer_advance_ns;
-
 extern bool enable_vmware_backdoor;
 
 extern struct static_key kvm_no_apic_vcpu;