[RFC PATCH v3 09/21] KVM: x86: Fix KVM clock precision in __get_kvmclock()

From: David Woodhouse
Date: Tue May 21 2024 - 20:22:28 EST


From: David Woodhouse <dwmw@xxxxxxxxxxxx>

When in 'master clock mode' (i.e. when host and guest TSCs are behaving
sanely and in sync), the KVM clock is defined in terms of the guest TSC.

When TSC scaling is used, calculating the KVM clock directly from *host*
TSC cycles leads to a systemic drift from the values calculated by the
guest from its TSC.

Commit 451a707813ae ("KVM: x86/xen: improve accuracy of Xen timers")
had a simple workaround for the specific case of Xen timers, as it had an
actual vCPU to hand and could use its scaling information. That commit
noted that it was broken for the general case of get_kvmclock_ns(), and
said "I'll come back to that".

Since __get_kvmclock() is invoked without a specific CPU, it needs to
be able to find or generate the scaling values required to perform the
correct calculation.

Thankfully, TSC scaling can only happen with X86_FEATURE_CONSTANT_TSC,
so it isn't as complex as it might have been.

In __kvm_synchronize_tsc(), note the current vCPU's scaling ratio in
kvm->arch.last_tsc_scaling_ratio. That is only protected by the
tsc_write_lock, so in pvclock_update_vm_gtod_copy(), copy it into a
separate kvm->arch.master_tsc_scaling_ratio so that it can be accessed
using the kvm->arch.pvclock_sc seqcount lock. Also generate the mul and
shift factors to convert to nanoseconds for the corresponding KVM clock,
just as kvm_guest_time_update() would.

In __get_kvmclock(), which runs within a seqcount retry loop, use those
values to convert host to guest TSC and then to nanoseconds. Only fall
back to using get_kvmclock_base_ns() when not in master clock mode.

There was previously a code path in __get_kvmclock() which looked like
it could set KVM_CLOCK_TSC_STABLE without KVM_CLOCK_REALTIME, perhaps
even on 32-bit hosts. In practice that could never happen as the
ka->use_master_clock flag couldn't be set on 32-bit, and even on 64-bit
hosts it would never be set when the system clock isn't TSC-based. So
that code path is now removed.

The kvm_get_wall_clock_epoch() function had the same problem; make it
just call get_kvmclock() and subtract kvmclock from wallclock, with
the same fallback as before.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 4 +
arch/x86/kvm/x86.c | 151 ++++++++++++++++----------------
2 files changed, 79 insertions(+), 76 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8440c4081727..b01c1d000fff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1353,6 +1353,7 @@ struct kvm_arch {
u64 last_tsc_write;
u32 last_tsc_khz;
u64 last_tsc_offset;
+ u64 last_tsc_scaling_ratio;
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
@@ -1364,6 +1365,9 @@ struct kvm_arch {

seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
+ s8 master_tsc_shift;
+ u32 master_tsc_mul;
+ u64 master_tsc_scaling_ratio;
u64 master_kernel_ns;
u64 master_cycle_now;
struct delayed_work kvmclock_update_work;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 44b3d2a0da5b..89918ba266cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2674,6 +2674,7 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
kvm->arch.last_tsc_nsec = ns;
kvm->arch.last_tsc_write = tsc;
kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ kvm->arch.last_tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
kvm->arch.last_tsc_offset = offset;

vcpu->arch.last_guest_tsc = tsc;
@@ -3009,6 +3010,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
struct kvm_arch *ka = &kvm->arch;
+ uint64_t last_tsc_hz;
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;

@@ -3028,6 +3030,35 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock;

+ /*
+ * When TSC scaling is in use (which can thankfully only happen
+ * with X86_FEATURE_CONSTANT_TSC), the host must calculate the
+ * KVM clock precisely as the guest would, by scaling through
+ * the guest TSC frequency. Otherwise, differences in arithmetic
+ * precision lead to systemic drift between the guest's and the
+ * host's idea of the time.
+ */
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Copy from the field protected solely by ka->tsc_write_lock,
+ * to the field protected by the ka->pvclock_sc seqlock.
+ */
+ ka->master_tsc_scaling_ratio = ka->last_tsc_scaling_ratio ? :
+ kvm_caps.default_tsc_scaling_ratio;
+
+ /*
+ * Calculate the scaling factors precisely the same way
+ * that kvm_guest_time_update() does.
+ */
+ last_tsc_hz = kvm_scale_tsc(tsc_khz * 1000LL,
+ ka->master_tsc_scaling_ratio);
+ kvm_get_time_scale(NSEC_PER_SEC, last_tsc_hz,
+ &ka->master_tsc_shift, &ka->master_tsc_mul);
+ } else if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ kvm_get_time_scale(NSEC_PER_SEC, tsc_khz * 1000LL,
+ &ka->master_tsc_shift, &ka->master_tsc_mul);
+ }
+
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);

@@ -3100,36 +3131,49 @@ static unsigned long get_cpu_tsc_khz(void)
static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
struct kvm_arch *ka = &kvm->arch;
- struct pvclock_vcpu_time_info hv_clock;
+
+#ifdef CONFIG_X86_64
+ uint64_t cur_tsc_khz = 0;
+ struct timespec64 ts;

/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();

- data->flags = 0;
if (ka->use_master_clock &&
- (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
-#ifdef CONFIG_X86_64
- struct timespec64 ts;
+ (cur_tsc_khz = get_cpu_tsc_khz()) &&
+ !kvm_get_walltime_and_clockread(&ts, &data->host_tsc))
+ cur_tsc_khz = 0;

- if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
- data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
- data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
- } else
-#endif
- data->host_tsc = rdtsc();
-
- data->flags |= KVM_CLOCK_TSC_STABLE;
- hv_clock.tsc_timestamp = ka->master_cycle_now;
- hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
- &hv_clock.tsc_shift,
- &hv_clock.tsc_to_system_mul);
- data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
- } else {
- data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ put_cpu();
+
+ if (cur_tsc_khz) {
+ uint64_t tsc_cycles;
+ uint32_t mul;
+ int8_t shift;
+
+ tsc_cycles = data->host_tsc - ka->master_cycle_now;
+
+ if (kvm_caps.has_tsc_control)
+ tsc_cycles = kvm_scale_tsc(tsc_cycles,
+ ka->master_tsc_scaling_ratio);
+
+ if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ mul = ka->master_tsc_mul;
+ shift = ka->master_tsc_shift;
+ } else {
+ kvm_get_time_scale(NSEC_PER_SEC, cur_tsc_khz * 1000LL,
+ &shift, &mul);
+ }
+ data->clock = ka->master_kernel_ns + ka->kvmclock_offset +
+ pvclock_scale_delta(tsc_cycles, mul, shift);
+ data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+ data->flags = KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC | KVM_CLOCK_TSC_STABLE;
+ return;
}
+#endif

- put_cpu();
+ data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ data->flags = 0;
}

static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
@@ -3330,68 +3374,23 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* that and kvmclock, but even that would be subject to change over
* time.
*
- * Attempt to calculate the epoch at a given moment using the *same*
- * TSC reading via kvm_get_walltime_and_clockread() to obtain both
- * wallclock and kvmclock times, and subtracting one from the other.
+ * Use get_kvmclock() to obtain a simultaneous reading of wallclock
+ * and kvmclock times from the *same* TSC reading, and subtract one
+ * from the other.
*
* Fall back to using their values at slightly different moments by
* calling ktime_get_real_ns() and get_kvmclock_ns() separately.
*/
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
{
-#ifdef CONFIG_X86_64
- struct pvclock_vcpu_time_info hv_clock;
- struct kvm_arch *ka = &kvm->arch;
- unsigned long seq, local_tsc_khz;
- struct timespec64 ts;
- uint64_t host_tsc;
-
- do {
- seq = read_seqcount_begin(&ka->pvclock_sc);
-
- local_tsc_khz = 0;
- if (!ka->use_master_clock)
- break;
-
- /*
- * The TSC read and the call to get_cpu_tsc_khz() must happen
- * on the same CPU.
- */
- get_cpu();
-
- local_tsc_khz = get_cpu_tsc_khz();
-
- if (local_tsc_khz &&
- !kvm_get_walltime_and_clockread(&ts, &host_tsc))
- local_tsc_khz = 0; /* Fall back to old method */
-
- put_cpu();
-
- /*
- * These values must be snapshotted within the seqcount loop.
- * After that, it's just mathematics which can happen on any
- * CPU at any time.
- */
- hv_clock.tsc_timestamp = ka->master_cycle_now;
- hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+ struct kvm_clock_data data;

- } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+ get_kvmclock(kvm, &data);

- /*
- * If the conditions were right, and obtaining the wallclock+TSC was
- * successful, calculate the KVM clock at the corresponding time and
- * subtract one from the other to get the guest's epoch in nanoseconds
- * since 1970-01-01.
- */
- if (local_tsc_khz) {
- kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
- &hv_clock.tsc_shift,
- &hv_clock.tsc_to_system_mul);
- return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
- __pvclock_read_cycles(&hv_clock, host_tsc);
- }
-#endif
- return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+ if (data.flags & KVM_CLOCK_REALTIME)
+ return data.realtime - data.clock;
+ else
+ return ktime_get_real_ns() - data.clock;
}

/*
--
2.44.0