From 64d816cba06c67eeee455b8c78ebcda349d49c24 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Nov 2015 19:09:21 -0500 Subject: locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg() This patch replaces the cmpxchg() and xchg() calls in the native qspinlock code with the more relaxed _acquire or _release versions of those calls to enable other architectures to adopt queued spinlocks with less memory barrier performance overhead. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447114167-47185-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 87e9ce6a63c5..7868418ea586 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -14,8 +14,9 @@ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long + * Authors: Waiman Long * Peter Zijlstra */ @@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { struct __qspinlock *l = (void *)lock; - return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; + /* + * Use release semantics to make sure that the MCS node is properly + * initialized before changing the tail code. + */ + return (u32)xchg_release(&l->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; } #else /* _Q_PENDING_BITS == 8 */ @@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) for (;;) { new = (val & _Q_LOCKED_PENDING_MASK) | tail; - old = atomic_cmpxchg(&lock->val, val, new); + /* + * Use release semantics to make sure that the MCS node is + * properly initialized before changing the tail code. + */ + old = atomic_cmpxchg_release(&lock->val, val, new); if (old == val) break; @@ -319,7 +329,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (val == new) new |= _Q_PENDING_VAL; - old = atomic_cmpxchg(&lock->val, val, new); + /* + * Acquire semantic is required here as the function may + * return immediately if the lock was free. + */ + old = atomic_cmpxchg_acquire(&lock->val, val, new); if (old == val) break; @@ -426,7 +440,12 @@ queue: set_locked(lock); break; } - old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); + /* + * The smp_load_acquire() call above has provided the necessary + * acquire semantics required for locking. At most two + * iterations of this loop may be ran. + */ + old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) goto release; /* No contention */ -- cgit From 81b5598665a24083dd889fbd8cb08b0d8de4b8ad Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Nov 2015 19:09:22 -0500 Subject: locking/qspinlock: Prefetch the next node cacheline A queue head CPU, after acquiring the lock, will have to notify the next CPU in the wait queue that it has became the new queue head. This involves loading a new cacheline from the MCS node of the next CPU. That operation can be expensive and add to the latency of locking operation. This patch addes code to optmistically prefetch the next MCS node cacheline if the next pointer is defined and it has been spinning for the MCS lock for a while. This reduces the locking latency and improves the system throughput. The performance change will depend on whether the prefetch overhead can be hidden within the latency of the lock spin loop. On really short critical section, there may not be performance gain at all. With longer critical section, however, it was found to have a performance boost of 5-10% over a range of different queue depths with a spinlock loop microbenchmark. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447114167-47185-3-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 7868418ea586..365b2033f55e 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -407,6 +407,16 @@ queue: pv_wait_node(node); arch_mcs_spin_lock_contended(&node->locked); + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We optimistically load + * the next pointer & prefetch the cacheline for writing + * to reduce latency in the upcoming MCS unlock operation. + */ + next = READ_ONCE(node->next); + if (next) + prefetchw(next); } /* -- cgit From aa68744f80bfb6f26fbe7f10e42876066f7dac1b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Nov 2015 19:09:23 -0500 Subject: locking/qspinlock: Avoid redundant read of next pointer With optimistic prefetch of the next node cacheline, the next pointer may have been properly inititalized. As a result, the reading of node->next in the contended path may be redundant. This patch eliminates the redundant read if the next pointer value is not NULL. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447114167-47185-4-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 365b2033f55e..986207887def 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -396,6 +396,7 @@ queue: * p,*,* -> n,*,* */ old = xchg_tail(lock, tail); + next = NULL; /* * if there was a previous node; link it and wait until reaching the @@ -463,10 +464,12 @@ queue: } /* - * contended path; wait for next, release. + * contended path; wait for next if not observed yet, release. */ - while (!(next = READ_ONCE(node->next))) - cpu_relax(); + if (!next) { + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + } arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); -- cgit From d78045306c41bd9334b956e4e7fa77cc72f06a40 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Nov 2015 19:09:24 -0500 Subject: locking/pvqspinlock, x86: Optimize the PV unlock code path The unlock function in queued spinlocks was optimized for better performance on bare metal systems at the expense of virtualized guests. For x86-64 systems, the unlock call needs to go through a PV_CALLEE_SAVE_REGS_THUNK() which saves and restores 8 64-bit registers before calling the real __pv_queued_spin_unlock() function. The thunk code may also be in a separate cacheline from __pv_queued_spin_unlock(). This patch optimizes the PV unlock code path by: 1) Moving the unlock slowpath code from the fastpath into a separate __pv_queued_spin_unlock_slowpath() function to make the fastpath as simple as possible.. 2) For x86-64, hand-coded an assembly function to combine the register saving thunk code with the fastpath code. Only registers that are used in the fastpath will be saved and restored. If the fastpath fails, the slowpath function will be called via another PV_CALLEE_SAVE_REGS_THUNK(). For 32-bit, it falls back to the C __pv_queued_spin_unlock() code as the thunk saves and restores only one 32-bit register. With a microbenchmark of 5M lock-unlock loop, the table below shows the execution times before and after the patch with different number of threads in a VM running on a 32-core Westmere-EX box with x86-64 4.2-rc1 based kernels: Threads Before patch After patch % Change ------- ------------ ----------- -------- 1 134.1 ms 119.3 ms -11% 2 1286 ms 953 ms -26% 3 3715 ms 3480 ms -6.3% 4 4092 ms 3764 ms -8.0% Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447114167-47185-5-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 43 +++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f0450ff4829b..4bd323d38c60 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -308,23 +308,14 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * PV version of the unlock function to be used in stead of - * queued_spin_unlock(). + * PV versions of the unlock fastpath and slowpath functions to be used + * instead of queued_spin_unlock(). */ -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible void +__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 locked; - - /* - * We must not unlock if SLOW, because in that case we must first - * unhash. Otherwise it would be possible to have multiple @lock - * entries, which would be BAD. - */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); - if (likely(locked == _Q_LOCKED_VAL)) - return; if (unlikely(locked != _Q_SLOW_VAL)) { WARN(!debug_locks_silent, @@ -363,12 +354,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) */ pv_kick(node->cpu); } + /* * Include the architecture specific callee-save thunk of the * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() near the top of the file to make sure - * that the callee-save thunk and the real unlock function are close - * to each other sharing consecutive instruction cachelines. + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined. */ #include +#ifndef __pv_queued_spin_unlock +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + u8 locked; + + /* + * We must not unlock if SLOW, because in that case we must first + * unhash. Otherwise it would be possible to have multiple @lock + * entries, which would be BAD. + */ + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) + return; + + __pv_queued_spin_unlock_slowpath(lock, locked); +} +#endif /* __pv_queued_spin_unlock */ -- cgit From b3e0b1b6d841a4b2f64fc09ea728913da8218424 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 16 Oct 2015 14:39:38 +0200 Subject: locking, sched: Introduce smp_cond_acquire() and use it Introduce smp_cond_acquire() which combines a control dependency and a read barrier to form acquire semantics. This primitive has two benefits: - it documents control dependencies, - its typically cheaper than using smp_load_acquire() in a loop. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 986207887def..ed9d96708f93 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -433,8 +433,7 @@ queue: * */ pv_wait_head(lock, node); - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) - cpu_relax(); + smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); /* * claim the lock: -- cgit From 45e898b735620f426eddf105fc886d2966593a58 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Nov 2015 19:09:25 -0500 Subject: locking/pvqspinlock: Collect slowpath lock statistics This patch enables the accumulation of kicking and waiting related PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration option is selected. It also enables the collection of data which enable us to calculate the kicking and wakeup latencies which have a heavy dependency on the CPUs being used. The statistical counters are per-cpu variables to minimize the performance overhead in their updates. These counters are exported via the debugfs filesystem under the qlockstat directory. When the corresponding debugfs files are read, summation and computing of the required data are then performed. The measured latencies for different CPUs are: CPU Wakeup Kicking --- ------ ------- Haswell-EX 63.6us 7.4us Westmere-EX 67.6us 9.3us The measured latencies varied a bit from run-to-run. The wakeup latency is much higher than the kicking latency. A sample of statistical counters after system bootup (with vCPU overcommit) was: pv_hash_hops=1.00 pv_kick_unlock=1148 pv_kick_wake=1146 pv_latency_kick=11040 pv_latency_wake=194840 pv_spurious_wakeup=7 pv_wait_again=4 pv_wait_head=23 pv_wait_node=1129 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447114167-47185-6-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 32 +++- kernel/locking/qspinlock_stat.h | 281 ++++++++++++++++++++++++++++++++++++ 2 files changed, 308 insertions(+), 5 deletions(-) create mode 100644 kernel/locking/qspinlock_stat.h (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 4bd323d38c60..aaeeefb791f8 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -40,6 +40,11 @@ struct pv_node { u8 state; }; +/* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + /* * Lock and MCS node addresses hash table for fast lookup * @@ -100,10 +105,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) { unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); struct pv_hash_entry *he; + int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); + qstat_hop(hopcnt); return &he->lock; } } @@ -164,9 +172,11 @@ static void pv_init_node(struct mcs_spinlock *node) static void pv_wait_node(struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + int waitcnt = 0; int loop; - for (;;) { + /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ + for (;; waitcnt++) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; @@ -184,12 +194,16 @@ static void pv_wait_node(struct mcs_spinlock *node) */ smp_store_mb(pn->state, vcpu_halted); - if (!READ_ONCE(node->locked)) + if (!READ_ONCE(node->locked)) { + qstat_inc(qstat_pv_wait_node, true); + qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&pn->state, vcpu_halted); + } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that value - * so that pv_wait_head() knows to not also try to hash this lock. + * If pv_kick_node() changed us to vcpu_hashed, retain that + * value so that pv_wait_head() knows to not also try to hash + * this lock. */ cmpxchg(&pn->state, vcpu_halted, vcpu_running); @@ -200,6 +214,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ + qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); } /* @@ -250,6 +265,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; + int waitcnt = 0; int loop; /* @@ -259,7 +275,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; - for (;;) { + for (;; waitcnt++) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) return; @@ -290,14 +306,19 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) return; } } + qstat_inc(qstat_pv_wait_head, true); + qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); + if (!READ_ONCE(l->locked)) + return; /* * The unlocker should have freed the lock before kicking the * CPU. So if the lock is still not free, it is a spurious * wakeup and so the vCPU should wait again after spinning for * a while. */ + qstat_inc(qstat_pv_spurious_wakeup, true); } /* @@ -352,6 +373,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * vCPU is harmless other than the additional latency in completing * the unlock. */ + qstat_inc(qstat_pv_kick_unlock, true); pv_kick(node->cpu); } diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h new file mode 100644 index 000000000000..b1553adec2e7 --- /dev/null +++ b/kernel/locking/qspinlock_stat.h @@ -0,0 +1,281 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long + */ + +/* + * When queued spinlock statistical counters are enabled, the following + * debugfs files will be created for reporting the counter values: + * + * /qlockstat/ + * pv_hash_hops - average # of hops per hashing operation + * pv_kick_unlock - # of vCPU kicks issued at unlock time + * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake + * pv_latency_kick - average latency (ns) of vCPU kick operation + * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_spurious_wakeup - # of spurious wakeups + * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_wait_head - # of vCPU wait's at the queue head + * pv_wait_node - # of vCPU wait's at a non-head queue node + * + * Writing to the "reset_counters" file will reset all the above counter + * values. + * + * These statistical counters are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counters usable even in a production + * environment. + * + * There may be slight difference between pv_kick_wake and pv_kick_unlock. + */ +enum qlock_stats { + qstat_pv_hash_hops, + qstat_pv_kick_unlock, + qstat_pv_kick_wake, + qstat_pv_latency_kick, + qstat_pv_latency_wake, + qstat_pv_spurious_wakeup, + qstat_pv_wait_again, + qstat_pv_wait_head, + qstat_pv_wait_node, + qstat_num, /* Total number of statistical counters */ + qstat_reset_cnts = qstat_num, +}; + +#ifdef CONFIG_QUEUED_LOCK_STAT +/* + * Collect pvqspinlock statistics + */ +#include +#include +#include + +static const char * const qstat_names[qstat_num + 1] = { + [qstat_pv_hash_hops] = "pv_hash_hops", + [qstat_pv_kick_unlock] = "pv_kick_unlock", + [qstat_pv_kick_wake] = "pv_kick_wake", + [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", + [qstat_pv_latency_kick] = "pv_latency_kick", + [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_wait_again] = "pv_wait_again", + [qstat_pv_wait_head] = "pv_wait_head", + [qstat_pv_wait_node] = "pv_wait_node", + [qstat_reset_cnts] = "reset_counters", +}; + +/* + * Per-cpu counters + */ +static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); +static DEFINE_PER_CPU(u64, pv_kick_time); + +/* + * Function to read and return the qlock statistical counter values + * + * The following counters are handled specially: + * 1. qstat_pv_latency_kick + * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock + * 2. qstat_pv_latency_wake + * Average wake latency (ns) = pv_latency_wake/pv_kick_wake + * 3. qstat_pv_hash_hops + * Average hops/hash = pv_hash_hops/pv_kick_unlock + */ +static ssize_t qstat_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, counter, len; + u64 stat = 0, kicks = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if (!file->f_inode) { + WARN_ON_ONCE(1); + return -EBADF; + } + counter = (long)(file->f_inode->i_private); + + if (counter >= qstat_num) + return -EBADF; + + for_each_possible_cpu(cpu) { + stat += per_cpu(qstats[counter], cpu); + /* + * Need to sum additional counter for some of them + */ + switch (counter) { + + case qstat_pv_latency_kick: + case qstat_pv_hash_hops: + kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + break; + + case qstat_pv_latency_wake: + kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + break; + } + } + + if (counter == qstat_pv_hash_hops) { + u64 frac; + + frac = 100ULL * do_div(stat, kicks); + frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + + /* + * Return a X.XX decimal number + */ + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + } else { + /* + * Round to the nearest ns + */ + if ((counter == qstat_pv_latency_kick) || + (counter == qstat_pv_latency_wake)) { + stat = 0; + if (kicks) + stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + } + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + } + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When counter = reset_cnts, reset all the counter values. + * Since the counter updates aren't atomic, the resetting is done twice + * to make sure that the counters are very likely to be all cleared. + */ +static ssize_t qstat_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if (!file->f_inode) { + WARN_ON_ONCE(1); + return -EBADF; + } + if ((long)(file->f_inode->i_private) != qstat_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(qstats, cpu); + + for (i = 0 ; i < qstat_num; i++) + WRITE_ONCE(ptr[i], 0); + for (i = 0 ; i < qstat_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_qstat = { + .read = qstat_read, + .write = qstat_write, + .llseek = default_llseek, +}; + +/* + * Initialize debugfs for the qspinlock statistical counters + */ +static int __init init_qspinlock_stat(void) +{ + struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); + int i; + + if (!d_qstat) { + pr_warn("Could not create 'qlockstat' debugfs directory\n"); + return 0; + } + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < qstat_num; i++) + debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat); + + debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat); + return 0; +} +fs_initcall(init_qspinlock_stat); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void qstat_inc(enum qlock_stats stat, bool cond) +{ + if (cond) + this_cpu_inc(qstats[stat]); +} + +/* + * PV hash hop count + */ +static inline void qstat_hop(int hopcnt) +{ + this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); +} + +/* + * Replacement function for pv_kick() + */ +static inline void __pv_kick(int cpu) +{ + u64 start = sched_clock(); + + per_cpu(pv_kick_time, cpu) = start; + pv_kick(cpu); + this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); +} + +/* + * Replacement function for pv_wait() + */ +static inline void __pv_wait(u8 *ptr, u8 val) +{ + u64 *pkick_time = this_cpu_ptr(&pv_kick_time); + + *pkick_time = 0; + pv_wait(ptr, val); + if (*pkick_time) { + this_cpu_add(qstats[qstat_pv_latency_wake], + sched_clock() - *pkick_time); + qstat_inc(qstat_pv_kick_wake, true); + } +} + +#define pv_kick(c) __pv_kick(c) +#define pv_wait(p, v) __pv_wait(p, v) + +#else /* CONFIG_QUEUED_LOCK_STAT */ + +static inline void qstat_inc(enum qlock_stats stat, bool cond) { } +static inline void qstat_hop(int hopcnt) { } + +#endif /* CONFIG_QUEUED_LOCK_STAT */ -- cgit From 1c4941fd53afb46ab15826628e4819866d008a28 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Nov 2015 16:18:56 -0500 Subject: locking/pvqspinlock: Allow limited lock stealing This patch allows one attempt for the lock waiter to steal the lock when entering the PV slowpath. To prevent lock starvation, the pending bit will be set by the queue head vCPU when it is in the active lock spinning loop to disable any lock stealing attempt. This helps to reduce the performance penalty caused by lock waiter preemption while not having much of the downsides of a real unfair lock. The pv_wait_head() function was renamed as pv_wait_head_or_lock() as it was modified to acquire the lock before returning. This is necessary because of possible lock stealing attempts from other tasks. Linux kernel builds were run in KVM guest on an 8-socket, 4 cores/socket Westmere-EX system and a 4-socket, 8 cores/socket Haswell-EX system. Both systems are configured to have 32 physical CPUs. The kernel build times before and after the patch were: Westmere Haswell Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs ----- -------- -------- -------- -------- Before patch 3m15.6s 10m56.1s 1m44.1s 5m29.1s After patch 3m02.3s 5m00.2s 1m43.7s 3m03.5s For the overcommited case (48 vCPUs), this patch is able to reduce kernel build time by more than 54% for Westmere and 44% for Haswell. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447190336-53317-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 26 +++++-- kernel/locking/qspinlock_paravirt.h | 141 ++++++++++++++++++++++++++++++------ kernel/locking/qspinlock_stat.h | 16 ++++ 3 files changed, 155 insertions(+), 28 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ed9d96708f93..2ea42999d2d8 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -251,15 +251,16 @@ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } static __always_inline void __pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_head(struct qspinlock *lock, - struct mcs_spinlock *node) { } +static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, + struct mcs_spinlock *node) + { return 0; } #define pv_enabled() false #define pv_init_node __pv_init_node #define pv_wait_node __pv_wait_node #define pv_kick_node __pv_kick_node -#define pv_wait_head __pv_wait_head +#define pv_wait_head_or_lock __pv_wait_head_or_lock #ifdef CONFIG_PARAVIRT_SPINLOCKS #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath @@ -431,10 +432,22 @@ queue: * sequentiality; this is because the set_locked() function below * does not imply a full barrier. * + * The PV pv_wait_head_or_lock function, if active, will acquire + * the lock and return a non-zero value. So we have to skip the + * smp_load_acquire() call. As the next PV queue head hasn't been + * designated yet, there is no way for the locked value to become + * _Q_SLOW_VAL. So both the set_locked() and the + * atomic_cmpxchg_relaxed() calls will be safe. + * + * If PV isn't active, 0 will be returned instead. + * */ - pv_wait_head(lock, node); + if ((val = pv_wait_head_or_lock(lock, node))) + goto locked; + smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); +locked: /* * claim the lock: * @@ -446,7 +459,8 @@ queue: * to grab the lock. */ for (;;) { - if (val != tail) { + /* In the PV case we might already have _Q_LOCKED_VAL set */ + if ((val & _Q_TAIL_MASK) != tail) { set_locked(lock); break; } @@ -493,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #undef pv_init_node #undef pv_wait_node #undef pv_kick_node -#undef pv_wait_head +#undef pv_wait_head_or_lock #undef queued_spin_lock_slowpath #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index aaeeefb791f8..ace60a451b4f 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -40,6 +40,89 @@ struct pv_node { u8 state; }; +/* + * By replacing the regular queued_spin_trylock() with the function below, + * it will be called once when a lock waiter enter the PV slowpath before + * being queued. By allowing one lock stealing attempt here when the pending + * bit is off, it helps to reduce the performance impact of lock waiter + * preemption without the drawback of lock starvation. + */ +#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) +static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); +} + +/* + * The pending bit is used by the queue head vCPU to indicate that it + * is actively spinning on the lock and no lock stealing is allowed. + */ +#if _Q_PENDING_BITS == 8 +static __always_inline void set_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 1); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 0); +} + +/* + * The pending bit check in pv_queued_spin_steal_lock() isn't a memory + * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock + * just to be sure that it will get it. + */ +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !READ_ONCE(l->locked) && + (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) + == _Q_PENDING_VAL); +} +#else /* _Q_PENDING_BITS == 8 */ +static __always_inline void set_pending(struct qspinlock *lock) +{ + atomic_set_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_clear_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + int val = atomic_read(&lock->val); + + for (;;) { + int old, new; + + if (val & _Q_LOCKED_MASK) + break; + + /* + * Try to clear pending bit & set locked bit + */ + old = val; + new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + val = atomic_cmpxchg(&lock->val, old, new); + + if (val == old) + return 1; + } + return 0; +} +#endif /* _Q_PENDING_BITS == 8 */ + /* * Include queued spinlock statistics code */ @@ -202,8 +285,8 @@ static void pv_wait_node(struct mcs_spinlock *node) /* * If pv_kick_node() changed us to vcpu_hashed, retain that - * value so that pv_wait_head() knows to not also try to hash - * this lock. + * value so that pv_wait_head_or_lock() knows to not also try + * to hash this lock. */ cmpxchg(&pn->state, vcpu_halted, vcpu_running); @@ -227,8 +310,9 @@ static void pv_wait_node(struct mcs_spinlock *node) /* * Called after setting next->locked = 1 when we're the lock owner. * - * Instead of waking the waiters stuck in pv_wait_node() advance their state such - * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. + * Instead of waking the waiters stuck in pv_wait_node() advance their state + * such that they're waiting in pv_wait_head_or_lock(), this avoids a + * wake/sleep cycle. */ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { @@ -257,10 +341,14 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * Wait for l->locked to become clear; halt the vcpu after a short spin. + * Wait for l->locked to become clear and acquire the lock; + * halt the vcpu after a short spin. * __pv_queued_spin_unlock() will wake us. + * + * The current value of the lock will be returned for additional processing. */ -static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +static u32 +pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; @@ -276,11 +364,18 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) lp = (struct qspinlock **)1; for (;; waitcnt++) { + /* + * Set the pending bit in the active lock spinning loop to + * disable lock stealing before attempting to acquire the lock. + */ + set_pending(lock); for (loop = SPIN_THRESHOLD; loop; loop--) { - if (!READ_ONCE(l->locked)) - return; + if (trylock_clear_pending(lock)) + goto gotlock; cpu_relax(); } + clear_pending(lock); + if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); @@ -296,36 +391,38 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { /* - * The lock is free and _Q_SLOW_VAL has never - * been set. Therefore we need to unhash before - * getting the lock. + * The lock was free and now we own the lock. + * Change the lock value back to _Q_LOCKED_VAL + * and unhash the table. */ + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); - return; + goto gotlock; } } qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); - if (!READ_ONCE(l->locked)) - return; /* * The unlocker should have freed the lock before kicking the * CPU. So if the lock is still not free, it is a spurious - * wakeup and so the vCPU should wait again after spinning for - * a while. + * wakeup or another vCPU has stolen the lock. The current + * vCPU should spin again. */ - qstat_inc(qstat_pv_spurious_wakeup, true); + qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* - * Lock is unlocked now; the caller will acquire it without waiting. - * As with pv_wait_node() we rely on the caller to do a load-acquire - * for us. + * The cmpxchg() or xchg() call before coming here provides the + * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL + * here is to indicate to the compiler that the value will always + * be nozero to enable better code optimization. */ +gotlock: + return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL); } /* @@ -350,7 +447,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * so we need a barrier to order the read of the node data in * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. * - * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. */ smp_rmb(); diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b1553adec2e7..94d4533fe984 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,6 +22,7 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups * pv_wait_again - # of vCPU wait's that happened after a vCPU kick * pv_wait_head - # of vCPU wait's at the queue head @@ -43,6 +44,7 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, + qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, qstat_pv_wait_head, @@ -66,6 +68,7 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_head] = "pv_wait_head", [qstat_pv_wait_node] = "pv_wait_node", @@ -273,6 +276,19 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) +/* + * PV unfair trylock count tracking function + */ +static inline int qstat_spin_steal_lock(struct qspinlock *lock) +{ + int ret = pv_queued_spin_steal_lock(lock); + + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; +} +#undef queued_spin_trylock +#define queued_spin_trylock(l) qstat_spin_steal_lock(l) + #else /* CONFIG_QUEUED_LOCK_STAT */ static inline void qstat_inc(enum qlock_stats stat, bool cond) { } -- cgit From cd0272fab785077c121aa91ec2401090965bbc37 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 9 Nov 2015 19:09:27 -0500 Subject: locking/pvqspinlock: Queue node adaptive spinning In an overcommitted guest where some vCPUs have to be halted to make forward progress in other areas, it is highly likely that a vCPU later in the spinlock queue will be spinning while the ones earlier in the queue would have been halted. The spinning in the later vCPUs is then just a waste of precious CPU cycles because they are not going to get the lock soon as the earlier ones have to be woken up and take their turn to get the lock. This patch implements an adaptive spinning mechanism where the vCPU will call pv_wait() if the previous vCPU is not running. Linux kernel builds were run in KVM guest on an 8-socket, 4 cores/socket Westmere-EX system and a 4-socket, 8 cores/socket Haswell-EX system. Both systems are configured to have 32 physical CPUs. The kernel build times before and after the patch were: Westmere Haswell Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs ----- -------- -------- -------- -------- Before patch 3m02.3s 5m00.2s 1m43.7s 3m03.5s After patch 3m03.0s 4m37.5s 1m43.0s 2m47.2s For 32 vCPUs, this patch doesn't cause any noticeable change in performance. For 48 vCPUs (over-committed), there is about 8% performance improvement. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447114167-47185-8-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 5 ++-- kernel/locking/qspinlock_paravirt.h | 46 +++++++++++++++++++++++++++++++++++-- kernel/locking/qspinlock_stat.h | 3 +++ 3 files changed, 50 insertions(+), 4 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2ea42999d2d8..393d1874b9e0 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock) */ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node, + struct mcs_spinlock *prev) { } static __always_inline void __pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { } static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, @@ -407,7 +408,7 @@ queue: prev = decode_tail(old); WRITE_ONCE(prev->next, node); - pv_wait_node(node); + pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index ace60a451b4f..87bb235c3448 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -22,6 +22,20 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) +/* + * Queue Node Adaptive Spinning + * + * A queue node vCPU will stop spinning if the vCPU in the previous node is + * not running. The one lock stealing attempt allowed at slowpath entry + * mitigates the slight slowdown for non-overcommitted guest with this + * aggressive wait-early mechanism. + * + * The status of the previous node will be checked at fixed interval + * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't + * pound on the cacheline of the previous node too heavily. + */ +#define PV_PREV_CHECK_MASK 0xff + /* * Queue node uses: vcpu_running & vcpu_halted. * Queue head uses: vcpu_running & vcpu_hashed. @@ -234,6 +248,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) BUG(); } +/* + * Return true if when it is time to check the previous node which is not + * in a running state. + */ +static inline bool +pv_wait_early(struct pv_node *prev, int loop) +{ + + if ((loop & PV_PREV_CHECK_MASK) != 0) + return false; + + return READ_ONCE(prev->state) != vcpu_running; +} + /* * Initialize the PV part of the mcs_spinlock node. */ @@ -252,17 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node) * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its * behalf. */ -static void pv_wait_node(struct mcs_spinlock *node) +static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; + struct pv_node *pp = (struct pv_node *)prev; int waitcnt = 0; int loop; + bool wait_early; /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ for (;; waitcnt++) { - for (loop = SPIN_THRESHOLD; loop; loop--) { + for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; + if (pv_wait_early(pp, loop)) { + wait_early = true; + break; + } cpu_relax(); } @@ -280,6 +314,7 @@ static void pv_wait_node(struct mcs_spinlock *node) if (!READ_ONCE(node->locked)) { qstat_inc(qstat_pv_wait_node, true); qstat_inc(qstat_pv_wait_again, waitcnt); + qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -364,6 +399,12 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) lp = (struct qspinlock **)1; for (;; waitcnt++) { + /* + * Set correct vCPU state to be used by queue node wait-early + * mechanism. + */ + WRITE_ONCE(pn->state, vcpu_running); + /* * Set the pending bit in the active lock spinning loop to * disable lock stealing before attempting to acquire the lock. @@ -402,6 +443,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) goto gotlock; } } + WRITE_ONCE(pn->state, vcpu_halted); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 94d4533fe984..640dcecdd1df 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -25,6 +25,7 @@ * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node * @@ -47,6 +48,7 @@ enum qlock_stats { qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, + qstat_pv_wait_early, qstat_pv_wait_head, qstat_pv_wait_node, qstat_num, /* Total number of statistical counters */ @@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_latency_wake] = "pv_latency_wake", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", + [qstat_pv_wait_early] = "pv_wait_early", [qstat_pv_wait_head] = "pv_wait_head", [qstat_pv_wait_node] = "pv_wait_node", [qstat_reset_cnts] = "reset_counters", -- cgit