mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-06-22 06:32:08 +00:00
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar: "So we have a laundry list of locking subsystem changes: - continuing barrier API and code improvements - futex enhancements - atomics API improvements - pvqspinlock enhancements: in particular lock stealing and adaptive spinning - qspinlock micro-enhancements" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op futex: Cleanup the goto confusion in requeue_pi() futex: Remove pointless put_pi_state calls in requeue() futex: Document pi_state refcounting in requeue code futex: Rename free_pi_state() to put_pi_state() futex: Drop refcount if requeue_pi() acquired the rtmutex locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation lcoking/barriers, arch: Use smp barriers in smp_store_release() locking/cmpxchg, arch: Remove tas() definitions locking/pvqspinlock: Queue node adaptive spinning locking/pvqspinlock: Allow limited lock stealing locking/pvqspinlock: Collect slowpath lock statistics sched/core, locking: Document Program-Order guarantees locking, sched: Introduce smp_cond_acquire() and use it locking/pvqspinlock, x86: Optimize the PV unlock code path locking/qspinlock: Avoid redundant read of next pointer locking/qspinlock: Prefetch the next node cacheline locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg() atomics: Add test for atomic operations with _relaxed variants
This commit is contained in:
commit
24af98c4cf
20 changed files with 904 additions and 146 deletions
|
@ -14,8 +14,9 @@
|
|||
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
|
||||
* (C) Copyright 2013-2014 Red Hat, Inc.
|
||||
* (C) Copyright 2015 Intel Corp.
|
||||
* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
|
||||
*
|
||||
* Authors: Waiman Long <waiman.long@hp.com>
|
||||
* Authors: Waiman Long <waiman.long@hpe.com>
|
||||
* Peter Zijlstra <peterz@infradead.org>
|
||||
*/
|
||||
|
||||
|
@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
|
|||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
|
||||
return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
|
||||
/*
|
||||
* Use release semantics to make sure that the MCS node is properly
|
||||
* initialized before changing the tail code.
|
||||
*/
|
||||
return (u32)xchg_release(&l->tail,
|
||||
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
|
||||
}
|
||||
|
||||
#else /* _Q_PENDING_BITS == 8 */
|
||||
|
@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
|
|||
|
||||
for (;;) {
|
||||
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
|
||||
old = atomic_cmpxchg(&lock->val, val, new);
|
||||
/*
|
||||
* Use release semantics to make sure that the MCS node is
|
||||
* properly initialized before changing the tail code.
|
||||
*/
|
||||
old = atomic_cmpxchg_release(&lock->val, val, new);
|
||||
if (old == val)
|
||||
break;
|
||||
|
||||
|
@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
|
|||
*/
|
||||
|
||||
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
|
||||
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
|
||||
static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
|
||||
struct mcs_spinlock *prev) { }
|
||||
static __always_inline void __pv_kick_node(struct qspinlock *lock,
|
||||
struct mcs_spinlock *node) { }
|
||||
static __always_inline void __pv_wait_head(struct qspinlock *lock,
|
||||
struct mcs_spinlock *node) { }
|
||||
static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
|
||||
struct mcs_spinlock *node)
|
||||
{ return 0; }
|
||||
|
||||
#define pv_enabled() false
|
||||
|
||||
#define pv_init_node __pv_init_node
|
||||
#define pv_wait_node __pv_wait_node
|
||||
#define pv_kick_node __pv_kick_node
|
||||
#define pv_wait_head __pv_wait_head
|
||||
#define pv_wait_head_or_lock __pv_wait_head_or_lock
|
||||
|
||||
#ifdef CONFIG_PARAVIRT_SPINLOCKS
|
||||
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
|
||||
|
@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
|
|||
if (val == new)
|
||||
new |= _Q_PENDING_VAL;
|
||||
|
||||
old = atomic_cmpxchg(&lock->val, val, new);
|
||||
/*
|
||||
* Acquire semantic is required here as the function may
|
||||
* return immediately if the lock was free.
|
||||
*/
|
||||
old = atomic_cmpxchg_acquire(&lock->val, val, new);
|
||||
if (old == val)
|
||||
break;
|
||||
|
||||
|
@ -382,6 +398,7 @@ queue:
|
|||
* p,*,* -> n,*,*
|
||||
*/
|
||||
old = xchg_tail(lock, tail);
|
||||
next = NULL;
|
||||
|
||||
/*
|
||||
* if there was a previous node; link it and wait until reaching the
|
||||
|
@ -391,8 +408,18 @@ queue:
|
|||
prev = decode_tail(old);
|
||||
WRITE_ONCE(prev->next, node);
|
||||
|
||||
pv_wait_node(node);
|
||||
pv_wait_node(node, prev);
|
||||
arch_mcs_spin_lock_contended(&node->locked);
|
||||
|
||||
/*
|
||||
* While waiting for the MCS lock, the next pointer may have
|
||||
* been set by another lock waiter. We optimistically load
|
||||
* the next pointer & prefetch the cacheline for writing
|
||||
* to reduce latency in the upcoming MCS unlock operation.
|
||||
*/
|
||||
next = READ_ONCE(node->next);
|
||||
if (next)
|
||||
prefetchw(next);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -406,11 +433,22 @@ queue:
|
|||
* sequentiality; this is because the set_locked() function below
|
||||
* does not imply a full barrier.
|
||||
*
|
||||
* The PV pv_wait_head_or_lock function, if active, will acquire
|
||||
* the lock and return a non-zero value. So we have to skip the
|
||||
* smp_load_acquire() call. As the next PV queue head hasn't been
|
||||
* designated yet, there is no way for the locked value to become
|
||||
* _Q_SLOW_VAL. So both the set_locked() and the
|
||||
* atomic_cmpxchg_relaxed() calls will be safe.
|
||||
*
|
||||
* If PV isn't active, 0 will be returned instead.
|
||||
*
|
||||
*/
|
||||
pv_wait_head(lock, node);
|
||||
while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
|
||||
cpu_relax();
|
||||
if ((val = pv_wait_head_or_lock(lock, node)))
|
||||
goto locked;
|
||||
|
||||
smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
|
||||
|
||||
locked:
|
||||
/*
|
||||
* claim the lock:
|
||||
*
|
||||
|
@ -422,11 +460,17 @@ queue:
|
|||
* to grab the lock.
|
||||
*/
|
||||
for (;;) {
|
||||
if (val != tail) {
|
||||
/* In the PV case we might already have _Q_LOCKED_VAL set */
|
||||
if ((val & _Q_TAIL_MASK) != tail) {
|
||||
set_locked(lock);
|
||||
break;
|
||||
}
|
||||
old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
|
||||
/*
|
||||
* The smp_load_acquire() call above has provided the necessary
|
||||
* acquire semantics required for locking. At most two
|
||||
* iterations of this loop may be ran.
|
||||
*/
|
||||
old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
|
||||
if (old == val)
|
||||
goto release; /* No contention */
|
||||
|
||||
|
@ -434,10 +478,12 @@ queue:
|
|||
}
|
||||
|
||||
/*
|
||||
* contended path; wait for next, release.
|
||||
* contended path; wait for next if not observed yet, release.
|
||||
*/
|
||||
while (!(next = READ_ONCE(node->next)))
|
||||
cpu_relax();
|
||||
if (!next) {
|
||||
while (!(next = READ_ONCE(node->next)))
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
arch_mcs_spin_unlock_contended(&next->locked);
|
||||
pv_kick_node(lock, next);
|
||||
|
@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
|
|||
#undef pv_init_node
|
||||
#undef pv_wait_node
|
||||
#undef pv_kick_node
|
||||
#undef pv_wait_head
|
||||
#undef pv_wait_head_or_lock
|
||||
|
||||
#undef queued_spin_lock_slowpath
|
||||
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
|
||||
|
|
|
@ -22,6 +22,20 @@
|
|||
|
||||
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
|
||||
|
||||
/*
|
||||
* Queue Node Adaptive Spinning
|
||||
*
|
||||
* A queue node vCPU will stop spinning if the vCPU in the previous node is
|
||||
* not running. The one lock stealing attempt allowed at slowpath entry
|
||||
* mitigates the slight slowdown for non-overcommitted guest with this
|
||||
* aggressive wait-early mechanism.
|
||||
*
|
||||
* The status of the previous node will be checked at fixed interval
|
||||
* controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
|
||||
* pound on the cacheline of the previous node too heavily.
|
||||
*/
|
||||
#define PV_PREV_CHECK_MASK 0xff
|
||||
|
||||
/*
|
||||
* Queue node uses: vcpu_running & vcpu_halted.
|
||||
* Queue head uses: vcpu_running & vcpu_hashed.
|
||||
|
@ -40,6 +54,94 @@ struct pv_node {
|
|||
u8 state;
|
||||
};
|
||||
|
||||
/*
|
||||
* By replacing the regular queued_spin_trylock() with the function below,
|
||||
* it will be called once when a lock waiter enter the PV slowpath before
|
||||
* being queued. By allowing one lock stealing attempt here when the pending
|
||||
* bit is off, it helps to reduce the performance impact of lock waiter
|
||||
* preemption without the drawback of lock starvation.
|
||||
*/
|
||||
#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
|
||||
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
|
||||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
|
||||
return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
|
||||
(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The pending bit is used by the queue head vCPU to indicate that it
|
||||
* is actively spinning on the lock and no lock stealing is allowed.
|
||||
*/
|
||||
#if _Q_PENDING_BITS == 8
|
||||
static __always_inline void set_pending(struct qspinlock *lock)
|
||||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
|
||||
WRITE_ONCE(l->pending, 1);
|
||||
}
|
||||
|
||||
static __always_inline void clear_pending(struct qspinlock *lock)
|
||||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
|
||||
WRITE_ONCE(l->pending, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
|
||||
* barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
|
||||
* just to be sure that it will get it.
|
||||
*/
|
||||
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
|
||||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
|
||||
return !READ_ONCE(l->locked) &&
|
||||
(cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
|
||||
== _Q_PENDING_VAL);
|
||||
}
|
||||
#else /* _Q_PENDING_BITS == 8 */
|
||||
static __always_inline void set_pending(struct qspinlock *lock)
|
||||
{
|
||||
atomic_set_mask(_Q_PENDING_VAL, &lock->val);
|
||||
}
|
||||
|
||||
static __always_inline void clear_pending(struct qspinlock *lock)
|
||||
{
|
||||
atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
|
||||
}
|
||||
|
||||
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
|
||||
{
|
||||
int val = atomic_read(&lock->val);
|
||||
|
||||
for (;;) {
|
||||
int old, new;
|
||||
|
||||
if (val & _Q_LOCKED_MASK)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Try to clear pending bit & set locked bit
|
||||
*/
|
||||
old = val;
|
||||
new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
|
||||
val = atomic_cmpxchg(&lock->val, old, new);
|
||||
|
||||
if (val == old)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif /* _Q_PENDING_BITS == 8 */
|
||||
|
||||
/*
|
||||
* Include queued spinlock statistics code
|
||||
*/
|
||||
#include "qspinlock_stat.h"
|
||||
|
||||
/*
|
||||
* Lock and MCS node addresses hash table for fast lookup
|
||||
*
|
||||
|
@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
|
|||
{
|
||||
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
|
||||
struct pv_hash_entry *he;
|
||||
int hopcnt = 0;
|
||||
|
||||
for_each_hash_entry(he, offset, hash) {
|
||||
hopcnt++;
|
||||
if (!cmpxchg(&he->lock, NULL, lock)) {
|
||||
WRITE_ONCE(he->node, node);
|
||||
qstat_hop(hopcnt);
|
||||
return &he->lock;
|
||||
}
|
||||
}
|
||||
|
@ -143,6 +248,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
|
|||
BUG();
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if when it is time to check the previous node which is not
|
||||
* in a running state.
|
||||
*/
|
||||
static inline bool
|
||||
pv_wait_early(struct pv_node *prev, int loop)
|
||||
{
|
||||
|
||||
if ((loop & PV_PREV_CHECK_MASK) != 0)
|
||||
return false;
|
||||
|
||||
return READ_ONCE(prev->state) != vcpu_running;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the PV part of the mcs_spinlock node.
|
||||
*/
|
||||
|
@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
|
|||
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
|
||||
* behalf.
|
||||
*/
|
||||
static void pv_wait_node(struct mcs_spinlock *node)
|
||||
static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
|
||||
{
|
||||
struct pv_node *pn = (struct pv_node *)node;
|
||||
struct pv_node *pp = (struct pv_node *)prev;
|
||||
int waitcnt = 0;
|
||||
int loop;
|
||||
bool wait_early;
|
||||
|
||||
for (;;) {
|
||||
for (loop = SPIN_THRESHOLD; loop; loop--) {
|
||||
/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
|
||||
for (;; waitcnt++) {
|
||||
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
|
||||
if (READ_ONCE(node->locked))
|
||||
return;
|
||||
if (pv_wait_early(pp, loop)) {
|
||||
wait_early = true;
|
||||
break;
|
||||
}
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
|
@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|||
*/
|
||||
smp_store_mb(pn->state, vcpu_halted);
|
||||
|
||||
if (!READ_ONCE(node->locked))
|
||||
if (!READ_ONCE(node->locked)) {
|
||||
qstat_inc(qstat_pv_wait_node, true);
|
||||
qstat_inc(qstat_pv_wait_again, waitcnt);
|
||||
qstat_inc(qstat_pv_wait_early, wait_early);
|
||||
pv_wait(&pn->state, vcpu_halted);
|
||||
}
|
||||
|
||||
/*
|
||||
* If pv_kick_node() changed us to vcpu_hashed, retain that value
|
||||
* so that pv_wait_head() knows to not also try to hash this lock.
|
||||
* If pv_kick_node() changed us to vcpu_hashed, retain that
|
||||
* value so that pv_wait_head_or_lock() knows to not also try
|
||||
* to hash this lock.
|
||||
*/
|
||||
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
|
||||
|
||||
|
@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|||
* So it is better to spin for a while in the hope that the
|
||||
* MCS lock will be released soon.
|
||||
*/
|
||||
qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|||
/*
|
||||
* Called after setting next->locked = 1 when we're the lock owner.
|
||||
*
|
||||
* Instead of waking the waiters stuck in pv_wait_node() advance their state such
|
||||
* that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
|
||||
* Instead of waking the waiters stuck in pv_wait_node() advance their state
|
||||
* such that they're waiting in pv_wait_head_or_lock(), this avoids a
|
||||
* wake/sleep cycle.
|
||||
*/
|
||||
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
|
||||
{
|
||||
|
@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
|
|||
}
|
||||
|
||||
/*
|
||||
* Wait for l->locked to become clear; halt the vcpu after a short spin.
|
||||
* Wait for l->locked to become clear and acquire the lock;
|
||||
* halt the vcpu after a short spin.
|
||||
* __pv_queued_spin_unlock() will wake us.
|
||||
*
|
||||
* The current value of the lock will be returned for additional processing.
|
||||
*/
|
||||
static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
||||
static u32
|
||||
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
|
||||
{
|
||||
struct pv_node *pn = (struct pv_node *)node;
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
struct qspinlock **lp = NULL;
|
||||
int waitcnt = 0;
|
||||
int loop;
|
||||
|
||||
/*
|
||||
|
@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
|||
if (READ_ONCE(pn->state) == vcpu_hashed)
|
||||
lp = (struct qspinlock **)1;
|
||||
|
||||
for (;;) {
|
||||
for (;; waitcnt++) {
|
||||
/*
|
||||
* Set correct vCPU state to be used by queue node wait-early
|
||||
* mechanism.
|
||||
*/
|
||||
WRITE_ONCE(pn->state, vcpu_running);
|
||||
|
||||
/*
|
||||
* Set the pending bit in the active lock spinning loop to
|
||||
* disable lock stealing before attempting to acquire the lock.
|
||||
*/
|
||||
set_pending(lock);
|
||||
for (loop = SPIN_THRESHOLD; loop; loop--) {
|
||||
if (!READ_ONCE(l->locked))
|
||||
return;
|
||||
if (trylock_clear_pending(lock))
|
||||
goto gotlock;
|
||||
cpu_relax();
|
||||
}
|
||||
clear_pending(lock);
|
||||
|
||||
|
||||
if (!lp) { /* ONCE */
|
||||
lp = pv_hash(lock, pn);
|
||||
|
@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
|||
*
|
||||
* Matches the smp_rmb() in __pv_queued_spin_unlock().
|
||||
*/
|
||||
if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
|
||||
if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
|
||||
/*
|
||||
* The lock is free and _Q_SLOW_VAL has never
|
||||
* been set. Therefore we need to unhash before
|
||||
* getting the lock.
|
||||
* The lock was free and now we own the lock.
|
||||
* Change the lock value back to _Q_LOCKED_VAL
|
||||
* and unhash the table.
|
||||
*/
|
||||
WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
|
||||
WRITE_ONCE(*lp, NULL);
|
||||
return;
|
||||
goto gotlock;
|
||||
}
|
||||
}
|
||||
WRITE_ONCE(pn->state, vcpu_halted);
|
||||
qstat_inc(qstat_pv_wait_head, true);
|
||||
qstat_inc(qstat_pv_wait_again, waitcnt);
|
||||
pv_wait(&l->locked, _Q_SLOW_VAL);
|
||||
|
||||
/*
|
||||
* The unlocker should have freed the lock before kicking the
|
||||
* CPU. So if the lock is still not free, it is a spurious
|
||||
* wakeup and so the vCPU should wait again after spinning for
|
||||
* a while.
|
||||
* wakeup or another vCPU has stolen the lock. The current
|
||||
* vCPU should spin again.
|
||||
*/
|
||||
qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock is unlocked now; the caller will acquire it without waiting.
|
||||
* As with pv_wait_node() we rely on the caller to do a load-acquire
|
||||
* for us.
|
||||
* The cmpxchg() or xchg() call before coming here provides the
|
||||
* acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
|
||||
* here is to indicate to the compiler that the value will always
|
||||
* be nozero to enable better code optimization.
|
||||
*/
|
||||
gotlock:
|
||||
return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
|
||||
}
|
||||
|
||||
/*
|
||||
* PV version of the unlock function to be used in stead of
|
||||
* queued_spin_unlock().
|
||||
* PV versions of the unlock fastpath and slowpath functions to be used
|
||||
* instead of queued_spin_unlock().
|
||||
*/
|
||||
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
|
||||
__visible void
|
||||
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
|
||||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
struct pv_node *node;
|
||||
u8 locked;
|
||||
|
||||
/*
|
||||
* We must not unlock if SLOW, because in that case we must first
|
||||
* unhash. Otherwise it would be possible to have multiple @lock
|
||||
* entries, which would be BAD.
|
||||
*/
|
||||
locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
|
||||
if (likely(locked == _Q_LOCKED_VAL))
|
||||
return;
|
||||
|
||||
if (unlikely(locked != _Q_SLOW_VAL)) {
|
||||
WARN(!debug_locks_silent,
|
||||
|
@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
|
|||
* so we need a barrier to order the read of the node data in
|
||||
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
|
||||
*
|
||||
* Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
|
||||
* Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
|
@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
|
|||
* vCPU is harmless other than the additional latency in completing
|
||||
* the unlock.
|
||||
*/
|
||||
qstat_inc(qstat_pv_kick_unlock, true);
|
||||
pv_kick(node->cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* Include the architecture specific callee-save thunk of the
|
||||
* __pv_queued_spin_unlock(). This thunk is put together with
|
||||
* __pv_queued_spin_unlock() near the top of the file to make sure
|
||||
* that the callee-save thunk and the real unlock function are close
|
||||
* to each other sharing consecutive instruction cachelines.
|
||||
* __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
|
||||
* function close to each other sharing consecutive instruction cachelines.
|
||||
* Alternatively, architecture specific version of __pv_queued_spin_unlock()
|
||||
* can be defined.
|
||||
*/
|
||||
#include <asm/qspinlock_paravirt.h>
|
||||
|
||||
#ifndef __pv_queued_spin_unlock
|
||||
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
|
||||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
u8 locked;
|
||||
|
||||
/*
|
||||
* We must not unlock if SLOW, because in that case we must first
|
||||
* unhash. Otherwise it would be possible to have multiple @lock
|
||||
* entries, which would be BAD.
|
||||
*/
|
||||
locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
|
||||
if (likely(locked == _Q_LOCKED_VAL))
|
||||
return;
|
||||
|
||||
__pv_queued_spin_unlock_slowpath(lock, locked);
|
||||
}
|
||||
#endif /* __pv_queued_spin_unlock */
|
||||
|
|
300
kernel/locking/qspinlock_stat.h
Normal file
300
kernel/locking/qspinlock_stat.h
Normal file
|
@ -0,0 +1,300 @@
|
|||
/*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* Authors: Waiman Long <waiman.long@hpe.com>
|
||||
*/
|
||||
|
||||
/*
|
||||
* When queued spinlock statistical counters are enabled, the following
|
||||
* debugfs files will be created for reporting the counter values:
|
||||
*
|
||||
* <debugfs>/qlockstat/
|
||||
* pv_hash_hops - average # of hops per hashing operation
|
||||
* pv_kick_unlock - # of vCPU kicks issued at unlock time
|
||||
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
|
||||
* pv_latency_kick - average latency (ns) of vCPU kick operation
|
||||
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
|
||||
* pv_lock_stealing - # of lock stealing operations
|
||||
* pv_spurious_wakeup - # of spurious wakeups
|
||||
* pv_wait_again - # of vCPU wait's that happened after a vCPU kick
|
||||
* pv_wait_early - # of early vCPU wait's
|
||||
* pv_wait_head - # of vCPU wait's at the queue head
|
||||
* pv_wait_node - # of vCPU wait's at a non-head queue node
|
||||
*
|
||||
* Writing to the "reset_counters" file will reset all the above counter
|
||||
* values.
|
||||
*
|
||||
* These statistical counters are implemented as per-cpu variables which are
|
||||
* summed and computed whenever the corresponding debugfs files are read. This
|
||||
* minimizes added overhead making the counters usable even in a production
|
||||
* environment.
|
||||
*
|
||||
* There may be slight difference between pv_kick_wake and pv_kick_unlock.
|
||||
*/
|
||||
enum qlock_stats {
|
||||
qstat_pv_hash_hops,
|
||||
qstat_pv_kick_unlock,
|
||||
qstat_pv_kick_wake,
|
||||
qstat_pv_latency_kick,
|
||||
qstat_pv_latency_wake,
|
||||
qstat_pv_lock_stealing,
|
||||
qstat_pv_spurious_wakeup,
|
||||
qstat_pv_wait_again,
|
||||
qstat_pv_wait_early,
|
||||
qstat_pv_wait_head,
|
||||
qstat_pv_wait_node,
|
||||
qstat_num, /* Total number of statistical counters */
|
||||
qstat_reset_cnts = qstat_num,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_QUEUED_LOCK_STAT
|
||||
/*
|
||||
* Collect pvqspinlock statistics
|
||||
*/
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/fs.h>
|
||||
|
||||
static const char * const qstat_names[qstat_num + 1] = {
|
||||
[qstat_pv_hash_hops] = "pv_hash_hops",
|
||||
[qstat_pv_kick_unlock] = "pv_kick_unlock",
|
||||
[qstat_pv_kick_wake] = "pv_kick_wake",
|
||||
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
|
||||
[qstat_pv_latency_kick] = "pv_latency_kick",
|
||||
[qstat_pv_latency_wake] = "pv_latency_wake",
|
||||
[qstat_pv_lock_stealing] = "pv_lock_stealing",
|
||||
[qstat_pv_wait_again] = "pv_wait_again",
|
||||
[qstat_pv_wait_early] = "pv_wait_early",
|
||||
[qstat_pv_wait_head] = "pv_wait_head",
|
||||
[qstat_pv_wait_node] = "pv_wait_node",
|
||||
[qstat_reset_cnts] = "reset_counters",
|
||||
};
|
||||
|
||||
/*
|
||||
* Per-cpu counters
|
||||
*/
|
||||
static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
|
||||
static DEFINE_PER_CPU(u64, pv_kick_time);
|
||||
|
||||
/*
|
||||
* Function to read and return the qlock statistical counter values
|
||||
*
|
||||
* The following counters are handled specially:
|
||||
* 1. qstat_pv_latency_kick
|
||||
* Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
|
||||
* 2. qstat_pv_latency_wake
|
||||
* Average wake latency (ns) = pv_latency_wake/pv_kick_wake
|
||||
* 3. qstat_pv_hash_hops
|
||||
* Average hops/hash = pv_hash_hops/pv_kick_unlock
|
||||
*/
|
||||
static ssize_t qstat_read(struct file *file, char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
char buf[64];
|
||||
int cpu, counter, len;
|
||||
u64 stat = 0, kicks = 0;
|
||||
|
||||
/*
|
||||
* Get the counter ID stored in file->f_inode->i_private
|
||||
*/
|
||||
if (!file->f_inode) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -EBADF;
|
||||
}
|
||||
counter = (long)(file->f_inode->i_private);
|
||||
|
||||
if (counter >= qstat_num)
|
||||
return -EBADF;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
stat += per_cpu(qstats[counter], cpu);
|
||||
/*
|
||||
* Need to sum additional counter for some of them
|
||||
*/
|
||||
switch (counter) {
|
||||
|
||||
case qstat_pv_latency_kick:
|
||||
case qstat_pv_hash_hops:
|
||||
kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
|
||||
break;
|
||||
|
||||
case qstat_pv_latency_wake:
|
||||
kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (counter == qstat_pv_hash_hops) {
|
||||
u64 frac;
|
||||
|
||||
frac = 100ULL * do_div(stat, kicks);
|
||||
frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
|
||||
|
||||
/*
|
||||
* Return a X.XX decimal number
|
||||
*/
|
||||
len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
|
||||
} else {
|
||||
/*
|
||||
* Round to the nearest ns
|
||||
*/
|
||||
if ((counter == qstat_pv_latency_kick) ||
|
||||
(counter == qstat_pv_latency_wake)) {
|
||||
stat = 0;
|
||||
if (kicks)
|
||||
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
|
||||
}
|
||||
len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
|
||||
}
|
||||
|
||||
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Function to handle write request
|
||||
*
|
||||
* When counter = reset_cnts, reset all the counter values.
|
||||
* Since the counter updates aren't atomic, the resetting is done twice
|
||||
* to make sure that the counters are very likely to be all cleared.
|
||||
*/
|
||||
static ssize_t qstat_write(struct file *file, const char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* Get the counter ID stored in file->f_inode->i_private
|
||||
*/
|
||||
if (!file->f_inode) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -EBADF;
|
||||
}
|
||||
if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
|
||||
return count;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
int i;
|
||||
unsigned long *ptr = per_cpu_ptr(qstats, cpu);
|
||||
|
||||
for (i = 0 ; i < qstat_num; i++)
|
||||
WRITE_ONCE(ptr[i], 0);
|
||||
for (i = 0 ; i < qstat_num; i++)
|
||||
WRITE_ONCE(ptr[i], 0);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
* Debugfs data structures
|
||||
*/
|
||||
static const struct file_operations fops_qstat = {
|
||||
.read = qstat_read,
|
||||
.write = qstat_write,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
/*
|
||||
* Initialize debugfs for the qspinlock statistical counters
|
||||
*/
|
||||
static int __init init_qspinlock_stat(void)
|
||||
{
|
||||
struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
|
||||
int i;
|
||||
|
||||
if (!d_qstat) {
|
||||
pr_warn("Could not create 'qlockstat' debugfs directory\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create the debugfs files
|
||||
*
|
||||
* As reading from and writing to the stat files can be slow, only
|
||||
* root is allowed to do the read/write to limit impact to system
|
||||
* performance.
|
||||
*/
|
||||
for (i = 0; i < qstat_num; i++)
|
||||
debugfs_create_file(qstat_names[i], 0400, d_qstat,
|
||||
(void *)(long)i, &fops_qstat);
|
||||
|
||||
debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
|
||||
(void *)(long)qstat_reset_cnts, &fops_qstat);
|
||||
return 0;
|
||||
}
|
||||
fs_initcall(init_qspinlock_stat);
|
||||
|
||||
/*
|
||||
* Increment the PV qspinlock statistical counters
|
||||
*/
|
||||
static inline void qstat_inc(enum qlock_stats stat, bool cond)
|
||||
{
|
||||
if (cond)
|
||||
this_cpu_inc(qstats[stat]);
|
||||
}
|
||||
|
||||
/*
|
||||
* PV hash hop count
|
||||
*/
|
||||
static inline void qstat_hop(int hopcnt)
|
||||
{
|
||||
this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Replacement function for pv_kick()
|
||||
*/
|
||||
static inline void __pv_kick(int cpu)
|
||||
{
|
||||
u64 start = sched_clock();
|
||||
|
||||
per_cpu(pv_kick_time, cpu) = start;
|
||||
pv_kick(cpu);
|
||||
this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Replacement function for pv_wait()
|
||||
*/
|
||||
static inline void __pv_wait(u8 *ptr, u8 val)
|
||||
{
|
||||
u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
|
||||
|
||||
*pkick_time = 0;
|
||||
pv_wait(ptr, val);
|
||||
if (*pkick_time) {
|
||||
this_cpu_add(qstats[qstat_pv_latency_wake],
|
||||
sched_clock() - *pkick_time);
|
||||
qstat_inc(qstat_pv_kick_wake, true);
|
||||
}
|
||||
}
|
||||
|
||||
#define pv_kick(c) __pv_kick(c)
|
||||
#define pv_wait(p, v) __pv_wait(p, v)
|
||||
|
||||
/*
|
||||
* PV unfair trylock count tracking function
|
||||
*/
|
||||
static inline int qstat_spin_steal_lock(struct qspinlock *lock)
|
||||
{
|
||||
int ret = pv_queued_spin_steal_lock(lock);
|
||||
|
||||
qstat_inc(qstat_pv_lock_stealing, ret);
|
||||
return ret;
|
||||
}
|
||||
#undef queued_spin_trylock
|
||||
#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
|
||||
|
||||
#else /* CONFIG_QUEUED_LOCK_STAT */
|
||||
|
||||
static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
|
||||
static inline void qstat_hop(int hopcnt) { }
|
||||
|
||||
#endif /* CONFIG_QUEUED_LOCK_STAT */
|
Loading…
Add table
Add a link
Reference in a new issue