Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar:
 "So we have a laundry list of locking subsystem changes:

   - continuing barrier API and code improvements

   - futex enhancements

   - atomics API improvements

   - pvqspinlock enhancements: in particular lock stealing and adaptive
     spinning

   - qspinlock micro-enhancements"

* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op
  futex: Cleanup the goto confusion in requeue_pi()
  futex: Remove pointless put_pi_state calls in requeue()
  futex: Document pi_state refcounting in requeue code
  futex: Rename free_pi_state() to put_pi_state()
  futex: Drop refcount if requeue_pi() acquired the rtmutex
  locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation
  lcoking/barriers, arch: Use smp barriers in smp_store_release()
  locking/cmpxchg, arch: Remove tas() definitions
  locking/pvqspinlock: Queue node adaptive spinning
  locking/pvqspinlock: Allow limited lock stealing
  locking/pvqspinlock: Collect slowpath lock statistics
  sched/core, locking: Document Program-Order guarantees
  locking, sched: Introduce smp_cond_acquire() and use it
  locking/pvqspinlock, x86: Optimize the PV unlock code path
  locking/qspinlock: Avoid redundant read of next pointer
  locking/qspinlock: Prefetch the next node cacheline
  locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg()
  atomics: Add test for atomic operations with _relaxed variants
This commit is contained in:
Linus Torvalds 2016-01-11 14:18:38 -08:00
commit 24af98c4cf
20 changed files with 904 additions and 146 deletions

View file

@ -14,8 +14,9 @@
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
* (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
* Authors: Waiman Long <waiman.long@hp.com>
* Authors: Waiman Long <waiman.long@hpe.com>
* Peter Zijlstra <peterz@infradead.org>
*/
@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
{
struct __qspinlock *l = (void *)lock;
return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
/*
* Use release semantics to make sure that the MCS node is properly
* initialized before changing the tail code.
*/
return (u32)xchg_release(&l->tail,
tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
old = atomic_cmpxchg(&lock->val, val, new);
/*
* Use release semantics to make sure that the MCS node is
* properly initialized before changing the tail code.
*/
old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val)
break;
@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
*/
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock,
struct mcs_spinlock *node) { }
static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
struct mcs_spinlock *node)
{ return 0; }
#define pv_enabled() false
#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node
#define pv_wait_head __pv_wait_head
#define pv_wait_head_or_lock __pv_wait_head_or_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (val == new)
new |= _Q_PENDING_VAL;
old = atomic_cmpxchg(&lock->val, val, new);
/*
* Acquire semantic is required here as the function may
* return immediately if the lock was free.
*/
old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val)
break;
@ -382,6 +398,7 @@ queue:
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);
next = NULL;
/*
* if there was a previous node; link it and wait until reaching the
@ -391,8 +408,18 @@ queue:
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
pv_wait_node(node);
pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
/*
* While waiting for the MCS lock, the next pointer may have
* been set by another lock waiter. We optimistically load
* the next pointer & prefetch the cacheline for writing
* to reduce latency in the upcoming MCS unlock operation.
*/
next = READ_ONCE(node->next);
if (next)
prefetchw(next);
}
/*
@ -406,11 +433,22 @@ queue:
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
* smp_load_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
*
* If PV isn't active, 0 will be returned instead.
*
*/
pv_wait_head(lock, node);
while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
cpu_relax();
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
locked:
/*
* claim the lock:
*
@ -422,11 +460,17 @@ queue:
* to grab the lock.
*/
for (;;) {
if (val != tail) {
/* In the PV case we might already have _Q_LOCKED_VAL set */
if ((val & _Q_TAIL_MASK) != tail) {
set_locked(lock);
break;
}
old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
/*
* The smp_load_acquire() call above has provided the necessary
* acquire semantics required for locking. At most two
* iterations of this loop may be ran.
*/
old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
@ -434,10 +478,12 @@ queue:
}
/*
* contended path; wait for next, release.
* contended path; wait for next if not observed yet, release.
*/
while (!(next = READ_ONCE(node->next)))
cpu_relax();
if (!next) {
while (!(next = READ_ONCE(node->next)))
cpu_relax();
}
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_init_node
#undef pv_wait_node
#undef pv_kick_node
#undef pv_wait_head
#undef pv_wait_head_or_lock
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath

View file

@ -22,6 +22,20 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
/*
* Queue Node Adaptive Spinning
*
* A queue node vCPU will stop spinning if the vCPU in the previous node is
* not running. The one lock stealing attempt allowed at slowpath entry
* mitigates the slight slowdown for non-overcommitted guest with this
* aggressive wait-early mechanism.
*
* The status of the previous node will be checked at fixed interval
* controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
* pound on the cacheline of the previous node too heavily.
*/
#define PV_PREV_CHECK_MASK 0xff
/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
@ -40,6 +54,94 @@ struct pv_node {
u8 state;
};
/*
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
* being queued. By allowing one lock stealing attempt here when the pending
* bit is off, it helps to reduce the performance impact of lock waiter
* preemption without the drawback of lock starvation.
*/
#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
}
/*
* The pending bit is used by the queue head vCPU to indicate that it
* is actively spinning on the lock and no lock stealing is allowed.
*/
#if _Q_PENDING_BITS == 8
static __always_inline void set_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
WRITE_ONCE(l->pending, 1);
}
static __always_inline void clear_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
WRITE_ONCE(l->pending, 0);
}
/*
* The pending bit check in pv_queued_spin_steal_lock() isn't a memory
* barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
* just to be sure that it will get it.
*/
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
return !READ_ONCE(l->locked) &&
(cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
== _Q_PENDING_VAL);
}
#else /* _Q_PENDING_BITS == 8 */
static __always_inline void set_pending(struct qspinlock *lock)
{
atomic_set_mask(_Q_PENDING_VAL, &lock->val);
}
static __always_inline void clear_pending(struct qspinlock *lock)
{
atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
}
static __always_inline int trylock_clear_pending(struct qspinlock *lock)
{
int val = atomic_read(&lock->val);
for (;;) {
int old, new;
if (val & _Q_LOCKED_MASK)
break;
/*
* Try to clear pending bit & set locked bit
*/
old = val;
new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
val = atomic_cmpxchg(&lock->val, old, new);
if (val == old)
return 1;
}
return 0;
}
#endif /* _Q_PENDING_BITS == 8 */
/*
* Include queued spinlock statistics code
*/
#include "qspinlock_stat.h"
/*
* Lock and MCS node addresses hash table for fast lookup
*
@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
qstat_hop(hopcnt);
return &he->lock;
}
}
@ -143,6 +248,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
BUG();
}
/*
* Return true if when it is time to check the previous node which is not
* in a running state.
*/
static inline bool
pv_wait_early(struct pv_node *prev, int loop)
{
if ((loop & PV_PREV_CHECK_MASK) != 0)
return false;
return READ_ONCE(prev->state) != vcpu_running;
}
/*
* Initialize the PV part of the mcs_spinlock node.
*/
@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
static void pv_wait_node(struct mcs_spinlock *node)
static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
int waitcnt = 0;
int loop;
bool wait_early;
for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
for (;; waitcnt++) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
if (pv_wait_early(pp, loop)) {
wait_early = true;
break;
}
cpu_relax();
}
@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
*/
smp_store_mb(pn->state, vcpu_halted);
if (!READ_ONCE(node->locked))
if (!READ_ONCE(node->locked)) {
qstat_inc(qstat_pv_wait_node, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
}
/*
* If pv_kick_node() changed us to vcpu_hashed, retain that value
* so that pv_wait_head() knows to not also try to hash this lock.
* If pv_kick_node() changed us to vcpu_hashed, retain that
* value so that pv_wait_head_or_lock() knows to not also try
* to hash this lock.
*/
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
}
/*
@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
/*
* Called after setting next->locked = 1 when we're the lock owner.
*
* Instead of waking the waiters stuck in pv_wait_node() advance their state such
* that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
* Instead of waking the waiters stuck in pv_wait_node() advance their state
* such that they're waiting in pv_wait_head_or_lock(), this avoids a
* wake/sleep cycle.
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
}
/*
* Wait for l->locked to become clear; halt the vcpu after a short spin.
* Wait for l->locked to become clear and acquire the lock;
* halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us.
*
* The current value of the lock will be returned for additional processing.
*/
static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
static u32
pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
int waitcnt = 0;
int loop;
/*
@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;
for (;;) {
for (;; waitcnt++) {
/*
* Set correct vCPU state to be used by queue node wait-early
* mechanism.
*/
WRITE_ONCE(pn->state, vcpu_running);
/*
* Set the pending bit in the active lock spinning loop to
* disable lock stealing before attempting to acquire the lock.
*/
set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
return;
if (trylock_clear_pending(lock))
goto gotlock;
cpu_relax();
}
clear_pending(lock);
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/*
* The lock is free and _Q_SLOW_VAL has never
* been set. Therefore we need to unhash before
* getting the lock.
* The lock was free and now we own the lock.
* Change the lock value back to _Q_LOCKED_VAL
* and unhash the table.
*/
WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
return;
goto gotlock;
}
}
WRITE_ONCE(pn->state, vcpu_halted);
qstat_inc(qstat_pv_wait_head, true);
qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
* wakeup and so the vCPU should wait again after spinning for
* a while.
* wakeup or another vCPU has stolen the lock. The current
* vCPU should spin again.
*/
qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
* Lock is unlocked now; the caller will acquire it without waiting.
* As with pv_wait_node() we rely on the caller to do a load-acquire
* for us.
* The cmpxchg() or xchg() call before coming here provides the
* acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
* here is to indicate to the compiler that the value will always
* be nozero to enable better code optimization.
*/
gotlock:
return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
}
/*
* PV version of the unlock function to be used in stead of
* queued_spin_unlock().
* PV versions of the unlock fastpath and slowpath functions to be used
* instead of queued_spin_unlock().
*/
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
__visible void
__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
*
* Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
* Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
*/
smp_rmb();
@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
qstat_inc(qstat_pv_kick_unlock, true);
pv_kick(node->cpu);
}
/*
* Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with
* __pv_queued_spin_unlock() near the top of the file to make sure
* that the callee-save thunk and the real unlock function are close
* to each other sharing consecutive instruction cachelines.
* __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
* function close to each other sharing consecutive instruction cachelines.
* Alternatively, architecture specific version of __pv_queued_spin_unlock()
* can be defined.
*/
#include <asm/qspinlock_paravirt.h>
#ifndef __pv_queued_spin_unlock
__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
__pv_queued_spin_unlock_slowpath(lock, locked);
}
#endif /* __pv_queued_spin_unlock */

View file

@ -0,0 +1,300 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* Authors: Waiman Long <waiman.long@hpe.com>
*/
/*
* When queued spinlock statistical counters are enabled, the following
* debugfs files will be created for reporting the counter values:
*
* <debugfs>/qlockstat/
* pv_hash_hops - average # of hops per hashing operation
* pv_kick_unlock - # of vCPU kicks issued at unlock time
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
* pv_latency_kick - average latency (ns) of vCPU kick operation
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
* pv_lock_stealing - # of lock stealing operations
* pv_spurious_wakeup - # of spurious wakeups
* pv_wait_again - # of vCPU wait's that happened after a vCPU kick
* pv_wait_early - # of early vCPU wait's
* pv_wait_head - # of vCPU wait's at the queue head
* pv_wait_node - # of vCPU wait's at a non-head queue node
*
* Writing to the "reset_counters" file will reset all the above counter
* values.
*
* These statistical counters are implemented as per-cpu variables which are
* summed and computed whenever the corresponding debugfs files are read. This
* minimizes added overhead making the counters usable even in a production
* environment.
*
* There may be slight difference between pv_kick_wake and pv_kick_unlock.
*/
enum qlock_stats {
qstat_pv_hash_hops,
qstat_pv_kick_unlock,
qstat_pv_kick_wake,
qstat_pv_latency_kick,
qstat_pv_latency_wake,
qstat_pv_lock_stealing,
qstat_pv_spurious_wakeup,
qstat_pv_wait_again,
qstat_pv_wait_early,
qstat_pv_wait_head,
qstat_pv_wait_node,
qstat_num, /* Total number of statistical counters */
qstat_reset_cnts = qstat_num,
};
#ifdef CONFIG_QUEUED_LOCK_STAT
/*
* Collect pvqspinlock statistics
*/
#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/fs.h>
static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_hash_hops] = "pv_hash_hops",
[qstat_pv_kick_unlock] = "pv_kick_unlock",
[qstat_pv_kick_wake] = "pv_kick_wake",
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
[qstat_pv_latency_kick] = "pv_latency_kick",
[qstat_pv_latency_wake] = "pv_latency_wake",
[qstat_pv_lock_stealing] = "pv_lock_stealing",
[qstat_pv_wait_again] = "pv_wait_again",
[qstat_pv_wait_early] = "pv_wait_early",
[qstat_pv_wait_head] = "pv_wait_head",
[qstat_pv_wait_node] = "pv_wait_node",
[qstat_reset_cnts] = "reset_counters",
};
/*
* Per-cpu counters
*/
static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
static DEFINE_PER_CPU(u64, pv_kick_time);
/*
* Function to read and return the qlock statistical counter values
*
* The following counters are handled specially:
* 1. qstat_pv_latency_kick
* Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
* 2. qstat_pv_latency_wake
* Average wake latency (ns) = pv_latency_wake/pv_kick_wake
* 3. qstat_pv_hash_hops
* Average hops/hash = pv_hash_hops/pv_kick_unlock
*/
static ssize_t qstat_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[64];
int cpu, counter, len;
u64 stat = 0, kicks = 0;
/*
* Get the counter ID stored in file->f_inode->i_private
*/
if (!file->f_inode) {
WARN_ON_ONCE(1);
return -EBADF;
}
counter = (long)(file->f_inode->i_private);
if (counter >= qstat_num)
return -EBADF;
for_each_possible_cpu(cpu) {
stat += per_cpu(qstats[counter], cpu);
/*
* Need to sum additional counter for some of them
*/
switch (counter) {
case qstat_pv_latency_kick:
case qstat_pv_hash_hops:
kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
break;
case qstat_pv_latency_wake:
kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
break;
}
}
if (counter == qstat_pv_hash_hops) {
u64 frac;
frac = 100ULL * do_div(stat, kicks);
frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
/*
* Return a X.XX decimal number
*/
len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
} else {
/*
* Round to the nearest ns
*/
if ((counter == qstat_pv_latency_kick) ||
(counter == qstat_pv_latency_wake)) {
stat = 0;
if (kicks)
stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
}
len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
}
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
/*
* Function to handle write request
*
* When counter = reset_cnts, reset all the counter values.
* Since the counter updates aren't atomic, the resetting is done twice
* to make sure that the counters are very likely to be all cleared.
*/
static ssize_t qstat_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
int cpu;
/*
* Get the counter ID stored in file->f_inode->i_private
*/
if (!file->f_inode) {
WARN_ON_ONCE(1);
return -EBADF;
}
if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
return count;
for_each_possible_cpu(cpu) {
int i;
unsigned long *ptr = per_cpu_ptr(qstats, cpu);
for (i = 0 ; i < qstat_num; i++)
WRITE_ONCE(ptr[i], 0);
for (i = 0 ; i < qstat_num; i++)
WRITE_ONCE(ptr[i], 0);
}
return count;
}
/*
* Debugfs data structures
*/
static const struct file_operations fops_qstat = {
.read = qstat_read,
.write = qstat_write,
.llseek = default_llseek,
};
/*
* Initialize debugfs for the qspinlock statistical counters
*/
static int __init init_qspinlock_stat(void)
{
struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
int i;
if (!d_qstat) {
pr_warn("Could not create 'qlockstat' debugfs directory\n");
return 0;
}
/*
* Create the debugfs files
*
* As reading from and writing to the stat files can be slow, only
* root is allowed to do the read/write to limit impact to system
* performance.
*/
for (i = 0; i < qstat_num; i++)
debugfs_create_file(qstat_names[i], 0400, d_qstat,
(void *)(long)i, &fops_qstat);
debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
(void *)(long)qstat_reset_cnts, &fops_qstat);
return 0;
}
fs_initcall(init_qspinlock_stat);
/*
* Increment the PV qspinlock statistical counters
*/
static inline void qstat_inc(enum qlock_stats stat, bool cond)
{
if (cond)
this_cpu_inc(qstats[stat]);
}
/*
* PV hash hop count
*/
static inline void qstat_hop(int hopcnt)
{
this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
}
/*
* Replacement function for pv_kick()
*/
static inline void __pv_kick(int cpu)
{
u64 start = sched_clock();
per_cpu(pv_kick_time, cpu) = start;
pv_kick(cpu);
this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
}
/*
* Replacement function for pv_wait()
*/
static inline void __pv_wait(u8 *ptr, u8 val)
{
u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
*pkick_time = 0;
pv_wait(ptr, val);
if (*pkick_time) {
this_cpu_add(qstats[qstat_pv_latency_wake],
sched_clock() - *pkick_time);
qstat_inc(qstat_pv_kick_wake, true);
}
}
#define pv_kick(c) __pv_kick(c)
#define pv_wait(p, v) __pv_wait(p, v)
/*
* PV unfair trylock count tracking function
*/
static inline int qstat_spin_steal_lock(struct qspinlock *lock)
{
int ret = pv_queued_spin_steal_lock(lock);
qstat_inc(qstat_pv_lock_stealing, ret);
return ret;
}
#undef queued_spin_trylock
#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
#else /* CONFIG_QUEUED_LOCK_STAT */
static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
static inline void qstat_hop(int hopcnt) { }
#endif /* CONFIG_QUEUED_LOCK_STAT */