Merge branches 'doc.2019.12.10a', 'exp.2019.12.09a', 'fixes.2020.01.24a', 'kfree_rcu.2020.01.24a', 'list.2020.01.10a', 'preempt.2020.01.24a' and 'torture.2019.12.09a' into HEAD

doc.2019.12.10a: Documentations updates
exp.2019.12.09a: Expedited grace-period updates
fixes.2020.01.24a: Miscellaneous fixes
kfree_rcu.2020.01.24a: Batch kfree_rcu() work
list.2020.01.10a: RCU-protected-list updates
preempt.2020.01.24a: Preemptible RCU updates
torture.2019.12.09a: Torture-test updates
This commit is contained in:
Paul E. McKenney 2020-01-24 10:37:27 -08:00
39 changed files with 1070 additions and 575 deletions

View file

@ -43,7 +43,6 @@
#include <uapi/linux/sched/types.h>
#include <linux/prefetch.h>
#include <linux/delay.h>
#include <linux/stop_machine.h>
#include <linux/random.h>
#include <linux/trace_events.h>
#include <linux/suspend.h>
@ -55,6 +54,7 @@
#include <linux/oom.h>
#include <linux/smpboot.h>
#include <linux/jiffies.h>
#include <linux/slab.h>
#include <linux/sched/isolation.h>
#include <linux/sched/clock.h>
#include "../time/tick-internal.h"
@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
};
struct rcu_state rcu_state = {
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* held, but the bit corresponding to the current CPU will be stable
* in most contexts.
*/
unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
{
return READ_ONCE(rnp->qsmaskinitnext);
}
@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void)
*
* No ordering, as we are sampling CPU-local information.
*/
bool rcu_dynticks_curr_cpu_in_eqs(void)
static bool rcu_dynticks_curr_cpu_in_eqs(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
* Snapshot the ->dynticks counter with full ordering so as to allow
* stable comparison of this counter with past and future snapshots.
*/
int rcu_dynticks_snap(struct rcu_data *rdp)
static int rcu_dynticks_snap(struct rcu_data *rdp)
{
int snap = atomic_add_return(0, &rdp->dynticks);
@ -528,16 +528,6 @@ static struct rcu_node *rcu_get_root(void)
return &rcu_state.node[0];
}
/*
* Convert a ->gp_state value to a character string.
*/
static const char *gp_state_getname(short gs)
{
if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
return "???";
return gp_state_names[gs];
}
/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
@ -577,7 +567,7 @@ static void rcu_eqs_enter(bool user)
}
lockdep_assert_irqs_disabled();
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
do_nocb_deferred_wakeup(rdp);
@ -650,14 +640,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
* leave it in non-RCU-idle state.
*/
if (rdp->dynticks_nmi_nesting != 1) {
trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
rdp->dynticks_nmi_nesting - 2);
return;
}
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
if (irq)
@ -744,7 +735,7 @@ static void rcu_eqs_exit(bool user)
rcu_dynticks_task_exit();
rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@ -800,8 +791,8 @@ void rcu_user_exit(void)
*/
static __always_inline void rcu_nmi_enter_common(bool irq)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
long incby = 2;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
/* Complain about underflow. */
WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
@ -828,12 +819,17 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
} else if (tick_nohz_full_cpu(rdp->cpu) &&
rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
rdp->rcu_forced_tick = true;
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
raw_spin_lock_rcu_node(rdp->mynode);
// Recheck under lock.
if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
rdp->rcu_forced_tick = true;
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
rdp->dynticks_nmi_nesting,
rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
rdp->dynticks_nmi_nesting + incby);
barrier();
@ -898,6 +894,7 @@ void rcu_irq_enter_irqson(void)
*/
static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
{
raw_lockdep_assert_held_rcu_node(rdp->mynode);
WRITE_ONCE(rdp->rcu_urgent_qs, false);
WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
@ -1934,7 +1931,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
struct rcu_node *rnp_p;
raw_lockdep_assert_held_rcu_node(rnp);
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
rnp->qsmask != 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@ -2146,7 +2143,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), 0);
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
@ -2168,7 +2164,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
if (unlikely(bl > 100))
tlimit = local_clock() + rcu_resched_ns;
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
if (offloaded)
@ -2179,9 +2174,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;
debug_rcu_head_unqueue(rhp);
if (__rcu_reclaim(rcu_state.name, rhp))
rcu_cblist_dequeued_lazy(&rcl);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_callback(rcu_state.name, rhp);
f = rhp->func;
WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
f(rhp);
rcu_lock_release(&rcu_callback_map);
/*
* Stop only if limit reached and CPU has something to do.
* Note: The rcl structure counts down from zero.
@ -2294,7 +2299,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
if (!IS_ENABLED(CONFIG_PREEMPTION) ||
if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
@ -2308,14 +2313,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
continue;
}
for_each_leaf_node_possible_cpu(rnp, cpu) {
unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
if ((rnp->qsmask & bit) != 0) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (f(rdp)) {
mask |= bit;
rcu_disable_urgency_upon_qs(rdp);
}
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (f(rdp)) {
mask |= rdp->grpmask;
rcu_disable_urgency_upon_qs(rdp);
}
}
if (mask != 0) {
@ -2474,8 +2476,8 @@ static void rcu_cpu_kthread(unsigned int cpu)
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
int spincnt;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
@ -2583,7 +2585,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
* is expected to specify a CPU.
*/
static void
__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
unsigned long flags;
struct rcu_data *rdp;
@ -2618,18 +2620,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
return; // Enqueued onto ->nocb_bypass, so just leave.
/* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
rcu_segcblist_enqueue(&rdp->cblist, head);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
(unsigned long)func,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
else
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
@ -2679,28 +2680,230 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, 0);
__call_rcu(head, func);
}
EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
* @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @lock and @rcu_work fields have been initialized
*
* This is a per-CPU structure. The reason that it is not included in
* the rcu_data structure is to permit this code to be extracted from
* the RCU files. Such extraction could allow further optimization of
* the interactions with the slab allocators.
*/
struct kfree_rcu_cpu {
struct rcu_head *head;
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
spinlock_t lock;
struct delayed_work monitor_work;
bool monitor_todo;
bool initialized;
};
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
/*
* Queue an RCU callback for lazy invocation after a grace period.
* This will likely be later named something like "call_rcu_lazy()",
* but this change will require some way of tagging the lazy RCU
* callbacks in the list of pending callbacks. Until then, this
* function may only be called from __kfree_rcu().
* This function is invoked in workqueue context after a grace period.
* It frees all the objects queued on ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
struct rcu_head *head, *next;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
krwp = container_of(to_rcu_work(work),
struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp;
spin_lock_irqsave(&krcp->lock, flags);
head = krwp->head_free;
krwp->head_free = NULL;
spin_unlock_irqrestore(&krcp->lock, flags);
// List "head" is now private, so traverse locklessly.
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
next = head->next;
// Potentially optimize with kfree_bulk in future.
debug_rcu_head_unqueue(head);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
/* Could be optimized with kfree_bulk() in future. */
kfree((void *)head - offset);
}
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
}
}
/*
* Schedule the kfree batch RCU work to run in workqueue context after a GP.
*
* This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
* timeout has been reached.
*/
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{
int i;
struct kfree_rcu_cpu_work *krwp = NULL;
lockdep_assert_held(&krcp->lock);
for (i = 0; i < KFREE_N_BATCHES; i++)
if (!krcp->krw_arr[i].head_free) {
krwp = &(krcp->krw_arr[i]);
break;
}
// If a previous RCU batch is in progress, we cannot immediately
// queue another one, so return false to tell caller to retry.
if (!krwp)
return false;
krwp->head_free = krcp->head;
krcp->head = NULL;
INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
queue_rcu_work(system_wq, &krwp->rcu_work);
return true;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
unsigned long flags)
{
// Attempt to start a new batch.
krcp->monitor_todo = false;
if (queue_kfree_rcu_work(krcp)) {
// Success! Our job is done here.
spin_unlock_irqrestore(&krcp->lock, flags);
return;
}
// Previous RCU batch still in progress, try again later.
krcp->monitor_todo = true;
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
spin_unlock_irqrestore(&krcp->lock, flags);
}
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
* It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
*/
static void kfree_rcu_monitor(struct work_struct *work)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
monitor_work.work);
spin_lock_irqsave(&krcp->lock, flags);
if (krcp->monitor_todo)
kfree_rcu_drain_unlock(krcp, flags);
else
spin_unlock_irqrestore(&krcp->lock, flags);
}
/*
* Queue a request for lazy invocation of kfree() after a grace period.
*
* Each kfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
* will be kfree'd in workqueue context. This allows us to:
*
* 1. Batch requests together to reduce the number of grace periods during
* heavy kfree_rcu() load.
*
* 2. It makes it possible to use kfree_bulk() on a large number of
* kfree_rcu() requests thus reducing cache misses and the per-object
* overhead of kfree().
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, 1);
unsigned long flags;
struct kfree_rcu_cpu *krcp;
local_irq_save(flags); // For safely calling this_cpu_ptr().
krcp = this_cpu_ptr(&krc);
if (krcp->initialized)
spin_lock(&krcp->lock);
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(head)) {
// Probable double kfree_rcu(), just leak.
WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
__func__, head);
goto unlock_return;
}
head->func = func;
head->next = krcp->head;
krcp->head = head;
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!krcp->monitor_todo) {
krcp->monitor_todo = true;
schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
}
unlock_return:
if (krcp->initialized)
spin_unlock(&krcp->lock);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
unsigned long flags;
for_each_online_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
spin_lock_irqsave(&krcp->lock, flags);
if (!krcp->head || krcp->monitor_todo) {
spin_unlock_irqrestore(&krcp->lock, flags);
continue;
}
krcp->monitor_todo = true;
schedule_delayed_work_on(cpu, &krcp->monitor_work,
KFREE_DRAIN_JIFFIES);
spin_unlock_irqrestore(&krcp->lock, flags);
}
}
/*
* During early boot, any blocking grace-period wait automatically
* implies a grace period. Later on, this is never the case for PREEMPT.
* implies a grace period. Later on, this is never the case for PREEMPTION.
*
* Howevr, because a context switch is a grace period for !PREEMPT, any
* Howevr, because a context switch is a grace period for !PREEMPTION, any
* blocking grace-period wait automatically implies a grace period if
* there is only one CPU online at any point time during execution of
* either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
@ -2896,7 +3099,7 @@ static void rcu_barrier_func(void *unused)
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
@ -3557,12 +3760,29 @@ static void __init rcu_dump_rcu_node_tree(void)
struct workqueue_struct *rcu_gp_wq;
struct workqueue_struct *rcu_par_gp_wq;
static void __init kfree_rcu_batch_init(void)
{
int cpu;
int i;
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
spin_lock_init(&krcp->lock);
for (i = 0; i < KFREE_N_BATCHES; i++)
krcp->krw_arr[i].krcp = krcp;
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}
}
void __init rcu_init(void)
{
int cpu;
rcu_early_boot_tests();
kfree_rcu_batch_init();
rcu_bootup_announce();
rcu_init_geometry();
rcu_init_one();