From 8d0968cc6b8ffd8496c2ebffdfdc801f949a85e5 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 1 Mar 2021 11:13:34 +0100 Subject: [PATCH 01/44] locking/csd_lock: Add boot parameter for controlling CSD lock debugging Currently CSD lock debugging can be switched on and off via a kernel config option only. Unfortunately there is at least one problem with CSD lock handling pending for about 2 years now, which has been seen in different environments (mostly when running virtualized under KVM or Xen, at least once on bare metal). Multiple attempts to catch this issue have finally led to introduction of CSD lock debug code, but this code is not in use in most distros as it has some impact on performance. In order to be able to ship kernels with CONFIG_CSD_LOCK_WAIT_DEBUG enabled even for production use, add a boot parameter for switching the debug functionality on. This will reduce any performance impact of the debug coding to a bare minimum when not being used. Signed-off-by: Juergen Gross [ Minor edits. ] Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210301101336.7797-2-jgross@suse.com --- .../admin-guide/kernel-parameters.txt | 6 +++ kernel/smp.c | 38 +++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 04545725f187..98dbffa66d89 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -784,6 +784,12 @@ cs89x0_media= [HW,NET] Format: { rj45 | aui | bnc } + csdlock_debug= [KNL] Enable debug add-ons of cross-CPU function call + handling. When switched on, additional debug data is + printed to the console in case a hanging CPU is + detected, and that CPU is pinged again in order to try + to resolve the hang situation. + dasd= [HW,NET] See header of drivers/s390/block/dasd_devmap.c. diff --git a/kernel/smp.c b/kernel/smp.c index aeb0adfa0606..d5f0b21ab55e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "smpboot.h" #include "sched/smp.h" @@ -102,6 +103,20 @@ void __init call_function_init(void) #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG +static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled); + +static int __init csdlock_debug(char *str) +{ + unsigned int val = 0; + + get_option(&str, &val); + if (val) + static_branch_enable(&csdlock_debug_enabled); + + return 0; +} +early_param("csdlock_debug", csdlock_debug); + static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); @@ -110,7 +125,7 @@ static DEFINE_PER_CPU(void *, cur_csd_info); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ -static void csd_lock_record(call_single_data_t *csd) +static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ @@ -125,7 +140,13 @@ static void csd_lock_record(call_single_data_t *csd) /* Or before unlock, as the case may be. */ } -static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) +static __always_inline void csd_lock_record(call_single_data_t *csd) +{ + if (static_branch_unlikely(&csdlock_debug_enabled)) + __csd_lock_record(csd); +} + +static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; @@ -140,7 +161,7 @@ static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; @@ -204,7 +225,7 @@ static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 t * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static void __csd_lock_wait(call_single_data_t *csd) { int bug_id = 0; u64 ts0, ts1; @@ -218,6 +239,15 @@ static __always_inline void csd_lock_wait(call_single_data_t *csd) smp_acquire__after_ctrl_dep(); } +static __always_inline void csd_lock_wait(call_single_data_t *csd) +{ + if (static_branch_unlikely(&csdlock_debug_enabled)) { + __csd_lock_wait(csd); + return; + } + + smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); +} #else static void csd_lock_record(call_single_data_t *csd) { From de7b09ef658d637eed0584eaba30884e409aef31 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 1 Mar 2021 11:13:35 +0100 Subject: [PATCH 02/44] locking/csd_lock: Prepare more CSD lock debugging In order to be able to easily add more CSD lock debugging data to struct call_function_data->csd move the call_single_data_t element into a sub-structure. Signed-off-by: Juergen Gross Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210301101336.7797-3-jgross@suse.com --- kernel/smp.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index d5f0b21ab55e..6d7e6dbe33dc 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -31,8 +31,12 @@ #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) +struct cfd_percpu { + call_single_data_t csd; +}; + struct call_function_data { - call_single_data_t __percpu *csd; + struct cfd_percpu __percpu *pcpu; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; @@ -55,8 +59,8 @@ int smpcfd_prepare_cpu(unsigned int cpu) free_cpumask_var(cfd->cpumask); return -ENOMEM; } - cfd->csd = alloc_percpu(call_single_data_t); - if (!cfd->csd) { + cfd->pcpu = alloc_percpu(struct cfd_percpu); + if (!cfd->pcpu) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; @@ -71,7 +75,7 @@ int smpcfd_dead_cpu(unsigned int cpu) free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); - free_percpu(cfd->csd); + free_percpu(cfd->pcpu); return 0; } @@ -694,7 +698,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { - call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); + call_single_data_t *csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd; if (cond_func && !cond_func(cpu, info)) continue; @@ -719,7 +723,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; - csd = per_cpu_ptr(cfd->csd, cpu); + csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd; csd_lock_wait(csd); } } From a5aabace5fb8abf2adcfcf0fe54c089b20d71755 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 1 Mar 2021 11:13:36 +0100 Subject: [PATCH 03/44] locking/csd_lock: Add more data to CSD lock debugging In order to help identifying problems with IPI handling and remote function execution add some more data to IPI debugging code. There have been multiple reports of CPUs looping long times (many seconds) in smp_call_function_many() waiting for another CPU executing a function like tlb flushing. Most of these reports have been for cases where the kernel was running as a guest on top of KVM or Xen (there are rumours of that happening under VMWare, too, and even on bare metal). Finding the root cause hasn't been successful yet, even after more than 2 years of chasing this bug by different developers. Commit: 35feb60474bf4f7 ("kernel/smp: Provide CSD lock timeout diagnostics") tried to address this by adding some debug code and by issuing another IPI when a hang was detected. This helped mitigating the problem (the repeated IPI unlocks the hang), but the root cause is still unknown. Current available data suggests that either an IPI wasn't sent when it should have been, or that the IPI didn't result in the target CPU executing the queued function (due to the IPI not reaching the CPU, the IPI handler not being called, or the handler not seeing the queued request). Try to add more diagnostic data by introducing a global atomic counter which is being incremented when doing critical operations (before and after queueing a new request, when sending an IPI, and when dequeueing a request). The counter value is stored in percpu variables which can be printed out when a hang is detected. The data of the last event (consisting of sequence counter, source CPU, target CPU, and event type) is stored in a global variable. When a new event is to be traced, the data of the last event is stored in the event related percpu location and the global data is updated with the new event's data. This allows to track two events in one data location: one by the value of the event data (the event before the current one), and one by the location itself (the current event). A typical printout with a detected hang will look like this: csd: Detected non-responsive CSD lock (#1) on CPU#1, waiting 5000000003 ns for CPU#06 scf_handler_1+0x0/0x50(0xffffa2a881bb1410). csd: CSD lock (#1) handling prior scf_handler_1+0x0/0x50(0xffffa2a8813823c0) request. csd: cnt(00008cc): ffff->0000 dequeue (src cpu 0 == empty) csd: cnt(00008cd): ffff->0006 idle csd: cnt(0003668): 0001->0006 queue csd: cnt(0003669): 0001->0006 ipi csd: cnt(0003e0f): 0007->000a queue csd: cnt(0003e10): 0001->ffff ping csd: cnt(0003e71): 0003->0000 ping csd: cnt(0003e72): ffff->0006 gotipi csd: cnt(0003e73): ffff->0006 handle csd: cnt(0003e74): ffff->0006 dequeue (src cpu 0 == empty) csd: cnt(0003e7f): 0004->0006 ping csd: cnt(0003e80): 0001->ffff pinged csd: cnt(0003eb2): 0005->0001 noipi csd: cnt(0003eb3): 0001->0006 queue csd: cnt(0003eb4): 0001->0006 noipi csd: cnt now: 0003f00 The idea is to print only relevant entries. Those are all events which are associated with the hang (so sender side events for the source CPU of the hanging request, and receiver side events for the target CPU), and the related events just before those (for adding data needed to identify a possible race). Printing all available data would be possible, but this would add large amounts of data printed on larger configurations. Signed-off-by: Juergen Gross [ Minor readability edits. Breaks col80 but is far more readable. ] Signed-off-by: Ingo Molnar Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/20210301101336.7797-4-jgross@suse.com --- .../admin-guide/kernel-parameters.txt | 4 + kernel/smp.c | 226 +++++++++++++++++- 2 files changed, 226 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 98dbffa66d89..1fe9d389cdaf 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -789,6 +789,10 @@ printed to the console in case a hanging CPU is detected, and that CPU is pinged again in order to try to resolve the hang situation. + 0: disable csdlock debugging (default) + 1: enable basic csdlock debugging (minor impact) + ext: enable extended csdlock debugging (more impact, + but more data) dasd= [HW,NET] See header of drivers/s390/block/dasd_devmap.c. diff --git a/kernel/smp.c b/kernel/smp.c index 6d7e6dbe33dc..f472ef623956 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -31,8 +31,59 @@ #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG +union cfd_seq_cnt { + u64 val; + struct { + u64 src:16; + u64 dst:16; +#define CFD_SEQ_NOCPU 0xffff + u64 type:4; +#define CFD_SEQ_QUEUE 0 +#define CFD_SEQ_IPI 1 +#define CFD_SEQ_NOIPI 2 +#define CFD_SEQ_PING 3 +#define CFD_SEQ_PINGED 4 +#define CFD_SEQ_HANDLE 5 +#define CFD_SEQ_DEQUEUE 6 +#define CFD_SEQ_IDLE 7 +#define CFD_SEQ_GOTIPI 8 +#define CFD_SEQ_HDLEND 9 + u64 cnt:28; + } u; +}; + +static char *seq_type[] = { + [CFD_SEQ_QUEUE] = "queue", + [CFD_SEQ_IPI] = "ipi", + [CFD_SEQ_NOIPI] = "noipi", + [CFD_SEQ_PING] = "ping", + [CFD_SEQ_PINGED] = "pinged", + [CFD_SEQ_HANDLE] = "handle", + [CFD_SEQ_DEQUEUE] = "dequeue (src CPU 0 == empty)", + [CFD_SEQ_IDLE] = "idle", + [CFD_SEQ_GOTIPI] = "gotipi", + [CFD_SEQ_HDLEND] = "hdlend (src CPU 0 == early)", +}; + +struct cfd_seq_local { + u64 ping; + u64 pinged; + u64 handle; + u64 dequeue; + u64 idle; + u64 gotipi; + u64 hdlend; +}; +#endif + struct cfd_percpu { call_single_data_t csd; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + u64 seq_queue; + u64 seq_ipi; + u64 seq_noipi; +#endif }; struct call_function_data { @@ -108,12 +159,18 @@ void __init call_function_init(void) #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled); +static DEFINE_STATIC_KEY_FALSE(csdlock_debug_extended); static int __init csdlock_debug(char *str) { unsigned int val = 0; - get_option(&str, &val); + if (str && !strcmp(str, "ext")) { + val = 1; + static_branch_enable(&csdlock_debug_extended); + } else + get_option(&str, &val); + if (val) static_branch_enable(&csdlock_debug_enabled); @@ -124,9 +181,34 @@ early_param("csdlock_debug", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); +static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); #define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) static atomic_t csd_bug_count = ATOMIC_INIT(0); +static u64 cfd_seq; + +#define CFD_SEQ(s, d, t, c) \ + (union cfd_seq_cnt){ .u.src = s, .u.dst = d, .u.type = t, .u.cnt = c } + +static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type) +{ + union cfd_seq_cnt new, old; + + new = CFD_SEQ(src, dst, type, 0); + + do { + old.val = READ_ONCE(cfd_seq); + new.u.cnt = old.u.cnt + 1; + } while (cmpxchg(&cfd_seq, old.val, new.val) != old.val); + + return old.val; +} + +#define cfd_seq_store(var, src, dst, type) \ + do { \ + if (static_branch_unlikely(&csdlock_debug_extended)) \ + var = cfd_seq_inc(src, dst, type); \ + } while (0) /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(call_single_data_t *csd) @@ -160,6 +242,80 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd) return -1; } +static void cfd_seq_data_add(u64 val, unsigned int src, unsigned int dst, + unsigned int type, union cfd_seq_cnt *data, + unsigned int *n_data, unsigned int now) +{ + union cfd_seq_cnt new[2]; + unsigned int i, j, k; + + new[0].val = val; + new[1] = CFD_SEQ(src, dst, type, new[0].u.cnt + 1); + + for (i = 0; i < 2; i++) { + if (new[i].u.cnt <= now) + new[i].u.cnt |= 0x80000000U; + for (j = 0; j < *n_data; j++) { + if (new[i].u.cnt == data[j].u.cnt) { + /* Direct read value trumps generated one. */ + if (i == 0) + data[j].val = new[i].val; + break; + } + if (new[i].u.cnt < data[j].u.cnt) { + for (k = *n_data; k > j; k--) + data[k].val = data[k - 1].val; + data[j].val = new[i].val; + (*n_data)++; + break; + } + } + if (j == *n_data) { + data[j].val = new[i].val; + (*n_data)++; + } + } +} + +static const char *csd_lock_get_type(unsigned int type) +{ + return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type]; +} + +static void csd_lock_print_extended(call_single_data_t *csd, int cpu) +{ + struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu); + unsigned int srccpu = csd->node.src; + struct call_function_data *cfd = per_cpu_ptr(&cfd_data, srccpu); + struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); + unsigned int now; + union cfd_seq_cnt data[2 * ARRAY_SIZE(seq_type)]; + unsigned int n_data = 0, i; + + data[0].val = READ_ONCE(cfd_seq); + now = data[0].u.cnt; + + cfd_seq_data_add(pcpu->seq_queue, srccpu, cpu, CFD_SEQ_QUEUE, data, &n_data, now); + cfd_seq_data_add(pcpu->seq_ipi, srccpu, cpu, CFD_SEQ_IPI, data, &n_data, now); + cfd_seq_data_add(pcpu->seq_noipi, srccpu, cpu, CFD_SEQ_NOIPI, data, &n_data, now); + + cfd_seq_data_add(per_cpu(cfd_seq_local.ping, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PING, data, &n_data, now); + cfd_seq_data_add(per_cpu(cfd_seq_local.pinged, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED, data, &n_data, now); + + cfd_seq_data_add(seq->idle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_IDLE, data, &n_data, now); + cfd_seq_data_add(seq->gotipi, CFD_SEQ_NOCPU, cpu, CFD_SEQ_GOTIPI, data, &n_data, now); + cfd_seq_data_add(seq->handle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HANDLE, data, &n_data, now); + cfd_seq_data_add(seq->dequeue, CFD_SEQ_NOCPU, cpu, CFD_SEQ_DEQUEUE, data, &n_data, now); + cfd_seq_data_add(seq->hdlend, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HDLEND, data, &n_data, now); + + for (i = 0; i < n_data; i++) { + pr_alert("\tcsd: cnt(%07x): %04x->%04x %s\n", + data[i].u.cnt & ~0x80000000U, data[i].u.src, + data[i].u.dst, csd_lock_get_type(data[i].u.type)); + } + pr_alert("\tcsd: cnt now: %07x\n", now); +} + /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, @@ -209,6 +365,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { + if (static_branch_unlikely(&csdlock_debug_extended)) + csd_lock_print_extended(csd, cpu); if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); if (!cpu_cur_csd) { @@ -252,7 +410,27 @@ static __always_inline void csd_lock_wait(call_single_data_t *csd) smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } + +static void __smp_call_single_queue_debug(int cpu, struct llist_node *node) +{ + unsigned int this_cpu = smp_processor_id(); + struct cfd_seq_local *seq = this_cpu_ptr(&cfd_seq_local); + struct call_function_data *cfd = this_cpu_ptr(&cfd_data); + struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); + + cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); + if (llist_add(node, &per_cpu(call_single_queue, cpu))) { + cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); + cfd_seq_store(seq->ping, this_cpu, cpu, CFD_SEQ_PING); + send_call_function_single_ipi(cpu); + cfd_seq_store(seq->pinged, this_cpu, cpu, CFD_SEQ_PINGED); + } else { + cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); + } +} #else +#define cfd_seq_store(var, src, dst, type) + static void csd_lock_record(call_single_data_t *csd) { } @@ -290,6 +468,19 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + if (static_branch_unlikely(&csdlock_debug_extended)) { + unsigned int type; + + type = CSD_TYPE(container_of(node, call_single_data_t, + node.llist)); + if (type == CSD_TYPE_SYNC || type == CSD_TYPE_ASYNC) { + __smp_call_single_queue_debug(cpu, node); + return; + } + } +#endif + /* * The list addition should be visible before sending the IPI * handler locks the list to pull the entry off it because of @@ -348,6 +539,8 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) */ void generic_smp_call_function_single_interrupt(void) { + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, + smp_processor_id(), CFD_SEQ_GOTIPI); flush_smp_call_function_queue(true); } @@ -375,7 +568,13 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) lockdep_assert_irqs_disabled(); head = this_cpu_ptr(&call_single_queue); + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->handle, CFD_SEQ_NOCPU, + smp_processor_id(), CFD_SEQ_HANDLE); entry = llist_del_all(head); + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->dequeue, + /* Special meaning of source cpu: 0 == queue empty */ + entry ? CFD_SEQ_NOCPU : 0, + smp_processor_id(), CFD_SEQ_DEQUEUE); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ @@ -434,8 +633,12 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } - if (!entry) + if (!entry) { + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, + 0, smp_processor_id(), + CFD_SEQ_HDLEND); return; + } /* * Second; run all !SYNC callbacks. @@ -473,6 +676,9 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) */ if (entry) sched_ttwu_pending(entry); + + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, CFD_SEQ_NOCPU, + smp_processor_id(), CFD_SEQ_HDLEND); } void flush_smp_call_function_from_idle(void) @@ -482,6 +688,8 @@ void flush_smp_call_function_from_idle(void) if (llist_empty(this_cpu_ptr(&call_single_queue))) return; + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, + smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); flush_smp_call_function_queue(true); if (local_softirq_pending()) @@ -698,7 +906,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { - call_single_data_t *csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd; + struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); + call_single_data_t *csd = &pcpu->csd; if (cond_func && !cond_func(cpu, info)) continue; @@ -712,12 +921,21 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif - if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) + cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); + cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); + } else { + cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); + } } /* Send a message to all CPUs in the map */ + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, + CFD_SEQ_NOCPU, CFD_SEQ_PING); arch_send_call_function_ipi_mask(cfd->cpumask_ipi); + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, + CFD_SEQ_NOCPU, CFD_SEQ_PINGED); if (wait) { for_each_cpu(cpu, cfd->cpumask) { From 864b435514b286c0be2a38a02f487aa28d990ef8 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Thu, 11 Feb 2021 13:48:48 -0800 Subject: [PATCH 04/44] x86/jump_label: Mark arguments as const to satisfy asm constraints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling an external kernel module with `-O0` or `-O1`, the following compile error may be reported: ./arch/x86/include/asm/jump_label.h:25:2: error: impossible constraint in ‘asm’ 25 | asm_volatile_goto("1:" | ^~~~~~~~~~~~~~~~~ It appears that these lower optimization levels prevent GCC from detecting that the key/branch arguments can be treated as constants and used as immediate operands. To work around this, explicitly add the `const` label. Signed-off-by: Jason Gerecke Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Steven Rostedt (VMware) Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20210211214848.536626-1-jason.gerecke@wacom.com --- arch/x86/include/asm/jump_label.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 06c3cc22a058..7f2006645d84 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -20,7 +20,7 @@ #include #include -static __always_inline bool arch_static_branch(struct static_key *key, bool branch) +static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" @@ -36,7 +36,7 @@ l_yes: return true; } -static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) +static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" From 3e31f94752e454bdd0ca4a1d046ee21f80c166c5 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 26 Feb 2021 17:06:58 -0700 Subject: [PATCH 05/44] lockdep: Add lockdep_assert_not_held() Some kernel functions must be called without holding a specific lock. Add lockdep_assert_not_held() to be used in these functions to detect incorrect calls while holding a lock. lockdep_assert_not_held() provides the opposite functionality of lockdep_assert_held() which is used to assert calls that require holding a specific lock. Incorporates suggestions from Peter Zijlstra to avoid misfires when lockdep_off() is employed. The need for lockdep_assert_not_held() came up in a discussion on ath10k patch. ath10k_drain_tx() and i915_vma_pin_ww() are examples of functions that can use lockdep_assert_not_held(). Signed-off-by: Shuah Khan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/linux-wireless/871rdmu9z9.fsf@codeaurora.org/ --- include/linux/lockdep.h | 11 ++++++++--- kernel/locking/lockdep.c | 6 +++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 7b7ebf2e28ec..dbd9ea846b36 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -301,8 +301,12 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) -#define lockdep_assert_held(l) do { \ - WARN_ON(debug_locks && !lockdep_is_held(l)); \ +#define lockdep_assert_held(l) do { \ + WARN_ON(debug_locks && lockdep_is_held(l) == 0); \ + } while (0) + +#define lockdep_assert_not_held(l) do { \ + WARN_ON(debug_locks && lockdep_is_held(l) == 1); \ } while (0) #define lockdep_assert_held_write(l) do { \ @@ -393,7 +397,8 @@ extern int lockdep_is_held(const void *); #define lockdep_is_held_type(l, r) (1) #define lockdep_assert_held(l) do { (void)(l); } while (0) -#define lockdep_assert_held_write(l) do { (void)(l); } while (0) +#define lockdep_assert_not_held(l) do { (void)(l); } while (0) +#define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c6d0c1dc6253..969736b33185 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5539,8 +5539,12 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) unsigned long flags; int ret = 0; + /* + * Avoid false negative lockdep_assert_held() and + * lockdep_assert_not_held(). + */ if (unlikely(!lockdep_enabled())) - return 1; /* avoid false negative lockdep_assert_held() */ + return -1; raw_local_irq_save(flags); check_flags(flags); From f8cfa46608f8aa5ca5421ce281ab314129c15411 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 26 Feb 2021 17:06:59 -0700 Subject: [PATCH 06/44] lockdep: Add lockdep lock state defines Adds defines for lock state returns from lock_is_held_type() based on Johannes Berg's suggestions as it make it easier to read and maintain the lock states. These are defines and a enum to avoid changes to lock_is_held_type() and lockdep_is_held() return types. Updates to lock_is_held_type() and __lock_is_held() to use the new defines. Signed-off-by: Shuah Khan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/linux-wireless/871rdmu9z9.fsf@codeaurora.org/ --- include/linux/lockdep.h | 11 +++++++++-- kernel/locking/lockdep.c | 11 ++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index dbd9ea846b36..17805aac0e85 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -268,6 +268,11 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, unsigned long ip); +/* lock_is_held_type() returns */ +#define LOCK_STATE_UNKNOWN -1 +#define LOCK_STATE_NOT_HELD 0 +#define LOCK_STATE_HELD 1 + /* * Same "read" as for lock_acquire(), except -1 means any. */ @@ -302,11 +307,13 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert_held(l) do { \ - WARN_ON(debug_locks && lockdep_is_held(l) == 0); \ + WARN_ON(debug_locks && \ + lockdep_is_held(l) == LOCK_STATE_NOT_HELD); \ } while (0) #define lockdep_assert_not_held(l) do { \ - WARN_ON(debug_locks && lockdep_is_held(l) == 1); \ + WARN_ON(debug_locks && \ + lockdep_is_held(l) == LOCK_STATE_HELD); \ } while (0) #define lockdep_assert_held_write(l) do { \ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 969736b33185..c0b8926a67f0 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -54,6 +54,7 @@ #include #include #include +#include #include @@ -5252,13 +5253,13 @@ int __lock_is_held(const struct lockdep_map *lock, int read) if (match_held_lock(hlock, lock)) { if (read == -1 || hlock->read == read) - return 1; + return LOCK_STATE_HELD; - return 0; + return LOCK_STATE_NOT_HELD; } } - return 0; + return LOCK_STATE_NOT_HELD; } static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) @@ -5537,14 +5538,14 @@ EXPORT_SYMBOL_GPL(lock_release); noinstr int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; - int ret = 0; + int ret = LOCK_STATE_NOT_HELD; /* * Avoid false negative lockdep_assert_held() and * lockdep_assert_not_held(). */ if (unlikely(!lockdep_enabled())) - return -1; + return LOCK_STATE_UNKNOWN; raw_local_irq_save(flags); check_flags(flags); From bdb1050ee1faaec1e78c15de8b1959176f26c655 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 26 Feb 2021 17:07:00 -0700 Subject: [PATCH 07/44] ath10k: Detect conf_mutex held ath10k_drain_tx() calls ath10k_drain_tx() must not be called with conf_mutex held as workers can use that also. Add call to lockdep_assert_not_held() on conf_mutex to detect if conf_mutex is held by the caller. The idea for this patch stemmed from coming across the comment block above the ath10k_drain_tx() while reviewing the conf_mutex holds during to debug the conf_mutex lock assert in ath10k_debug_fw_stats_request(). Adding detection to assert on conf_mutex hold will help detect incorrect usages that could lead to locking problems when async worker routines try to call this routine. Signed-off-by: Shuah Khan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Kalle Valo Link: https://lore.kernel.org/linux-wireless/871rdmu9z9.fsf@codeaurora.org/ --- drivers/net/wireless/ath/ath10k/mac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index bb6c5ee43ac0..5ce4f8d038b9 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -4727,6 +4727,8 @@ out: /* Must not be called with conf_mutex held as workers can use that also. */ void ath10k_drain_tx(struct ath10k *ar) { + lockdep_assert_not_held(&ar->conf_mutex); + /* make sure rcu-protected mac80211 tx path itself is drained */ synchronize_net(); From e36299efe7d749976fbdaaf756dee6ef32543c2c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 3 Mar 2021 10:38:45 +0100 Subject: [PATCH 08/44] kcsan, debugfs: Move debugfs file creation out of early init Commit 56348560d495 ("debugfs: do not attempt to create a new file before the filesystem is initalized") forbids creating new debugfs files until debugfs is fully initialized. This means that KCSAN's debugfs file creation, which happened at the end of __init(), no longer works. And was apparently never supposed to work! However, there is no reason to create KCSAN's debugfs file so early. This commit therefore moves its creation to a late_initcall() callback. Cc: "Rafael J. Wysocki" Cc: stable Fixes: 56348560d495 ("debugfs: do not attempt to create a new file before the filesystem is initalized") Reviewed-by: Greg Kroah-Hartman Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 2 -- kernel/kcsan/debugfs.c | 4 +++- kernel/kcsan/kcsan.h | 5 ----- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3bf98db9c702..23e7acb5c667 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -639,8 +639,6 @@ void __init kcsan_init(void) BUG_ON(!in_task()); - kcsan_debugfs_init(); - for_each_possible_cpu(cpu) per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles(); diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 3c8093a371b1..209ad8dcfcec 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -261,7 +261,9 @@ static const struct file_operations debugfs_ops = .release = single_release }; -void __init kcsan_debugfs_init(void) +static void __init kcsan_debugfs_init(void) { debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops); } + +late_initcall(kcsan_debugfs_init); diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 8d4bf3431b3c..87ccdb3b051f 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -30,11 +30,6 @@ extern bool kcsan_enabled; void kcsan_save_irqtrace(struct task_struct *task); void kcsan_restore_irqtrace(struct task_struct *task); -/* - * Initialize debugfs file. - */ -void kcsan_debugfs_init(void); - /* * Statistics counters displayed via debugfs; should only be modified in * slow-paths. From a146fed56f8a06a6f17ac11ebdc7ca3f396bcb55 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 13 Jan 2021 17:05:56 +0100 Subject: [PATCH 09/44] kcsan: Make test follow KUnit style recommendations Per recently added KUnit style recommendations at Documentation/dev-tools/kunit/style.rst, make the following changes to the KCSAN test: 1. Rename 'kcsan-test.c' to 'kcsan_test.c'. 2. Rename suite name 'kcsan-test' to 'kcsan'. 3. Rename CONFIG_KCSAN_TEST to CONFIG_KCSAN_KUNIT_TEST and default to KUNIT_ALL_TESTS. Reviewed-by: David Gow Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 4 ++-- kernel/kcsan/{kcsan-test.c => kcsan_test.c} | 2 +- lib/Kconfig.kcsan | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) rename kernel/kcsan/{kcsan-test.c => kcsan_test.c} (99%) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 65ca5539c470..c2bb07f5bcc7 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -13,5 +13,5 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ obj-y := core.o debugfs.o report.o obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o -CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer -obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o +CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan_test.c similarity index 99% rename from kernel/kcsan/kcsan-test.c rename to kernel/kcsan/kcsan_test.c index ebe7fd245104..f16f632eb416 100644 --- a/kernel/kcsan/kcsan-test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1156,7 +1156,7 @@ static void test_exit(struct kunit *test) } static struct kunit_suite kcsan_test_suite = { - .name = "kcsan-test", + .name = "kcsan", .test_cases = kcsan_test_cases, .init = test_init, .exit = test_exit, diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index f271ff5fbb5a..0440f373248e 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -69,8 +69,9 @@ config KCSAN_SELFTEST panic. Recommended to be enabled, ensuring critical functionality works as intended. -config KCSAN_TEST - tristate "KCSAN test for integrated runtime behaviour" +config KCSAN_KUNIT_TEST + tristate "KCSAN test for integrated runtime behaviour" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS depends on TRACEPOINTS && KUNIT select TORTURE_TEST help From f6a149140321274cbd955dee50798fe191841f94 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 13 Jan 2021 17:05:57 +0100 Subject: [PATCH 10/44] kcsan: Switch to KUNIT_CASE_PARAM for parameterized tests Since KUnit now support parameterized tests via KUNIT_CASE_PARAM, update KCSAN's test to switch to it for parameterized tests. This simplifies parameterized tests and gets rid of the "parameters in case name" workaround (hack). At the same time, we can increase the maximum number of threads used, because on systems with too few CPUs, KUnit allows us to now stop at the maximum useful threads and not unnecessarily execute redundant test cases with (the same) limited threads as had been the case before. Reviewed-by: David Gow Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 118 ++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 63 deletions(-) diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index f16f632eb416..b71751fc9f4f 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -13,6 +13,8 @@ * Author: Marco Elver */ +#define pr_fmt(fmt) "kcsan_test: " fmt + #include #include #include @@ -951,22 +953,53 @@ static void test_atomic_builtins(struct kunit *test) } /* - * Each test case is run with different numbers of threads. Until KUnit supports - * passing arguments for each test case, we encode #threads in the test case - * name (read by get_num_threads()). [The '-' was chosen as a stylistic - * preference to separate test name and #threads.] + * Generate thread counts for all test cases. Values generated are in interval + * [2, 5] followed by exponentially increasing thread counts from 8 to 32. * * The thread counts are chosen to cover potentially interesting boundaries and - * corner cases (range 2-5), and then stress the system with larger counts. + * corner cases (2 to 5), and then stress the system with larger counts. */ -#define KCSAN_KUNIT_CASE(test_name) \ - { .run_case = test_name, .name = #test_name "-02" }, \ - { .run_case = test_name, .name = #test_name "-03" }, \ - { .run_case = test_name, .name = #test_name "-04" }, \ - { .run_case = test_name, .name = #test_name "-05" }, \ - { .run_case = test_name, .name = #test_name "-08" }, \ - { .run_case = test_name, .name = #test_name "-16" } +static const void *nthreads_gen_params(const void *prev, char *desc) +{ + long nthreads = (long)prev; + if (nthreads < 0 || nthreads >= 32) + nthreads = 0; /* stop */ + else if (!nthreads) + nthreads = 2; /* initial value */ + else if (nthreads < 5) + nthreads++; + else if (nthreads == 5) + nthreads = 8; + else + nthreads *= 2; + + if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + /* + * Without any preemption, keep 2 CPUs free for other tasks, one + * of which is the main test case function checking for + * completion or failure. + */ + const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const long min_required_cpus = 2 + min_unused_cpus; + + if (num_online_cpus() < min_required_cpus) { + pr_err_once("Too few online CPUs (%u < %d) for test\n", + num_online_cpus(), min_required_cpus); + nthreads = 0; + } else if (nthreads >= num_online_cpus() - min_unused_cpus) { + /* Use negative value to indicate last param. */ + nthreads = -(num_online_cpus() - min_unused_cpus); + pr_warn_once("Limiting number of threads to %ld (only %d online CPUs)\n", + -nthreads, num_online_cpus()); + } + } + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "threads=%ld", abs(nthreads)); + return (void *)nthreads; +} + +#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params) static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_basic), KCSAN_KUNIT_CASE(test_concurrent_races), @@ -996,24 +1029,6 @@ static struct kunit_case kcsan_test_cases[] = { /* ===== End test cases ===== */ -/* Get number of threads encoded in test name. */ -static bool __no_kcsan -get_num_threads(const char *test, int *nthreads) -{ - int len = strlen(test); - - if (WARN_ON(len < 3)) - return false; - - *nthreads = test[len - 1] - '0'; - *nthreads += (test[len - 2] - '0') * 10; - - if (WARN_ON(*nthreads < 0)) - return false; - - return true; -} - /* Concurrent accesses from interrupts. */ __no_kcsan static void access_thread_timer(struct timer_list *timer) @@ -1076,9 +1091,6 @@ static int test_init(struct kunit *test) if (!torture_init_begin((char *)test->name, 1)) return -EBUSY; - if (!get_num_threads(test->name, &nthreads)) - goto err; - if (WARN_ON(threads)) goto err; @@ -1087,38 +1099,18 @@ static int test_init(struct kunit *test) goto err; } - if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { - /* - * Without any preemption, keep 2 CPUs free for other tasks, one - * of which is the main test case function checking for - * completion or failure. - */ - const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; - const int min_required_cpus = 2 + min_unused_cpus; + nthreads = abs((long)test->param_value); + if (WARN_ON(!nthreads)) + goto err; - if (num_online_cpus() < min_required_cpus) { - pr_err("%s: too few online CPUs (%u < %d) for test", - test->name, num_online_cpus(), min_required_cpus); + threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), GFP_KERNEL); + if (WARN_ON(!threads)) + goto err; + + threads[nthreads] = NULL; + for (i = 0; i < nthreads; ++i) { + if (torture_create_kthread(access_thread, NULL, threads[i])) goto err; - } else if (nthreads > num_online_cpus() - min_unused_cpus) { - nthreads = num_online_cpus() - min_unused_cpus; - pr_warn("%s: limiting number of threads to %d\n", - test->name, nthreads); - } - } - - if (nthreads) { - threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), - GFP_KERNEL); - if (WARN_ON(!threads)) - goto err; - - threads[nthreads] = NULL; - for (i = 0; i < nthreads; ++i) { - if (torture_create_kthread(access_thread, NULL, - threads[i])) - goto err; - } } torture_init_end(); From bd0ccc4afca2d6ae0029cae35c4f1d2e2ade7579 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 15 Jan 2021 18:09:53 +0100 Subject: [PATCH 11/44] kcsan: Add missing license and copyright headers Adds missing license and/or copyright headers for KCSAN source files. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- Documentation/dev-tools/kcsan.rst | 3 +++ include/linux/kcsan-checks.h | 6 ++++++ include/linux/kcsan.h | 7 +++++++ kernel/kcsan/atomic.h | 5 +++++ kernel/kcsan/core.c | 5 +++++ kernel/kcsan/debugfs.c | 5 +++++ kernel/kcsan/encoding.h | 5 +++++ kernel/kcsan/kcsan.h | 3 ++- kernel/kcsan/report.c | 5 +++++ kernel/kcsan/selftest.c | 5 +++++ 10 files changed, 48 insertions(+), 1 deletion(-) diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index be7a0b0e1f28..d85ce238ace7 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -1,3 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2019, Google LLC. + The Kernel Concurrency Sanitizer (KCSAN) ======================================== diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index cf14840609ce..9fd0ad80fef6 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -1,4 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * KCSAN access checks and modifiers. These can be used to explicitly check + * uninstrumented accesses, or change KCSAN checking behaviour of accesses. + * + * Copyright (C) 2019, Google LLC. + */ #ifndef _LINUX_KCSAN_CHECKS_H #define _LINUX_KCSAN_CHECKS_H diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 53340d8789f9..fc266ecb2a4d 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -1,4 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. Public interface and + * data structures to set up runtime. See kcsan-checks.h for explicit checks and + * modifiers. For more info please see Documentation/dev-tools/kcsan.rst. + * + * Copyright (C) 2019, Google LLC. + */ #ifndef _LINUX_KCSAN_H #define _LINUX_KCSAN_H diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h index 75fe701f4127..530ae1bda8e7 100644 --- a/kernel/kcsan/atomic.h +++ b/kernel/kcsan/atomic.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Rules for implicitly atomic memory accesses. + * + * Copyright (C) 2019, Google LLC. + */ #ifndef _KERNEL_KCSAN_ATOMIC_H #define _KERNEL_KCSAN_ATOMIC_H diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 23e7acb5c667..45c821d4e8bd 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -1,4 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN core runtime. + * + * Copyright (C) 2019, Google LLC. + */ #define pr_fmt(fmt) "kcsan: " fmt diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 209ad8dcfcec..c1dd02f3be8b 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -1,4 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN debugfs interface. + * + * Copyright (C) 2019, Google LLC. + */ #define pr_fmt(fmt) "kcsan: " fmt diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index 7ee405524904..170a2bb22f53 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -1,4 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * KCSAN watchpoint encoding. + * + * Copyright (C) 2019, Google LLC. + */ #ifndef _KERNEL_KCSAN_ENCODING_H #define _KERNEL_KCSAN_ENCODING_H diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h index 87ccdb3b051f..9881099d4179 100644 --- a/kernel/kcsan/kcsan.h +++ b/kernel/kcsan/kcsan.h @@ -1,8 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ - /* * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please * see Documentation/dev-tools/kcsan.rst. + * + * Copyright (C) 2019, Google LLC. */ #ifndef _KERNEL_KCSAN_KCSAN_H diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index d3bf87e6007c..13dce3c664d6 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -1,4 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN reporting. + * + * Copyright (C) 2019, Google LLC. + */ #include #include diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 9014a3a82cf9..7f29cb0f5e63 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -1,4 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * KCSAN short boot-time selftests. + * + * Copyright (C) 2019, Google LLC. + */ #define pr_fmt(fmt) "kcsan: " fmt From ba46b21bbdf8c1e8f054f44e7db7d6684720ef78 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 13 Jan 2021 11:59:20 +0100 Subject: [PATCH 12/44] doc: Update rcu_dereference.rst reference Changeset b00aedf978aa ("doc: Convert to rcu_dereference.txt to rcu_dereference.rst") renamed: Documentation/RCU/rcu_dereference.txt to: Documentation/RCU/rcu_dereference.rst. Update its cross-reference accordingly. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/glossary.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt index b2da6365be63..6f3d16dbf467 100644 --- a/tools/memory-model/Documentation/glossary.txt +++ b/tools/memory-model/Documentation/glossary.txt @@ -19,7 +19,7 @@ Address Dependency: When the address of a later memory access is computed from the value returned by the rcu_dereference() on line 2, the address dependency extends from that rcu_dereference() to that "p->a". In rare cases, optimizing compilers can destroy address - dependencies. Please see Documentation/RCU/rcu_dereference.txt + dependencies. Please see Documentation/RCU/rcu_dereference.rst for more information. See also "Control Dependency" and "Data Dependency". From 9146658cc49a1dbed5ece140f658be884e189ade Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Thu, 14 Jan 2021 23:09:07 +0900 Subject: [PATCH 13/44] tools/memory-model: Remove reference to atomic_ops.rst atomic_ops.rst was removed by commit f0400a77ebdc ("atomic: Delete obsolete documentation"). Remove the broken link in tools/memory-model/Documentation/simple.txt. Cc: Peter Zijlstra Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- tools/memory-model/Documentation/simple.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/memory-model/Documentation/simple.txt b/tools/memory-model/Documentation/simple.txt index 81e1a0ec5342..4c789ec8334f 100644 --- a/tools/memory-model/Documentation/simple.txt +++ b/tools/memory-model/Documentation/simple.txt @@ -189,7 +189,6 @@ Additional information may be found in these files: Documentation/atomic_t.txt Documentation/atomic_bitops.txt -Documentation/core-api/atomic_ops.rst Documentation/core-api/refcount-vs-atomic.rst Reading code using these primitives is often also quite helpful. From 9a4b99fce659c03699f1cb5003ebe7c57c084d49 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 26 Feb 2021 09:50:26 -0800 Subject: [PATCH 14/44] kernel/futex: Kill rt_mutex_next_owner() Update wake_futex_pi() and kill the call altogether. This is possible because: (i) The case of fixup_owner() in which the pi_mutex was stolen from the signaled enqueued top-waiter which fails to trylock and doesn't see a current owner of the rtmutex but needs to acknowledge an non-enqueued higher priority waiter, which is the other alternative. This used to be handled by rt_mutex_next_owner(), which guaranteed fixup_pi_state_owner('newowner') never to be nil. Nowadays the logic is handled by an EAGAIN loop, without the need of rt_mutex_next_owner(). Specifically: c1e2f0eaf015 (futex: Avoid violating the 10th rule of futex) 9f5d1c336a10 (futex: Handle transient "ownerless" rtmutex state correctly) (ii) rt_mutex_next_owner() and rt_mutex_top_waiter() are semantically equivalent, as of: c28d62cf52d7 (locking/rtmutex: Handle non enqueued waiters gracefully in remove_waiter()) So instead of keeping the call around, just use the good ole rt_mutex_top_waiter(). No change in semantics. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210226175029.50335-1-dave@stgolabs.net --- kernel/futex.c | 7 +++++-- kernel/locking/rtmutex.c | 20 -------------------- kernel/locking/rtmutex_common.h | 1 - 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index e68db7745039..db8002dbca7a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1494,13 +1494,14 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { u32 curval, newval; + struct rt_mutex_waiter *top_waiter; struct task_struct *new_owner; bool postunlock = false; DEFINE_WAKE_Q(wake_q); int ret = 0; - new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); - if (WARN_ON_ONCE(!new_owner)) { + top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!top_waiter)) { /* * As per the comment in futex_unlock_pi() this should not happen. * @@ -1513,6 +1514,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ goto out_unlock; } + new_owner = top_waiter->task; + /* * We pass it to the next owner. The WAITERS bit is always kept * enabled while there is PI state around. We cleanup the owner diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 48fff6437901..29f09d0b8224 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1792,26 +1792,6 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, return ret; } -/** - * rt_mutex_next_owner - return the next owner of the lock - * - * @lock: the rt lock query - * - * Returns the next owner of the lock or NULL - * - * Caller has to serialize against other accessors to the lock - * itself. - * - * Special API call for PI-futex support - */ -struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - return NULL; - - return rt_mutex_top_waiter(lock)->task; -} - /** * rt_mutex_wait_proxy_lock() - Wait for lock acquisition * @lock: the rt_mutex we were woken on diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index ca6fb489007b..a5007f00c1b7 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -130,7 +130,6 @@ enum rtmutex_chainwalk { /* * PI-futex support (proxy locking functions, etc.): */ -extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); From a1565aa4699847febfdfd6af3bf06ca17a9e16af Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 26 Feb 2021 09:50:27 -0800 Subject: [PATCH 15/44] kernel/futex: Make futex_wait_requeue_pi() only call fixup_owner() A small cleanup that allows for fixup_pi_state_owner() only to be called from fixup_owner(), and make requeue_pi uniformly call fixup_owner() regardless of the state in which the fixup is actually needed. Of course this makes the caller's first pi_state->owner != current check redundant, but that should't really matter. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210226175029.50335-2-dave@stgolabs.net --- kernel/futex.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index db8002dbca7a..ee09995d707b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3241,15 +3241,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * reference count. */ - /* Check if the requeue code acquired the second futex for us. */ + /* + * Check if the requeue code acquired the second futex for us and do + * any pertinent fixup. + */ if (!q.rt_waiter) { - /* - * Got the lock. We might not be the anticipated owner if we - * did a lock-steal - fix up the PI-state in that case. - */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); - ret = fixup_pi_state_owner(uaddr2, &q, current); + ret = fixup_owner(uaddr2, &q, true); /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. From a3f2428d2b9c9ca70f52818774a2f6e0e30a0f0b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 26 Feb 2021 09:50:28 -0800 Subject: [PATCH 16/44] kernel/futex: Move hb unlock out of unqueue_me_pi() This improves the code readability, and the locking more obvious as it becomes symmetric for the caller. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210226175029.50335-3-dave@stgolabs.net --- kernel/futex.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index ee09995d707b..dcd6132485e1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2318,19 +2318,15 @@ retry: /* * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry - * and dropped here. + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. */ static void unqueue_me_pi(struct futex_q *q) - __releases(q->lock_ptr) { __unqueue_futex(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; - - spin_unlock(q->lock_ptr); } static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, @@ -2913,8 +2909,8 @@ no_block: if (res) ret = (res < 0) ? res : 0; - /* Unqueue and drop the lock */ unqueue_me_pi(&q); + spin_unlock(q.lock_ptr); goto out; out_unlock_put_key: @@ -3290,8 +3286,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; - /* Unqueue and drop the lock. */ unqueue_me_pi(&q); + spin_unlock(q.lock_ptr); } if (ret == -EINTR) { From c2e4bfe0eef313eeb1c4c9d921be7a9d91d5d71a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 26 Feb 2021 09:50:29 -0800 Subject: [PATCH 17/44] kernel/futex: Explicitly document pi_lock for pi_state owner fixup This seems to belong in the serialization and lifetime rules section. pi_state_update_owner() will take the pi_mutex's owner's pi_lock to do whatever fixup, successful or not. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210226175029.50335-4-dave@stgolabs.net --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/futex.c b/kernel/futex.c index dcd6132485e1..475055715371 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -981,6 +981,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * p->pi_lock: * * p->pi_state_list -> pi_state->list, relation + * pi_mutex->owner -> pi_state->owner, relation * * pi_state->refcount: * From 49ab51b01ec6fd837ae3efe2e0cdb41fcf5cf048 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 15:40:05 -0800 Subject: [PATCH 18/44] tools/memory-model: Add access-marking documentation This commit adapts the "Concurrency bugs should fear the big bad data-race detector (part 2)" LWN article (https://lwn.net/Articles/816854/) to kernel-documentation form. This allows more easily updating the material as needed. Suggested-by: Thomas Gleixner [ paulmck: Apply Marco Elver feedback. ] [ paulmck: Update per Akira Yokosawa feedback. ] Reviewed-by: Marco Elver Signed-off-by: Paul E. McKenney --- .../Documentation/access-marking.txt | 479 ++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 tools/memory-model/Documentation/access-marking.txt diff --git a/tools/memory-model/Documentation/access-marking.txt b/tools/memory-model/Documentation/access-marking.txt new file mode 100644 index 000000000000..1ab189f51f55 --- /dev/null +++ b/tools/memory-model/Documentation/access-marking.txt @@ -0,0 +1,479 @@ +MARKING SHARED-MEMORY ACCESSES +============================== + +This document provides guidelines for marking intentionally concurrent +normal accesses to shared memory, that is "normal" as in accesses that do +not use read-modify-write atomic operations. It also describes how to +document these accesses, both with comments and with special assertions +processed by the Kernel Concurrency Sanitizer (KCSAN). This discussion +builds on an earlier LWN article [1]. + + +ACCESS-MARKING OPTIONS +====================== + +The Linux kernel provides the following access-marking options: + +1. Plain C-language accesses (unmarked), for example, "a = b;" + +2. Data-race marking, for example, "data_race(a = b);" + +3. READ_ONCE(), for example, "a = READ_ONCE(b);" + The various forms of atomic_read() also fit in here. + +4. WRITE_ONCE(), for example, "WRITE_ONCE(a, b);" + The various forms of atomic_set() also fit in here. + + +These may be used in combination, as shown in this admittedly improbable +example: + + WRITE_ONCE(a, b + data_race(c + d) + READ_ONCE(e)); + +Neither plain C-language accesses nor data_race() (#1 and #2 above) place +any sort of constraint on the compiler's choice of optimizations [2]. +In contrast, READ_ONCE() and WRITE_ONCE() (#3 and #4 above) restrict the +compiler's use of code-motion and common-subexpression optimizations. +Therefore, if a given access is involved in an intentional data race, +using READ_ONCE() for loads and WRITE_ONCE() for stores is usually +preferable to data_race(), which in turn is usually preferable to plain +C-language accesses. + +KCSAN will complain about many types of data races involving plain +C-language accesses, but marking all accesses involved in a given data +race with one of data_race(), READ_ONCE(), or WRITE_ONCE(), will prevent +KCSAN from complaining. Of course, lack of KCSAN complaints does not +imply correct code. Therefore, please take a thoughtful approach +when responding to KCSAN complaints. Churning the code base with +ill-considered additions of data_race(), READ_ONCE(), and WRITE_ONCE() +is unhelpful. + +In fact, the following sections describe situations where use of +data_race() and even plain C-language accesses is preferable to +READ_ONCE() and WRITE_ONCE(). + + +Use of the data_race() Macro +---------------------------- + +Here are some situations where data_race() should be used instead of +READ_ONCE() and WRITE_ONCE(): + +1. Data-racy loads from shared variables whose values are used only + for diagnostic purposes. + +2. Data-racy reads whose values are checked against marked reload. + +3. Reads whose values feed into error-tolerant heuristics. + +4. Writes setting values that feed into error-tolerant heuristics. + + +Data-Racy Reads for Approximate Diagnostics + +Approximate diagnostics include lockdep reports, monitoring/statistics +(including /proc and /sys output), WARN*()/BUG*() checks whose return +values are ignored, and other situations where reads from shared variables +are not an integral part of the core concurrency design. + +In fact, use of data_race() instead READ_ONCE() for these diagnostic +reads can enable better checking of the remaining accesses implementing +the core concurrency design. For example, suppose that the core design +prevents any non-diagnostic reads from shared variable x from running +concurrently with updates to x. Then using plain C-language writes +to x allows KCSAN to detect reads from x from within regions of code +that fail to exclude the updates. In this case, it is important to use +data_race() for the diagnostic reads because otherwise KCSAN would give +false-positive warnings about these diagnostic reads. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Data-Racy Reads That Are Checked Against Marked Reload + +The values from some reads are not implicitly trusted. They are instead +fed into some operation that checks the full value against a later marked +load from memory, which means that the occasional arbitrarily bogus value +is not a problem. For example, if a bogus value is fed into cmpxchg(), +all that happens is that this cmpxchg() fails, which normally results +in a retry. Unless the race condition that resulted in the bogus value +recurs, this retry will with high probability succeed, so no harm done. + +However, please keep in mind that a data_race() load feeding into +a cmpxchg_relaxed() might still be subject to load fusing on some +architectures. Therefore, it is best to capture the return value from +the failing cmpxchg() for the next iteration of the loop, an approach +that provides the compiler much less scope for mischievous optimizations. +Capturing the return value from cmpxchg() also saves a memory reference +in many cases. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Reads Feeding Into Error-Tolerant Heuristics + +Values from some reads feed into heuristics that can tolerate occasional +errors. Such reads can use data_race(), thus allowing KCSAN to focus on +the other accesses to the relevant shared variables. But please note +that data_race() loads are subject to load fusing, which can result in +consistent errors, which in turn are quite capable of breaking heuristics. +Therefore use of data_race() should be limited to cases where some other +code (such as a barrier() call) will force the occasional reload. + +In theory, plain C-language loads can also be used for this use case. +However, in practice this will have the disadvantage of causing KCSAN +to generate false positives because KCSAN will have no way of knowing +that the resulting data race was intentional. + + +Writes Setting Values Feeding Into Error-Tolerant Heuristics + +The values read into error-tolerant heuristics come from somewhere, +for example, from sysfs. This means that some code in sysfs writes +to this same variable, and these writes can also use data_race(). +After all, if the heuristic can tolerate the occasional bogus value +due to compiler-mangled reads, it can also tolerate the occasional +compiler-mangled write, at least assuming that the proper value is in +place once the write completes. + +Plain C-language stores can also be used for this use case. However, +in kernels built with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n, this +will have the disadvantage of causing KCSAN to generate false positives +because KCSAN will have no way of knowing that the resulting data race +was intentional. + + +Use of Plain C-Language Accesses +-------------------------------- + +Here are some example situations where plain C-language accesses should +used instead of READ_ONCE(), WRITE_ONCE(), and data_race(): + +1. Accesses protected by mutual exclusion, including strict locking + and sequence locking. + +2. Initialization-time and cleanup-time accesses. This covers a + wide variety of situations, including the uniprocessor phase of + system boot, variables to be used by not-yet-spawned kthreads, + structures not yet published to reference-counted or RCU-protected + data structures, and the cleanup side of any of these situations. + +3. Per-CPU variables that are not accessed from other CPUs. + +4. Private per-task variables, including on-stack variables, some + fields in the task_struct structure, and task-private heap data. + +5. Any other loads for which there is not supposed to be a concurrent + store to that same variable. + +6. Any other stores for which there should be neither concurrent + loads nor concurrent stores to that same variable. + + But note that KCSAN makes two explicit exceptions to this rule + by default, refraining from flagging plain C-language stores: + + a. No matter what. You can override this default by building + with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n. + + b. When the store writes the value already contained in + that variable. You can override this default by building + with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n. + + c. When one of the stores is in an interrupt handler and + the other in the interrupted code. You can override this + default by building with CONFIG_KCSAN_INTERRUPT_WATCHER=y. + +Note that it is important to use plain C-language accesses in these cases, +because doing otherwise prevents KCSAN from detecting violations of your +code's synchronization rules. + + +ACCESS-DOCUMENTATION OPTIONS +============================ + +It is important to comment marked accesses so that people reading your +code, yourself included, are reminded of the synchronization design. +However, it is even more important to comment plain C-language accesses +that are intentionally involved in data races. Such comments are +needed to remind people reading your code, again, yourself included, +of how the compiler has been prevented from optimizing those accesses +into concurrency bugs. + +It is also possible to tell KCSAN about your synchronization design. +For example, ASSERT_EXCLUSIVE_ACCESS(foo) tells KCSAN that any +concurrent access to variable foo by any other CPU is an error, even +if that concurrent access is marked with READ_ONCE(). In addition, +ASSERT_EXCLUSIVE_WRITER(foo) tells KCSAN that although it is OK for there +to be concurrent reads from foo from other CPUs, it is an error for some +other CPU to be concurrently writing to foo, even if that concurrent +write is marked with data_race() or WRITE_ONCE(). + +Note that although KCSAN will call out data races involving either +ASSERT_EXCLUSIVE_ACCESS() or ASSERT_EXCLUSIVE_WRITER() on the one hand +and data_race() writes on the other, KCSAN will not report the location +of these data_race() writes. + + +EXAMPLES +======== + +As noted earlier, the goal is to prevent the compiler from destroying +your concurrent algorithm, to help the human reader, and to inform +KCSAN of aspects of your concurrency design. This section looks at a +few examples showing how this can be done. + + +Lock Protection With Lockless Diagnostic Access +----------------------------------------------- + +For example, suppose a shared variable "foo" is read only while a +reader-writer spinlock is read-held, written only while that same +spinlock is write-held, except that it is also read locklessly for +diagnostic purposes. The code might look as follows: + + int foo; + DEFINE_RWLOCK(foo_rwlock); + + void update_foo(int newval) + { + write_lock(&foo_rwlock); + foo = newval; + do_something(newval); + write_unlock(&foo_rwlock); + } + + int read_foo(void) + { + int ret; + + read_lock(&foo_rwlock); + do_something_else(); + ret = foo; + read_unlock(&foo_rwlock); + return ret; + } + + int read_foo_diagnostic(void) + { + return data_race(foo); + } + +The reader-writer lock prevents the compiler from introducing concurrency +bugs into any part of the main algorithm using foo, which means that +the accesses to foo within both update_foo() and read_foo() can (and +should) be plain C-language accesses. One benefit of making them be +plain C-language accesses is that KCSAN can detect any erroneous lockless +reads from or updates to foo. The data_race() in read_foo_diagnostic() +tells KCSAN that data races are expected, and should be silently +ignored. This data_race() also tells the human reading the code that +read_foo_diagnostic() might sometimes return a bogus value. + +However, please note that your kernel must be built with +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n in order for KCSAN to +detect a buggy lockless write. If you need KCSAN to detect such a +write even if that write did not change the value of foo, you also +need CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n. If you need KCSAN to +detect such a write happening in an interrupt handler running on the +same CPU doing the legitimate lock-protected write, you also need +CONFIG_KCSAN_INTERRUPT_WATCHER=y. With some or all of these Kconfig +options set properly, KCSAN can be quite helpful, although it is not +necessarily a full replacement for hardware watchpoints. On the other +hand, neither are hardware watchpoints a full replacement for KCSAN +because it is not always easy to tell hardware watchpoint to conditionally +trap on accesses. + + +Lock-Protected Writes With Lockless Reads +----------------------------------------- + +For another example, suppose a shared variable "foo" is updated only +while holding a spinlock, but is read locklessly. The code might look +as follows: + + int foo; + DEFINE_SPINLOCK(foo_lock); + + void update_foo(int newval) + { + spin_lock(&foo_lock); + WRITE_ONCE(foo, newval); + ASSERT_EXCLUSIVE_WRITER(foo); + do_something(newval); + spin_unlock(&foo_wlock); + } + + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +Because foo is read locklessly, all accesses are marked. The purpose +of the ASSERT_EXCLUSIVE_WRITER() is to allow KCSAN to check for a buggy +concurrent lockless write. + + +Lockless Reads and Writes +------------------------- + +For another example, suppose a shared variable "foo" is both read and +updated locklessly. The code might look as follows: + + int foo; + + int update_foo(int newval) + { + int ret; + + ret = xchg(&foo, newval); + do_something(newval); + return ret; + } + + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +Because foo is accessed locklessly, all accesses are marked. It does +not make sense to use ASSERT_EXCLUSIVE_WRITER() in this case because +there really can be concurrent lockless writers. KCSAN would +flag any concurrent plain C-language reads from foo, and given +CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n, also any concurrent plain +C-language writes to foo. + + +Lockless Reads and Writes, But With Single-Threaded Initialization +------------------------------------------------------------------ + +For yet another example, suppose that foo is initialized in a +single-threaded manner, but that a number of kthreads are then created +that locklessly and concurrently access foo. Some snippets of this code +might look as follows: + + int foo; + + void initialize_foo(int initval, int nkthreads) + { + int i; + + foo = initval; + ASSERT_EXCLUSIVE_ACCESS(foo); + for (i = 0; i < nkthreads; i++) + kthread_run(access_foo_concurrently, ...); + } + + /* Called from access_foo_concurrently(). */ + int update_foo(int newval) + { + int ret; + + ret = xchg(&foo, newval); + do_something(newval); + return ret; + } + + /* Also called from access_foo_concurrently(). */ + int read_foo(void) + { + do_something_else(); + return READ_ONCE(foo); + } + +The initialize_foo() uses a plain C-language write to foo because there +are not supposed to be concurrent accesses during initialization. The +ASSERT_EXCLUSIVE_ACCESS() allows KCSAN to flag buggy concurrent unmarked +reads, and the ASSERT_EXCLUSIVE_ACCESS() call further allows KCSAN to +flag buggy concurrent writes, even if: (1) Those writes are marked or +(2) The kernel was built with CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=y. + + +Checking Stress-Test Race Coverage +---------------------------------- + +When designing stress tests it is important to ensure that race conditions +of interest really do occur. For example, consider the following code +fragment: + + int foo; + + int update_foo(int newval) + { + return xchg(&foo, newval); + } + + int xor_shift_foo(int shift, int mask) + { + int old, new, newold; + + newold = data_race(foo); /* Checked by cmpxchg(). */ + do { + old = newold; + new = (old << shift) ^ mask; + newold = cmpxchg(&foo, old, new); + } while (newold != old); + return old; + } + + int read_foo(void) + { + return READ_ONCE(foo); + } + +If it is possible for update_foo(), xor_shift_foo(), and read_foo() to be +invoked concurrently, the stress test should force this concurrency to +actually happen. KCSAN can evaluate the stress test when the above code +is modified to read as follows: + + int foo; + + int update_foo(int newval) + { + ASSERT_EXCLUSIVE_ACCESS(foo); + return xchg(&foo, newval); + } + + int xor_shift_foo(int shift, int mask) + { + int old, new, newold; + + newold = data_race(foo); /* Checked by cmpxchg(). */ + do { + old = newold; + new = (old << shift) ^ mask; + ASSERT_EXCLUSIVE_ACCESS(foo); + newold = cmpxchg(&foo, old, new); + } while (newold != old); + return old; + } + + + int read_foo(void) + { + ASSERT_EXCLUSIVE_ACCESS(foo); + return READ_ONCE(foo); + } + +If a given stress-test run does not result in KCSAN complaints from +each possible pair of ASSERT_EXCLUSIVE_ACCESS() invocations, the +stress test needs improvement. If the stress test was to be evaluated +on a regular basis, it would be wise to place the above instances of +ASSERT_EXCLUSIVE_ACCESS() under #ifdef so that they did not result in +false positives when not evaluating the stress test. + + +REFERENCES +========== + +[1] "Concurrency bugs should fear the big bad data-race detector (part 2)" + https://lwn.net/Articles/816854/ + +[2] "Who's afraid of a big bad optimizing compiler?" + https://lwn.net/Articles/793253/ From 4faf62b1ef1a9367f7dcf8b7ce509980dfdcee83 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Wed, 17 Mar 2021 09:48:06 +0530 Subject: [PATCH 19/44] locking/rwsem: Fix comment typo s/folowing/following/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: Ingo Molnar Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20210317041806.4096156-1-unixbhaskar@gmail.com --- kernel/locking/rwsem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index abba5df50006..fe9cc65cd522 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -632,7 +632,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) } /* - * The rwsem_spin_on_owner() function returns the folowing 4 values + * The rwsem_spin_on_owner() function returns the following 4 values * depending on the lock owner state. * OWNER_NULL : owner is currently NULL * OWNER_WRITER: when owner changes and is a writer From 2ea55bbba23e9d36996299664d618393c8602646 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 18 Mar 2021 13:28:11 -0400 Subject: [PATCH 20/44] locking/locktorture: Fix false positive circular locking splat in ww_mutex test In order to avoid false positive circular locking lockdep splat when runnng the ww_mutex torture test, we need to make sure that the ww_mutexes have the same lock class as the acquire_ctx. This means the ww_mutexes must have the same lockdep key as the acquire_ctx. Unfortunately the current DEFINE_WW_MUTEX() macro fails to do that. As a result, we add an init method for the ww_mutex test to do explicit ww_mutex_init()'s of the ww_mutexes to avoid the false positive warning. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210318172814.4400-3-longman@redhat.com --- kernel/locking/locktorture.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 0ab94e1f1276..3c27f43ab3e8 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -357,10 +357,20 @@ static struct lock_torture_ops mutex_lock_ops = { }; #include +/* + * The torture ww_mutexes should belong to the same lock class as + * torture_ww_class to avoid lockdep problem. The ww_mutex_init() + * function is called for initialization to ensure that. + */ static DEFINE_WD_CLASS(torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_0, &torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_1, &torture_ww_class); -static DEFINE_WW_MUTEX(torture_ww_mutex_2, &torture_ww_class); +static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2; + +static void torture_ww_mutex_init(void) +{ + ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class); + ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class); + ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class); +} static int torture_ww_mutex_lock(void) __acquires(torture_ww_mutex_0) @@ -418,6 +428,7 @@ __releases(torture_ww_mutex_2) } static struct lock_torture_ops ww_mutex_lock_ops = { + .init = torture_ww_mutex_init, .writelock = torture_ww_mutex_lock, .write_delay = torture_mutex_delay, .task_boost = torture_boost_dummy, From 5261ced47f8e89173c3b015f6152a05f11a418c3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 18 Mar 2021 13:28:12 -0400 Subject: [PATCH 21/44] locking/ww_mutex: Remove DEFINE_WW_MUTEX() macro The current DEFINE_WW_MUTEX() macro fails to properly set up the lockdep key of the ww_mutexes causing potential circular locking dependency splat. Though it is possible to add more macro magic to make it work, but the result is rather ugly. Since locktorture was the only user of DEFINE_WW_MUTEX() and the previous commit has just removed its use. It is easier to just remove the macro to force future users of ww_mutexes to use ww_mutex_init() for initialization. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210318172814.4400-4-longman@redhat.com --- include/linux/ww_mutex.h | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 6ecf2a0220db..b77f39f319ad 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -48,39 +48,26 @@ struct ww_acquire_ctx { #endif }; -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) \ - , .ww_class = class -#else -# define __WW_CLASS_MUTEX_INITIALIZER(lockname, class) -#endif - #define __WW_CLASS_INITIALIZER(ww_class, _is_wait_die) \ { .stamp = ATOMIC_LONG_INIT(0) \ , .acquire_name = #ww_class "_acquire" \ , .mutex_name = #ww_class "_mutex" \ , .is_wait_die = _is_wait_die } -#define __WW_MUTEX_INITIALIZER(lockname, class) \ - { .base = __MUTEX_INITIALIZER(lockname.base) \ - __WW_CLASS_MUTEX_INITIALIZER(lockname, class) } - #define DEFINE_WD_CLASS(classname) \ struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 1) #define DEFINE_WW_CLASS(classname) \ struct ww_class classname = __WW_CLASS_INITIALIZER(classname, 0) -#define DEFINE_WW_MUTEX(mutexname, ww_class) \ - struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class) - /** * ww_mutex_init - initialize the w/w mutex * @lock: the mutex to be initialized * @ww_class: the w/w class the mutex should belong to * * Initialize the w/w mutex to unlocked state and associate it with the given - * class. + * class. Static define macro for w/w mutex is not provided and this function + * is the only way to properly initialize the w/w mutex. * * It is not allowed to initialize an already locked mutex. */ From aa3a5f31877e5dc131cc286ce707413d441c8375 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 18 Mar 2021 13:28:13 -0400 Subject: [PATCH 22/44] locking/locktorture: Pass thread id to lock/unlock functions To allow the lock and unlock functions in locktorture to access per-thread information, we need to pass some hint on how to locate those information. One way to do this is to pass in a unique thread id which can then be used to access a global array for thread specific information. Change the lock and unlock method to add a thread id parameter which can be determined by the offset of the lwsp/lrsp pointer from the global lwsa/lrsa array. There is no other functional change in this patch. Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210318172814.4400-5-longman@redhat.com --- kernel/locking/locktorture.c | 94 ++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 3c27f43ab3e8..90a975a95a13 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -76,13 +76,13 @@ static void lock_torture_cleanup(void); struct lock_torture_ops { void (*init)(void); void (*exit)(void); - int (*writelock)(void); + int (*writelock)(int tid); void (*write_delay)(struct torture_random_state *trsp); void (*task_boost)(struct torture_random_state *trsp); - void (*writeunlock)(void); - int (*readlock)(void); + void (*writeunlock)(int tid); + int (*readlock)(int tid); void (*read_delay)(struct torture_random_state *trsp); - void (*readunlock)(void); + void (*readunlock)(int tid); unsigned long flags; /* for irq spinlocks */ const char *name; @@ -105,7 +105,7 @@ static struct lock_torture_cxt cxt = { 0, 0, false, false, * Definitions for lock torture testing. */ -static int torture_lock_busted_write_lock(void) +static int torture_lock_busted_write_lock(int tid __maybe_unused) { return 0; /* BUGGY, do not use in real life!!! */ } @@ -122,7 +122,7 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_lock_busted_write_unlock(void) +static void torture_lock_busted_write_unlock(int tid __maybe_unused) { /* BUGGY, do not use in real life!!! */ } @@ -145,7 +145,8 @@ static struct lock_torture_ops lock_busted_ops = { static DEFINE_SPINLOCK(torture_spinlock); -static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +static int torture_spin_lock_write_lock(int tid __maybe_unused) +__acquires(torture_spinlock) { spin_lock(&torture_spinlock); return 0; @@ -169,7 +170,8 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +static void torture_spin_lock_write_unlock(int tid __maybe_unused) +__releases(torture_spinlock) { spin_unlock(&torture_spinlock); } @@ -185,7 +187,7 @@ static struct lock_torture_ops spin_lock_ops = { .name = "spin_lock" }; -static int torture_spin_lock_write_lock_irq(void) +static int torture_spin_lock_write_lock_irq(int tid __maybe_unused) __acquires(torture_spinlock) { unsigned long flags; @@ -195,7 +197,7 @@ __acquires(torture_spinlock) return 0; } -static void torture_lock_spin_write_unlock_irq(void) +static void torture_lock_spin_write_unlock_irq(int tid __maybe_unused) __releases(torture_spinlock) { spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); @@ -214,7 +216,8 @@ static struct lock_torture_ops spin_lock_irq_ops = { static DEFINE_RWLOCK(torture_rwlock); -static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) +static int torture_rwlock_write_lock(int tid __maybe_unused) +__acquires(torture_rwlock) { write_lock(&torture_rwlock); return 0; @@ -235,12 +238,14 @@ static void torture_rwlock_write_delay(struct torture_random_state *trsp) udelay(shortdelay_us); } -static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) +static void torture_rwlock_write_unlock(int tid __maybe_unused) +__releases(torture_rwlock) { write_unlock(&torture_rwlock); } -static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) +static int torture_rwlock_read_lock(int tid __maybe_unused) +__acquires(torture_rwlock) { read_lock(&torture_rwlock); return 0; @@ -261,7 +266,8 @@ static void torture_rwlock_read_delay(struct torture_random_state *trsp) udelay(shortdelay_us); } -static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) +static void torture_rwlock_read_unlock(int tid __maybe_unused) +__releases(torture_rwlock) { read_unlock(&torture_rwlock); } @@ -277,7 +283,8 @@ static struct lock_torture_ops rw_lock_ops = { .name = "rw_lock" }; -static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) +static int torture_rwlock_write_lock_irq(int tid __maybe_unused) +__acquires(torture_rwlock) { unsigned long flags; @@ -286,13 +293,14 @@ static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) return 0; } -static void torture_rwlock_write_unlock_irq(void) +static void torture_rwlock_write_unlock_irq(int tid __maybe_unused) __releases(torture_rwlock) { write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); } -static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) +static int torture_rwlock_read_lock_irq(int tid __maybe_unused) +__acquires(torture_rwlock) { unsigned long flags; @@ -301,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) return 0; } -static void torture_rwlock_read_unlock_irq(void) +static void torture_rwlock_read_unlock_irq(int tid __maybe_unused) __releases(torture_rwlock) { read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); @@ -320,7 +328,8 @@ static struct lock_torture_ops rw_lock_irq_ops = { static DEFINE_MUTEX(torture_mutex); -static int torture_mutex_lock(void) __acquires(torture_mutex) +static int torture_mutex_lock(int tid __maybe_unused) +__acquires(torture_mutex) { mutex_lock(&torture_mutex); return 0; @@ -340,7 +349,8 @@ static void torture_mutex_delay(struct torture_random_state *trsp) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_mutex_unlock(void) __releases(torture_mutex) +static void torture_mutex_unlock(int tid __maybe_unused) +__releases(torture_mutex) { mutex_unlock(&torture_mutex); } @@ -372,7 +382,7 @@ static void torture_ww_mutex_init(void) ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class); } -static int torture_ww_mutex_lock(void) +static int torture_ww_mutex_lock(int tid __maybe_unused) __acquires(torture_ww_mutex_0) __acquires(torture_ww_mutex_1) __acquires(torture_ww_mutex_2) @@ -417,7 +427,7 @@ __acquires(torture_ww_mutex_2) return 0; } -static void torture_ww_mutex_unlock(void) +static void torture_ww_mutex_unlock(int tid __maybe_unused) __releases(torture_ww_mutex_0) __releases(torture_ww_mutex_1) __releases(torture_ww_mutex_2) @@ -442,7 +452,8 @@ static struct lock_torture_ops ww_mutex_lock_ops = { #ifdef CONFIG_RT_MUTEXES static DEFINE_RT_MUTEX(torture_rtmutex); -static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +static int torture_rtmutex_lock(int tid __maybe_unused) +__acquires(torture_rtmutex) { rt_mutex_lock(&torture_rtmutex); return 0; @@ -498,7 +509,8 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +static void torture_rtmutex_unlock(int tid __maybe_unused) +__releases(torture_rtmutex) { rt_mutex_unlock(&torture_rtmutex); } @@ -516,7 +528,8 @@ static struct lock_torture_ops rtmutex_lock_ops = { #endif static DECLARE_RWSEM(torture_rwsem); -static int torture_rwsem_down_write(void) __acquires(torture_rwsem) +static int torture_rwsem_down_write(int tid __maybe_unused) +__acquires(torture_rwsem) { down_write(&torture_rwsem); return 0; @@ -536,12 +549,14 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rwsem_up_write(void) __releases(torture_rwsem) +static void torture_rwsem_up_write(int tid __maybe_unused) +__releases(torture_rwsem) { up_write(&torture_rwsem); } -static int torture_rwsem_down_read(void) __acquires(torture_rwsem) +static int torture_rwsem_down_read(int tid __maybe_unused) +__acquires(torture_rwsem) { down_read(&torture_rwsem); return 0; @@ -561,7 +576,8 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) torture_preempt_schedule(); /* Allow test to be preempted. */ } -static void torture_rwsem_up_read(void) __releases(torture_rwsem) +static void torture_rwsem_up_read(int tid __maybe_unused) +__releases(torture_rwsem) { up_read(&torture_rwsem); } @@ -590,24 +606,28 @@ static void torture_percpu_rwsem_exit(void) percpu_free_rwsem(&pcpu_rwsem); } -static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +static int torture_percpu_rwsem_down_write(int tid __maybe_unused) +__acquires(pcpu_rwsem) { percpu_down_write(&pcpu_rwsem); return 0; } -static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +static void torture_percpu_rwsem_up_write(int tid __maybe_unused) +__releases(pcpu_rwsem) { percpu_up_write(&pcpu_rwsem); } -static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +static int torture_percpu_rwsem_down_read(int tid __maybe_unused) +__acquires(pcpu_rwsem) { percpu_down_read(&pcpu_rwsem); return 0; } -static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +static void torture_percpu_rwsem_up_read(int tid __maybe_unused) +__releases(pcpu_rwsem) { percpu_up_read(&pcpu_rwsem); } @@ -632,6 +652,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { static int lock_torture_writer(void *arg) { struct lock_stress_stats *lwsp = arg; + int tid = lwsp - cxt.lwsa; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); @@ -642,7 +663,7 @@ static int lock_torture_writer(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->task_boost(&rand); - cxt.cur_ops->writelock(); + cxt.cur_ops->writelock(tid); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; @@ -653,7 +674,7 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); - cxt.cur_ops->writeunlock(); + cxt.cur_ops->writeunlock(tid); stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); @@ -670,6 +691,7 @@ static int lock_torture_writer(void *arg) static int lock_torture_reader(void *arg) { struct lock_stress_stats *lrsp = arg; + int tid = lrsp - cxt.lrsa; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_reader task started"); @@ -679,7 +701,7 @@ static int lock_torture_reader(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cxt.cur_ops->readlock(); + cxt.cur_ops->readlock(tid); lock_is_read_held = true; if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ @@ -687,7 +709,7 @@ static int lock_torture_reader(void *arg) lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); lock_is_read_held = false; - cxt.cur_ops->readunlock(); + cxt.cur_ops->readunlock(tid); stutter_wait("lock_torture_reader"); } while (!torture_must_stop()); From 8c52cca04f97a4c09ec2f0bd8fe6d0cdf49834e4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 18 Mar 2021 13:28:14 -0400 Subject: [PATCH 23/44] locking/locktorture: Fix incorrect use of ww_acquire_ctx in ww_mutex test The ww_acquire_ctx structure for ww_mutex needs to persist for a complete lock/unlock cycle. In the ww_mutex test in locktorture, however, both ww_acquire_init() and ww_acquire_fini() are called within the lock function only. This causes a lockdep splat of "WARNING: Nested lock was not taken" when lockdep is enabled in the kernel. To fix this problem, we need to move the ww_acquire_fini() after the ww_mutex_unlock() in torture_ww_mutex_unlock(). This is done by allocating a global array of ww_acquire_ctx structures. Each locking thread is associated with its own ww_acquire_ctx via the unique thread id it has so that both the lock and unlock functions can access the same ww_acquire_ctx structure. Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210318172814.4400-6-longman@redhat.com --- kernel/locking/locktorture.c | 39 +++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 90a975a95a13..b3adb40549bf 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -374,15 +374,27 @@ static struct lock_torture_ops mutex_lock_ops = { */ static DEFINE_WD_CLASS(torture_ww_class); static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2; +static struct ww_acquire_ctx *ww_acquire_ctxs; static void torture_ww_mutex_init(void) { ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class); ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class); ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class); + + ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress, + sizeof(*ww_acquire_ctxs), + GFP_KERNEL); + if (!ww_acquire_ctxs) + VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory"); } -static int torture_ww_mutex_lock(int tid __maybe_unused) +static void torture_ww_mutex_exit(void) +{ + kfree(ww_acquire_ctxs); +} + +static int torture_ww_mutex_lock(int tid) __acquires(torture_ww_mutex_0) __acquires(torture_ww_mutex_1) __acquires(torture_ww_mutex_2) @@ -392,7 +404,7 @@ __acquires(torture_ww_mutex_2) struct list_head link; struct ww_mutex *lock; } locks[3], *ll, *ln; - struct ww_acquire_ctx ctx; + struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid]; locks[0].lock = &torture_ww_mutex_0; list_add(&locks[0].link, &list); @@ -403,12 +415,12 @@ __acquires(torture_ww_mutex_2) locks[2].lock = &torture_ww_mutex_2; list_add(&locks[2].link, &list); - ww_acquire_init(&ctx, &torture_ww_class); + ww_acquire_init(ctx, &torture_ww_class); list_for_each_entry(ll, &list, link) { int err; - err = ww_mutex_lock(ll->lock, &ctx); + err = ww_mutex_lock(ll->lock, ctx); if (!err) continue; @@ -419,26 +431,29 @@ __acquires(torture_ww_mutex_2) if (err != -EDEADLK) return err; - ww_mutex_lock_slow(ll->lock, &ctx); + ww_mutex_lock_slow(ll->lock, ctx); list_move(&ll->link, &list); } - ww_acquire_fini(&ctx); return 0; } -static void torture_ww_mutex_unlock(int tid __maybe_unused) +static void torture_ww_mutex_unlock(int tid) __releases(torture_ww_mutex_0) __releases(torture_ww_mutex_1) __releases(torture_ww_mutex_2) { + struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid]; + ww_mutex_unlock(&torture_ww_mutex_0); ww_mutex_unlock(&torture_ww_mutex_1); ww_mutex_unlock(&torture_ww_mutex_2); + ww_acquire_fini(ctx); } static struct lock_torture_ops ww_mutex_lock_ops = { .init = torture_ww_mutex_init, + .exit = torture_ww_mutex_exit, .writelock = torture_ww_mutex_lock, .write_delay = torture_mutex_delay, .task_boost = torture_boost_dummy, @@ -924,16 +939,16 @@ static int __init lock_torture_init(void) goto unwind; } - if (cxt.cur_ops->init) { - cxt.cur_ops->init(); - cxt.init_called = true; - } - if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; else cxt.nrealwriters_stress = 2 * num_online_cpus(); + if (cxt.cur_ops->init) { + cxt.cur_ops->init(); + cxt.init_called = true; + } + #ifdef CONFIG_DEBUG_MUTEXES if (str_has_prefix(torture_type, "mutex")) cxt.debug_lock = true; From e2db7592be8e83df47519116621411e1056b21c7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Mar 2021 02:35:05 +0100 Subject: [PATCH 24/44] locking: Fix typos in comments Fix ~16 single-word typos in locking code comments. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/spinlock.h | 2 +- include/linux/lockdep.h | 2 +- include/linux/rwsem.h | 2 +- kernel/locking/lockdep.c | 4 ++-- kernel/locking/lockdep_proc.c | 2 +- kernel/locking/mcs_spinlock.h | 2 +- kernel/locking/mutex.c | 4 ++-- kernel/locking/osq_lock.c | 4 ++-- kernel/locking/rtmutex.c | 4 ++-- kernel/locking/rwsem.c | 2 +- kernel/locking/spinlock.c | 4 ++-- 11 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 8f009e788ad4..f610a773f2be 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -22,7 +22,7 @@ * assembler to insert a extra (16-bit) IT instruction, depending on the * presence or absence of neighbouring conditional instructions. * - * To avoid this unpredictableness, an approprite IT is inserted explicitly: + * To avoid this unpredictability, an appropriate IT is inserted explicitly: * the assembler won't change IT instructions which are explicitly present * in the input. */ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 17805aac0e85..09ac2e8348d2 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -155,7 +155,7 @@ extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* - * Split the recrursion counter in two to readily detect 'off' vs recursion. + * Split the recursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4c715be48717..a66038d88878 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -110,7 +110,7 @@ do { \ /* * This is the same regardless of which rwsem implementation that is being used. - * It is just a heuristic meant to be called by somebody alreadying holding the + * It is just a heuristic meant to be called by somebody already holding the * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c0b8926a67f0..0e97287891db 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1747,7 +1747,7 @@ static enum bfs_result __bfs(struct lock_list *source_entry, /* * Step 4: if not match, expand the path by adding the - * forward or backwards dependencis in the search + * forward or backwards dependencies in the search * */ first = true; @@ -1916,7 +1916,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the * dependency graph, as any strong path ..-> A -> B ->.. we can get with * having dependency A -> B, we could already get a equivalent path ..-> A -> - * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant. + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. * * We need to make sure both the start and the end of A -> .. -> B is not * weaker than A -> B. For the start part, please see the comment in diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 02ef87f50df2..806978314496 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -348,7 +348,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) debug_locks); /* - * Zappped classes and lockdep data buffers reuse statistics. + * Zapped classes and lockdep data buffers reuse statistics. */ seq_puts(m, "\n"); seq_printf(m, " zapped classes: %11lu\n", diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5e10153b4d3c..85251d8771d9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -7,7 +7,7 @@ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock * with the desirable properties of being fair, and with each cpu trying * to acquire the lock spinning on a local variable. - * It avoids expensive cache bouncings that common test-and-set spin-lock + * It avoids expensive cache bounces that common test-and-set spin-lock * implementations incur. */ #ifndef __LINUX_MCS_SPINLOCK_H diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 622ebdfcd083..cb6b112ce155 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -92,7 +92,7 @@ static inline unsigned long __owner_flags(unsigned long owner) } /* - * Trylock variant that retuns the owning task on failure. + * Trylock variant that returns the owning task on failure. */ static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock) { @@ -207,7 +207,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, /* * Give up ownership to a specific task, when @task = NULL, this is equivalent - * to a regular unlock. Sets PICKUP on a handoff, clears HANDOF, preserves + * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves * WAITERS. Provides RELEASE semantics like a regular unlock, the * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff. */ diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 1de006ed3aa8..d5610ad52b92 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -135,7 +135,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) */ /* - * Wait to acquire the lock or cancelation. Note that need_resched() + * Wait to acquire the lock or cancellation. Note that need_resched() * will come with an IPI, which will wake smp_cond_load_relaxed() if it * is implemented with a monitor-wait. vcpu_is_preempted() relies on * polling, be careful. @@ -164,7 +164,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) /* * We can only fail the cmpxchg() racing against an unlock(), - * in which case we should observe @node->locked becomming + * in which case we should observe @node->locked becoming * true. */ if (smp_load_acquire(&node->locked)) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 29f09d0b8224..db31bce114f8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -706,7 +706,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, } else if (prerequeue_top_waiter == waiter) { /* * The waiter was the top waiter on the lock, but is - * no longer the top prority waiter. Replace waiter in + * no longer the top priority waiter. Replace waiter in * the owner tasks pi waiters tree with the new top * (highest priority) waiter and adjust the priority * of the owner. @@ -1194,7 +1194,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, return; /* - * Yell lowdly and stop the task right here. + * Yell loudly and stop the task right here. */ rt_mutex_print_deadlock(w); while (1) { diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index fe9cc65cd522..809b0016d344 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -819,7 +819,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * we try to get it. The new owner may be a spinnable * writer. * - * To take advantage of two scenarios listed agove, the RT + * To take advantage of two scenarios listed above, the RT * task is made to retry one more time to see if it can * acquire the lock or continue spinning on the new owning * writer. Of course, if the time lag is long enough or the diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 0ff08380f531..c8d7ad9fb9b2 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -58,10 +58,10 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state); /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function - * which embedds them into the calling _lock_function below. + * which embeds them into the calling _lock_function below. * * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal + * time (making _this_ CPU preemptible if possible), and we also signal * towards that other CPU that it should break the lock ASAP. */ #define BUILD_LOCK_OPS(op, locktype) \ From 8af856d18bfbe89676ade38caa2a5d06f75f211d Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 24 Mar 2021 13:40:40 +0800 Subject: [PATCH 25/44] locking/mutex: Remove repeated declaration Commit 0cd39f4600ed ("locking/seqlock, headers: Untangle the spaghetti monster") introduces 'struct ww_acquire_ctx' again, remove the repeated declaration and move the pre-declarations to the top. Signed-off-by: Shaokun Zhang Signed-off-by: Ingo Molnar Acked-by: Waiman Long Link: https://lore.kernel.org/r/1616564440-61318-1-git-send-email-zhangshaokun@hisilicon.com --- include/linux/mutex.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 0cd631a19727..e7a126796937 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -20,6 +20,7 @@ #include #include +struct ww_class; struct ww_acquire_ctx; /* @@ -65,9 +66,6 @@ struct mutex { #endif }; -struct ww_class; -struct ww_acquire_ctx; - struct ww_mutex { struct mutex base; struct ww_acquire_ctx *ctx; From bd9a5fc2edb0bdcb0756298daa31ddd6a02f0634 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 22 Jan 2021 09:11:01 -0800 Subject: [PATCH 26/44] MAINTAINERS: Add myself as futex reviewer I'm volunteering to help review some of the pain. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210122171101.15991-1-dave@stgolabs.net --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index aa84121c5611..8c31c77d4cc2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7363,6 +7363,7 @@ M: Thomas Gleixner M: Ingo Molnar R: Peter Zijlstra R: Darren Hart +R: Davidlohr Bueso L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core From c15380b72d7ae821ee090ba5a56fc6310828dbda Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Mar 2021 16:29:30 +0100 Subject: [PATCH 27/44] locking/rtmutex: Remove rt_mutex_timed_lock() rt_mutex_timed_lock() has no callers since: c051b21f71d1f ("rtmutex: Confine deadlock logic to futex") Remove it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.061103415@linutronix.de --- include/linux/rtmutex.h | 3 --- kernel/locking/rtmutex.c | 46 ---------------------------------------- 2 files changed, 49 deletions(-) diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 6fd615a0eea9..32f4a3538c3c 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -115,9 +115,6 @@ extern void rt_mutex_lock(struct rt_mutex *lock); #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); -extern int rt_mutex_timed_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout); - extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index db31bce114f8..ca93e5d7b026 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1394,21 +1394,6 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) -{ - if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; - - return slowfn(lock, state, timeout, chwalk); -} - static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) @@ -1516,37 +1501,6 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) return __rt_mutex_slowtrylock(lock); } -/** - * rt_mutex_timed_lock - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller - * - * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -ETIMEDOUT when the timeout expired - */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) -{ - int ret; - - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_MIN_CHAINWALK, - rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); - - return ret; -} -EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); - /** * rt_mutex_trylock - try to lock a rt_mutex * From 2d445c3e4a8216cfa9703998124c13250cc13e5e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Mar 2021 16:29:31 +0100 Subject: [PATCH 28/44] locking/rtmutex: Remove rtmutex deadlock tester leftovers The following debug members of 'struct rtmutex' are unused: - save_state: No users - file,line: Printed if ::name is NULL. This is only used for non-futex locks so ::name is never NULL - magic: Assigned to NULL by rt_mutex_destroy(), no further usage Remove them along with unused inline and macro leftovers related to the long gone deadlock tester. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.195064296@linutronix.de --- include/linux/rtmutex.h | 7 ++----- kernel/locking/rtmutex-debug.c | 7 +------ kernel/locking/rtmutex-debug.h | 2 -- kernel/locking/rtmutex.c | 3 --- kernel/locking/rtmutex.h | 2 -- kernel/locking/rtmutex_common.h | 1 - 6 files changed, 3 insertions(+), 19 deletions(-) diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 32f4a3538c3c..48b334b9eb87 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -32,10 +32,7 @@ struct rt_mutex { struct rb_root_cached waiters; struct task_struct *owner; #ifdef CONFIG_DEBUG_RT_MUTEXES - int save_state; - const char *name, *file; - int line; - void *magic; + const char *name; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -60,7 +57,7 @@ struct hrtimer_sleeper; #ifdef CONFIG_DEBUG_RT_MUTEXES # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ - , .name = #mutexname, .file = __FILE__, .line = __LINE__ + , .name = #mutexname # define rt_mutex_init(mutex) \ do { \ diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 36e69100e8e0..7e411b946d4c 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -42,12 +42,7 @@ static void printk_task(struct task_struct *p) static void printk_lock(struct rt_mutex *lock, int print_owner) { - if (lock->name) - printk(" [%p] {%s}\n", - lock, lock->name); - else - printk(" [%p] {%s:%d}\n", - lock, lock->file, lock->line); + printk(" [%p] {%s}\n", lock, lock->name); if (print_owner && rt_mutex_owner(lock)) { printk(".. ->owner: %p\n", lock->owner); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index fc549713bba3..772c9b012b62 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -22,8 +22,6 @@ extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, struct rt_mutex_waiter *waiter, struct rt_mutex *lock); extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ - do { (w)->deadlock_lock = NULL; } while (0) static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, enum rtmutex_chainwalk walk) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ca93e5d7b026..11abc60d1478 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1594,9 +1594,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) void rt_mutex_destroy(struct rt_mutex *lock) { WARN_ON(rt_mutex_is_locked(lock)); -#ifdef CONFIG_DEBUG_RT_MUTEXES - lock->magic = NULL; -#endif } EXPORT_SYMBOL_GPL(rt_mutex_destroy); diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 732f96abf462..4dbdec15f1a0 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -11,7 +11,6 @@ * Non-debug version. */ -#define rt_mutex_deadlock_check(l) (0) #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0) @@ -21,7 +20,6 @@ #define debug_rt_mutex_init(m, n, k) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_reset_waiter(w) do { } while (0) static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) { diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index a5007f00c1b7..aa047436dadf 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -30,7 +30,6 @@ struct rt_mutex_waiter { struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES - unsigned long ip; struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif From 6d41c675a5394057f6fb1dc97cc0a0e360f2c2f8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Mar 2021 16:29:32 +0100 Subject: [PATCH 29/44] locking/rtmutex: Remove output from deadlock detector The rtmutex specific deadlock detector predates lockdep coverage of rtmutex and since commit f5694788ad8da ("rt_mutex: Add lockdep annotations") it contains a lot of redundant functionality: - lockdep will detect an potential deadlock before rtmutex-debug has a chance to do so - the deadlock debugging is restricted to rtmutexes which are not associated to futexes and have an active waiter, which is covered by lockdep already Remove the redundant functionality and move actual deadlock WARN() into the deadlock code path. The latter needs a seperate cleanup. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.320398604@linutronix.de --- include/linux/rtmutex.h | 7 --- kernel/locking/rtmutex-debug.c | 97 --------------------------------- kernel/locking/rtmutex-debug.h | 9 --- kernel/locking/rtmutex.c | 7 +-- kernel/locking/rtmutex.h | 7 --- kernel/locking/rtmutex_common.h | 4 -- 6 files changed, 1 insertion(+), 130 deletions(-) diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 48b334b9eb87..0725c4b45749 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -31,9 +31,6 @@ struct rt_mutex { raw_spinlock_t wait_lock; struct rb_root_cached waiters; struct task_struct *owner; -#ifdef CONFIG_DEBUG_RT_MUTEXES - const char *name; -#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -56,8 +53,6 @@ struct hrtimer_sleeper; #endif #ifdef CONFIG_DEBUG_RT_MUTEXES -# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ - , .name = #mutexname # define rt_mutex_init(mutex) \ do { \ @@ -67,7 +62,6 @@ do { \ extern void rt_mutex_debug_task_free(struct task_struct *tsk); #else -# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) # define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) # define rt_mutex_debug_task_free(t) do { } while (0) #endif @@ -83,7 +77,6 @@ do { \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ , .waiters = RB_ROOT_CACHED \ , .owner = NULL \ - __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} #define DEFINE_RT_MUTEX(mutexname) \ diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 7e411b946d4c..fb150100335f 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -32,105 +32,12 @@ #include "rtmutex_common.h" -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); - else - printk(""); -} - -static void printk_lock(struct rt_mutex *lock, int print_owner) -{ - printk(" [%p] {%s}\n", lock, lock->name); - - if (print_owner && rt_mutex_owner(lock)) { - printk(".. ->owner: %p\n", lock->owner); - printk(".. held by: "); - printk_task(rt_mutex_owner(lock)); - printk("\n"); - } -} - void rt_mutex_debug_task_free(struct task_struct *task) { DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } -/* - * We fill out the fields in the waiter to store the information about - * the deadlock. We print when we return. act_waiter can be NULL in - * case of a remove waiter operation. - */ -void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *act_waiter, - struct rt_mutex *lock) -{ - struct task_struct *task; - - if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) - return; - - task = rt_mutex_owner(act_waiter->lock); - if (task && task != current) { - act_waiter->deadlock_task_pid = get_pid(task_pid(task)); - act_waiter->deadlock_lock = lock; - } -} - -void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) -{ - struct task_struct *task; - - if (!waiter->deadlock_lock || !debug_locks) - return; - - rcu_read_lock(); - task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); - if (!task) { - rcu_read_unlock(); - return; - } - - if (!debug_locks_off()) { - rcu_read_unlock(); - return; - } - - pr_warn("\n"); - pr_warn("============================================\n"); - pr_warn("WARNING: circular locking deadlock detected!\n"); - pr_warn("%s\n", print_tainted()); - pr_warn("--------------------------------------------\n"); - printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task_pid_nr(task), - current->comm, task_pid_nr(current)); - - printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, task_pid_nr(current)); - printk_lock(waiter->lock, 1); - - printk("\n2) %s/%d is blocked on this lock:\n", - task->comm, task_pid_nr(task)); - printk_lock(waiter->deadlock_lock, 1); - - debug_show_held_locks(current); - debug_show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task_pid_nr(task)); - show_stack(task, NULL, KERN_DEFAULT); - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, task_pid_nr(current)); - dump_stack(); - debug_show_all_locks(); - rcu_read_unlock(); - - printk("[ turning off deadlock detection." - "Please report this trace. ]\n\n"); -} - void debug_rt_mutex_lock(struct rt_mutex *lock) { } @@ -153,12 +60,10 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { memset(waiter, 0x11, sizeof(*waiter)); - waiter->deadlock_task_pid = NULL; } void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { - put_pid(waiter->deadlock_task_pid); memset(waiter, 0x22, sizeof(*waiter)); } @@ -168,10 +73,8 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_cl * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lock->name = name; #ifdef CONFIG_DEBUG_LOCK_ALLOC lockdep_init_map(&lock->dep_map, name, key, 0); #endif } - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 772c9b012b62..659e93e256c6 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -18,18 +18,9 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *waiter, - struct rt_mutex *lock); -extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, enum rtmutex_chainwalk walk) { return (waiter != NULL); } - -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - debug_rt_mutex_print_deadlock(w); -} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 11abc60d1478..4beca549aeeb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -579,7 +579,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; goto out_unlock_pi; @@ -1171,8 +1170,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); - schedule(); raw_spin_lock_irq(&lock->wait_lock); @@ -1196,7 +1193,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, /* * Yell loudly and stop the task right here. */ - rt_mutex_print_deadlock(w); + WARN(1, "rtmutex deadlock detected\n"); while (1) { set_current_state(TASK_INTERRUPTIBLE); schedule(); @@ -1704,8 +1701,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } - debug_rt_mutex_print_deadlock(waiter); - return ret; } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 4dbdec15f1a0..d77cb8280aa6 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -18,13 +18,6 @@ #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) #define debug_rt_mutex_init(m, n, k) do { } while (0) -#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) -#define debug_rt_mutex_print_deadlock(w) do { } while (0) - -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - WARN(1, "rtmutex deadlock detected\n"); -} static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, enum rtmutex_chainwalk walk) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index aa047436dadf..badb2a2803aa 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -29,10 +29,6 @@ struct rt_mutex_waiter { struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; -#ifdef CONFIG_DEBUG_RT_MUTEXES - struct pid *deadlock_task_pid; - struct rt_mutex *deadlock_lock; -#endif int prio; u64 deadline; }; From 199cacd1a625cfc499d624b98b10dc763062f7dd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 26 Mar 2021 16:29:33 +0100 Subject: [PATCH 30/44] locking/rtmutex: Consolidate rt_mutex_init() rt_mutex_init() only initializes lockdep if CONFIG_DEBUG_RT_MUTEXES is enabled, which is fine because all lockdep variants select it, but there is no reason to do so. Move the function outside of the CONFIG_DEBUG_RT_MUTEXES block which removes #ifdeffery. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.437405350@linutronix.de --- include/linux/rtmutex.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 0725c4b45749..243fabc2c85f 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -43,6 +43,7 @@ struct hrtimer_sleeper; extern int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len); extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task); + extern void rt_mutex_debug_task_free(struct task_struct *tsk); #else static inline int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) @@ -50,22 +51,15 @@ struct hrtimer_sleeper; return 0; } # define rt_mutex_debug_check_no_locks_held(task) do { } while (0) +# define rt_mutex_debug_task_free(t) do { } while (0) #endif -#ifdef CONFIG_DEBUG_RT_MUTEXES - -# define rt_mutex_init(mutex) \ +#define rt_mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ __rt_mutex_init(mutex, __func__, &__key); \ } while (0) - extern void rt_mutex_debug_task_free(struct task_struct *tsk); -#else -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) -# define rt_mutex_debug_task_free(t) do { } while (0) -#endif - #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ , .dep_map = { .name = #mutexname } From 8188d74e68174b11ff7c4a635ffc8fd31eacc6b9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:34 +0100 Subject: [PATCH 31/44] locking/rtmutex: Remove empty and unused debug stubs No users or useless and therefore just ballast. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.549192485@linutronix.de --- include/linux/rtmutex.h | 14 ++------------ kernel/locking/rtmutex-debug.c | 9 --------- kernel/locking/rtmutex-debug.h | 3 --- kernel/locking/rtmutex.c | 18 ------------------ kernel/locking/rtmutex.h | 2 -- 5 files changed, 2 insertions(+), 44 deletions(-) diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 243fabc2c85f..d1672de9ca89 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -40,18 +40,9 @@ struct rt_mutex_waiter; struct hrtimer_sleeper; #ifdef CONFIG_DEBUG_RT_MUTEXES - extern int rt_mutex_debug_check_no_locks_freed(const void *from, - unsigned long len); - extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task); - extern void rt_mutex_debug_task_free(struct task_struct *tsk); +extern void rt_mutex_debug_task_free(struct task_struct *tsk); #else - static inline int rt_mutex_debug_check_no_locks_freed(const void *from, - unsigned long len) - { - return 0; - } -# define rt_mutex_debug_check_no_locks_held(task) do { } while (0) -# define rt_mutex_debug_task_free(t) do { } while (0) +static inline void rt_mutex_debug_task_free(struct task_struct *tsk) { } #endif #define rt_mutex_init(mutex) \ @@ -88,7 +79,6 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock) } extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); -extern void rt_mutex_destroy(struct rt_mutex *lock); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index fb150100335f..df584c91710b 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -38,20 +38,11 @@ void rt_mutex_debug_task_free(struct task_struct *task) DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } -void debug_rt_mutex_lock(struct rt_mutex *lock) -{ -} - void debug_rt_mutex_unlock(struct rt_mutex *lock) { DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } -void -debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) -{ -} - void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 659e93e256c6..392468d7253f 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -13,10 +13,7 @@ extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); -extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4beca549aeeb..96c7c537eab4 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -885,9 +885,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, raw_spin_unlock(&task->pi_lock); takeit: - /* We got the lock. */ - debug_rt_mutex_lock(lock); - /* * This either preserves the RT_MUTEX_HAS_WAITERS bit if there * are still waiters or clears it. @@ -1580,20 +1577,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) rt_mutex_postunlock(&wake_q); } -/** - * rt_mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void rt_mutex_destroy(struct rt_mutex *lock) -{ - WARN_ON(rt_mutex_is_locked(lock)); -} -EXPORT_SYMBOL_GPL(rt_mutex_destroy); - /** * __rt_mutex_init - initialize the rt_mutex * @@ -1635,7 +1618,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { __rt_mutex_init(lock, NULL, NULL); - debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index d77cb8280aa6..1e484abc94ae 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -13,8 +13,6 @@ #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) -#define debug_rt_mutex_lock(l) do { } while (0) -#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) #define debug_rt_mutex_init(m, n, k) do { } while (0) From fae37feee096bd3c85f6453713131a471404c6f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:35 +0100 Subject: [PATCH 32/44] locking/rtmutex: Move rt_mutex_debug_task_free() to rtmutex.c Prepare for removing the header maze. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.646359691@linutronix.de --- kernel/locking/rtmutex-debug.c | 6 ------ kernel/locking/rtmutex.c | 8 ++++++++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index df584c91710b..f1a83ec9c9b0 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -32,12 +32,6 @@ #include "rtmutex_common.h" -void rt_mutex_debug_task_free(struct task_struct *task) -{ - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); - DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); -} - void debug_rt_mutex_unlock(struct rt_mutex *lock) { DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 96c7c537eab4..8a934db4bb1a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1813,3 +1813,11 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, return cleanup; } + +#ifdef CONFIG_DEBUG_RT_MUTEXES +void rt_mutex_debug_task_free(struct task_struct *task) +{ + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +} +#endif From f7efc4799f8114ba85b68584f83293f435009de4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:36 +0100 Subject: [PATCH 33/44] locking/rtmutex: Inline chainwalk depth check There is no point for this wrapper at all. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.754254046@linutronix.de --- kernel/locking/rtmutex.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8a934db4bb1a..2c77e729b820 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,14 +343,9 @@ static void rt_mutex_adjust_prio(struct task_struct *p) static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, enum rtmutex_chainwalk chwalk) { - /* - * This is just a wrapper function for the following call, - * because debug_rt_mutex_detect_deadlock() smells like a magic - * debug feature and I wanted to keep the cond function in the - * main source file along with the comments instead of having - * two of the same in the headers. - */ - return debug_rt_mutex_detect_deadlock(waiter, chwalk); + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX)) + return waiter != NULL; + return chwalk == RT_MUTEX_FULL_CHAINWALK; } /* From 37350e3b2655eb0fce4e0e6ce8ce631c781136fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:37 +0100 Subject: [PATCH 34/44] locking/rtmutex: Remove pointless CONFIG_RT_MUTEXES=n stubs None of these functions are used when CONFIG_RT_MUTEXES=n. Remove the gunk. Remove pointless comments and clean up the coding style mess while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.863379182@linutronix.de --- kernel/locking/rtmutex_common.h | 62 +++++++++++---------------------- 1 file changed, 20 insertions(+), 42 deletions(-) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index badb2a2803aa..8596a71774e0 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -23,29 +23,30 @@ * @tree_entry: pi node to enqueue into the mutex waiters tree * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task + * @lock: Pointer to the rt_mutex on which the waiter blocks + * @prio: Priority of the waiter + * @deadline: Deadline of the waiter if applicable */ struct rt_mutex_waiter { - struct rb_node tree_entry; - struct rb_node pi_tree_entry; + struct rb_node tree_entry; + struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; - int prio; - u64 deadline; + int prio; + u64 deadline; }; /* - * Various helpers to access the waiters-tree: + * Must be guarded because this header is included from rcu/tree_plugin.h + * unconditionally. */ - #ifdef CONFIG_RT_MUTEXES - static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } -static inline struct rt_mutex_waiter * -rt_mutex_top_waiter(struct rt_mutex *lock) +static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; @@ -62,42 +63,12 @@ static inline int task_has_pi_waiters(struct task_struct *p) return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } -static inline struct rt_mutex_waiter * -task_top_pi_waiter(struct task_struct *p) +static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) { - return rb_entry(p->pi_waiters.rb_leftmost, - struct rt_mutex_waiter, pi_tree_entry); + return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, + pi_tree_entry); } -#else - -static inline int rt_mutex_has_waiters(struct rt_mutex *lock) -{ - return false; -} - -static inline struct rt_mutex_waiter * -rt_mutex_top_waiter(struct rt_mutex *lock) -{ - return NULL; -} - -static inline int task_has_pi_waiters(struct task_struct *p) -{ - return false; -} - -static inline struct rt_mutex_waiter * -task_top_pi_waiter(struct task_struct *p) -{ - return NULL; -} - -#endif - -/* - * lock->owner state tracking: - */ #define RT_MUTEX_HAS_WAITERS 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) @@ -106,6 +77,13 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } +#else /* CONFIG_RT_MUTEXES */ +/* Used in rcu/tree_plugin.h */ +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return NULL; +} +#endif /* !CONFIG_RT_MUTEXES */ /* * Constants for rt mutex functions which have a selectable deadlock From f5a98866e506a816f6a855df1e7ed41e1891ec66 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:38 +0100 Subject: [PATCH 35/44] locking/rtmutex: Decrapify __rt_mutex_init() The conditional debug handling is just another layer of obfuscation. Split the function so rt_mutex_init_proxy_locked() can invoke the inner init and __rt_mutex_init() gets the full treatment. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153943.955697588@linutronix.de --- kernel/locking/rtmutex.c | 10 ++++------ kernel/locking/rtmutex_common.h | 7 +++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2c77e729b820..c9c2ab50c1d5 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1586,12 +1586,10 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) { - lock->owner = NULL; - raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT_CACHED; + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); - if (name && key) - debug_rt_mutex_init(lock, name, key); + __rt_mutex_basic_init(lock); } EXPORT_SYMBOL_GPL(__rt_mutex_init); @@ -1612,7 +1610,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { - __rt_mutex_init(lock, NULL, NULL); + __rt_mutex_basic_init(lock); rt_mutex_set_owner(lock, proxy_owner); } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 8596a71774e0..41adf8be991b 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -100,6 +100,13 @@ enum rtmutex_chainwalk { RT_MUTEX_FULL_CHAINWALK, }; +static inline void __rt_mutex_basic_init(struct rt_mutex *lock) +{ + lock->owner = NULL; + raw_spin_lock_init(&lock->wait_lock); + lock->waiters = RB_ROOT_CACHED; +} + /* * PI-futex support (proxy locking functions, etc.): */ From f41dcc18698e660668a33cde8ab965e0298ac341 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:39 +0100 Subject: [PATCH 36/44] locking/rtmutex: Move debug functions as inlines into common header There is no value in having two header files providing just empty stubs and a C file which implements trivial debug functions which can just be inlined. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153944.052454464@linutronix.de --- kernel/locking/Makefile | 2 - kernel/locking/rtmutex-debug.c | 65 --------------------------------- kernel/locking/rtmutex-debug.h | 23 ------------ kernel/locking/rtmutex.h | 24 ------------ kernel/locking/rtmutex_common.h | 30 ++++++++++++--- 5 files changed, 25 insertions(+), 119 deletions(-) delete mode 100644 kernel/locking/rtmutex-debug.c delete mode 100644 kernel/locking/rtmutex-debug.h delete mode 100644 kernel/locking/rtmutex.h diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8838f1d7c4a2..3572808223e4 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -12,7 +12,6 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o @@ -26,7 +25,6 @@ obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c deleted file mode 100644 index f1a83ec9c9b0..000000000000 --- a/kernel/locking/rtmutex-debug.c +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner - * - * This code is based on the rt.c implementation in the preempt-rt tree. - * Portions of said code are - * - * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Copyright (C) 2006 Esben Nielsen - * Copyright (C) 2006 Kihon Technologies Inc., - * Steven Rostedt - * - * See rt.c in preempt-rt for proper credits and further information - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex_common.h" - -void debug_rt_mutex_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); -} - -void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); -} - -void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - memset(waiter, 0x11, sizeof(*waiter)); -} - -void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) -{ - memset(waiter, 0x22, sizeof(*waiter)); -} - -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) -{ - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif -} diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h deleted file mode 100644 index 392468d7253f..000000000000 --- a/kernel/locking/rtmutex-debug.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - * This file contains macros used solely by rtmutex.c. Debug version. - */ - -extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); -extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); - -static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - enum rtmutex_chainwalk walk) -{ - return (waiter != NULL); -} diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h deleted file mode 100644 index 1e484abc94ae..000000000000 --- a/kernel/locking/rtmutex.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - * This file contains macros used solely by rtmutex.c. - * Non-debug version. - */ - -#define debug_rt_mutex_init_waiter(w) do { } while (0) -#define debug_rt_mutex_free_waiter(w) do { } while (0) -#define debug_rt_mutex_proxy_unlock(l) do { } while (0) -#define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n, k) do { } while (0) - -static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, - enum rtmutex_chainwalk walk) -{ - return walk == RT_MUTEX_FULL_CHAINWALK; -} diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 41adf8be991b..a90c22abdbca 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -13,6 +13,7 @@ #ifndef __KERNEL_RTMUTEX_COMMON_H #define __KERNEL_RTMUTEX_COMMON_H +#include #include #include @@ -135,10 +136,29 @@ extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, extern void rt_mutex_postunlock(struct wake_q_head *wake_q); -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif +/* Debug functions */ +static inline void debug_rt_mutex_unlock(struct rt_mutex *lock) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); +} + +static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); +} + +static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + memset(waiter, 0x11, sizeof(*waiter)); +} + +static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) + memset(waiter, 0x22, sizeof(*waiter)); +} #endif From d7a2edb890c0bfe467140c0cd79fe7cf65249376 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:40 +0100 Subject: [PATCH 37/44] locking/rtmutex: Make text section and inlining consistent rtmutex is half __sched and the other half is not. If the compiler decides to not inline larger static functions then part of the code ends up in the regular text section. There are also quite some performance related small helpers which are either static or plain inline. Force inline those which make sense and mark the rest __sched. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153944.152977820@linutronix.de --- kernel/locking/rtmutex.c | 152 +++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c9c2ab50c1d5..36128211f589 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -49,7 +49,7 @@ * set this bit before looking at the lock. */ -static void +static __always_inline void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) { unsigned long val = (unsigned long)owner; @@ -60,13 +60,13 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) WRITE_ONCE(lock->owner, (struct task_struct *)val); } -static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void clear_rt_mutex_waiters(struct rt_mutex *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); } -static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -149,7 +149,7 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) * all future threads that attempt to [Rmw] the lock to the slowpath. As such * relaxed semantics suffice. */ -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -165,8 +165,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - unsigned long flags) +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -204,7 +204,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, # define rt_mutex_cmpxchg_acquire(l,c,n) (0) # define rt_mutex_cmpxchg_release(l,c,n) (0) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -213,8 +213,8 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - unsigned long flags) +static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; @@ -229,9 +229,8 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, #define task_to_waiter(p) \ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -static inline int -rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) { if (left->prio < right->prio) return 1; @@ -248,9 +247,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, return 0; } -static inline int -rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) { if (left->prio != right->prio) return 0; @@ -270,18 +268,18 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, #define __node_2_waiter(node) \ rb_entry((node), struct rt_mutex_waiter, tree_entry) -static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) +static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) { return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b)); } -static void +static __always_inline void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); } -static void +static __always_inline void rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { if (RB_EMPTY_NODE(&waiter->tree_entry)) @@ -294,18 +292,19 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) #define __node_2_pi_waiter(node) \ rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) -static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) +static __always_inline bool +__pi_waiter_less(struct rb_node *a, const struct rb_node *b) { return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); } -static void +static __always_inline void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); } -static void +static __always_inline void rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) @@ -315,7 +314,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->pi_tree_entry); } -static void rt_mutex_adjust_prio(struct task_struct *p) +static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) { struct task_struct *pi_task = NULL; @@ -340,8 +339,9 @@ static void rt_mutex_adjust_prio(struct task_struct *p) * deadlock detection is disabled independent of the detect argument * and the config settings. */ -static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, - enum rtmutex_chainwalk chwalk) +static __always_inline bool +rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk chwalk) { if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX)) return waiter != NULL; @@ -353,7 +353,7 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, */ int max_lock_depth = 1024; -static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) +static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) { return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; } @@ -421,12 +421,12 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * unlock(lock->wait_lock); release [L] * goto again; */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - enum rtmutex_chainwalk chwalk, - struct rt_mutex *orig_lock, - struct rt_mutex *next_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) +static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, + enum rtmutex_chainwalk chwalk, + struct rt_mutex *orig_lock, + struct rt_mutex *next_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task) { struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; struct rt_mutex_waiter *prerequeue_top_waiter; @@ -778,8 +778,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * @waiter: The waiter that is queued to the lock's wait tree if the * callsite called task_blocked_on_lock(), otherwise NULL */ -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) +static int __sched +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); @@ -896,10 +897,10 @@ takeit: * * This must be called with lock->wait_lock held and interrupts disabled */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, - enum rtmutex_chainwalk chwalk) +static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + enum rtmutex_chainwalk chwalk) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -985,8 +986,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * * Called with lock->wait_lock held and interrupts disabled. */ -static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, - struct rt_mutex *lock) +static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q, + struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; @@ -1035,8 +1036,8 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) +static void __sched remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); @@ -1093,7 +1094,7 @@ static void remove_waiter(struct rt_mutex *lock, * * Called from sched_setscheduler */ -void rt_mutex_adjust_pi(struct task_struct *task) +void __sched rt_mutex_adjust_pi(struct task_struct *task) { struct rt_mutex_waiter *waiter; struct rt_mutex *next_lock; @@ -1116,7 +1117,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) next_lock, NULL, task); } -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { debug_rt_mutex_init_waiter(waiter); RB_CLEAR_NODE(&waiter->pi_tree_entry); @@ -1134,10 +1135,9 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) * * Must be called with lock->wait_lock held and interrupts disabled */ -static int __sched -__rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) +static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter) { int ret = 0; @@ -1172,8 +1172,8 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } -static void rt_mutex_handle_deadlock(int res, int detect_deadlock, - struct rt_mutex_waiter *w) +static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_waiter *w) { /* * If the result is not -EDEADLOCK or the caller requested @@ -1195,10 +1195,9 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, /* * Slow path lock function: */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk) +static int __sched rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; unsigned long flags; @@ -1257,7 +1256,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } -static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1273,7 +1272,7 @@ static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path try-lock function: */ -static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) +static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock) { unsigned long flags; int ret; @@ -1371,7 +1370,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * The atomic acquire/release ops are compiled away, when either the * architecture does not support cmpxchg or when debugging is enabled. */ -static inline int +static __always_inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, @@ -1383,7 +1382,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } -static inline int +static __always_inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { @@ -1396,7 +1395,7 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, /* * Performs the wakeup of the top-waiter and re-enables preemption. */ -void rt_mutex_postunlock(struct wake_q_head *wake_q) +void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) { wake_up_q(wake_q); @@ -1404,7 +1403,7 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q) preempt_enable(); } -static inline void +static __always_inline void rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, struct wake_q_head *wqh)) @@ -1418,7 +1417,8 @@ rt_mutex_fastunlock(struct rt_mutex *lock, rt_mutex_postunlock(&wake_q); } -static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) +static __always_inline void __rt_mutex_lock(struct rt_mutex *lock, + unsigned int subclass) { might_sleep(); @@ -1536,7 +1536,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); * @wake_q: The wake queue head from which to get the next lock waiter */ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) + struct wake_q_head *wake_q) { lockdep_assert_held(&lock->wait_lock); @@ -1583,7 +1583,7 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) * * Initializing of a locked rt_mutex is not allowed */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name, +void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) { debug_check_no_locks_freed((void *)lock, sizeof(*lock)); @@ -1607,8 +1607,8 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); * possible at this point because the pi_state which contains the rtmutex * is not yet visible to other tasks. */ -void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) +void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) { __rt_mutex_basic_init(lock); rt_mutex_set_owner(lock, proxy_owner); @@ -1626,7 +1626,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, * possible because it belongs to the pi_state which is about to be freed * and it is not longer visible to other tasks. */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock) +void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock) { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); @@ -1651,9 +1651,9 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock) * * Special API call for PI-futex support. */ -int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) +int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) { int ret; @@ -1698,9 +1698,9 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, * * Special API call for PI-futex support. */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task) +int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) { int ret; @@ -1730,9 +1730,9 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, * * Special API call for PI-futex support */ -int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter) +int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter) { int ret; @@ -1770,8 +1770,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, * * Special API call for PI-futex support */ -bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) +bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) { bool cleanup = false; From 70c80103aafdeae99126694bc1cd54de016bc258 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:41 +0100 Subject: [PATCH 38/44] locking/rtmutex: Consolidate the fast/slowpath invocation The indirection via a function pointer (which is at least optimized into a tail call by the compiler) is making the code hard to read. Clean it up and move the futex related trylock functions down to the futex section. Move the wake_q wakeup into rt_mutex_slowunlock(). No point in handing it to the caller. The futex code uses a different function. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153944.247927548@linutronix.de --- kernel/locking/rtmutex.c | 146 ++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 86 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 36128211f589..7d0c16871033 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1298,14 +1298,25 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock) return ret; } +/* + * Performs the wakeup of the top-waiter and re-enables preemption. + */ +void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) +{ + wake_up_q(wake_q); + + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + preempt_enable(); +} + /* * Slow path to release a rt-mutex. * * Return whether the current task needs to call rt_mutex_postunlock(). */ -static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) +static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { + DEFINE_WAKE_Q(wake_q); unsigned long flags; /* irqsave required to support early boot calls */ @@ -1347,7 +1358,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ if (unlock_rt_mutex_safe(lock, flags) == true) - return false; + return; /* Relock the rtmutex and try again */ raw_spin_lock_irqsave(&lock->wait_lock, flags); } @@ -1358,10 +1369,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * * Queue the next waiter for wakeup once we release the wait_lock. */ - mark_wakeup_next_waiter(wake_q, lock); + mark_wakeup_next_waiter(&wake_q, lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return true; /* call rt_mutex_postunlock() */ + rt_mutex_postunlock(&wake_q); } /* @@ -1370,60 +1381,21 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * The atomic acquire/release ops are compiled away, when either the * architecture does not support cmpxchg or when debugging is enabled. */ -static __always_inline int -rt_mutex_fastlock(struct rt_mutex *lock, int state, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) +static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, long state, + unsigned int subclass) { + int ret; + + might_sleep(); + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); -} - -static __always_inline int -rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 1; - - return slowfn(lock); -} - -/* - * Performs the wakeup of the top-waiter and re-enables preemption. - */ -void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) -{ - wake_up_q(wake_q); - - /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ - preempt_enable(); -} - -static __always_inline void -rt_mutex_fastunlock(struct rt_mutex *lock, - bool (*slowfn)(struct rt_mutex *lock, - struct wake_q_head *wqh)) -{ - DEFINE_WAKE_Q(wake_q); - - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) - return; - - if (slowfn(lock, &wake_q)) - rt_mutex_postunlock(&wake_q); -} - -static __always_inline void __rt_mutex_lock(struct rt_mutex *lock, - unsigned int subclass) -{ - might_sleep(); - - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); + ret = rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1435,7 +1407,7 @@ static __always_inline void __rt_mutex_lock(struct rt_mutex *lock, */ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) { - __rt_mutex_lock(lock, subclass); + __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); @@ -1448,7 +1420,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - __rt_mutex_lock(lock, 0); + __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif @@ -1464,42 +1436,21 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { - int ret; - - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); - - return ret; + return __rt_mutex_lock(lock, TASK_INTERRUPTIBLE, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); -/* - * Futex variant, must not use fastpath. - */ -int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_slowtrylock(lock); -} - -int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) -{ - return __rt_mutex_slowtrylock(lock); -} - /** * rt_mutex_trylock - try to lock a rt_mutex * * @lock: the rt_mutex to be locked * - * This function can only be called in thread context. It's safe to - * call it from atomic regions, but not from hard interrupt or soft - * interrupt context. + * This function can only be called in thread context. It's safe to call it + * from atomic regions, but not from hard or soft interrupt context. * - * Returns 1 on success and 0 on contention + * Returns: + * 1 on success + * 0 on contention */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { @@ -1508,7 +1459,14 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; - ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + /* + * No lockdep annotation required because lockdep disables the fast + * path. + */ + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 1; + + ret = rt_mutex_slowtrylock(lock); if (ret) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); @@ -1524,10 +1482,26 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); void __sched rt_mutex_unlock(struct rt_mutex *lock) { mutex_release(&lock->dep_map, _RET_IP_); - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; + + rt_mutex_slowunlock(lock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); +/* + * Futex variants, must not use fastpath. + */ +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_slowtrylock(lock); +} + +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * __rt_mutex_futex_unlock - Futex variant, that since futex variants * do not use the fast-path, can be simple and will not need to retry. From 82cd5b1039e26b1b1254886464991e34de439ac5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:42 +0100 Subject: [PATCH 39/44] locking/rtmutex: Fix misleading comment in rt_mutex_postunlock() Preemption is disabled in mark_wakeup_next_waiter(,) not in rt_mutex_slowunlock(). Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153944.341734608@linutronix.de --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7d0c16871033..512b400bd961 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1305,7 +1305,7 @@ void __sched rt_mutex_postunlock(struct wake_q_head *wake_q) { wake_up_q(wake_q); - /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */ preempt_enable(); } From c2c360ed7f28fd6b7eb7e39e70af2d2ae405f466 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:43 +0100 Subject: [PATCH 40/44] locking/rtmutex: Restrict the trylock WARN_ON() to debug The warning as written is expensive and not really required for a production kernel. Make it depend on rt mutex debugging and use !in_task() for the condition which generates far better code and gives the same answer. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153944.436565064@linutronix.de --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 512b400bd961..c68542dbbd0a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1456,7 +1456,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) { int ret; - if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; /* From a51a327f3bcdcb1a37ed9325ad07e1456cd4d426 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Mar 2021 16:29:44 +0100 Subject: [PATCH 41/44] locking/rtmutex: Clean up signal handling in __rt_mutex_slowlock() The signal handling in __rt_mutex_slowlock() is open coded. Use signal_pending_state() instead. Aside of the cleanup this also prepares for the RT lock substituions which require support for TASK_KILLABLE. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210326153944.533811987@linutronix.de --- kernel/locking/rtmutex.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c68542dbbd0a..406818196a9f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1146,18 +1146,13 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, if (try_to_take_rt_mutex(lock, current, waiter)) break; - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (likely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; + if (timeout && !timeout->task) { + ret = -ETIMEDOUT; + break; + } + if (signal_pending_state(state, current)) { + ret = -EINTR; + break; } raw_spin_unlock_irq(&lock->wait_lock); From 7d95f22798ecea513f37b792b39fec4bcf20fec3 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 26 Mar 2021 11:50:23 +0100 Subject: [PATCH 42/44] static_call: Fix unused variable warn w/o MODULE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here is the warning converted as error and reported by GCC: kernel/static_call.c: In function ‘__static_call_update’: kernel/static_call.c:153:18: error: unused variable ‘mod’ [-Werror=unused-variable] 153 | struct module *mod = site_mod->mod; | ^~~ cc1: all warnings being treated as errors make[1]: *** [scripts/Makefile.build:271: kernel/static_call.o] Error 1 This is simply because since recently, we no longer use 'mod' variable elsewhere if MODULE is unset. When using 'make tinyconfig' to generate the default kconfig, MODULE is unset. There are different ways to fix this warning. Here I tried to minimised the number of modified lines and not add more #ifdef. We could also move the declaration of the 'mod' variable inside the if-statement or directly use site_mod->mod. Fixes: 698bacefe993 ("static_call: Align static_call_is_init() patching condition") Signed-off-by: Matthieu Baerts Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210326105023.2058860-1-matthieu.baerts@tessares.net --- kernel/static_call.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/static_call.c b/kernel/static_call.c index 2c5950b0b90e..723fcc9d20db 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -165,13 +165,13 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) stop = __stop_static_call_sites; -#ifdef CONFIG_MODULES if (mod) { +#ifdef CONFIG_MODULES stop = mod->static_call_sites + mod->num_static_call_sites; init = mod->state == MODULE_STATE_COMING; - } #endif + } for (site = site_mod->sites; site < stop && static_call_key(site) == key; site++) { From 9432bbd969c667fc9c4b1c140c5a745ff2a7b540 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Mar 2021 16:49:03 +0100 Subject: [PATCH 43/44] static_call: Relax static_call_update() function argument type static_call_update() had stronger type requirements than regular C, relax them to match. Instead of requiring the @func argument has the exact matching type, allow any type which C is willing to promote to the right (function) pointer type. Specifically this allows (void *) arguments. This cleans up a bunch of static_call_update() callers for PREEMPT_DYNAMIC and should get around silly GCC11 warnings for free. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YFoN7nCl8OfGtpeh@hirez.programming.kicks-ass.net --- include/linux/static_call.h | 4 ++-- kernel/sched/core.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 85ecc789f4ff..8d50f62420ca 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -113,9 +113,9 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #define static_call_update(name, func) \ ({ \ - BUILD_BUG_ON(!__same_type(*(func), STATIC_CALL_TRAMP(name))); \ + typeof(&STATIC_CALL_TRAMP(name)) __F = (func); \ __static_call_update(&STATIC_CALL_KEY(name), \ - STATIC_CALL_TRAMP_ADDR(name), func); \ + STATIC_CALL_TRAMP_ADDR(name), __F); \ }) #ifdef CONFIG_HAVE_STATIC_CALL_INLINE diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 98191218d891..67f989001894 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5396,25 +5396,25 @@ static void sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: static_call_update(cond_resched, __cond_resched); - static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); - static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); - static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); - static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + static_call_update(might_resched, (void *)&__static_call_return0); + static_call_update(preempt_schedule, NULL); + static_call_update(preempt_schedule_notrace, NULL); + static_call_update(irqentry_exit_cond_resched, NULL); pr_info("Dynamic Preempt: none\n"); break; case preempt_dynamic_voluntary: static_call_update(cond_resched, __cond_resched); static_call_update(might_resched, __cond_resched); - static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); - static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); - static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + static_call_update(preempt_schedule, NULL); + static_call_update(preempt_schedule_notrace, NULL); + static_call_update(irqentry_exit_cond_resched, NULL); pr_info("Dynamic Preempt: voluntary\n"); break; case preempt_dynamic_full: - static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); - static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(cond_resched, (void *)&__static_call_return0); + static_call_update(might_resched, (void *)&__static_call_return0); static_call_update(preempt_schedule, __preempt_schedule_func); static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); From f4abe9967c6fdb511ee567e129a014b60945ab93 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Apr 2021 15:50:38 +0200 Subject: [PATCH 44/44] kcsan: Fix printk format string Printing a 'long' variable using the '%d' format string is wrong and causes a warning from gcc: kernel/kcsan/kcsan_test.c: In function 'nthreads_gen_params': include/linux/kern_levels.h:5:25: error: format '%d' expects argument of type 'int', but argument 3 has type 'long int' [-Werror=format=] Use the appropriate format modifier. Fixes: f6a149140321 ("kcsan: Switch to KUNIT_CASE_PARAM for parameterized tests") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Acked-by: Paul E. McKenney Link: https://lkml.kernel.org/r/20210421135059.3371701-1-arnd@kernel.org --- kernel/kcsan/kcsan_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index b71751fc9f4f..8bcffbdef3d3 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -984,7 +984,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc) const long min_required_cpus = 2 + min_unused_cpus; if (num_online_cpus() < min_required_cpus) { - pr_err_once("Too few online CPUs (%u < %d) for test\n", + pr_err_once("Too few online CPUs (%u < %ld) for test\n", num_online_cpus(), min_required_cpus); nthreads = 0; } else if (nthreads >= num_online_cpus() - min_unused_cpus) {