Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

There were quite a few overlapping sets of changes here.

Daniel's bug fix for off-by-ones in the new BPF branch instructions,
along with the added allowances for "data_end > ptr + x" forms
collided with the metadata additions.

Along with those three changes came veritifer test cases, which in
their final form I tried to group together properly.  If I had just
trimmed GIT's conflict tags as-is, this would have split up the
meta tests unnecessarily.

In the socketmap code, a set of preemption disabling changes
overlapped with the rename of bpf_compute_data_end() to
bpf_compute_data_pointers().

Changes were made to the mv88e6060.c driver set addr method
which got removed in net-next.

The hyperv transport socket layer had a locking change in 'net'
which overlapped with a change of socket state macro usage
in 'net-next'.

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-10-22 13:36:53 +01:00
commit f8ddadc4db
415 changed files with 4551 additions and 2007 deletions

View file

@ -102,7 +102,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
if (array_size >= U32_MAX - PAGE_SIZE ||
elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
bpf_array_alloc_percpu(array)) {
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}

View file

@ -72,7 +72,7 @@ static LIST_HEAD(dev_map_list);
static u64 dev_map_bitmap_size(const union bpf_attr *attr)
{
return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
}
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@ -81,6 +81,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
int err = -EINVAL;
u64 cost;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
@ -114,8 +117,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
err = -ENOMEM;
/* A per cpu bitfield with a bit per possible net device */
dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
__alignof__(unsigned long));
dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
__alignof__(unsigned long),
GFP_KERNEL | __GFP_NOWARN);
if (!dtab->flush_needed)
goto free_dtab;

View file

@ -318,10 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
/* make sure the size for pcpu_alloc() is reasonable */
goto free_htab;
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8);
if (percpu)

View file

@ -39,6 +39,7 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <net/strparser.h>
#include <net/tcp.h>
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@ -104,9 +105,16 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
return SK_DROP;
skb_orphan(skb);
/* We need to ensure that BPF metadata for maps is also cleared
* when we orphan the skb so that we don't have the possibility
* to reference a stale map.
*/
TCP_SKB_CB(skb)->bpf.map = NULL;
skb->sk = psock->sock;
bpf_compute_data_pointers(skb);
preempt_disable();
rc = (*prog->bpf_func)(skb, prog->insnsi);
preempt_enable();
skb->sk = NULL;
return rc;
@ -117,17 +125,10 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
struct sock *sk;
int rc;
/* Because we use per cpu values to feed input from sock redirect
* in BPF program to do_sk_redirect_map() call we need to ensure we
* are not preempted. RCU read lock is not sufficient in this case
* with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
*/
preempt_disable();
rc = smap_verdict_func(psock, skb);
switch (rc) {
case SK_REDIRECT:
sk = do_sk_redirect_map();
preempt_enable();
sk = do_sk_redirect_map(skb);
if (likely(sk)) {
struct smap_psock *peer = smap_psock_sk(sk);
@ -144,8 +145,6 @@ static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
/* Fall through and free skb otherwise */
case SK_DROP:
default:
if (rc != SK_REDIRECT)
preempt_enable();
kfree_skb(skb);
}
}
@ -490,6 +489,9 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
int err = -EINVAL;
u64 cost;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
@ -843,6 +845,12 @@ static int sock_map_update_elem(struct bpf_map *map,
return -EINVAL;
}
if (skops.sk->sk_type != SOCK_STREAM ||
skops.sk->sk_protocol != IPPROTO_TCP) {
fput(socket->file);
return -EOPNOTSUPP;
}
err = sock_map_ctx_update_elem(&skops, map, key, flags);
fput(socket->file);
return err;

View file

@ -1006,7 +1006,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
/* ctx accesses must be at a fixed offset, so that we can
* determine what type of data were returned.
*/
if (!tnum_is_const(reg->var_off)) {
if (reg->off) {
verbose(env,
"dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
regno, reg->off, off - reg->off);
return -EACCES;
}
if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@ -1015,7 +1021,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
tn_buf, off, size);
return -EACCES;
}
off += reg->var_off.value;
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
@ -2341,12 +2346,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
static void find_good_pkt_pointers(struct bpf_verifier_state *state,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type)
enum bpf_reg_type type,
bool range_right_open)
{
struct bpf_reg_state *regs = state->regs, *reg;
u16 new_range;
int i;
if (dst_reg->off < 0)
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
/* This doesn't give us any range */
return;
@ -2357,9 +2365,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
*/
return;
/* LLVM can generate four kind of checks:
new_range = dst_reg->off;
if (range_right_open)
new_range--;
/* Examples for register markings:
*
* Type 1/2:
* pkt_data in dst register:
*
* r2 = r3;
* r2 += 8;
@ -2376,7 +2388,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* r2=pkt(id=n,off=8,r=0)
* r3=pkt(id=n,off=0,r=0)
*
* Type 3/4:
* pkt_data in src register:
*
* r2 = r3;
* r2 += 8;
@ -2394,7 +2406,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* r3=pkt(id=n,off=0,r=0)
*
* Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
* so that range of bytes [r3, r3 + 8) is safe to access.
* or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
* and [r3, r3 + 8-1) respectively is safe to access depending on
* the check.
*/
/* If our ids match, then we must have the same max_value. And we
@ -2405,14 +2419,14 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == type && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
regs[i].range = max(regs[i].range, new_range);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == type && reg->id == dst_reg->id)
reg->range = max_t(u16, reg->range, dst_reg->off);
reg->range = max_t(u16, reg->range, new_range);
}
}
@ -2776,39 +2790,71 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
/* pkt_data' > pkt_end */
find_good_pkt_pointers(this_branch, dst_reg,
PTR_TO_PACKET, false);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
/* pkt_end > pkt_data' */
find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
PTR_TO_PACKET, true);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
/* pkt_data' < pkt_end */
find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET,
true);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
/* pkt_end < pkt_data' */
find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
PTR_TO_PACKET, false);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
/* pkt_data' >= pkt_end */
find_good_pkt_pointers(this_branch, dst_reg,
PTR_TO_PACKET, true);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
/* pkt_end >= pkt_data' */
find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
PTR_TO_PACKET);
PTR_TO_PACKET, false);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
/* pkt_data' <= pkt_end */
find_good_pkt_pointers(other_branch, dst_reg,
PTR_TO_PACKET, false);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
/* pkt_end <= pkt_data' */
find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
PTR_TO_PACKET);
PTR_TO_PACKET, true);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
find_good_pkt_pointers(this_branch, dst_reg,
PTR_TO_PACKET_META, false);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
dst_reg->type == PTR_TO_PACKET_META &&
reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
find_good_pkt_pointers(other_branch, dst_reg,
PTR_TO_PACKET_META, false);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
regs[insn->src_reg].type == PTR_TO_PACKET_META) {
find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
PTR_TO_PACKET_META);
PTR_TO_PACKET_META, false);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
regs[insn->src_reg].type == PTR_TO_PACKET_META) {
find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
PTR_TO_PACKET_META);
PTR_TO_PACKET_META, false);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose(env, "R%d pointer comparison prohibited\n",
insn->dst_reg);

View file

@ -662,7 +662,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
if (cgrp == event->cgrp)
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
__update_cgrp_time(event->cgrp);
}
@ -8966,6 +8966,14 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
static void free_pmu_context(struct pmu *pmu)
{
/*
* Static contexts such as perf_sw_context have a global lifetime
* and may be shared between different PMUs. Avoid freeing them
* when a single PMU is going away.
*/
if (pmu->task_ctx_nr > perf_invalid_context)
return;
mutex_lock(&pmus_lock);
free_percpu(pmu->pmu_cpu_context);
mutex_unlock(&pmus_lock);

View file

@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
if (!infop)
return err;
if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
return -EFAULT;
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (!infop)
return err;
if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
return -EFAULT;
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);

View file

@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
#ifdef CONFIG_DEBUG_KMEMLEAK
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
#endif
tsk->stack_vm_area = s;
return s->addr;
}

View file

@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
ret = __irq_startup(desc);
irq_set_affinity_locked(d, aff, false);
break;
case IRQ_STARTUP_ABORT:
return 0;

View file

@ -18,8 +18,34 @@
static inline bool irq_needs_fixup(struct irq_data *d)
{
const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
unsigned int cpu = smp_processor_id();
return cpumask_test_cpu(smp_processor_id(), m);
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
/*
* The cpumask_empty() check is a workaround for interrupt chips,
* which do not implement effective affinity, but the architecture has
* enabled the config switch. Use the general affinity mask instead.
*/
if (cpumask_empty(m))
m = irq_data_get_affinity_mask(d);
/*
* Sanity check. If the mask is not empty when excluding the outgoing
* CPU then it must contain at least one online CPU. The outgoing CPU
* has been removed from the online mask already.
*/
if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) {
/*
* If this happens then there was a missed IRQ fixup at some
* point. Warn about it and enforce fixup.
*/
pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
cpumask_pr_args(m), d->irq, cpu);
return true;
}
#endif
return cpumask_test_cpu(cpu, m);
}
static bool migrate_one_irq(struct irq_desc *desc)

View file

@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_desc *desc)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
}
static void irq_validate_effective_affinity(struct irq_data *data)
{
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
if (!cpumask_empty(m))
return;
pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
chip->name, data->irq);
#endif
}
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_chip *chip = irq_data_get_irq_chip(data);
int ret;
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
ret = 0;
}

View file

@ -830,6 +830,41 @@ int klp_register_patch(struct klp_patch *patch)
}
EXPORT_SYMBOL_GPL(klp_register_patch);
/*
* Remove parts of patches that touch a given kernel module. The list of
* patches processed might be limited. When limit is NULL, all patches
* will be handled.
*/
static void klp_cleanup_module_patches_limited(struct module *mod,
struct klp_patch *limit)
{
struct klp_patch *patch;
struct klp_object *obj;
list_for_each_entry(patch, &klp_patches, list) {
if (patch == limit)
break;
klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
/*
* Only unpatch the module if the patch is enabled or
* is in transition.
*/
if (patch->enabled || patch == klp_transition_patch) {
pr_notice("reverting patch '%s' on unloading module '%s'\n",
patch->mod->name, obj->mod->name);
klp_unpatch_object(obj);
}
klp_free_object_loaded(obj);
break;
}
}
}
int klp_module_coming(struct module *mod)
{
int ret;
@ -894,7 +929,7 @@ err:
pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
patch->mod->name, obj->mod->name, obj->mod->name);
mod->klp_alive = false;
klp_free_object_loaded(obj);
klp_cleanup_module_patches_limited(mod, patch);
mutex_unlock(&klp_mutex);
return ret;
@ -902,9 +937,6 @@ err:
void klp_module_going(struct module *mod)
{
struct klp_patch *patch;
struct klp_object *obj;
if (WARN_ON(mod->state != MODULE_STATE_GOING &&
mod->state != MODULE_STATE_COMING))
return;
@ -917,25 +949,7 @@ void klp_module_going(struct module *mod)
*/
mod->klp_alive = false;
list_for_each_entry(patch, &klp_patches, list) {
klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
/*
* Only unpatch the module if the patch is enabled or
* is in transition.
*/
if (patch->enabled || patch == klp_transition_patch) {
pr_notice("reverting patch '%s' on unloading module '%s'\n",
patch->mod->name, obj->mod->name);
klp_unpatch_object(obj);
}
klp_free_object_loaded(obj);
break;
}
}
klp_cleanup_module_patches_limited(mod, NULL);
mutex_unlock(&klp_mutex);
}

View file

@ -1873,10 +1873,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next, int distance, struct stack_trace *trace,
int (*save)(struct stack_trace *trace))
{
struct lock_list *entry;
int ret;
struct lock_list this;
struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
struct lock_list this;
int ret;
/*
* Prove that the new <prev> -> <next> dependency would not
@ -1890,8 +1890,17 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
this.class = hlock_class(next);
this.parent = NULL;
ret = check_noncircular(&this, hlock_class(prev), &target_entry);
if (unlikely(!ret))
if (unlikely(!ret)) {
if (!trace->entries) {
/*
* If @save fails here, the printing might trigger
* a WARN but because of the !nr_entries it should
* not do bad things.
*/
save(trace);
}
return print_circular_bug(&this, target_entry, next, prev, trace);
}
else if (unlikely(ret < 0))
return print_bfs_bug(ret);
@ -1938,7 +1947,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return print_bfs_bug(ret);
if (save && !save(trace))
if (!trace->entries && !save(trace))
return 0;
/*
@ -1958,20 +1967,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (!ret)
return 0;
/*
* Debugging printouts:
*/
if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
printk(KERN_CONT " => ");
print_lock_name(hlock_class(next));
printk(KERN_CONT "\n");
dump_stack();
if (!graph_lock())
return 0;
}
return 2;
}
@ -1986,8 +1981,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
struct held_lock *hlock;
struct stack_trace trace;
int (*save)(struct stack_trace *trace) = save_trace;
struct stack_trace trace = {
.nr_entries = 0,
.max_entries = 0,
.entries = NULL,
.skip = 0,
};
/*
* Debugging checks.
@ -2018,17 +2017,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
*/
if (hlock->read != 2 && hlock->check) {
int ret = check_prev_add(curr, hlock, next,
distance, &trace, save);
distance, &trace, save_trace);
if (!ret)
return 0;
/*
* Stop saving stack_trace if save_trace() was
* called at least once:
*/
if (save && ret == 2)
save = NULL;
/*
* Stop after the first non-trylock entry,
* as non-trylock entries have added their

View file

@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
/**
* call_srcu() - Queue a callback for invocation after an SRCU grace period
* @sp: srcu_struct in queue the callback
* @head: structure to be used for queueing the SRCU callback.
* @rhp: structure to be used for queueing the SRCU callback.
* @func: function to be invoked after the SRCU grace period
*
* The callback function will be invoked some time after a full SRCU

View file

@ -85,6 +85,9 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
}
/**
* rcu_sync_enter_start - Force readers onto slow path for multiple updates
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
* Must be called after rcu_sync_init() and before first use.
*
* Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
@ -142,7 +145,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
/**
* rcu_sync_func() - Callback function managing reader access to fastpath
* @rsp: Pointer to rcu_sync structure to use for synchronization
* @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
*
* This function is passed to one of the call_rcu() functions by
* rcu_sync_exit(), so that it is invoked after a grace period following the
@ -158,9 +161,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* rcu_sync_exit(). Otherwise, set all state back to idle so that readers
* can again use their fastpaths.
*/
static void rcu_sync_func(struct rcu_head *rcu)
static void rcu_sync_func(struct rcu_head *rhp)
{
struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
unsigned long flags;
BUG_ON(rsp->gp_state != GP_PASSED);

View file

@ -3097,9 +3097,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
* read-side critical sections have completed. call_rcu_sched() assumes
* that the read-side critical sections end on enabling of preemption
* or on voluntary preemption.
* RCU read-side critical sections are delimited by :
* - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
* - anything that disables preemption.
* RCU read-side critical sections are delimited by:
*
* - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
* - anything that disables preemption.
*
* These may be nested.
*
@ -3124,11 +3125,12 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
* RCU read-side critical sections are delimited by :
* - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
* OR
* - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
* These may be nested.
* RCU read-side critical sections are delimited by:
*
* - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR
* - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
*
* These may be nested.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.

View file

@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
return 1;
}
struct llc_stats {
unsigned long nr_running;
unsigned long load;
unsigned long capacity;
int has_capacity;
};
/*
* The purpose of wake_affine() is to quickly determine on which CPU we can run
* soonest. For the purpose of speed we only consider the waking and previous
* CPU.
*
* wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
* will be) idle.
*
* wake_affine_weight() - considers the weight to reflect the average
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/
static bool get_llc_stats(struct llc_stats *stats, int cpu)
static bool
wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (idle_cpu(this_cpu))
return true;
if (!sds)
return false;
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return true;
stats->nr_running = READ_ONCE(sds->nr_running);
stats->load = READ_ONCE(sds->load);
stats->capacity = READ_ONCE(sds->capacity);
stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
return true;
return false;
}
/*
* Can a task be moved from prev_cpu to this_cpu without causing a load
* imbalance that would trigger the load balancer?
*
* Since we're running on 'stale' values, we might in fact create an imbalance
* but recomputing these values is expensive, as that'd mean iteration 2 cache
* domains worth of CPUs.
*/
static bool
wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
struct llc_stats prev_stats, this_stats;
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
if (!get_llc_stats(&prev_stats, prev_cpu) ||
!get_llc_stats(&this_stats, this_cpu))
return false;
this_eff_load = target_load(this_cpu, sd->wake_idx);
prev_eff_load = source_load(prev_cpu, sd->wake_idx);
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current LLC.
*/
if (sync) {
unsigned long current_load = task_h_load(current);
/* in this case load hits 0 and this LLC is considered 'idle' */
if (current_load > this_stats.load)
if (current_load > this_eff_load)
return true;
this_stats.load -= current_load;
this_eff_load -= current_load;
}
/*
* The has_capacity stuff is not SMT aware, but by trying to balance
* the nr_running on both ends we try and fill the domain at equal
* rates, thereby first consuming cores before siblings.
*/
/* if the old cache has capacity, stay there */
if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
return false;
/* if this cache has capacity, come here */
if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
return true;
/*
* Check to see if we can move the load without causing too much
* imbalance.
*/
task_load = task_h_load(p);
this_eff_load = 100;
this_eff_load *= prev_stats.capacity;
this_eff_load += task_load;
if (sched_feat(WA_BIAS))
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= this_stats.capacity;
this_eff_load *= this_stats.load + task_load;
prev_eff_load *= prev_stats.load - task_load;
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
return this_eff_load <= prev_eff_load;
}
@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
bool affine;
bool affine = false;
/*
* Default to no affine wakeups; wake_affine() should not effect a task
* placement the load-balancer feels inclined to undo. The conservative
* option is therefore to not move tasks when they wake up.
*/
affine = false;
if (sched_feat(WA_IDLE) && !affine)
affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
/*
* If the wakeup is across cache domains, try to evaluate if movement
* makes sense, otherwise rely on select_idle_siblings() to do
* placement inside the cache domain.
*/
if (!cpus_share_cache(prev_cpu, this_cpu))
affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
if (sched_feat(WA_WEIGHT) && !affine)
affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@ -7672,22 +7633,6 @@ next_group:
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
if (!shared)
return;
/*
* Since these are sums over groups they can contain some CPUs
* multiple times for the NUMA domains.
*
* Currently only wake_affine_llc() and find_busiest_group()
* uses these numbers, only the last is affected by this problem.
*
* XXX fix that.
*/
WRITE_ONCE(shared->nr_running, sds->total_running);
WRITE_ONCE(shared->load, sds->total_load);
WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
@ -8097,6 +8042,13 @@ static int should_we_balance(struct lb_env *env)
struct sched_group *sg = env->sd->groups;
int cpu, balance_cpu = -1;
/*
* Ensure the balancing environment is consistent; can happen
* when the softirq triggers 'during' hotplug.
*/
if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
return 0;
/*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.

View file

@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
SCHED_FEAT(WA_IDLE, true)
SCHED_FEAT(WA_WEIGHT, true)
SCHED_FEAT(WA_BIAS, true)

View file

@ -18,6 +18,7 @@
#include <linux/membarrier.h>
#include <linux/tick.h>
#include <linux/cpumask.h>
#include <linux/atomic.h>
#include "sched.h" /* for cpu_rq(). */
@ -26,21 +27,26 @@
* except MEMBARRIER_CMD_QUERY.
*/
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
static void membarrier_private_expedited(void)
static int membarrier_private_expedited(void)
{
int cpu;
bool fallback = false;
cpumask_var_t tmpmask;
if (!(atomic_read(&current->mm->membarrier_state)
& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
if (num_online_cpus() == 1)
return;
return 0;
/*
* Matches memory barriers around rq->curr modification in
@ -94,6 +100,24 @@ static void membarrier_private_expedited(void)
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
return 0;
}
static void membarrier_register_private_expedited(void)
{
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
/*
* We need to consider threads belonging to different thread
* groups, which use the same mm. (CLONE_VM but not
* CLONE_THREAD).
*/
if (atomic_read(&mm->membarrier_state)
& MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
return;
atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
&mm->membarrier_state);
}
/**
@ -144,7 +168,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
synchronize_sched();
return 0;
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
membarrier_private_expedited();
return membarrier_private_expedited();
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
membarrier_register_private_expedited();
return 0;
default:
return -EINVAL;

View file

@ -473,7 +473,7 @@ static long seccomp_attach_filter(unsigned int flags,
return 0;
}
void __get_seccomp_filter(struct seccomp_filter *filter)
static void __get_seccomp_filter(struct seccomp_filter *filter)
{
/* Reference count is bounded by the number of total processes. */
refcount_inc(&filter->usage);