From 1689f25924ada8fe14a4a82c38925d04994c7142 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Jun 2023 16:24:27 +0200 Subject: [PATCH 01/79] netfilter: nf_tables: report use refcount overflow Overflow use refcount checks are not complete. Add helper function to deal with object reference counter tracking. Report -EMFILE in case UINT_MAX is reached. nft_use_dec() splats in case that reference counter underflows, which should not ever happen. Add nft_use_inc_restore() and nft_use_dec_restore() which are used to restore reference counter from error and abort paths. Use u32 in nft_flowtable and nft_object since helper functions cannot work on bitfields. Remove the few early incomplete checks now that the helper functions are in place and used to check for refcount overflow. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 31 +++++- net/netfilter/nf_tables_api.c | 163 ++++++++++++++++++------------ net/netfilter/nft_flow_offload.c | 6 +- net/netfilter/nft_immediate.c | 8 +- net/netfilter/nft_objref.c | 8 +- 5 files changed, 141 insertions(+), 75 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 84f2fd85fd5a..640441a2f926 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1211,6 +1211,29 @@ int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); +static inline bool nft_use_inc(u32 *use) +{ + if (*use == UINT_MAX) + return false; + + (*use)++; + + return true; +} + +static inline void nft_use_dec(u32 *use) +{ + WARN_ON_ONCE((*use)-- == 0); +} + +/* For error and abort path: restore use counter to previous state. */ +static inline void nft_use_inc_restore(u32 *use) +{ + WARN_ON_ONCE(!nft_use_inc(use)); +} + +#define nft_use_dec_restore nft_use_dec + /** * struct nft_table - nf_tables table * @@ -1296,8 +1319,8 @@ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; - u32 genmask:2, - use:30; + u32 genmask:2; + u32 use; u64 handle; u16 udlen; u8 *udata; @@ -1399,8 +1422,8 @@ struct nft_flowtable { char *name; int hooknum; int ops_len; - u32 genmask:2, - use:30; + u32 genmask:2; + u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9573a8fcad79..86b3c4de7f40 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -253,8 +253,10 @@ int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) if (chain->bound) return -EBUSY; + if (!nft_use_inc(&chain->use)) + return -EMFILE; + chain->bound = true; - chain->use++; nft_chain_trans_bind(ctx, chain); return 0; @@ -437,7 +439,7 @@ static int nft_delchain(struct nft_ctx *ctx) if (IS_ERR(trans)) return PTR_ERR(trans); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nft_deactivate_next(ctx->net, ctx->chain); return 0; @@ -476,7 +478,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule) /* You cannot delete the same rule twice */ if (nft_is_active_next(ctx->net, rule)) { nft_deactivate_next(ctx->net, rule); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); return 0; } return -ENOENT; @@ -644,7 +646,7 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) nft_map_deactivate(ctx, set); nft_deactivate_next(ctx->net, set); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -676,7 +678,7 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj) return err; nft_deactivate_next(ctx->net, obj); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -711,7 +713,7 @@ static int nft_delflowtable(struct nft_ctx *ctx, return err; nft_deactivate_next(ctx->net, flowtable); - ctx->table->use--; + nft_use_dec(&ctx->table->use); return err; } @@ -2396,9 +2398,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_chain *chain; int err; - if (table->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_CHAIN_HOOK]) { struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook = {}; @@ -2494,6 +2493,11 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) goto err_destroy_chain; + if (!nft_use_inc(&table->use)) { + err = -EMFILE; + goto err_use; + } + trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -2510,10 +2514,11 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err_unregister_hook; } - table->use++; - return 0; + err_unregister_hook: + nft_use_dec_restore(&table->use); +err_use: nf_tables_unregister_hook(net, table, chain); err_destroy_chain: nf_tables_chain_destroy(ctx); @@ -3840,9 +3845,6 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return -EINVAL; handle = nf_tables_alloc_handle(table); - if (chain->use == UINT_MAX) - return -EOVERFLOW; - if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); old_rule = __nft_rule_lookup(chain, pos_handle); @@ -3936,6 +3938,11 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } } + if (!nft_use_inc(&chain->use)) { + err = -EMFILE; + goto err_release_rule; + } + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { err = nft_delrule(&ctx, old_rule); if (err < 0) @@ -3967,7 +3974,6 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } } kvfree(expr_info); - chain->use++; if (flow) nft_trans_flow_rule(trans) = flow; @@ -3978,6 +3984,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return 0; err_destroy_flow_rule: + nft_use_dec_restore(&chain->use); if (flow) nft_flow_rule_destroy(flow); err_release_rule: @@ -5014,9 +5021,15 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, alloc_size = sizeof(*set) + size + udlen; if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; + + if (!nft_use_inc(&table->use)) + return -EMFILE; + set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT); - if (!set) - return -ENOMEM; + if (!set) { + err = -ENOMEM; + goto err_alloc; + } name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT); if (!name) { @@ -5074,7 +5087,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, goto err_set_expr_alloc; list_add_tail_rcu(&set->list, &table->sets); - table->use++; + return 0; err_set_expr_alloc: @@ -5086,6 +5099,9 @@ err_set_init: kfree(set->name); err_set_name: kvfree(set); +err_alloc: + nft_use_dec_restore(&table->use); + return err; } @@ -5224,9 +5240,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *i; struct nft_set_iter iter; - if (set->use == UINT_MAX) - return -EOVERFLOW; - if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; @@ -5254,10 +5267,12 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, return iter.err; } bind: + if (!nft_use_inc(&set->use)) + return -EMFILE; + binding->chain = ctx->chain; list_add_tail_rcu(&binding->list, &set->bindings); nft_set_trans_bind(ctx, set); - set->use++; return 0; } @@ -5331,7 +5346,7 @@ void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set) nft_clear(ctx->net, set); } - set->use++; + nft_use_inc_restore(&set->use); } EXPORT_SYMBOL_GPL(nf_tables_activate_set); @@ -5347,7 +5362,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, else list_del_rcu(&binding->list); - set->use--; + nft_use_dec(&set->use); break; case NFT_TRANS_PREPARE: if (nft_set_is_anonymous(set)) { @@ -5356,7 +5371,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, nft_deactivate_next(ctx->net, set); } - set->use--; + nft_use_dec(&set->use); return; case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: @@ -5364,7 +5379,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(ctx, set); - set->use--; + nft_use_dec(&set->use); fallthrough; default: nf_tables_unbind_set(ctx, set, binding, @@ -6155,7 +6170,7 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem); } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); @@ -6657,8 +6672,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); + obj = NULL; goto err_parse_key_end; } + + if (!nft_use_inc(&obj->use)) { + err = -EMFILE; + obj = NULL; + goto err_parse_key_end; + } + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); if (err < 0) goto err_parse_key_end; @@ -6727,10 +6750,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; - if (obj) { + if (obj) *nft_set_ext_obj(ext) = obj; - obj->use++; - } + if (ulen > 0) { if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) { err = -EINVAL; @@ -6798,12 +6820,13 @@ err_element_clash: kfree(trans); err_elem_free: nf_tables_set_elem_destroy(ctx, set, elem.priv); - if (obj) - obj->use--; err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&elem.data.val, desc.type); err_parse_key_end: + if (obj) + nft_use_dec_restore(&obj->use); + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); @@ -6883,7 +6906,7 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; - chain->use++; + nft_use_inc_restore(&chain->use); break; } } @@ -6898,7 +6921,7 @@ static void nft_setelem_data_activate(const struct net *net, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_hold(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use++; + nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } static void nft_setelem_data_deactivate(const struct net *net, @@ -6910,7 +6933,7 @@ static void nft_setelem_data_deactivate(const struct net *net, if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) - (*nft_set_ext_obj(ext))->use--; + nft_use_dec(&(*nft_set_ext_obj(ext))->use); } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, @@ -7453,9 +7476,14 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nft_use_inc(&table->use)) + return -EMFILE; + type = nft_obj_type_get(net, objtype); - if (IS_ERR(type)) - return PTR_ERR(type); + if (IS_ERR(type)) { + err = PTR_ERR(type); + goto err_type; + } obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { @@ -7489,7 +7517,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); - table->use++; + return 0; err_obj_ht: /* queued in transaction log */ @@ -7505,6 +7533,9 @@ err_strdup: kfree(obj); err_init: module_put(type->owner); +err_type: + nft_use_dec_restore(&table->use); + return err; } @@ -7906,7 +7937,7 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, case NFT_TRANS_PREPARE: case NFT_TRANS_ABORT: case NFT_TRANS_RELEASE: - flowtable->use--; + nft_use_dec(&flowtable->use); fallthrough; default: return; @@ -8260,9 +8291,14 @@ static int nf_tables_newflowtable(struct sk_buff *skb, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + if (!nft_use_inc(&table->use)) + return -EMFILE; + flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT); - if (!flowtable) - return -ENOMEM; + if (!flowtable) { + err = -ENOMEM; + goto flowtable_alloc; + } flowtable->table = table; flowtable->handle = nf_tables_alloc_handle(table); @@ -8317,7 +8353,6 @@ static int nf_tables_newflowtable(struct sk_buff *skb, goto err5; list_add_tail_rcu(&flowtable->list, &table->flowtables); - table->use++; return 0; err5: @@ -8334,6 +8369,9 @@ err2: kfree(flowtable->name); err1: kfree(flowtable); +flowtable_alloc: + nft_use_dec_restore(&table->use); + return err; } @@ -9713,7 +9751,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) */ if (nft_set_is_anonymous(nft_trans_set(trans)) && !list_empty(&nft_trans_set(trans)->bindings)) - trans->ctx.table->use--; + nft_use_dec(&trans->ctx.table->use); } nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_NEWSET, GFP_KERNEL); @@ -9943,7 +9981,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, trans->ctx.table, @@ -9956,7 +9994,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_splice(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); } else { - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, trans->ctx.chain); } nft_trans_destroy(trans); @@ -9966,7 +10004,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - trans->ctx.chain->use--; + nft_use_dec_restore(&trans->ctx.chain->use); list_del_rcu(&nft_trans_rule(trans)->list); nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), @@ -9976,7 +10014,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: - trans->ctx.chain->use++; + nft_use_inc_restore(&trans->ctx.chain->use); nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) @@ -9989,7 +10027,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; } - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; @@ -9998,7 +10036,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_set(trans)); if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_activate(&trans->ctx, nft_trans_set(trans)); @@ -10042,13 +10080,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans)); nft_trans_destroy(trans); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); nft_obj_del(nft_trans_obj(trans)); } break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_obj(trans)); nft_trans_destroy(trans); break; @@ -10057,7 +10095,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable_hooks(trans)); } else { - trans->ctx.table->use--; + nft_use_dec_restore(&trans->ctx.table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, &nft_trans_flowtable(trans)->hook_list); @@ -10069,7 +10107,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) list_splice(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); } else { - trans->ctx.table->use++; + nft_use_inc_restore(&trans->ctx.table->use); nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); } nft_trans_destroy(trans); @@ -10518,8 +10556,9 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (desc->flags & NFT_DATA_DESC_SETELEM && chain->flags & NFT_CHAIN_BINDING) return -EINVAL; + if (!nft_use_inc(&chain->use)) + return -EMFILE; - chain->use++; data->verdict.chain = chain; break; } @@ -10537,7 +10576,7 @@ static void nft_verdict_uninit(const struct nft_data *data) case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; - chain->use--; + nft_use_dec(&chain->use); break; } } @@ -10706,11 +10745,11 @@ int __nft_release_basechain(struct nft_ctx *ctx) nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); - ctx->chain->use--; + nft_use_dec(&ctx->chain->use); nf_tables_rule_release(ctx, rule); } nft_chain_del(ctx->chain); - ctx->table->use--; + nft_use_dec(&ctx->table->use); nf_tables_chain_destroy(ctx); return 0; @@ -10760,18 +10799,18 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.chain = chain; list_for_each_entry_safe(rule, nr, &chain->rules, list) { list_del(&rule->list); - chain->use--; + nft_use_dec(&chain->use); nf_tables_rule_release(&ctx, rule); } } list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) { list_del(&flowtable->list); - table->use--; + nft_use_dec(&table->use); nf_tables_flowtable_destroy(flowtable); } list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); - table->use--; + nft_use_dec(&table->use); if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT)) nft_map_deactivate(&ctx, set); @@ -10779,13 +10818,13 @@ static void __nft_release_table(struct net *net, struct nft_table *table) } list_for_each_entry_safe(obj, ne, &table->objects, list) { nft_obj_del(obj); - table->use--; + nft_use_dec(&table->use); nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { ctx.chain = chain; nft_chain_del(chain); - table->use--; + nft_use_dec(&table->use); nf_tables_chain_destroy(&ctx); } nf_tables_table_destroy(&ctx); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 5ef9146e74ad..ab3362c483b4 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -408,8 +408,10 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (IS_ERR(flowtable)) return PTR_ERR(flowtable); + if (!nft_use_inc(&flowtable->use)) + return -EMFILE; + priv->flowtable = flowtable; - flowtable->use++; return nf_ct_netns_get(ctx->net, ctx->family); } @@ -428,7 +430,7 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx, { struct nft_flow_offload *priv = nft_expr_priv(expr); - priv->flowtable->use++; + nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 3d76ebfe8939..407d7197f75b 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -159,7 +159,7 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, default: nft_chain_del(chain); chain->bound = false; - chain->table->use--; + nft_use_dec(&chain->table->use); break; } break; @@ -198,7 +198,7 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, * let the transaction records release this chain and its rules. */ if (chain->bound) { - chain->use--; + nft_use_dec(&chain->use); break; } @@ -206,9 +206,9 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, chain_ctx = *ctx; chain_ctx.chain = chain; - chain->use--; + nft_use_dec(&chain->use); list_for_each_entry_safe(rule, n, &chain->rules, list) { - chain->use--; + nft_use_dec(&chain->use); list_del(&rule->list); nf_tables_rule_destroy(&chain_ctx, rule); } diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index a48dd5b5d45b..509011b1ef59 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -41,8 +41,10 @@ static int nft_objref_init(const struct nft_ctx *ctx, if (IS_ERR(obj)) return -ENOENT; + if (!nft_use_inc(&obj->use)) + return -EMFILE; + nft_objref_priv(expr) = obj; - obj->use++; return 0; } @@ -72,7 +74,7 @@ static void nft_objref_deactivate(const struct nft_ctx *ctx, if (phase == NFT_TRANS_COMMIT) return; - obj->use--; + nft_use_dec(&obj->use); } static void nft_objref_activate(const struct nft_ctx *ctx, @@ -80,7 +82,7 @@ static void nft_objref_activate(const struct nft_ctx *ctx, { struct nft_object *obj = nft_objref_priv(expr); - obj->use++; + nft_use_inc_restore(&obj->use); } static const struct nft_expr_ops nft_objref_ops = { From 8a9dc07ba92497a81f1ff65d25c2ba7b6f9a8bdc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Jul 2023 13:43:18 +0200 Subject: [PATCH 02/79] netfilter: conntrack: gre: don't set assured flag for clash entries Now that conntrack core is allowd to insert clashing entries, make sure GRE won't set assured flag on NAT_CLASH entries, just like UDP. Doing so prevents early_drop logic for these entries. Fixes: d671fd82eaa9 ("netfilter: conntrack: allow insertion clash of gre protocol") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_gre.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index ad6f0ca40cd2..af369e686fc5 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -205,6 +205,8 @@ int nf_conntrack_gre_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { + unsigned long status; + if (!nf_ct_is_confirmed(ct)) { unsigned int *timeouts = nf_ct_timeout_lookup(ct); @@ -217,11 +219,17 @@ int nf_conntrack_gre_packet(struct nf_conn *ct, ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; } + status = READ_ONCE(ct->status); /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ - if (ct->status & IPS_SEEN_REPLY) { + if (status & IPS_SEEN_REPLY) { nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.stream_timeout); + + /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ + if (unlikely((status & IPS_NAT_CLASH))) + return NF_ACCEPT; + /* Also, more likely to be important, and not a probe. */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); From 6eef7a2b933885a17679eb8ed0796ddf0ee5309b Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 3 Jul 2023 16:52:16 +0200 Subject: [PATCH 03/79] netfilter: conntrack: Avoid nf_ct_helper_hash uses after free If nf_conntrack_init_start() fails (for example due to a register_nf_conntrack_bpf() failure), the nf_conntrack_helper_fini() clean-up path frees the nf_ct_helper_hash map. When built with NF_CONNTRACK=y, further netfilter modules (e.g: netfilter_conntrack_ftp) can still be loaded and call nf_conntrack_helpers_register(), independently of whether nf_conntrack initialized correctly. This accesses the nf_ct_helper_hash dangling pointer and causes a uaf, possibly leading to random memory corruption. This patch guards nf_conntrack_helper_register() from accessing a freed or uninitialized nf_ct_helper_hash pointer and fixes possible uses-after-free when loading a conntrack module. Cc: stable@vger.kernel.org Fixes: 12f7a505331e ("netfilter: add user-space connection tracking helper infrastructure") Signed-off-by: Florent Revest Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 0c4db2f2ac43..f22691f83853 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -360,6 +360,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + if (!nf_ct_helper_hash) + return -ENOENT; + if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; @@ -515,4 +518,5 @@ int nf_conntrack_helper_init(void) void nf_conntrack_helper_fini(void) { kvfree(nf_ct_helper_hash); + nf_ct_helper_hash = NULL; } From eaf9e7192ec9af2fbf1b6eb2299dd0feca6c5f7e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 4 Jul 2023 12:25:23 +0200 Subject: [PATCH 04/79] netfilter: conntrack: don't fold port numbers into addresses before hashing Originally this used jhash2() over tuple and folded the zone id, the pernet hash value, destination port and l4 protocol number into the 32bit seed value. When the switch to siphash was done, I used an on-stack temporary buffer to build a suitable key to be hashed via siphash(). But this showed up as performance regression, so I got rid of the temporary copy and collected to-be-hashed data in 4 u64 variables. This makes it easy to build tuples that produce the same hash, which isn't desirable even though chain lengths are limited. Switch back to plain siphash, but just like with jhash2(), take advantage of the fact that most of to-be-hashed data is already in a suitable order. Use an empty struct as annotation in 'struct nf_conntrack_tuple' to mark last member that can be used as hash input. The only remaining data that isn't present in the tuple structure are the zone identifier and the pernet hash: fold those into the key. Fixes: d2c806abcf0b ("netfilter: conntrack: use siphash_4u64") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_tuple.h | 3 +++ net/netfilter/nf_conntrack_core.c | 20 +++++++------------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index 9334371c94e2..f7dd950ff250 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -67,6 +67,9 @@ struct nf_conntrack_tuple { /* The protocol. */ u_int8_t protonum; + /* The direction must be ignored for the tuplehash */ + struct { } __nfct_hash_offsetend; + /* The direction (for tuplehash) */ u_int8_t dir; } dst; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d119f1d4c2fc..992393102d5f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -211,24 +211,18 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, unsigned int zoneid, const struct net *net) { - u64 a, b, c, d; + siphash_key_t key; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - /* The direction must be ignored, handle usable tuplehash members manually */ - a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; - b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3]; + key = nf_conntrack_hash_rnd; - c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; - c |= tuple->dst.protonum; + key.key[0] ^= zoneid; + key.key[1] ^= net_hash_mix(net); - d = (u64)zoneid << 32 | net_hash_mix(net); - - /* IPv4: u3.all[1,2,3] == 0 */ - c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; - d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; - - return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); + return siphash((void *)tuple, + offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), + &key); } static u32 scale_hash(u32 hash) From 515ad530795c118f012539ed76d02bacfd426d89 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 5 Jul 2023 09:12:55 -0300 Subject: [PATCH 05/79] netfilter: nf_tables: do not ignore genmask when looking up chain by id When adding a rule to a chain referring to its ID, if that chain had been deleted on the same batch, the rule might end up referring to a deleted chain. This will lead to a WARNING like following: [ 33.098431] ------------[ cut here ]------------ [ 33.098678] WARNING: CPU: 5 PID: 69 at net/netfilter/nf_tables_api.c:2037 nf_tables_chain_destroy+0x23d/0x260 [ 33.099217] Modules linked in: [ 33.099388] CPU: 5 PID: 69 Comm: kworker/5:1 Not tainted 6.4.0+ #409 [ 33.099726] Workqueue: events nf_tables_trans_destroy_work [ 33.100018] RIP: 0010:nf_tables_chain_destroy+0x23d/0x260 [ 33.100306] Code: 8b 7c 24 68 e8 64 9c ed fe 4c 89 e7 e8 5c 9c ed fe 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d 31 c0 89 c6 89 c7 c3 cc cc cc cc <0f> 0b 48 83 c4 08 5b 41 5c 41 5d 41 5e 41 5f 5d 31 c0 89 c6 89 c7 [ 33.101271] RSP: 0018:ffffc900004ffc48 EFLAGS: 00010202 [ 33.101546] RAX: 0000000000000001 RBX: ffff888006fc0a28 RCX: 0000000000000000 [ 33.101920] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 33.102649] RBP: ffffc900004ffc78 R08: 0000000000000000 R09: 0000000000000000 [ 33.103018] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8880135ef500 [ 33.103385] R13: 0000000000000000 R14: dead000000000122 R15: ffff888006fc0a10 [ 33.103762] FS: 0000000000000000(0000) GS:ffff888024c80000(0000) knlGS:0000000000000000 [ 33.104184] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 33.104493] CR2: 00007fe863b56a50 CR3: 00000000124b0001 CR4: 0000000000770ee0 [ 33.104872] PKRU: 55555554 [ 33.104999] Call Trace: [ 33.105113] [ 33.105214] ? show_regs+0x72/0x90 [ 33.105371] ? __warn+0xa5/0x210 [ 33.105520] ? nf_tables_chain_destroy+0x23d/0x260 [ 33.105732] ? report_bug+0x1f2/0x200 [ 33.105902] ? handle_bug+0x46/0x90 [ 33.106546] ? exc_invalid_op+0x19/0x50 [ 33.106762] ? asm_exc_invalid_op+0x1b/0x20 [ 33.106995] ? nf_tables_chain_destroy+0x23d/0x260 [ 33.107249] ? nf_tables_chain_destroy+0x30/0x260 [ 33.107506] nf_tables_trans_destroy_work+0x669/0x680 [ 33.107782] ? mark_held_locks+0x28/0xa0 [ 33.107996] ? __pfx_nf_tables_trans_destroy_work+0x10/0x10 [ 33.108294] ? _raw_spin_unlock_irq+0x28/0x70 [ 33.108538] process_one_work+0x68c/0xb70 [ 33.108755] ? lock_acquire+0x17f/0x420 [ 33.108977] ? __pfx_process_one_work+0x10/0x10 [ 33.109218] ? do_raw_spin_lock+0x128/0x1d0 [ 33.109435] ? _raw_spin_lock_irq+0x71/0x80 [ 33.109634] worker_thread+0x2bd/0x700 [ 33.109817] ? __pfx_worker_thread+0x10/0x10 [ 33.110254] kthread+0x18b/0x1d0 [ 33.110410] ? __pfx_kthread+0x10/0x10 [ 33.110581] ret_from_fork+0x29/0x50 [ 33.110757] [ 33.110866] irq event stamp: 1651 [ 33.111017] hardirqs last enabled at (1659): [] __up_console_sem+0x79/0xa0 [ 33.111379] hardirqs last disabled at (1666): [] __up_console_sem+0x5e/0xa0 [ 33.111740] softirqs last enabled at (1616): [] __irq_exit_rcu+0x9e/0xe0 [ 33.112094] softirqs last disabled at (1367): [] __irq_exit_rcu+0x9e/0xe0 [ 33.112453] ---[ end trace 0000000000000000 ]--- This is due to the nft_chain_lookup_byid ignoring the genmask. After this change, adding the new rule will fail as it will not find the chain. Fixes: 837830a4b439 ("netfilter: nf_tables: add NFTA_RULE_CHAIN_ID attribute") Cc: stable@vger.kernel.org Reported-by: Mingi Cho of Theori working with ZDI Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 86b3c4de7f40..237f739da3ca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2699,7 +2699,7 @@ err_hooks: static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nft_table *table, - const struct nlattr *nla) + const struct nlattr *nla, u8 genmask) { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); @@ -2710,7 +2710,8 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net, if (trans->msg_type == NFT_MSG_NEWCHAIN && chain->table == table && - id == nft_trans_chain_id(trans)) + id == nft_trans_chain_id(trans) && + nft_active_genmask(chain, genmask)) return chain; } return ERR_PTR(-ENOENT); @@ -3814,7 +3815,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, return -EOPNOTSUPP; } else if (nla[NFTA_RULE_CHAIN_ID]) { - chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]); + chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID], + genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); return PTR_ERR(chain); @@ -10540,7 +10542,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, genmask); } else if (tb[NFTA_VERDICT_CHAIN_ID]) { chain = nft_chain_lookup_byid(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN_ID]); + tb[NFTA_VERDICT_CHAIN_ID], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); } else { From 5f16da6ee6ac32e6c8098bc4cfcc4f170694f9da Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 9 Jun 2023 17:40:23 -0700 Subject: [PATCH 06/79] ice: Fix max_rate check while configuring TX rate limits Remove incorrect check in ice_validate_mqprio_opt() that limits filter configuration when sum of max_rates of all TCs exceeds the link speed. The max rate of each TC is unrelated to value used by other TCs and is valid as long as it is less than link speed. Fixes: fbc7b27af0f9 ("ice: enable ndo_setup_tc support for mqprio_qdisc") Signed-off-by: Sridhar Samudrala Signed-off-by: Sudheer Mogilappagari Tested-by: Bharathi Sreenivas Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 93979ab18bc1..64efe4c83a3e 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -7872,10 +7872,10 @@ static int ice_validate_mqprio_qopt(struct ice_vsi *vsi, struct tc_mqprio_qopt_offload *mqprio_qopt) { - u64 sum_max_rate = 0, sum_min_rate = 0; int non_power_of_2_qcount = 0; struct ice_pf *pf = vsi->back; int max_rss_q_cnt = 0; + u64 sum_min_rate = 0; struct device *dev; int i, speed; u8 num_tc; @@ -7891,6 +7891,7 @@ ice_validate_mqprio_qopt(struct ice_vsi *vsi, dev = ice_pf_to_dev(pf); vsi->ch_rss_size = 0; num_tc = mqprio_qopt->qopt.num_tc; + speed = ice_get_link_speed_kbps(vsi); for (i = 0; num_tc; i++) { int qcount = mqprio_qopt->qopt.count[i]; @@ -7931,7 +7932,6 @@ ice_validate_mqprio_qopt(struct ice_vsi *vsi, */ max_rate = mqprio_qopt->max_rate[i]; max_rate = div_u64(max_rate, ICE_BW_KBPS_DIVISOR); - sum_max_rate += max_rate; /* min_rate is minimum guaranteed rate and it can't be zero */ min_rate = mqprio_qopt->min_rate[i]; @@ -7944,6 +7944,12 @@ ice_validate_mqprio_qopt(struct ice_vsi *vsi, return -EINVAL; } + if (max_rate && max_rate > speed) { + dev_err(dev, "TC%d: max_rate(%llu Kbps) > link speed of %u Kbps\n", + i, max_rate, speed); + return -EINVAL; + } + iter_div_u64_rem(min_rate, ICE_MIN_BW_LIMIT, &rem); if (rem) { dev_err(dev, "TC%d: Min Rate not multiple of %u Kbps", @@ -7981,12 +7987,6 @@ ice_validate_mqprio_qopt(struct ice_vsi *vsi, (mqprio_qopt->qopt.offset[i] + mqprio_qopt->qopt.count[i])) return -EINVAL; - speed = ice_get_link_speed_kbps(vsi); - if (sum_max_rate && sum_max_rate > (u64)speed) { - dev_err(dev, "Invalid max Tx rate(%llu) Kbps > speed(%u) Kbps specified\n", - sum_max_rate, speed); - return -EINVAL; - } if (sum_min_rate && sum_min_rate > (u64)speed) { dev_err(dev, "Invalid min Tx rate(%llu) Kbps > speed (%u) Kbps specified\n", sum_min_rate, speed); From 479cdfe388a04a16fdd127f3e9e9e019e45e5573 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 9 Jun 2023 17:40:24 -0700 Subject: [PATCH 07/79] ice: Fix tx queue rate limit when TCs are configured Configuring tx_maxrate via sysfs interface /sys/class/net/eth0/queues/tx-1/tx_maxrate was not working when TCs are configured because always main VSI was being used. Fix by using correct VSI in ice_set_tx_maxrate when TCs are configured. Fixes: 1ddef455f4a8 ("ice: Add NDO callback to set the maximum per-queue bitrate") Signed-off-by: Sridhar Samudrala Signed-off-by: Sudheer Mogilappagari Tested-by: Bharathi Sreenivas Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_main.c | 7 +++++++ drivers/net/ethernet/intel/ice/ice_tc_lib.c | 22 ++++++++++----------- drivers/net/ethernet/intel/ice/ice_tc_lib.h | 1 + 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 64efe4c83a3e..19a5e7f3a075 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5739,6 +5739,13 @@ ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate) q_handle = vsi->tx_rings[queue_index]->q_handle; tc = ice_dcb_get_tc(vsi, queue_index); + vsi = ice_locate_vsi_using_queue(vsi, queue_index); + if (!vsi) { + netdev_err(netdev, "Invalid VSI for given queue %d\n", + queue_index); + return -EINVAL; + } + /* Set BW back to default, when user set maxrate to 0 */ if (!maxrate) status = ice_cfg_q_bw_dflt_lmt(vsi->port_info, vsi->idx, tc, diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index b54052ef6050..4a34ef5f58d3 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -750,17 +750,16 @@ exit: /** * ice_locate_vsi_using_queue - locate VSI using queue (forward to queue action) * @vsi: Pointer to VSI - * @tc_fltr: Pointer to tc_flower_filter + * @queue: Queue index * - * Locate the VSI using specified queue. When ADQ is not enabled, always - * return input VSI, otherwise locate corresponding VSI based on per channel - * offset and qcount + * Locate the VSI using specified "queue". When ADQ is not enabled, + * always return input VSI, otherwise locate corresponding + * VSI based on per channel "offset" and "qcount" */ -static struct ice_vsi * -ice_locate_vsi_using_queue(struct ice_vsi *vsi, - struct ice_tc_flower_fltr *tc_fltr) +struct ice_vsi * +ice_locate_vsi_using_queue(struct ice_vsi *vsi, int queue) { - int num_tc, tc, queue; + int num_tc, tc; /* if ADQ is not active, passed VSI is the candidate VSI */ if (!ice_is_adq_active(vsi->back)) @@ -770,7 +769,6 @@ ice_locate_vsi_using_queue(struct ice_vsi *vsi, * upon queue number) */ num_tc = vsi->mqprio_qopt.qopt.num_tc; - queue = tc_fltr->action.fwd.q.queue; for (tc = 0; tc < num_tc; tc++) { int qcount = vsi->mqprio_qopt.qopt.count[tc]; @@ -812,6 +810,7 @@ ice_tc_forward_action(struct ice_vsi *vsi, struct ice_tc_flower_fltr *tc_fltr) struct ice_pf *pf = vsi->back; struct device *dev; u32 tc_class; + int q; dev = ice_pf_to_dev(pf); @@ -840,7 +839,8 @@ ice_tc_forward_action(struct ice_vsi *vsi, struct ice_tc_flower_fltr *tc_fltr) /* Determine destination VSI even though the action is * FWD_TO_QUEUE, because QUEUE is associated with VSI */ - dest_vsi = tc_fltr->dest_vsi; + q = tc_fltr->action.fwd.q.queue; + dest_vsi = ice_locate_vsi_using_queue(vsi, q); break; default: dev_err(dev, @@ -1716,7 +1716,7 @@ ice_tc_forward_to_queue(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr, /* If ADQ is configured, and the queue belongs to ADQ VSI, then prepare * ADQ switch filter */ - ch_vsi = ice_locate_vsi_using_queue(vsi, fltr); + ch_vsi = ice_locate_vsi_using_queue(vsi, fltr->action.fwd.q.queue); if (!ch_vsi) return -EINVAL; fltr->dest_vsi = ch_vsi; diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.h b/drivers/net/ethernet/intel/ice/ice_tc_lib.h index 8bbc1a62bdb1..65d387163a46 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.h @@ -204,6 +204,7 @@ static inline int ice_chnl_dmac_fltr_cnt(struct ice_pf *pf) return pf->num_dmac_chnl_fltrs; } +struct ice_vsi *ice_locate_vsi_using_queue(struct ice_vsi *vsi, int queue); int ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, struct flow_cls_offload *cls_flower); From ed89b74d2dc920cb61d3094e0e97ec8775b13086 Mon Sep 17 00:00:00 2001 From: Muhammad Husaini Zulkifli Date: Mon, 15 May 2023 14:03:36 +0800 Subject: [PATCH 08/79] igc: Add condition for qbv_config_change_errors counter Add condition to increase the qbv counter during taprio qbv configuration only. There might be a case when TC already been setup then user configure the ETF/CBS qdisc and this counter will increase if no condition above. Fixes: ae4fe4698300 ("igc: Add qbv_config_change_errors counter") Signed-off-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 2 ++ drivers/net/ethernet/intel/igc/igc_tsn.c | 1 + 3 files changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 00a5ee487812..aa5ceab0d371 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -184,6 +184,7 @@ struct igc_adapter { u32 max_frame_size; u32 min_frame_size; + int tc_setup_type; ktime_t base_time; ktime_t cycle_time; bool qbv_enable; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 019ce91c45aa..b90f94511005 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6327,6 +6327,8 @@ static int igc_setup_tc(struct net_device *dev, enum tc_setup_type type, { struct igc_adapter *adapter = netdev_priv(dev); + adapter->tc_setup_type = type; + switch (type) { case TC_QUERY_CAPS: return igc_tc_query_caps(adapter, type_data); diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 94a2b0dfb54d..6b299b83e7ef 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -249,6 +249,7 @@ skip_cbs: * Gate Control List (GCL) is running. */ if ((rd32(IGC_BASET_H) || rd32(IGC_BASET_L)) && + (adapter->tc_setup_type == TC_SETUP_QDISC_TAPRIO) && tsn_mode_reconfig) adapter->qbv_config_change_errors++; } else { From cca28ceac7c7857bc2d313777017585aef00bcc4 Mon Sep 17 00:00:00 2001 From: Muhammad Husaini Zulkifli Date: Wed, 17 May 2023 08:18:12 +0800 Subject: [PATCH 09/79] igc: Remove delay during TX ring configuration Remove unnecessary delay during the TX ring configuration. This will cause delay, especially during link down and link up activity. Furthermore, old SKUs like as I225 will call the reset_adapter to reset the controller during TSN mode Gate Control List (GCL) setting. This will add more time to the configuration of the real-time use case. It doesn't mentioned about this delay in the Software User Manual. It might have been ported from legacy code I210 in the past. Fixes: 13b5b7fd6a4a ("igc: Add support for Tx/Rx rings") Signed-off-by: Muhammad Husaini Zulkifli Acked-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index b90f94511005..7e22204822ea 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -711,7 +711,6 @@ static void igc_configure_tx_ring(struct igc_adapter *adapter, /* disable the queue */ wr32(IGC_TXDCTL(reg_idx), 0); wrfl(); - mdelay(10); wr32(IGC_TDLEN(reg_idx), ring->count * sizeof(union igc_adv_tx_desc)); From 175c241288c09f81eb7b44d65c1ef6045efa4d1a Mon Sep 17 00:00:00 2001 From: Muhammad Husaini Zulkifli Date: Sat, 3 Jun 2023 20:59:34 +0800 Subject: [PATCH 10/79] igc: Fix TX Hang issue when QBV Gate is closed If a user schedules a Gate Control List (GCL) to close one of the QBV gates while also transmitting a packet to that closed gate, TX Hang will be happen. HW would not drop any packet when the gate is closed and keep queuing up in HW TX FIFO until the gate is re-opened. This patch implements the solution to drop the packet for the closed gate. This patch will also reset the adapter to perform SW initialization for each 1st Gate Control List (GCL) to avoid hang. This is due to the HW design, where changing to TSN transmit mode requires SW initialization. Intel Discrete I225/6 transmit mode cannot be changed when in dynamic mode according to Software User Manual Section 7.5.2.1. Subsequent Gate Control List (GCL) operations will proceed without a reset, as they already are in TSN Mode. Step to reproduce: DUT: 1) Configure GCL List with certain gate close. BASE=$(date +%s%N) tc qdisc replace dev $IFACE parent root handle 100 taprio \ num_tc 4 \ map 0 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 \ queues 1@0 1@1 1@2 1@3 \ base-time $BASE \ sched-entry S 0x8 500000 \ sched-entry S 0x4 500000 \ flags 0x2 2) Transmit the packet to closed gate. You may use udp_tai application to transmit UDP packet to any of the closed gate. ./udp_tai -i -P 100000 -p 90 -c 1 -t <0/1> -u 30004 Fixes: ec50a9d437f0 ("igc: Add support for taprio offloading") Co-developed-by: Tan Tee Min Signed-off-by: Tan Tee Min Tested-by: Chwee Lin Choong Signed-off-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 6 +++ drivers/net/ethernet/intel/igc/igc_main.c | 58 +++++++++++++++++++++-- drivers/net/ethernet/intel/igc/igc_tsn.c | 41 ++++++++++------ 3 files changed, 87 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index aa5ceab0d371..639a50c02537 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -14,6 +14,7 @@ #include #include #include +#include #include "igc_hw.h" @@ -101,6 +102,8 @@ struct igc_ring { u32 start_time; u32 end_time; u32 max_sdu; + bool oper_gate_closed; /* Operating gate. True if the TX Queue is closed */ + bool admin_gate_closed; /* Future gate. True if the TX Queue will be closed */ /* CBS parameters */ bool cbs_enable; /* indicates if CBS is enabled */ @@ -160,6 +163,7 @@ struct igc_adapter { struct timer_list watchdog_timer; struct timer_list dma_err_timer; struct timer_list phy_info_timer; + struct hrtimer hrtimer; u32 wol; u32 en_mng_pt; @@ -189,6 +193,8 @@ struct igc_adapter { ktime_t cycle_time; bool qbv_enable; u32 qbv_config_change_errors; + bool qbv_transition; + unsigned int qbv_count; /* OS defined structs */ struct pci_dev *pdev; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 7e22204822ea..e5bfc4000658 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1572,6 +1572,9 @@ done: first->bytecount = skb->len; first->gso_segs = 1; + if (adapter->qbv_transition || tx_ring->oper_gate_closed) + goto out_drop; + if (tx_ring->max_sdu > 0) { u32 max_sdu = 0; @@ -3011,8 +3014,8 @@ static bool igc_clean_tx_irq(struct igc_q_vector *q_vector, int napi_budget) time_after(jiffies, tx_buffer->time_stamp + (adapter->tx_timeout_factor * HZ)) && !(rd32(IGC_STATUS) & IGC_STATUS_TXOFF) && - (rd32(IGC_TDH(tx_ring->reg_idx)) != - readl(tx_ring->tail))) { + (rd32(IGC_TDH(tx_ring->reg_idx)) != readl(tx_ring->tail)) && + !tx_ring->oper_gate_closed) { /* detected Tx unit hang */ netdev_err(tx_ring->netdev, "Detected Tx Unit Hang\n" @@ -6102,6 +6105,8 @@ static int igc_tsn_clear_schedule(struct igc_adapter *adapter) adapter->base_time = 0; adapter->cycle_time = NSEC_PER_SEC; adapter->qbv_config_change_errors = 0; + adapter->qbv_transition = false; + adapter->qbv_count = 0; for (i = 0; i < adapter->num_tx_queues; i++) { struct igc_ring *ring = adapter->tx_ring[i]; @@ -6109,6 +6114,8 @@ static int igc_tsn_clear_schedule(struct igc_adapter *adapter) ring->start_time = 0; ring->end_time = NSEC_PER_SEC; ring->max_sdu = 0; + ring->oper_gate_closed = false; + ring->admin_gate_closed = false; } return 0; @@ -6120,6 +6127,7 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, bool queue_configured[IGC_MAX_TX_QUEUES] = { }; struct igc_hw *hw = &adapter->hw; u32 start_time = 0, end_time = 0; + struct timespec64 now; size_t n; int i; @@ -6149,6 +6157,8 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, adapter->cycle_time = qopt->cycle_time; adapter->base_time = qopt->base_time; + igc_ptp_read(adapter, &now); + for (n = 0; n < qopt->num_entries; n++) { struct tc_taprio_sched_entry *e = &qopt->entries[n]; @@ -6183,7 +6193,10 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, ring->start_time = start_time; ring->end_time = end_time; - queue_configured[i] = true; + if (ring->start_time >= adapter->cycle_time) + queue_configured[i] = false; + else + queue_configured[i] = true; } start_time += e->interval; @@ -6193,8 +6206,20 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, * If not, set the start and end time to be end time. */ for (i = 0; i < adapter->num_tx_queues; i++) { + struct igc_ring *ring = adapter->tx_ring[i]; + + if (!is_base_time_past(qopt->base_time, &now)) { + ring->admin_gate_closed = false; + } else { + ring->oper_gate_closed = false; + ring->admin_gate_closed = false; + } + if (!queue_configured[i]) { - struct igc_ring *ring = adapter->tx_ring[i]; + if (!is_base_time_past(qopt->base_time, &now)) + ring->admin_gate_closed = true; + else + ring->oper_gate_closed = true; ring->start_time = end_time; ring->end_time = end_time; @@ -6575,6 +6600,27 @@ static const struct xdp_metadata_ops igc_xdp_metadata_ops = { .xmo_rx_timestamp = igc_xdp_rx_timestamp, }; +static enum hrtimer_restart igc_qbv_scheduling_timer(struct hrtimer *timer) +{ + struct igc_adapter *adapter = container_of(timer, struct igc_adapter, + hrtimer); + unsigned int i; + + adapter->qbv_transition = true; + for (i = 0; i < adapter->num_tx_queues; i++) { + struct igc_ring *tx_ring = adapter->tx_ring[i]; + + if (tx_ring->admin_gate_closed) { + tx_ring->admin_gate_closed = false; + tx_ring->oper_gate_closed = true; + } else { + tx_ring->oper_gate_closed = false; + } + } + adapter->qbv_transition = false; + return HRTIMER_NORESTART; +} + /** * igc_probe - Device Initialization Routine * @pdev: PCI device information struct @@ -6753,6 +6799,9 @@ static int igc_probe(struct pci_dev *pdev, INIT_WORK(&adapter->reset_task, igc_reset_task); INIT_WORK(&adapter->watchdog_task, igc_watchdog_task); + hrtimer_init(&adapter->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + adapter->hrtimer.function = &igc_qbv_scheduling_timer; + /* Initialize link properties that are user-changeable */ adapter->fc_autoneg = true; hw->mac.autoneg = true; @@ -6856,6 +6905,7 @@ static void igc_remove(struct pci_dev *pdev) cancel_work_sync(&adapter->reset_task); cancel_work_sync(&adapter->watchdog_task); + hrtimer_cancel(&adapter->hrtimer); /* Release control of h/w to f/w. If f/w is AMT enabled, this * would have already happened in close and is redundant. diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 6b299b83e7ef..3cdb0c988728 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -114,7 +114,6 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) static int igc_tsn_enable_offload(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; - bool tsn_mode_reconfig = false; u32 tqavctrl, baset_l, baset_h; u32 sec, nsec, cycle; ktime_t base_time, systim; @@ -228,11 +227,10 @@ skip_cbs: tqavctrl = rd32(IGC_TQAVCTRL) & ~IGC_TQAVCTRL_FUTSCDDIS; - if (tqavctrl & IGC_TQAVCTRL_TRANSMIT_MODE_TSN) - tsn_mode_reconfig = true; - tqavctrl |= IGC_TQAVCTRL_TRANSMIT_MODE_TSN | IGC_TQAVCTRL_ENHANCED_QAV; + adapter->qbv_count++; + cycle = adapter->cycle_time; base_time = adapter->base_time; @@ -250,17 +248,28 @@ skip_cbs: */ if ((rd32(IGC_BASET_H) || rd32(IGC_BASET_L)) && (adapter->tc_setup_type == TC_SETUP_QDISC_TAPRIO) && - tsn_mode_reconfig) + (adapter->qbv_count > 1)) adapter->qbv_config_change_errors++; } else { - /* According to datasheet section 7.5.2.9.3.3, FutScdDis bit - * has to be configured before the cycle time and base time. - * Tx won't hang if there is a GCL is already running, - * so in this case we don't need to set FutScdDis. - */ - if (igc_is_device_id_i226(hw) && - !(rd32(IGC_BASET_H) || rd32(IGC_BASET_L))) - tqavctrl |= IGC_TQAVCTRL_FUTSCDDIS; + if (igc_is_device_id_i226(hw)) { + ktime_t adjust_time, expires_time; + + /* According to datasheet section 7.5.2.9.3.3, FutScdDis bit + * has to be configured before the cycle time and base time. + * Tx won't hang if a GCL is already running, + * so in this case we don't need to set FutScdDis. + */ + if (!(rd32(IGC_BASET_H) || rd32(IGC_BASET_L))) + tqavctrl |= IGC_TQAVCTRL_FUTSCDDIS; + + nsec = rd32(IGC_SYSTIML); + sec = rd32(IGC_SYSTIMH); + systim = ktime_set(sec, nsec); + + adjust_time = adapter->base_time; + expires_time = ktime_sub_ns(adjust_time, systim); + hrtimer_start(&adapter->hrtimer, expires_time, HRTIMER_MODE_REL); + } } wr32(IGC_TQAVCTRL, tqavctrl); @@ -306,7 +315,11 @@ int igc_tsn_offload_apply(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; - if (netif_running(adapter->netdev) && igc_is_device_id_i225(hw)) { + /* Per I225/6 HW Design Section 7.5.2.1, transmit mode + * cannot be changed dynamically. Require reset the adapter. + */ + if (netif_running(adapter->netdev) && + (igc_is_device_id_i225(hw) || !adapter->qbv_count)) { schedule_work(&adapter->reset_task); return 0; } From 884abe45a9014d0de2e6edb0630dfd64f23f1d1b Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 28 Jun 2023 08:59:34 +0800 Subject: [PATCH 11/79] net/mlx5e: fix double free in mlx5e_destroy_flow_table In function accel_fs_tcp_create_groups(), when the ft->g memory is successfully allocated but the 'in' memory fails to be allocated, the memory pointed to by ft->g is released once. And in function accel_fs_tcp_create_table, mlx5e_destroy_flow_table is called to release the memory pointed to by ft->g again. This will cause double free problem. Fixes: c062d52ac24c ("net/mlx5e: Receive flow steering framework for accelerated TCP flows") Signed-off-by: Zhengchao Shao Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c index 88a5aed9d678..c7d191f66ad1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c @@ -190,6 +190,7 @@ static int accel_fs_tcp_create_groups(struct mlx5e_flow_table *ft, in = kvzalloc(inlen, GFP_KERNEL); if (!in || !ft->g) { kfree(ft->g); + ft->g = NULL; kvfree(in); return -ENOMEM; } From 3250affdc658557a41df9c5fb567723e421f8bf2 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 30 Jun 2023 09:49:02 +0800 Subject: [PATCH 12/79] net/mlx5e: fix memory leak in mlx5e_fs_tt_redirect_any_create The memory pointed to by the fs->any pointer is not freed in the error path of mlx5e_fs_tt_redirect_any_create, which can lead to a memory leak. Fix by freeing the memory in the error path, thereby making the error path identical to mlx5e_fs_tt_redirect_any_destroy(). Fixes: 0f575c20bf06 ("net/mlx5e: Introduce Flow Steering ANY API") Signed-off-by: Zhengchao Shao Reviewed-by: Simon Horman Reviewed-by: Rahul Rameshbabu Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c index 03cb79adf912..be83ad9db82a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c @@ -594,7 +594,7 @@ int mlx5e_fs_tt_redirect_any_create(struct mlx5e_flow_steering *fs) err = fs_any_create_table(fs); if (err) - return err; + goto err_free_any; err = fs_any_enable(fs); if (err) @@ -606,8 +606,8 @@ int mlx5e_fs_tt_redirect_any_create(struct mlx5e_flow_steering *fs) err_destroy_table: fs_any_destroy_table(fs_any); - - kfree(fs_any); +err_free_any: mlx5e_fs_set_any(fs, NULL); + kfree(fs_any); return err; } From d543b649ffe58a0cb4b6948b3305069c5980a1fa Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 30 Jun 2023 09:49:03 +0800 Subject: [PATCH 13/79] net/mlx5e: fix memory leak in mlx5e_ptp_open When kvzalloc_node or kvzalloc failed in mlx5e_ptp_open, the memory pointed by "c" or "cparams" is not freed, which can lead to a memory leak. Fix by freeing the array in the error path. Fixes: 145e5637d941 ("net/mlx5e: Add TX PTP port object support") Signed-off-by: Zhengchao Shao Reviewed-by: Rahul Rameshbabu Reviewed-by: Gal Pressman Reviewed-by: Simon Horman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 3cbebfba582b..b0b429a0321e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -729,8 +729,10 @@ int mlx5e_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params, c = kvzalloc_node(sizeof(*c), GFP_KERNEL, dev_to_node(mlx5_core_dma_dev(mdev))); cparams = kvzalloc(sizeof(*cparams), GFP_KERNEL); - if (!c || !cparams) - return -ENOMEM; + if (!c || !cparams) { + err = -ENOMEM; + goto err_free; + } c->priv = priv; c->mdev = priv->mdev; From 2e2d1965794d22fbe86df45bf4f933216743577d Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 22 May 2023 21:18:53 +0300 Subject: [PATCH 14/79] net/mlx5e: RX, Fix flush and close release flow of regular rq for legacy rq Regular (non-XSK) RQs get flushed on XSK setup and re-activated on XSK close. If the same regular RQ is closed (a config change for example) soon after the XSK close, a double release occurs because the missing wqes get released a second time. Fixes: 3f93f82988bc ("net/mlx5e: RX, Defer page release in legacy rq for better recycling") Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 704b022cd1f0..a9575219e455 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -390,10 +390,18 @@ static void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix) { struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix); - if (rq->xsk_pool) + if (rq->xsk_pool) { mlx5e_xsk_free_rx_wqe(wi); - else + } else { mlx5e_free_rx_wqe(rq, wi); + + /* Avoid a second release of the wqe pages: dealloc is called + * for the same missing wqes on regular RQ flush and on regular + * RQ close. This happens when XSK RQs come into play. + */ + for (int i = 0; i < rq->wqe.info.num_frags; i++, wi++) + wi->flags |= BIT(MLX5E_WQE_FRAG_SKIP_RELEASE); + } } static void mlx5e_xsk_free_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk) From 631079e08aa4a20b73e70de4cf457886194f029f Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 26 Jun 2023 20:36:41 -0700 Subject: [PATCH 15/79] net/mlx5: Register a unique thermal zone per device Prior to this patch only one "mlx5" thermal zone could have been registered regardless of the number of individual mlx5 devices in the system. To fix this setup a unique name per device to register its own thermal zone. In order to not register a thermal zone for a virtual device (VF/SF) add a check for PF device type. The new name is a concatenation between "mlx5_" and "", which will also help associating a thermal zone with its PCI device. $ lspci | grep ConnectX 00:04.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] 00:05.0 Ethernet controller: Mellanox Technologies MT2892 Family [ConnectX-6 Dx] $ cat /sys/devices/virtual/thermal/thermal_zone0/type mlx5_0000:00:04.0 $ cat /sys/devices/virtual/thermal/thermal_zone1/type mlx5_0000:00:05.0 Fixes: c1fef618d611 ("net/mlx5: Implement thermal zone") CC: Sandipan Patra Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/thermal.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/thermal.c b/drivers/net/ethernet/mellanox/mlx5/core/thermal.c index 20bb5eb266c1..52199d39657e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/thermal.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/thermal.c @@ -68,14 +68,19 @@ static struct thermal_zone_device_ops mlx5_thermal_ops = { int mlx5_thermal_init(struct mlx5_core_dev *mdev) { + char data[THERMAL_NAME_LENGTH]; struct mlx5_thermal *thermal; - struct thermal_zone_device *tzd; - const char *data = "mlx5"; + int err; - tzd = thermal_zone_get_zone_by_name(data); - if (!IS_ERR(tzd)) + if (!mlx5_core_is_pf(mdev) && !mlx5_core_is_ecpf(mdev)) return 0; + err = snprintf(data, sizeof(data), "mlx5_%s", dev_name(mdev->device)); + if (err < 0 || err >= sizeof(data)) { + mlx5_core_err(mdev, "Failed to setup thermal zone name, %d\n", err); + return -EINVAL; + } + thermal = kzalloc(sizeof(*thermal), GFP_KERNEL); if (!thermal) return -ENOMEM; @@ -89,10 +94,10 @@ int mlx5_thermal_init(struct mlx5_core_dev *mdev) &mlx5_thermal_ops, NULL, 0, MLX5_THERMAL_POLL_INT_MSEC); if (IS_ERR(thermal->tzdev)) { - dev_err(mdev->device, "Failed to register thermal zone device (%s) %ld\n", - data, PTR_ERR(thermal->tzdev)); + err = PTR_ERR(thermal->tzdev); + mlx5_core_err(mdev, "Failed to register thermal zone device (%s) %d\n", data, err); kfree(thermal); - return -EINVAL; + return err; } mdev->thermal = thermal; From 65e64640e97c0f223e77f9ea69b5a46186b93470 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 8 Jun 2023 09:32:10 +0200 Subject: [PATCH 16/79] net/mlx5e: Check for NOT_READY flag state after locking Currently the check for NOT_READY flag is performed before obtaining the necessary lock. This opens a possibility for race condition when the flow is concurrently removed from unready_flows list by the workqueue task, which causes a double-removal from the list and a crash[0]. Fix the issue by moving the flag check inside the section protected by uplink_priv->unready_flows_lock mutex. [0]: [44376.389654] general protection fault, probably for non-canonical address 0xdead000000000108: 0000 [#1] SMP [44376.391665] CPU: 7 PID: 59123 Comm: tc Not tainted 6.4.0-rc4+ #1 [44376.392984] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [44376.395342] RIP: 0010:mlx5e_tc_del_fdb_flow+0xb3/0x340 [mlx5_core] [44376.396857] Code: 00 48 8b b8 68 ce 02 00 e8 8a 4d 02 00 4c 8d a8 a8 01 00 00 4c 89 ef e8 8b 79 88 e1 48 8b 83 98 06 00 00 48 8b 93 90 06 00 00 <48> 89 42 08 48 89 10 48 b8 00 01 00 00 00 00 ad de 48 89 83 90 06 [44376.399167] RSP: 0018:ffff88812cc97570 EFLAGS: 00010246 [44376.399680] RAX: dead000000000122 RBX: ffff8881088e3800 RCX: ffff8881881bac00 [44376.400337] RDX: dead000000000100 RSI: ffff88812cc97500 RDI: ffff8881242f71b0 [44376.401001] RBP: ffff88811cbb0940 R08: 0000000000000400 R09: 0000000000000001 [44376.401663] R10: 0000000000000001 R11: 0000000000000000 R12: ffff88812c944000 [44376.402342] R13: ffff8881242f71a8 R14: ffff8881222b4000 R15: 0000000000000000 [44376.402999] FS: 00007f0451104800(0000) GS:ffff88852cb80000(0000) knlGS:0000000000000000 [44376.403787] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [44376.404343] CR2: 0000000000489108 CR3: 0000000123a79003 CR4: 0000000000370ea0 [44376.405004] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [44376.405665] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [44376.406339] Call Trace: [44376.406651] [44376.406939] ? die_addr+0x33/0x90 [44376.407311] ? exc_general_protection+0x192/0x390 [44376.407795] ? asm_exc_general_protection+0x22/0x30 [44376.408292] ? mlx5e_tc_del_fdb_flow+0xb3/0x340 [mlx5_core] [44376.408876] __mlx5e_tc_del_fdb_peer_flow+0xbc/0xe0 [mlx5_core] [44376.409482] mlx5e_tc_del_flow+0x42/0x210 [mlx5_core] [44376.410055] mlx5e_flow_put+0x25/0x50 [mlx5_core] [44376.410529] mlx5e_delete_flower+0x24b/0x350 [mlx5_core] [44376.411043] tc_setup_cb_reoffload+0x22/0x80 [44376.411462] fl_reoffload+0x261/0x2f0 [cls_flower] [44376.411907] ? mlx5e_rep_indr_setup_ft_cb+0x160/0x160 [mlx5_core] [44376.412481] ? mlx5e_rep_indr_setup_ft_cb+0x160/0x160 [mlx5_core] [44376.413044] tcf_block_playback_offloads+0x76/0x170 [44376.413497] tcf_block_unbind+0x7b/0xd0 [44376.413881] tcf_block_setup+0x17d/0x1c0 [44376.414269] tcf_block_offload_cmd.isra.0+0xf1/0x130 [44376.414725] tcf_block_offload_unbind+0x43/0x70 [44376.415153] __tcf_block_put+0x82/0x150 [44376.415532] ingress_destroy+0x22/0x30 [sch_ingress] [44376.415986] qdisc_destroy+0x3b/0xd0 [44376.416343] qdisc_graft+0x4d0/0x620 [44376.416706] tc_get_qdisc+0x1c9/0x3b0 [44376.417074] rtnetlink_rcv_msg+0x29c/0x390 [44376.419978] ? rep_movs_alternative+0x3a/0xa0 [44376.420399] ? rtnl_calcit.isra.0+0x120/0x120 [44376.420813] netlink_rcv_skb+0x54/0x100 [44376.421192] netlink_unicast+0x1f6/0x2c0 [44376.421573] netlink_sendmsg+0x232/0x4a0 [44376.421980] sock_sendmsg+0x38/0x60 [44376.422328] ____sys_sendmsg+0x1d0/0x1e0 [44376.422709] ? copy_msghdr_from_user+0x6d/0xa0 [44376.423127] ___sys_sendmsg+0x80/0xc0 [44376.423495] ? ___sys_recvmsg+0x8b/0xc0 [44376.423869] __sys_sendmsg+0x51/0x90 [44376.424226] do_syscall_64+0x3d/0x90 [44376.424587] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [44376.425046] RIP: 0033:0x7f045134f887 [44376.425403] Code: 0a 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 [44376.426914] RSP: 002b:00007ffd63a82b98 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [44376.427592] RAX: ffffffffffffffda RBX: 000000006481955f RCX: 00007f045134f887 [44376.428195] RDX: 0000000000000000 RSI: 00007ffd63a82c00 RDI: 0000000000000003 [44376.428796] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 [44376.429404] R10: 00007f0451208708 R11: 0000000000000246 R12: 0000000000000001 [44376.430039] R13: 0000000000409980 R14: 000000000047e538 R15: 0000000000485400 [44376.430644] [44376.430907] Modules linked in: mlx5_ib mlx5_core act_mirred act_tunnel_key cls_flower vxlan dummy sch_ingress openvswitch nsh rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_g ss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: mlx5_core] [44376.433936] ---[ end trace 0000000000000000 ]--- [44376.434373] RIP: 0010:mlx5e_tc_del_fdb_flow+0xb3/0x340 [mlx5_core] [44376.434951] Code: 00 48 8b b8 68 ce 02 00 e8 8a 4d 02 00 4c 8d a8 a8 01 00 00 4c 89 ef e8 8b 79 88 e1 48 8b 83 98 06 00 00 48 8b 93 90 06 00 00 <48> 89 42 08 48 89 10 48 b8 00 01 00 00 00 00 ad de 48 89 83 90 06 [44376.436452] RSP: 0018:ffff88812cc97570 EFLAGS: 00010246 [44376.436924] RAX: dead000000000122 RBX: ffff8881088e3800 RCX: ffff8881881bac00 [44376.437530] RDX: dead000000000100 RSI: ffff88812cc97500 RDI: ffff8881242f71b0 [44376.438179] RBP: ffff88811cbb0940 R08: 0000000000000400 R09: 0000000000000001 [44376.438786] R10: 0000000000000001 R11: 0000000000000000 R12: ffff88812c944000 [44376.439393] R13: ffff8881242f71a8 R14: ffff8881222b4000 R15: 0000000000000000 [44376.439998] FS: 00007f0451104800(0000) GS:ffff88852cb80000(0000) knlGS:0000000000000000 [44376.440714] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [44376.441225] CR2: 0000000000489108 CR3: 0000000123a79003 CR4: 0000000000370ea0 [44376.441843] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [44376.442471] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: ad86755b18d5 ("net/mlx5e: Protect unready flows with dedicated lock") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 41dc26800f48..8d0a3f69693e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1639,7 +1639,8 @@ static void remove_unready_flow(struct mlx5e_tc_flow *flow) uplink_priv = &rpriv->uplink_priv; mutex_lock(&uplink_priv->unready_flows_lock); - unready_flow_del(flow); + if (flow_flag_test(flow, NOT_READY)) + unready_flow_del(flow); mutex_unlock(&uplink_priv->unready_flows_lock); } @@ -1932,8 +1933,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, esw_attr = attr->esw_attr; mlx5e_put_flow_tunnel_id(flow); - if (flow_flag_test(flow, NOT_READY)) - remove_unready_flow(flow); + remove_unready_flow(flow); if (mlx5e_is_offloaded_flow(flow)) { if (flow_flag_test(flow, SLOW)) From f7a485115ad4cfc560833942014bf791abf1f827 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 4 Jun 2023 12:45:38 +0300 Subject: [PATCH 17/79] net/mlx5e: TC, CT: Offload ct clear only once Non-clear CT action causes a flow rule split, while CT clear action doesn't and is just a header-rewrite to the current flow rule. But ct offload is done in post_parse and is per ct action instance, so ct clear offload is parsed multiple times, while its deleted once. Fix this by post_parsing the ct action only once per flow attribute (which is per flow rule) by using a offloaded ct_attr flag. Fixes: 08fe94ec5f77 ("net/mlx5e: TC, Remove special handling of CT action") Signed-off-by: Paul Blakey Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 14 +++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index a254e728ac95..fadfa8b50beb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -1545,7 +1545,8 @@ mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, attr->ct_attr.ct_action |= act->ct.action; /* So we can have clear + ct */ attr->ct_attr.zone = act->ct.zone; - attr->ct_attr.nf_ft = act->ct.flow_table; + if (!(act->ct.action & TCA_CT_ACT_CLEAR)) + attr->ct_attr.nf_ft = act->ct.flow_table; attr->ct_attr.act_miss_cookie = act->miss_cookie; return 0; @@ -1990,6 +1991,9 @@ mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *att if (!priv) return -EOPNOTSUPP; + if (attr->ct_attr.offloaded) + return 0; + if (attr->ct_attr.ct_action & TCA_CT_ACT_CLEAR) { err = mlx5_tc_ct_entry_set_registers(priv, &attr->parse_attr->mod_hdr_acts, 0, 0, 0, 0); @@ -1999,11 +2003,15 @@ mlx5_tc_ct_flow_offload(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *att attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; } - if (!attr->ct_attr.nf_ft) /* means only ct clear action, and not ct_clear,ct() */ + if (!attr->ct_attr.nf_ft) { /* means only ct clear action, and not ct_clear,ct() */ + attr->ct_attr.offloaded = true; return 0; + } mutex_lock(&priv->control_lock); err = __mlx5_tc_ct_flow_offload(priv, attr); + if (!err) + attr->ct_attr.offloaded = true; mutex_unlock(&priv->control_lock); return err; @@ -2021,7 +2029,7 @@ void mlx5_tc_ct_delete_flow(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *attr) { - if (!attr->ct_attr.ft) /* no ct action, return */ + if (!attr->ct_attr.offloaded) /* no ct action, return */ return; if (!attr->ct_attr.nf_ft) /* means only ct clear action, and not ct_clear,ct() */ return; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h index 8e9316fa46d4..b66c5f98067f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h @@ -29,6 +29,7 @@ struct mlx5_ct_attr { u32 ct_labels_id; u32 act_miss_mapping; u64 act_miss_cookie; + bool offloaded; struct mlx5_ct_ft *ft; }; From 6496357aa5f710eec96f91345b9da1b37c3231f6 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 20 Jun 2023 14:07:03 +0300 Subject: [PATCH 18/79] net/mlx5: Query hca_cap_2 only when supported On vport enable, where fw's hca caps are queried, the driver queries hca_caps_2 without checking if fw truly supports them, causing a false failure of vfs vport load and blocking SRIOV enablement on old devices such as CX4 where hca_caps_2 support is missing. Thus, add a check for the said caps support before accessing them. Fixes: e5b9642a33be ("net/mlx5: E-Switch, Implement devlink port function cmds to control migratable") Signed-off-by: Maher Sanalla Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index faec7d7a4400..243c455f1029 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -807,6 +807,9 @@ static int mlx5_esw_vport_caps_get(struct mlx5_eswitch *esw, struct mlx5_vport * hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability); vport->info.roce_enabled = MLX5_GET(cmd_hca_cap, hca_caps, roce); + if (!MLX5_CAP_GEN_MAX(esw->dev, hca_cap_2)) + goto out_free; + memset(query_ctx, 0, query_out_sz); err = mlx5_vport_get_other_func_cap(esw->dev, vport->vport, query_ctx, MLX5_CAP_GENERAL_2); From 7abd955a58fb0fcd4e756fa2065c03ae488fcfa7 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 31 May 2023 21:18:49 +0300 Subject: [PATCH 19/79] net/mlx5e: RX, Fix page_pool page fragment tracking for XDP Currently mlx5e releases pages directly to the page_pool for XDP_TX and does page fragment counting for XDP_REDIRECT. RX pages from the page_pool are leaking on XDP_REDIRECT because the xdp core will release only one fragment out of MLX5E_PAGECNT_BIAS_MAX and subsequently the page is marked as "skip release" which avoids the driver release. A fix would be to take an extra fragment for XDP_REDIRECT and not set the "skip release" bit so that the release on the driver side can handle the remaining bias fragments. But this would be a shortsighted solution. Instead, this patch converges the two XDP paths (XDP_TX and XDP_REDIRECT) to always do fragment tracking. The "skip release" bit is no longer necessary for XDP. Fixes: 6f5742846053 ("net/mlx5e: RX, Enable skb page recycling through the page_pool") Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 32 +++++++------------ 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index f0e6095809fa..40589cebb773 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -662,8 +662,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) * as we know this is a page_pool page. */ - page_pool_put_defragged_page(page->pp, - page, -1, true); + page_pool_recycle_direct(page->pp, page); } while (++n < num); break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index a9575219e455..41d37159e027 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1751,11 +1751,11 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi prog = rcu_dereference(rq->xdp_prog); if (prog && mlx5e_xdp_handle(rq, prog, &mxbuf)) { - if (test_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { struct mlx5e_wqe_frag_info *pwi; for (pwi = head_wi; pwi < wi; pwi++) - pwi->flags |= BIT(MLX5E_WQE_FRAG_SKIP_RELEASE); + pwi->frag_page->frags++; } return NULL; /* page/packet was consumed by XDP */ } @@ -1825,12 +1825,8 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) rq, wi, cqe, cqe_bcnt); if (!skb) { /* probably for XDP */ - if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { - /* do not return page to cache, - * it will be returned on XDP_TX completion. - */ - wi->flags |= BIT(MLX5E_WQE_FRAG_SKIP_RELEASE); - } + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) + wi->frag_page->frags++; goto wq_cyc_pop; } @@ -1876,12 +1872,8 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) rq, wi, cqe, cqe_bcnt); if (!skb) { /* probably for XDP */ - if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { - /* do not return page to cache, - * it will be returned on XDP_TX completion. - */ - wi->flags |= BIT(MLX5E_WQE_FRAG_SKIP_RELEASE); - } + if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) + wi->frag_page->frags++; goto wq_cyc_pop; } @@ -2060,12 +2052,12 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w if (prog) { if (mlx5e_xdp_handle(rq, prog, &mxbuf)) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) { - int i; + struct mlx5e_frag_page *pfp; - for (i = 0; i < sinfo->nr_frags; i++) - /* non-atomic */ - __set_bit(page_idx + i, wi->skip_release_bitmap); - return NULL; + for (pfp = head_page; pfp < frag_page; pfp++) + pfp->frags++; + + wi->linear_page.frags++; } mlx5e_page_release_fragmented(rq, &wi->linear_page); return NULL; /* page/packet was consumed by XDP */ @@ -2163,7 +2155,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, cqe_bcnt, &mxbuf); if (mlx5e_xdp_handle(rq, prog, &mxbuf)) { if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) - __set_bit(page_idx, wi->skip_release_bitmap); /* non-atomic */ + frag_page->frags++; return NULL; /* page/packet was consumed by XDP */ } From 9ac3fc2f42e5ffa1e927dcbffb71b15fa81459e2 Mon Sep 17 00:00:00 2001 From: Prasad Koya Date: Mon, 5 Jun 2023 11:09:01 -0700 Subject: [PATCH 20/79] igc: set TP bit in 'supported' and 'advertising' fields of ethtool_link_ksettings set TP bit in the 'supported' and 'advertising' fields. i225/226 parts only support twisted pair copper. Fixes: 8c5ad0dae93c ("igc: Add ethtool support") Signed-off-by: Prasad Koya Acked-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 0e2cb00622d1..93bce729be76 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1708,6 +1708,8 @@ static int igc_ethtool_get_link_ksettings(struct net_device *netdev, /* twisted pair */ cmd->base.port = PORT_TP; cmd->base.phy_address = hw->phy.addr; + ethtool_link_ksettings_add_link_mode(cmd, supported, TP); + ethtool_link_ksettings_add_link_mode(cmd, advertising, TP); /* advertising link modes */ if (hw->phy.autoneg_advertised & ADVERTISE_10_HALF) From 25102893e409bc02761ab82dbcfa092006404790 Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Fri, 9 Jun 2023 11:28:42 +0800 Subject: [PATCH 21/79] igc: Include the length/type field and VLAN tag in queueMaxSDU IEEE 802.1Q does not have clear definitions of what constitutes an SDU (Service Data Unit), but IEEE Std 802.3 clause 3.1.2 does define the MAC service primitives and clause 3.2.7 does define the MAC Client Data for Q-tagged frames. It shows that the mac_service_data_unit (MSDU) does NOT contain the preamble, destination and source address, or FCS. The MSDU does contain the length/type field, MAC client data, VLAN tag and any padding data (prior to the FCS). Thus, the maximum 802.3 frame size that is allowed to be transmitted should be QueueMaxSDU (MSDU) + 16 (6 byte SA + 6 byte DA + 4 byte FCS). Fixes: 92a0dcb8427d ("igc: offload queue max SDU from tc-taprio") Signed-off-by: Tan Tee Min Reviewed-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index e5bfc4000658..281a0e35b9d1 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1575,16 +1575,9 @@ done: if (adapter->qbv_transition || tx_ring->oper_gate_closed) goto out_drop; - if (tx_ring->max_sdu > 0) { - u32 max_sdu = 0; - - max_sdu = tx_ring->max_sdu + - (skb_vlan_tagged(first->skb) ? VLAN_HLEN : 0); - - if (first->bytecount > max_sdu) { - adapter->stats.txdrop++; - goto out_drop; - } + if (tx_ring->max_sdu > 0 && first->bytecount > tx_ring->max_sdu) { + adapter->stats.txdrop++; + goto out_drop; } if (unlikely(test_bit(IGC_RING_FLAG_TX_HWTSTAMP, &tx_ring->flags) && @@ -6231,7 +6224,7 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, struct net_device *dev = adapter->netdev; if (qopt->max_sdu[i]) - ring->max_sdu = qopt->max_sdu[i] + dev->hard_header_len; + ring->max_sdu = qopt->max_sdu[i] + dev->hard_header_len - ETH_TLEN; else ring->max_sdu = 0; } From 84a192e46106355de1a314d709e657231d4b1026 Mon Sep 17 00:00:00 2001 From: Aravindhan Gunasekaran Date: Thu, 15 Jun 2023 12:00:43 +0530 Subject: [PATCH 22/79] igc: Handle PPS start time programming for past time values I225/6 hardware can be programmed to start PPS output once the time in Target Time registers is reached. The time programmed in these registers should always be into future. Only then PPS output is triggered when SYSTIM register reaches the programmed value. There are two modes in i225/6 hardware to program PPS, pulse and clock mode. There were issues reported where PPS is not generated when start time is in past. Example 1, "echo 0 0 0 2 0 > /sys/class/ptp/ptp0/period" In the current implementation, a value of '0' is programmed into Target time registers and PPS output is in pulse mode. Eventually an interrupt which is triggered upon SYSTIM register reaching Target time is not fired. Thus no PPS output is generated. Example 2, "echo 0 0 0 1 0 > /sys/class/ptp/ptp0/period" Above case, a value of '0' is programmed into Target time registers and PPS output is in clock mode. Here, HW tries to catch-up the current time by incrementing Target Time register. This catch-up time seem to vary according to programmed PPS period time as per the HW design. In my experiments, the delay ranged between few tens of seconds to few minutes. The PPS output is only generated after the Target time register reaches current time. In my experiments, I also observed PPS stopped working with below test and could not recover until module is removed and loaded again. 1) echo 0 0 1 0 > /sys/class/ptp/ptp1/period 2) echo 0 0 0 1 0 > /sys/class/ptp/ptp1/period 3) echo 0 0 0 1 0 > /sys/class/ptp/ptp1/period After this PPS did not work even if i re-program with proper values. I could only get this back working by reloading the driver. This patch takes care of calculating and programming appropriate future time value into Target Time registers. Fixes: 5e91c72e560c ("igc: Fix PPS delta between two synchronized end-points") Signed-off-by: Aravindhan Gunasekaran Reviewed-by: Muhammad Husaini Zulkifli Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ptp.c | 25 +++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 32ef112f8291..f0b979a70655 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -356,16 +356,35 @@ static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, tsim &= ~IGC_TSICR_TT0; } if (on) { + struct timespec64 safe_start; int i = rq->perout.index; igc_pin_perout(igc, i, pin, use_freq); - igc->perout[i].start.tv_sec = rq->perout.start.sec; + igc_ptp_read(igc, &safe_start); + + /* PPS output start time is triggered by Target time(TT) + * register. Programming any past time value into TT + * register will cause PPS to never start. Need to make + * sure we program the TT register a time ahead in + * future. There isn't a stringent need to fire PPS out + * right away. Adding +2 seconds should take care of + * corner cases. Let's say if the SYSTIML is close to + * wrap up and the timer keeps ticking as we program the + * register, adding +2seconds is safe bet. + */ + safe_start.tv_sec += 2; + + if (rq->perout.start.sec < safe_start.tv_sec) + igc->perout[i].start.tv_sec = safe_start.tv_sec; + else + igc->perout[i].start.tv_sec = rq->perout.start.sec; igc->perout[i].start.tv_nsec = rq->perout.start.nsec; igc->perout[i].period.tv_sec = ts.tv_sec; igc->perout[i].period.tv_nsec = ts.tv_nsec; - wr32(trgttimh, rq->perout.start.sec); + wr32(trgttimh, (u32)igc->perout[i].start.tv_sec); /* For now, always select timer 0 as source. */ - wr32(trgttiml, rq->perout.start.nsec | IGC_TT_IO_TIMER_SEL_SYSTIM0); + wr32(trgttiml, (u32)(igc->perout[i].start.tv_nsec | + IGC_TT_IO_TIMER_SEL_SYSTIM0)); if (use_freq) wr32(freqout, ns); tsauxc |= tsauxc_mask; From caf3ef7468f7534771b5c44cd8dbd6f7f87c2cbd Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 5 Jul 2023 18:05:35 -0300 Subject: [PATCH 23/79] netfilter: nf_tables: prevent OOB access in nft_byteorder_eval When evaluating byteorder expressions with size 2, a union with 32-bit and 16-bit members is used. Since the 16-bit members are aligned to 32-bit, the array accesses will be out-of-bounds. It may lead to a stack-out-of-bounds access like the one below: [ 23.095215] ================================================================== [ 23.095625] BUG: KASAN: stack-out-of-bounds in nft_byteorder_eval+0x13c/0x320 [ 23.096020] Read of size 2 at addr ffffc90000007948 by task ping/115 [ 23.096358] [ 23.096456] CPU: 0 PID: 115 Comm: ping Not tainted 6.4.0+ #413 [ 23.096770] Call Trace: [ 23.096910] [ 23.097030] dump_stack_lvl+0x60/0xc0 [ 23.097218] print_report+0xcf/0x630 [ 23.097388] ? nft_byteorder_eval+0x13c/0x320 [ 23.097577] ? kasan_addr_to_slab+0xd/0xc0 [ 23.097760] ? nft_byteorder_eval+0x13c/0x320 [ 23.097949] kasan_report+0xc9/0x110 [ 23.098106] ? nft_byteorder_eval+0x13c/0x320 [ 23.098298] __asan_load2+0x83/0xd0 [ 23.098453] nft_byteorder_eval+0x13c/0x320 [ 23.098659] nft_do_chain+0x1c8/0xc50 [ 23.098852] ? __pfx_nft_do_chain+0x10/0x10 [ 23.099078] ? __kasan_check_read+0x11/0x20 [ 23.099295] ? __pfx___lock_acquire+0x10/0x10 [ 23.099535] ? __pfx___lock_acquire+0x10/0x10 [ 23.099745] ? __kasan_check_read+0x11/0x20 [ 23.099929] nft_do_chain_ipv4+0xfe/0x140 [ 23.100105] ? __pfx_nft_do_chain_ipv4+0x10/0x10 [ 23.100327] ? lock_release+0x204/0x400 [ 23.100515] ? nf_hook.constprop.0+0x340/0x550 [ 23.100779] nf_hook_slow+0x6c/0x100 [ 23.100977] ? __pfx_nft_do_chain_ipv4+0x10/0x10 [ 23.101223] nf_hook.constprop.0+0x334/0x550 [ 23.101443] ? __pfx_ip_local_deliver_finish+0x10/0x10 [ 23.101677] ? __pfx_nf_hook.constprop.0+0x10/0x10 [ 23.101882] ? __pfx_ip_rcv_finish+0x10/0x10 [ 23.102071] ? __pfx_ip_local_deliver_finish+0x10/0x10 [ 23.102291] ? rcu_read_lock_held+0x4b/0x70 [ 23.102481] ip_local_deliver+0xbb/0x110 [ 23.102665] ? __pfx_ip_rcv+0x10/0x10 [ 23.102839] ip_rcv+0x199/0x2a0 [ 23.102980] ? __pfx_ip_rcv+0x10/0x10 [ 23.103140] __netif_receive_skb_one_core+0x13e/0x150 [ 23.103362] ? __pfx___netif_receive_skb_one_core+0x10/0x10 [ 23.103647] ? mark_held_locks+0x48/0xa0 [ 23.103819] ? process_backlog+0x36c/0x380 [ 23.103999] __netif_receive_skb+0x23/0xc0 [ 23.104179] process_backlog+0x91/0x380 [ 23.104350] __napi_poll.constprop.0+0x66/0x360 [ 23.104589] ? net_rx_action+0x1cb/0x610 [ 23.104811] net_rx_action+0x33e/0x610 [ 23.105024] ? _raw_spin_unlock+0x23/0x50 [ 23.105257] ? __pfx_net_rx_action+0x10/0x10 [ 23.105485] ? mark_held_locks+0x48/0xa0 [ 23.105741] __do_softirq+0xfa/0x5ab [ 23.105956] ? __dev_queue_xmit+0x765/0x1c00 [ 23.106193] do_softirq.part.0+0x49/0xc0 [ 23.106423] [ 23.106547] [ 23.106670] __local_bh_enable_ip+0xf5/0x120 [ 23.106903] __dev_queue_xmit+0x789/0x1c00 [ 23.107131] ? __pfx___dev_queue_xmit+0x10/0x10 [ 23.107381] ? find_held_lock+0x8e/0xb0 [ 23.107585] ? lock_release+0x204/0x400 [ 23.107798] ? neigh_resolve_output+0x185/0x350 [ 23.108049] ? mark_held_locks+0x48/0xa0 [ 23.108265] ? neigh_resolve_output+0x185/0x350 [ 23.108514] neigh_resolve_output+0x246/0x350 [ 23.108753] ? neigh_resolve_output+0x246/0x350 [ 23.109003] ip_finish_output2+0x3c3/0x10b0 [ 23.109250] ? __pfx_ip_finish_output2+0x10/0x10 [ 23.109510] ? __pfx_nf_hook+0x10/0x10 [ 23.109732] __ip_finish_output+0x217/0x390 [ 23.109978] ip_finish_output+0x2f/0x130 [ 23.110207] ip_output+0xc9/0x170 [ 23.110404] ip_push_pending_frames+0x1a0/0x240 [ 23.110652] raw_sendmsg+0x102e/0x19e0 [ 23.110871] ? __pfx_raw_sendmsg+0x10/0x10 [ 23.111093] ? lock_release+0x204/0x400 [ 23.111304] ? __mod_lruvec_page_state+0x148/0x330 [ 23.111567] ? find_held_lock+0x8e/0xb0 [ 23.111777] ? find_held_lock+0x8e/0xb0 [ 23.111993] ? __rcu_read_unlock+0x7c/0x2f0 [ 23.112225] ? aa_sk_perm+0x18a/0x550 [ 23.112431] ? filemap_map_pages+0x4f1/0x900 [ 23.112665] ? __pfx_aa_sk_perm+0x10/0x10 [ 23.112880] ? find_held_lock+0x8e/0xb0 [ 23.113098] inet_sendmsg+0xa0/0xb0 [ 23.113297] ? inet_sendmsg+0xa0/0xb0 [ 23.113500] ? __pfx_inet_sendmsg+0x10/0x10 [ 23.113727] sock_sendmsg+0xf4/0x100 [ 23.113924] ? move_addr_to_kernel.part.0+0x4f/0xa0 [ 23.114190] __sys_sendto+0x1d4/0x290 [ 23.114391] ? __pfx___sys_sendto+0x10/0x10 [ 23.114621] ? __pfx_mark_lock.part.0+0x10/0x10 [ 23.114869] ? lock_release+0x204/0x400 [ 23.115076] ? find_held_lock+0x8e/0xb0 [ 23.115287] ? rcu_is_watching+0x23/0x60 [ 23.115503] ? __rseq_handle_notify_resume+0x6e2/0x860 [ 23.115778] ? __kasan_check_write+0x14/0x30 [ 23.116008] ? blkcg_maybe_throttle_current+0x8d/0x770 [ 23.116285] ? mark_held_locks+0x28/0xa0 [ 23.116503] ? do_syscall_64+0x37/0x90 [ 23.116713] __x64_sys_sendto+0x7f/0xb0 [ 23.116924] do_syscall_64+0x59/0x90 [ 23.117123] ? irqentry_exit_to_user_mode+0x25/0x30 [ 23.117387] ? irqentry_exit+0x77/0xb0 [ 23.117593] ? exc_page_fault+0x92/0x140 [ 23.117806] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 23.118081] RIP: 0033:0x7f744aee2bba [ 23.118282] Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 [ 23.119237] RSP: 002b:00007ffd04a7c9f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 23.119644] RAX: ffffffffffffffda RBX: 00007ffd04a7e0a0 RCX: 00007f744aee2bba [ 23.120023] RDX: 0000000000000040 RSI: 000056488e9e6300 RDI: 0000000000000003 [ 23.120413] RBP: 000056488e9e6300 R08: 00007ffd04a80320 R09: 0000000000000010 [ 23.120809] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 [ 23.121219] R13: 00007ffd04a7dc38 R14: 00007ffd04a7ca00 R15: 00007ffd04a7e0a0 [ 23.121617] [ 23.121749] [ 23.121845] The buggy address belongs to the virtual mapping at [ 23.121845] [ffffc90000000000, ffffc90000009000) created by: [ 23.121845] irq_init_percpu_irqstack+0x1cf/0x270 [ 23.122707] [ 23.122803] The buggy address belongs to the physical page: [ 23.123104] page:0000000072ac19f0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x24a09 [ 23.123609] flags: 0xfffffc0001000(reserved|node=0|zone=1|lastcpupid=0x1fffff) [ 23.123998] page_type: 0xffffffff() [ 23.124194] raw: 000fffffc0001000 ffffea0000928248 ffffea0000928248 0000000000000000 [ 23.124610] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 [ 23.125023] page dumped because: kasan: bad access detected [ 23.125326] [ 23.125421] Memory state around the buggy address: [ 23.125682] ffffc90000007800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 23.126072] ffffc90000007880: 00 00 00 00 00 f1 f1 f1 f1 f1 f1 00 00 f2 f2 00 [ 23.126455] >ffffc90000007900: 00 00 00 00 00 00 00 00 00 f2 f2 f2 f2 00 00 00 [ 23.126840] ^ [ 23.127138] ffffc90000007980: 00 00 00 00 00 00 00 00 00 00 00 00 00 f3 f3 f3 [ 23.127522] ffffc90000007a00: f3 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 [ 23.127906] ================================================================== [ 23.128324] Disabling lock debugging due to kernel taint Using simple s16 pointers for the 16-bit accesses fixes the problem. For the 32-bit accesses, src and dst can be used directly. Fixes: 96518518cc41 ("netfilter: add nftables") Cc: stable@vger.kernel.org Reported-by: Tanguy DUBROCA (@SidewayRE) from @Synacktiv working with ZDI Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_byteorder.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index 9a85e797ed58..e596d1a842f7 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -30,11 +30,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, const struct nft_byteorder *priv = nft_expr_priv(expr); u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - union { u32 u32; u16 u16; } *s, *d; + u16 *s16, *d16; unsigned int i; - s = (void *)src; - d = (void *)dst; + s16 = (void *)src; + d16 = (void *)dst; switch (priv->size) { case 8: { @@ -62,11 +62,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = ntohl((__force __be32)s[i].u32); + dst[i] = ntohl((__force __be32)src[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 4; i++) - d[i].u32 = (__force __u32)htonl(s[i].u32); + dst[i] = (__force __u32)htonl(src[i]); break; } break; @@ -74,11 +74,11 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = ntohs((__force __be16)s[i].u16); + d16[i] = ntohs((__force __be16)s16[i]); break; case NFT_BYTEORDER_HTON: for (i = 0; i < priv->len / 2; i++) - d[i].u16 = (__force __u16)htons(s[i].u16); + d16[i] = (__force __u16)htons(s16[i]); break; } break; From 5415ccd50a8620c8cbaa32d6f18c946c453566f5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 5 Jul 2023 20:17:29 +0530 Subject: [PATCH 24/79] bpf: Fix max stack depth check for async callbacks The check_max_stack_depth pass happens after the verifier's symbolic execution, and attempts to walk the call graph of the BPF program, ensuring that the stack usage stays within bounds for all possible call chains. There are two cases to consider: bpf_pseudo_func and bpf_pseudo_call. In the former case, the callback pointer is loaded into a register, and is assumed that it is passed to some helper later which calls it (however there is no way to be sure), but the check remains conservative and accounts the stack usage anyway. For this particular case, asynchronous callbacks are skipped as they execute asynchronously when their corresponding event fires. The case of bpf_pseudo_call is simpler and we know that the call is definitely made, hence the stack depth of the subprog is accounted for. However, the current check still skips an asynchronous callback even if a bpf_pseudo_call was made for it. This is erroneous, as it will miss accounting for the stack usage of the asynchronous callback, which can be used to breach the maximum stack depth limit. Fix this by only skipping asynchronous callbacks when the instruction is not a pseudo call to the subprog. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230705144730.235802-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11e54dd8b6dd..930b5555cfd3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5642,8 +5642,9 @@ continue_func: verbose(env, "verifier bug. subprog has tail_call and async cb\n"); return -EFAULT; } - /* async callbacks don't increase bpf prog stack size */ - continue; + /* async callbacks don't increase bpf prog stack size unless called directly */ + if (!bpf_pseudo_call(insn + i)) + continue; } i = next_insn; From 906bd22a44c7c381ae92996129b075ea7beba8f6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 5 Jul 2023 20:17:30 +0530 Subject: [PATCH 25/79] selftests/bpf: Add selftest for check_stack_max_depth bug Use the bpf_timer_set_callback helper to mark timer_cb as an async callback, and put a direct call to timer_cb in the main subprog. As the check_stack_max_depth happens after the do_check pass, the order does not matter. Without the previous fix, the test passes successfully. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230705144730.235802-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/async_stack_depth.c | 9 +++++ .../selftests/bpf/progs/async_stack_depth.c | 40 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/async_stack_depth.c create mode 100644 tools/testing/selftests/bpf/progs/async_stack_depth.c diff --git a/tools/testing/selftests/bpf/prog_tests/async_stack_depth.c b/tools/testing/selftests/bpf/prog_tests/async_stack_depth.c new file mode 100644 index 000000000000..118abc29b236 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/async_stack_depth.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "async_stack_depth.skel.h" + +void test_async_stack_depth(void) +{ + RUN_TESTS(async_stack_depth); +} diff --git a/tools/testing/selftests/bpf/progs/async_stack_depth.c b/tools/testing/selftests/bpf/progs/async_stack_depth.c new file mode 100644 index 000000000000..477ba950bb43 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/async_stack_depth.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include "bpf_misc.h" + +struct hmap_elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 64); + __type(key, int); + __type(value, struct hmap_elem); +} hmap SEC(".maps"); + +__attribute__((noinline)) +static int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + volatile char buf[256] = {}; + return buf[69]; +} + +SEC("tc") +__failure __msg("combined stack size of 2 calls") +int prog(struct __sk_buff *ctx) +{ + struct hmap_elem *elem; + volatile char buf[256] = {}; + + elem = bpf_map_lookup_elem(&hmap, &(int){0}); + if (!elem) + return 0; + + timer_cb(NULL, NULL, NULL); + return bpf_timer_set_callback(&elem->timer, timer_cb) + buf[0]; +} + +char _license[] SEC("license") = "GPL"; From 21327f81db6337c8843ce755b01523c7d3df715b Mon Sep 17 00:00:00 2001 From: Klaus Kudielka Date: Wed, 5 Jul 2023 07:37:12 +0200 Subject: [PATCH 26/79] net: mvneta: fix txq_map in case of txq_number==1 If we boot with mvneta.txq_number=1, the txq_map is set incorrectly: MVNETA_CPU_TXQ_ACCESS(1) refers to TX queue 1, but only TX queue 0 is initialized. Fix this. Fixes: 50bf8cb6fc9c ("net: mvneta: Configure XPS support") Signed-off-by: Klaus Kudielka Reviewed-by: Michal Kubiak Link: https://lore.kernel.org/r/20230705053712.3914-1-klaus.kudielka@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/mvneta.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index ff5647bcdfca..acf4f6ba73a6 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -1511,7 +1511,7 @@ static void mvneta_defaults_set(struct mvneta_port *pp) */ if (txq_number == 1) txq_map = (cpu == pp->rxq_def) ? - MVNETA_CPU_TXQ_ACCESS(1) : 0; + MVNETA_CPU_TXQ_ACCESS(0) : 0; } else { txq_map = MVNETA_CPU_TXQ_ACCESS_ALL_MASK; @@ -4356,7 +4356,7 @@ static void mvneta_percpu_elect(struct mvneta_port *pp) */ if (txq_number == 1) txq_map = (cpu == elected_cpu) ? - MVNETA_CPU_TXQ_ACCESS(1) : 0; + MVNETA_CPU_TXQ_ACCESS(0) : 0; else txq_map = mvreg_read(pp, MVNETA_CPU_MAP(cpu)) & MVNETA_CPU_TXQ_ACCESS_ALL_MASK; From 009d30f1a77795014f151ba317fcbfc2f17153c6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 5 Jul 2023 13:44:20 +0300 Subject: [PATCH 27/79] net: mscc: ocelot: extend ocelot->fwd_domain_lock to cover ocelot->tas_lock In a future commit we will have to call vsc9959_tas_guard_bands_update() from ocelot_port_update_active_preemptible_tcs(), and that will be impossible due to the AB/BA locking dependencies between ocelot->tas_lock and ocelot->fwd_domain_lock. Just like we did in commit 3ff468ef987e ("net: mscc: ocelot: remove struct ocelot_mm_state :: lock"), the only solution is to expand the scope of ocelot->fwd_domain_lock for it to also serialize changes made to the Time-Aware Shaper, because those will have to result in a recalculation of cut-through TCs, which is something that depends on the forwarding domain. Signed-off-by: Vladimir Oltean Message-ID: <20230705104422.49025-2-vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 4 +-- drivers/net/dsa/ocelot/felix_vsc9959.c | 36 ++++++++++++++++---------- drivers/net/ethernet/mscc/ocelot.c | 1 - drivers/net/ethernet/mscc/ocelot_mm.c | 7 ++--- include/soc/mscc/ocelot.h | 8 +++--- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 70c0e2b1936b..0c1207613aa4 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1790,12 +1790,12 @@ static int felix_change_mtu(struct dsa_switch *ds, int port, int new_mtu) ocelot_port_set_maxlen(ocelot, port, new_mtu); - mutex_lock(&ocelot->tas_lock); + mutex_lock(&ocelot->fwd_domain_lock); if (ocelot_port->taprio && felix->info->tas_guard_bands_update) felix->info->tas_guard_bands_update(ocelot, port); - mutex_unlock(&ocelot->tas_lock); + mutex_unlock(&ocelot->fwd_domain_lock); return 0; } diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index bb39fedd46c7..56b8bcac9690 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1217,7 +1217,7 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) u8 tas_speed; int tc; - lockdep_assert_held(&ocelot->tas_lock); + lockdep_assert_held(&ocelot->fwd_domain_lock); taprio = ocelot_port->taprio; @@ -1259,8 +1259,6 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) vsc9959_tas_min_gate_lengths(taprio, min_gate_len); - mutex_lock(&ocelot->fwd_domain_lock); - for (tc = 0; tc < OCELOT_NUM_TC; tc++) { u32 requested_max_sdu = vsc9959_tas_tc_max_sdu(taprio, tc); u64 remaining_gate_len_ps; @@ -1323,8 +1321,6 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) ocelot_write_rix(ocelot, maxlen, QSYS_PORT_MAX_SDU, port); ocelot->ops->cut_through_fwd(ocelot); - - mutex_unlock(&ocelot->fwd_domain_lock); } static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, @@ -1351,7 +1347,7 @@ static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, break; } - mutex_lock(&ocelot->tas_lock); + mutex_lock(&ocelot->fwd_domain_lock); ocelot_rmw_rix(ocelot, QSYS_TAG_CONFIG_LINK_SPEED(tas_speed), @@ -1361,7 +1357,7 @@ static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port, if (ocelot_port->taprio) vsc9959_tas_guard_bands_update(ocelot, port); - mutex_unlock(&ocelot->tas_lock); + mutex_unlock(&ocelot->fwd_domain_lock); } static void vsc9959_new_base_time(struct ocelot *ocelot, ktime_t base_time, @@ -1409,7 +1405,7 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port, int ret, i; u32 val; - mutex_lock(&ocelot->tas_lock); + mutex_lock(&ocelot->fwd_domain_lock); if (taprio->cmd == TAPRIO_CMD_DESTROY) { ocelot_port_mqprio(ocelot, port, &taprio->mqprio); @@ -1421,7 +1417,7 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port, vsc9959_tas_guard_bands_update(ocelot, port); - mutex_unlock(&ocelot->tas_lock); + mutex_unlock(&ocelot->fwd_domain_lock); return 0; } else if (taprio->cmd != TAPRIO_CMD_REPLACE) { ret = -EOPNOTSUPP; @@ -1504,7 +1500,7 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port, ocelot_port->taprio = taprio_offload_get(taprio); vsc9959_tas_guard_bands_update(ocelot, port); - mutex_unlock(&ocelot->tas_lock); + mutex_unlock(&ocelot->fwd_domain_lock); return 0; @@ -1512,7 +1508,7 @@ err_reset_tc: taprio->mqprio.qopt.num_tc = 0; ocelot_port_mqprio(ocelot, port, &taprio->mqprio); err_unlock: - mutex_unlock(&ocelot->tas_lock); + mutex_unlock(&ocelot->fwd_domain_lock); return ret; } @@ -1525,7 +1521,7 @@ static void vsc9959_tas_clock_adjust(struct ocelot *ocelot) int port; u32 val; - mutex_lock(&ocelot->tas_lock); + mutex_lock(&ocelot->fwd_domain_lock); for (port = 0; port < ocelot->num_phys_ports; port++) { ocelot_port = ocelot->ports[port]; @@ -1563,7 +1559,7 @@ static void vsc9959_tas_clock_adjust(struct ocelot *ocelot) QSYS_TAG_CONFIG_ENABLE, QSYS_TAG_CONFIG, port); } - mutex_unlock(&ocelot->tas_lock); + mutex_unlock(&ocelot->fwd_domain_lock); } static int vsc9959_qos_port_cbs_set(struct dsa_switch *ds, int port, @@ -1634,6 +1630,18 @@ static int vsc9959_qos_query_caps(struct tc_query_caps_base *base) } } +static int vsc9959_qos_port_mqprio(struct ocelot *ocelot, int port, + struct tc_mqprio_qopt_offload *mqprio) +{ + int ret; + + mutex_lock(&ocelot->fwd_domain_lock); + ret = ocelot_port_mqprio(ocelot, port, mqprio); + mutex_unlock(&ocelot->fwd_domain_lock); + + return ret; +} + static int vsc9959_port_setup_tc(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data) @@ -1646,7 +1654,7 @@ static int vsc9959_port_setup_tc(struct dsa_switch *ds, int port, case TC_SETUP_QDISC_TAPRIO: return vsc9959_qos_port_tas_set(ocelot, port, type_data); case TC_SETUP_QDISC_MQPRIO: - return ocelot_port_mqprio(ocelot, port, type_data); + return vsc9959_qos_port_mqprio(ocelot, port, type_data); case TC_SETUP_QDISC_CBS: return vsc9959_qos_port_cbs_set(ds, port, type_data); default: diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 2fa833d041ba..56ccbd4c37fe 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -2927,7 +2927,6 @@ int ocelot_init(struct ocelot *ocelot) mutex_init(&ocelot->mact_lock); mutex_init(&ocelot->fwd_domain_lock); - mutex_init(&ocelot->tas_lock); spin_lock_init(&ocelot->ptp_clock_lock); spin_lock_init(&ocelot->ts_id_lock); diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c index fb3145118d68..f3c0e6c32934 100644 --- a/drivers/net/ethernet/mscc/ocelot_mm.c +++ b/drivers/net/ethernet/mscc/ocelot_mm.c @@ -89,17 +89,14 @@ void ocelot_port_change_fp(struct ocelot *ocelot, int port, { struct ocelot_mm_state *mm = &ocelot->mm[port]; - mutex_lock(&ocelot->fwd_domain_lock); + lockdep_assert_held(&ocelot->fwd_domain_lock); if (mm->preemptible_tcs == preemptible_tcs) - goto out_unlock; + return; mm->preemptible_tcs = preemptible_tcs; ocelot_port_update_active_preemptible_tcs(ocelot, port); - -out_unlock: - mutex_unlock(&ocelot->fwd_domain_lock); } static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port) diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 22aae505c813..eb5f8914a66c 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -863,12 +863,12 @@ struct ocelot { struct mutex stat_view_lock; /* Lock for serializing access to the MAC table */ struct mutex mact_lock; - /* Lock for serializing forwarding domain changes */ + /* Lock for serializing forwarding domain changes, including the + * configuration of the Time-Aware Shaper, MAC Merge layer and + * cut-through forwarding, on which it depends + */ struct mutex fwd_domain_lock; - /* Lock for serializing Time-Aware Shaper changes */ - struct mutex tas_lock; - struct workqueue_struct *owq; u8 ptp:1; From c60819149b637d0f9f7f66e110d2a0d90a3993ea Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 5 Jul 2023 13:44:21 +0300 Subject: [PATCH 28/79] net: dsa: felix: make vsc9959_tas_guard_bands_update() visible to ocelot->ops In a future change we will need to make ocelot_port_update_active_preemptible_tcs() call vsc9959_tas_guard_bands_update(), but that is currently not possible, since the ocelot switch lib does not have access to functions private to the DSA wrapper. Move the pointer to vsc9959_tas_guard_bands_update() from felix->info (which is private to the DSA driver) to ocelot->ops (which is also visible to the ocelot switch lib). Signed-off-by: Vladimir Oltean Message-ID: <20230705104422.49025-3-vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 5 ++--- drivers/net/dsa/ocelot/felix.h | 1 - drivers/net/dsa/ocelot/felix_vsc9959.c | 2 +- include/soc/mscc/ocelot.h | 1 + 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 0c1207613aa4..dee43caee19e 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1786,14 +1786,13 @@ static int felix_change_mtu(struct dsa_switch *ds, int port, int new_mtu) { struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port = ocelot->ports[port]; - struct felix *felix = ocelot_to_felix(ocelot); ocelot_port_set_maxlen(ocelot, port, new_mtu); mutex_lock(&ocelot->fwd_domain_lock); - if (ocelot_port->taprio && felix->info->tas_guard_bands_update) - felix->info->tas_guard_bands_update(ocelot, port); + if (ocelot_port->taprio && ocelot->ops->tas_guard_bands_update) + ocelot->ops->tas_guard_bands_update(ocelot, port); mutex_unlock(&ocelot->fwd_domain_lock); diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 96008c046da5..1d4befe7cfe8 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -57,7 +57,6 @@ struct felix_info { void (*mdio_bus_free)(struct ocelot *ocelot); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); - void (*tas_guard_bands_update)(struct ocelot *ocelot, int port); void (*port_sched_speed_set)(struct ocelot *ocelot, int port, u32 speed); void (*phylink_mac_config)(struct ocelot *ocelot, int port, diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 56b8bcac9690..d7caadd13f83 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -2599,6 +2599,7 @@ static const struct ocelot_ops vsc9959_ops = { .cut_through_fwd = vsc9959_cut_through_fwd, .tas_clock_adjust = vsc9959_tas_clock_adjust, .update_stats = vsc9959_update_stats, + .tas_guard_bands_update = vsc9959_tas_guard_bands_update, }; static const struct felix_info felix_info_vsc9959 = { @@ -2624,7 +2625,6 @@ static const struct felix_info felix_info_vsc9959 = { .port_modes = vsc9959_port_modes, .port_setup_tc = vsc9959_port_setup_tc, .port_sched_speed_set = vsc9959_sched_speed_set, - .tas_guard_bands_update = vsc9959_tas_guard_bands_update, }; /* The INTB interrupt is shared between for PTP TX timestamp availability diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index eb5f8914a66c..a8c2817335b9 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -663,6 +663,7 @@ struct ocelot_ops { struct flow_stats *stats); void (*cut_through_fwd)(struct ocelot *ocelot); void (*tas_clock_adjust)(struct ocelot *ocelot); + void (*tas_guard_bands_update)(struct ocelot *ocelot, int port); void (*update_stats)(struct ocelot *ocelot); }; From c6efb4ae387c79bf0d4da286108c810b7b40de3c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 5 Jul 2023 13:44:22 +0300 Subject: [PATCH 29/79] net: mscc: ocelot: fix oversize frame dropping for preemptible TCs This switch implements Hold/Release in a strange way, with no control from the user as required by IEEE 802.1Q-2018 through Set-And-Hold-MAC and Set-And-Release-MAC, but rather, it emits HOLD requests implicitly based on the schedule. Namely, when the gate of a preemptible TC is about to close (actually QSYS::PREEMPTION_CFG.HOLD_ADVANCE octet times in advance of this event), the QSYS seems to emit a HOLD request pulse towards the MAC which preempts the currently transmitted packet, and further packets are held back in the queue system. This allows large frames to be squeezed through small time slots, because HOLD requests initiated by the gate events result in the frame being segmented in multiple fragments, the bit time of which is equal to the size of the time slot. It has been reported that the vsc9959_tas_guard_bands_update() logic breaks this, because it doesn't take preemptible TCs into account, and enables oversized frame dropping when the time slot doesn't allow a full MTU to be sent, but it does allow 2*minFragSize to be sent (128B). Packets larger than 128B are dropped instead of being sent in multiple fragments. Confusingly, the manual says: | For guard band, SDU calculation of a traffic class of a port, if | preemption is enabled (through 'QSYS::PREEMPTION_CFG.P_QUEUES') then | QSYS::PREEMPTION_CFG.HOLD_ADVANCE is used, otherwise | QSYS::QMAXSDU_CFG_*.QMAXSDU_* is used. but this only refers to the static guard band durations, and the QMAXSDU_CFG_* registers have dual purpose - the other being oversized frame dropping, which takes place irrespective of whether frames are preemptible or express. So, to fix the problem, we need to call vsc9959_tas_guard_bands_update() from ocelot_port_update_active_preemptible_tcs(), and modify the guard band logic to consider a different (lower) oversize limit for preemptible traffic classes. Fixes: 403ffc2c34de ("net: mscc: ocelot: add support for preemptible traffic classes") Signed-off-by: Vladimir Oltean Message-ID: <20230705104422.49025-4-vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix_vsc9959.c | 21 +++++++++++++++++---- drivers/net/ethernet/mscc/ocelot_mm.c | 7 +++++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index d7caadd13f83..1c113957fcf4 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1209,11 +1209,13 @@ static u32 vsc9959_tas_tc_max_sdu(struct tc_taprio_qopt_offload *taprio, int tc) static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) { struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct ocelot_mm_state *mm = &ocelot->mm[port]; struct tc_taprio_qopt_offload *taprio; u64 min_gate_len[OCELOT_NUM_TC]; + u32 val, maxlen, add_frag_size; + u64 needed_min_frag_time_ps; int speed, picos_per_byte; u64 needed_bit_time_ps; - u32 val, maxlen; u8 tas_speed; int tc; @@ -1253,9 +1255,18 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) */ needed_bit_time_ps = (u64)(maxlen + 24) * picos_per_byte; + /* Preemptible TCs don't need to pass a full MTU, the port will + * automatically emit a HOLD request when a preemptible TC gate closes + */ + val = ocelot_read_rix(ocelot, QSYS_PREEMPTION_CFG, port); + add_frag_size = QSYS_PREEMPTION_CFG_MM_ADD_FRAG_SIZE_X(val); + needed_min_frag_time_ps = picos_per_byte * + (u64)(24 + 2 * ethtool_mm_frag_size_add_to_min(add_frag_size)); + dev_dbg(ocelot->dev, - "port %d: max frame size %d needs %llu ps at speed %d\n", - port, maxlen, needed_bit_time_ps, speed); + "port %d: max frame size %d needs %llu ps, %llu ps for mPackets at speed %d\n", + port, maxlen, needed_bit_time_ps, needed_min_frag_time_ps, + speed); vsc9959_tas_min_gate_lengths(taprio, min_gate_len); @@ -1267,7 +1278,9 @@ static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port) remaining_gate_len_ps = vsc9959_tas_remaining_gate_len_ps(min_gate_len[tc]); - if (remaining_gate_len_ps > needed_bit_time_ps) { + if ((mm->active_preemptible_tcs & BIT(tc)) ? + remaining_gate_len_ps > needed_min_frag_time_ps : + remaining_gate_len_ps > needed_bit_time_ps) { /* Setting QMAXSDU_CFG to 0 disables oversized frame * dropping. */ diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c index f3c0e6c32934..c815ae64e39d 100644 --- a/drivers/net/ethernet/mscc/ocelot_mm.c +++ b/drivers/net/ethernet/mscc/ocelot_mm.c @@ -67,10 +67,13 @@ void ocelot_port_update_active_preemptible_tcs(struct ocelot *ocelot, int port) val = mm->preemptible_tcs; /* Cut through switching doesn't work for preemptible priorities, - * so first make sure it is disabled. + * so first make sure it is disabled. Also, changing the preemptible + * TCs affects the oversized frame dropping logic, so that needs to be + * re-triggered. And since tas_guard_bands_update() also implicitly + * calls cut_through_fwd(), we don't need to explicitly call it. */ mm->active_preemptible_tcs = val; - ocelot->ops->cut_through_fwd(ocelot); + ocelot->ops->tas_guard_bands_update(ocelot, port); dev_dbg(ocelot->dev, "port %d %s/%s, MM TX %s, preemptible TCs 0x%x, active 0x%x\n", From 525c469e5de9bf7e53574396196e80fc716ac9eb Mon Sep 17 00:00:00 2001 From: Quan Zhou Date: Wed, 5 Jul 2023 23:26:38 +0800 Subject: [PATCH 30/79] wifi: mt76: mt7921e: fix init command fail with enabled device For some cases as below, we may encounter the unpreditable chip stats in driver probe() * The system reboot flow do not work properly, such as kernel oops while rebooting, and then the driver do not go back to default status at this moment. * Similar to the flow above. If the device was enabled in BIOS or UEFI, the system may switch to Linux without driver fully shutdown. To avoid the problem, force push the device back to default in probe() * mt7921e_mcu_fw_pmctrl() : return control privilege to chip side. * mt7921_wfsys_reset() : cleanup chip config before resource init. Error log [59007.600714] mt7921e 0000:02:00.0: ASIC revision: 79220010 [59010.889773] mt7921e 0000:02:00.0: Message 00000010 (seq 1) timeout [59010.889786] mt7921e 0000:02:00.0: Failed to get patch semaphore [59014.217839] mt7921e 0000:02:00.0: Message 00000010 (seq 2) timeout [59014.217852] mt7921e 0000:02:00.0: Failed to get patch semaphore [59017.545880] mt7921e 0000:02:00.0: Message 00000010 (seq 3) timeout [59017.545893] mt7921e 0000:02:00.0: Failed to get patch semaphore [59020.874086] mt7921e 0000:02:00.0: Message 00000010 (seq 4) timeout [59020.874099] mt7921e 0000:02:00.0: Failed to get patch semaphore [59024.202019] mt7921e 0000:02:00.0: Message 00000010 (seq 5) timeout [59024.202033] mt7921e 0000:02:00.0: Failed to get patch semaphore [59027.530082] mt7921e 0000:02:00.0: Message 00000010 (seq 6) timeout [59027.530096] mt7921e 0000:02:00.0: Failed to get patch semaphore [59030.857888] mt7921e 0000:02:00.0: Message 00000010 (seq 7) timeout [59030.857904] mt7921e 0000:02:00.0: Failed to get patch semaphore [59034.185946] mt7921e 0000:02:00.0: Message 00000010 (seq 8) timeout [59034.185961] mt7921e 0000:02:00.0: Failed to get patch semaphore [59037.514249] mt7921e 0000:02:00.0: Message 00000010 (seq 9) timeout [59037.514262] mt7921e 0000:02:00.0: Failed to get patch semaphore [59040.842362] mt7921e 0000:02:00.0: Message 00000010 (seq 10) timeout [59040.842375] mt7921e 0000:02:00.0: Failed to get patch semaphore [59040.923845] mt7921e 0000:02:00.0: hardware init failed Cc: stable@vger.kernel.org Fixes: 5c14a5f944b9 ("mt76: mt7921: introduce mt7921e support") Tested-by: Kai-Heng Feng Tested-by: Juan Martinez Co-developed-by: Leon Yen Signed-off-by: Leon Yen Signed-off-by: Quan Zhou Signed-off-by: Deren Wu Message-ID: <39fcb7cee08d4ab940d38d82f21897483212483f.1688569385.git.deren.wu@mediatek.com> Signed-off-by: Jakub Kicinski --- drivers/net/wireless/mediatek/mt76/mt7921/dma.c | 4 ---- drivers/net/wireless/mediatek/mt76/mt7921/mcu.c | 8 -------- drivers/net/wireless/mediatek/mt76/mt7921/pci.c | 8 ++++++++ 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/dma.c b/drivers/net/wireless/mediatek/mt76/mt7921/dma.c index f0a80c2b476a..4153cd6c2a01 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/dma.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/dma.c @@ -231,10 +231,6 @@ int mt7921_dma_init(struct mt7921_dev *dev) if (ret) return ret; - ret = mt7921_wfsys_reset(dev); - if (ret) - return ret; - /* init tx queue */ ret = mt76_connac_init_tx_queues(dev->phy.mt76, MT7921_TXQ_BAND0, MT7921_TX_RING_SIZE, diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c index c69ce6df4956..f55caa00ac69 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c @@ -476,12 +476,6 @@ static int mt7921_load_firmware(struct mt7921_dev *dev) { int ret; - ret = mt76_get_field(dev, MT_CONN_ON_MISC, MT_TOP_MISC2_FW_N9_RDY); - if (ret && mt76_is_mmio(&dev->mt76)) { - dev_dbg(dev->mt76.dev, "Firmware is already download\n"); - goto fw_loaded; - } - ret = mt76_connac2_load_patch(&dev->mt76, mt7921_patch_name(dev)); if (ret) return ret; @@ -504,8 +498,6 @@ static int mt7921_load_firmware(struct mt7921_dev *dev) return -EIO; } -fw_loaded: - #ifdef CONFIG_PM dev->mt76.hw->wiphy->wowlan = &mt76_connac_wowlan_support; #endif /* CONFIG_PM */ diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c index ddb1fa4ee01d..95610a117d2f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c @@ -325,6 +325,10 @@ static int mt7921_pci_probe(struct pci_dev *pdev, bus_ops->rmw = mt7921_rmw; dev->mt76.bus = bus_ops; + ret = mt7921e_mcu_fw_pmctrl(dev); + if (ret) + goto err_free_dev; + ret = __mt7921e_mcu_drv_pmctrl(dev); if (ret) goto err_free_dev; @@ -333,6 +337,10 @@ static int mt7921_pci_probe(struct pci_dev *pdev, (mt7921_l1_rr(dev, MT_HW_REV) & 0xff); dev_info(mdev->dev, "ASIC revision: %04x\n", mdev->rev); + ret = mt7921_wfsys_reset(dev); + if (ret) + goto err_free_dev; + mt76_wr(dev, MT_WFDMA0_HOST_INT_ENA, 0); mt76_wr(dev, MT_PCIE_MAC_INT_ENABLE, 0xff); From 0323bce598eea038714f941ce2b22541c46d488f Mon Sep 17 00:00:00 2001 From: M A Ramdhan Date: Wed, 5 Jul 2023 12:15:30 -0400 Subject: [PATCH 31/79] net/sched: cls_fw: Fix improper refcount update leads to use-after-free In the event of a failure in tcf_change_indev(), fw_set_parms() will immediately return an error after incrementing or decrementing reference counter in tcf_bind_filter(). If attacker can control reference counter to zero and make reference freed, leading to use after free. In order to prevent this, move the point of possible failure above the point where the TC_FW_CLASSID is handled. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: M A Ramdhan Signed-off-by: M A Ramdhan Acked-by: Jamal Hadi Salim Reviewed-by: Pedro Tammela Message-ID: <20230705161530.52003-1-ramdhan@starlabs.sg> Signed-off-by: Jakub Kicinski --- net/sched/cls_fw.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index ae9439a6c56c..8641f8059317 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -212,11 +212,6 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, if (err < 0) return err; - if (tb[TCA_FW_CLASSID]) { - f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); - tcf_bind_filter(tp, &f->res, base); - } - if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); @@ -233,6 +228,11 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, } else if (head->mask != 0xFFFFFFFF) return err; + if (tb[TCA_FW_CLASSID]) { + f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); + tcf_bind_filter(tp, &f->res, base); + } + return 0; } From 0503efeadbf6bb8bf24397613a73b67e665eac5f Mon Sep 17 00:00:00 2001 From: Junfeng Guo Date: Thu, 6 Jul 2023 12:41:28 +0800 Subject: [PATCH 32/79] gve: Set default duplex configuration to full Current duplex mode was unset in the driver, resulting in the default parameter being set to 0, which corresponds to half duplex. It might mislead users to have incorrect expectation about the driver's transmission capabilities. Set the default duplex configuration to full, as the driver runs in full duplex mode at this point. Fixes: 7e074d5a76ca ("gve: Enable Link Speed Reporting in the driver.") Signed-off-by: Junfeng Guo Reviewed-by: Leon Romanovsky Message-ID: <20230706044128.2726747-1-junfeng.guo@intel.com> Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index cfd4b8d284d1..50162ec9424d 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -590,6 +590,9 @@ static int gve_get_link_ksettings(struct net_device *netdev, err = gve_adminq_report_link_speed(priv); cmd->base.speed = priv->link_speed; + + cmd->base.duplex = DUPLEX_FULL; + return err; } From af42088bdaf292060b8d8a00d8644ca7b2b3f2d1 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Thu, 6 Jul 2023 09:57:05 +0530 Subject: [PATCH 33/79] octeontx2-af: Promisc enable/disable through mbox In legacy silicon, promiscuous mode is only modified through CGX mbox messages. In CN10KB silicon, it is modified from CGX mbox and NIX. This breaks legacy application behaviour. Fix this by removing call from NIX. Fixes: d6c9784baf59 ("octeontx2-af: Invoke exact match functions if supported") Signed-off-by: Ratheesh Kannoth Reviewed-by: Leon Romanovsky Reviewed-by: Michal Kubiak Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/af/rvu_nix.c | 11 ++------- .../marvell/octeontx2/af/rvu_npc_hash.c | 23 +++++++++++++++++-- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 0d745ae1cc9a..04b0e885f9d2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -4069,21 +4069,14 @@ int rvu_mbox_handler_nix_set_rx_mode(struct rvu *rvu, struct nix_rx_mode *req, } /* install/uninstall promisc entry */ - if (promisc) { + if (promisc) rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf, pfvf->rx_chan_base, pfvf->rx_chan_cnt); - - if (rvu_npc_exact_has_match_table(rvu)) - rvu_npc_exact_promisc_enable(rvu, pcifunc); - } else { + else if (!nix_rx_multicast) rvu_npc_enable_promisc_entry(rvu, pcifunc, nixlf, false); - if (rvu_npc_exact_has_match_table(rvu)) - rvu_npc_exact_promisc_disable(rvu, pcifunc); - } - return 0; } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c index 9f11c1e40737..6fe67f3a7f6f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c @@ -1164,8 +1164,10 @@ static u16 __rvu_npc_exact_cmd_rules_cnt_update(struct rvu *rvu, int drop_mcam_i { struct npc_exact_table *table; u16 *cnt, old_cnt; + bool promisc; table = rvu->hw->table; + promisc = table->promisc_mode[drop_mcam_idx]; cnt = &table->cnt_cmd_rules[drop_mcam_idx]; old_cnt = *cnt; @@ -1177,13 +1179,18 @@ static u16 __rvu_npc_exact_cmd_rules_cnt_update(struct rvu *rvu, int drop_mcam_i *enable_or_disable_cam = false; - /* If all rules are deleted, disable cam */ + if (promisc) + goto done; + + /* If all rules are deleted and not already in promisc mode; + * disable cam + */ if (!*cnt && val < 0) { *enable_or_disable_cam = true; goto done; } - /* If rule got added, enable cam */ + /* If rule got added and not already in promisc mode; enable cam */ if (!old_cnt && val > 0) { *enable_or_disable_cam = true; goto done; @@ -1462,6 +1469,12 @@ int rvu_npc_exact_promisc_disable(struct rvu *rvu, u16 pcifunc) *promisc = false; mutex_unlock(&table->lock); + /* Enable drop rule */ + rvu_npc_enable_mcam_by_entry_index(rvu, drop_mcam_idx, NIX_INTF_RX, + true); + + dev_dbg(rvu->dev, "%s: disabled promisc mode (cgx=%d lmac=%d)\n", + __func__, cgx_id, lmac_id); return 0; } @@ -1503,6 +1516,12 @@ int rvu_npc_exact_promisc_enable(struct rvu *rvu, u16 pcifunc) *promisc = true; mutex_unlock(&table->lock); + /* disable drop rule */ + rvu_npc_enable_mcam_by_entry_index(rvu, drop_mcam_idx, NIX_INTF_RX, + false); + + dev_dbg(rvu->dev, "%s: Enabled promisc mode (cgx=%d lmac=%d)\n", + __func__, cgx_id, lmac_id); return 0; } From 7709fbd4922c197efabda03660d93e48a3e80323 Mon Sep 17 00:00:00 2001 From: Sai Krishna Date: Thu, 6 Jul 2023 13:59:36 +0530 Subject: [PATCH 34/79] octeontx2-af: Move validation of ptp pointer before its usage Moved PTP pointer validation before its use to avoid smatch warning. Also used kzalloc/kfree instead of devm_kzalloc/devm_kfree. Fixes: 2ef4e45d99b1 ("octeontx2-af: Add PTP PPS Errata workaround on CN10K silicon") Signed-off-by: Naveen Mamindlapalli Signed-off-by: Sunil Goutham Signed-off-by: Sai Krishna Signed-off-by: David S. Miller --- .../net/ethernet/marvell/octeontx2/af/ptp.c | 19 +++++++++---------- .../net/ethernet/marvell/octeontx2/af/rvu.c | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/ptp.c b/drivers/net/ethernet/marvell/octeontx2/af/ptp.c index 3411e2e47d46..0ee420a489fc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/ptp.c @@ -208,7 +208,7 @@ struct ptp *ptp_get(void) /* Check driver is bound to PTP block */ if (!ptp) ptp = ERR_PTR(-EPROBE_DEFER); - else + else if (!IS_ERR(ptp)) pci_dev_get(ptp->pdev); return ptp; @@ -388,11 +388,10 @@ static int ptp_extts_on(struct ptp *ptp, int on) static int ptp_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { - struct device *dev = &pdev->dev; struct ptp *ptp; int err; - ptp = devm_kzalloc(dev, sizeof(*ptp), GFP_KERNEL); + ptp = kzalloc(sizeof(*ptp), GFP_KERNEL); if (!ptp) { err = -ENOMEM; goto error; @@ -428,20 +427,19 @@ static int ptp_probe(struct pci_dev *pdev, return 0; error_free: - devm_kfree(dev, ptp); + kfree(ptp); error: /* For `ptp_get()` we need to differentiate between the case * when the core has not tried to probe this device and the case when - * the probe failed. In the later case we pretend that the - * initialization was successful and keep the error in + * the probe failed. In the later case we keep the error in * `dev->driver_data`. */ pci_set_drvdata(pdev, ERR_PTR(err)); if (!first_ptp_block) first_ptp_block = ERR_PTR(err); - return 0; + return err; } static void ptp_remove(struct pci_dev *pdev) @@ -449,16 +447,17 @@ static void ptp_remove(struct pci_dev *pdev) struct ptp *ptp = pci_get_drvdata(pdev); u64 clock_cfg; - if (cn10k_ptp_errata(ptp) && hrtimer_active(&ptp->hrtimer)) - hrtimer_cancel(&ptp->hrtimer); - if (IS_ERR_OR_NULL(ptp)) return; + if (cn10k_ptp_errata(ptp) && hrtimer_active(&ptp->hrtimer)) + hrtimer_cancel(&ptp->hrtimer); + /* Disable PTP clock */ clock_cfg = readq(ptp->reg_base + PTP_CLOCK_CFG); clock_cfg &= ~PTP_CLOCK_CFG_PTP_EN; writeq(clock_cfg, ptp->reg_base + PTP_CLOCK_CFG); + kfree(ptp); } static const struct pci_device_id ptp_id_table[] = { diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 8dbc35c481f6..73df2d564545 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -3252,7 +3252,7 @@ static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) rvu->ptp = ptp_get(); if (IS_ERR(rvu->ptp)) { err = PTR_ERR(rvu->ptp); - if (err == -EPROBE_DEFER) + if (err) goto err_release_regions; rvu->ptp = NULL; } From abfb2a58a5377ebab717d4362d6180f901b6e5c1 Mon Sep 17 00:00:00 2001 From: Nitya Sunkad Date: Thu, 6 Jul 2023 11:20:06 -0700 Subject: [PATCH 35/79] ionic: remove WARN_ON to prevent panic_on_warn Remove unnecessary early code development check and the WARN_ON that it uses. The irq alloc and free paths have long been cleaned up and this check shouldn't have stuck around so long. Fixes: 77ceb68e29cc ("ionic: Add notifyq support") Signed-off-by: Nitya Sunkad Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 7c20a44e549b..612b0015dc43 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -475,11 +475,6 @@ static void ionic_qcqs_free(struct ionic_lif *lif) static void ionic_link_qcq_interrupts(struct ionic_qcq *src_qcq, struct ionic_qcq *n_qcq) { - if (WARN_ON(n_qcq->flags & IONIC_QCQ_F_INTR)) { - ionic_intr_free(n_qcq->cq.lif->ionic, n_qcq->intr.index); - n_qcq->flags &= ~IONIC_QCQ_F_INTR; - } - n_qcq->intr.vector = src_qcq->intr.vector; n_qcq->intr.index = src_qcq->intr.index; n_qcq->napi_qcq = src_qcq->napi_qcq; From 3a7af34fb6ecd9fbeb4454fc03c654b26fab5f5e Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 6 Jul 2023 13:59:24 -0700 Subject: [PATCH 36/79] ionic: remove dead device fail path Remove the probe error path code that leaves the driver bound to the device, but with essentially a dead device. This was useful maybe twice early in the driver's life and no longer makes sense to keep. Fixes: 30a1e6d0f8e2 ("ionic: keep ionic dev on lif init fail") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c index b8678da1cce5..ab7d217b98b3 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c @@ -353,12 +353,6 @@ err_out_reset: ionic_reset(ionic); err_out_teardown: ionic_dev_teardown(ionic); - pci_clear_master(pdev); - /* Don't fail the probe for these errors, keep - * the hw interface around for inspection - */ - return 0; - err_out_unmap_bars: ionic_unmap_bars(ionic); err_out_pci_release_regions: From 8139dccd464aaee4a2c351506ff883733c6ca5a3 Mon Sep 17 00:00:00 2001 From: Ivan Babrou Date: Thu, 6 Jul 2023 21:39:20 -0700 Subject: [PATCH 37/79] udp6: add a missing call into udp_fail_queue_rcv_skb tracepoint The tracepoint has existed for 12 years, but it only covered udp over the legacy IPv4 protocol. Having it enabled for udp6 removes the unnecessary difference in error visibility. Signed-off-by: Ivan Babrou Fixes: 296f7ea75b45 ("udp: add tracepoints for queueing skb to rcvbuf") Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/net-traces.c | 2 ++ net/ipv6/udp.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 805b7385dd8d..6aef976bc1da 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -63,4 +63,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); +EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb); + EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 317b01c9bc39..630d61a32c1d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -680,6 +681,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb_reason(skb, drop_reason); + trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } From e7731194fdf085f46d58b1adccfddbd0dfee4873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 7 Jul 2023 08:53:25 +0200 Subject: [PATCH 38/79] net: bgmac: postpone turning IRQs off to avoid SoC hangs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turning IRQs off is done by accessing Ethernet controller registers. That can't be done until device's clock is enabled. It results in a SoC hang otherwise. This bug remained unnoticed for years as most bootloaders keep all Ethernet interfaces turned on. It seems to only affect a niche SoC family BCM47189. It has two Ethernet controllers but CFE bootloader uses only the first one. Fixes: 34322615cbaa ("net: bgmac: Mask interrupts during probe") Signed-off-by: Rafał Miłecki Reviewed-by: Michal Kubiak Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bgmac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 1761df8fb7f9..10c7c232cc4e 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -1492,8 +1492,6 @@ int bgmac_enet_probe(struct bgmac *bgmac) bgmac->in_init = true; - bgmac_chip_intrs_off(bgmac); - net_dev->irq = bgmac->irq; SET_NETDEV_DEV(net_dev, bgmac->dev); dev_set_drvdata(bgmac->dev, bgmac); @@ -1511,6 +1509,8 @@ int bgmac_enet_probe(struct bgmac *bgmac) */ bgmac_clk_enable(bgmac, 0); + bgmac_chip_intrs_off(bgmac); + /* This seems to be fixing IRQ by assigning OOB #6 to the core */ if (!(bgmac->feature_flags & BGMAC_FEAT_IDM_MASK)) { if (bgmac->feature_flags & BGMAC_FEAT_IRQ_ID_OOB_6) From c329b261afe71197d9da83c1f18eb45a7e97e089 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 7 Jul 2023 10:11:10 +0200 Subject: [PATCH 39/79] net: prevent skb corruption on frag list segmentation Ian reported several skb corruptions triggered by rx-gro-list, collecting different oops alike: [ 62.624003] BUG: kernel NULL pointer dereference, address: 00000000000000c0 [ 62.631083] #PF: supervisor read access in kernel mode [ 62.636312] #PF: error_code(0x0000) - not-present page [ 62.641541] PGD 0 P4D 0 [ 62.644174] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 62.648629] CPU: 1 PID: 913 Comm: napi/eno2-79 Not tainted 6.4.0 #364 [ 62.655162] Hardware name: Supermicro Super Server/A2SDi-12C-HLN4F, BIOS 1.7a 10/13/2022 [ 62.663344] RIP: 0010:__udp_gso_segment (./include/linux/skbuff.h:2858 ./include/linux/udp.h:23 net/ipv4/udp_offload.c:228 net/ipv4/udp_offload.c:261 net/ipv4/udp_offload.c:277) [ 62.687193] RSP: 0018:ffffbd3a83b4f868 EFLAGS: 00010246 [ 62.692515] RAX: 00000000000000ce RBX: 0000000000000000 RCX: 0000000000000000 [ 62.699743] RDX: ffffa124def8a000 RSI: 0000000000000079 RDI: ffffa125952a14d4 [ 62.706970] RBP: ffffa124def8a000 R08: 0000000000000022 R09: 00002000001558c9 [ 62.714199] R10: 0000000000000000 R11: 00000000be554639 R12: 00000000000000e2 [ 62.721426] R13: ffffa125952a1400 R14: ffffa125952a1400 R15: 00002000001558c9 [ 62.728654] FS: 0000000000000000(0000) GS:ffffa127efa40000(0000) knlGS:0000000000000000 [ 62.736852] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 62.742702] CR2: 00000000000000c0 CR3: 00000001034b0000 CR4: 00000000003526e0 [ 62.749948] Call Trace: [ 62.752498] [ 62.779267] inet_gso_segment (net/ipv4/af_inet.c:1398) [ 62.787605] skb_mac_gso_segment (net/core/gro.c:141) [ 62.791906] __skb_gso_segment (net/core/dev.c:3403 (discriminator 2)) [ 62.800492] validate_xmit_skb (./include/linux/netdevice.h:4862 net/core/dev.c:3659) [ 62.804695] validate_xmit_skb_list (net/core/dev.c:3710) [ 62.809158] sch_direct_xmit (net/sched/sch_generic.c:330) [ 62.813198] __dev_queue_xmit (net/core/dev.c:3805 net/core/dev.c:4210) net/netfilter/core.c:626) [ 62.821093] br_dev_queue_push_xmit (net/bridge/br_forward.c:55) [ 62.825652] maybe_deliver (net/bridge/br_forward.c:193) [ 62.829420] br_flood (net/bridge/br_forward.c:233) [ 62.832758] br_handle_frame_finish (net/bridge/br_input.c:215) [ 62.837403] br_handle_frame (net/bridge/br_input.c:298 net/bridge/br_input.c:416) [ 62.851417] __netif_receive_skb_core.constprop.0 (net/core/dev.c:5387) [ 62.866114] __netif_receive_skb_list_core (net/core/dev.c:5570) [ 62.871367] netif_receive_skb_list_internal (net/core/dev.c:5638 net/core/dev.c:5727) [ 62.876795] napi_complete_done (./include/linux/list.h:37 ./include/net/gro.h:434 ./include/net/gro.h:429 net/core/dev.c:6067) [ 62.881004] ixgbe_poll (drivers/net/ethernet/intel/ixgbe/ixgbe_main.c:3191) [ 62.893534] __napi_poll (net/core/dev.c:6498) [ 62.897133] napi_threaded_poll (./include/linux/netpoll.h:89 net/core/dev.c:6640) [ 62.905276] kthread (kernel/kthread.c:379) [ 62.913435] ret_from_fork (arch/x86/entry/entry_64.S:314) [ 62.917119] In the critical scenario, rx-gro-list GRO-ed packets are fed, via a bridge, both to the local input path and to an egress device (tun). The segmentation of such packets unsafely writes to the cloned skbs with shared heads. This change addresses the issue by uncloning as needed the to-be-segmented skbs. Reported-by: Ian Kumlien Tested-by: Ian Kumlien Fixes: 3a1296a38d0c ("net: Support GRO/GSO fraglist chaining.") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6c5915efbc17..a298992060e6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4261,6 +4261,11 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(skb, -skb_network_offset(skb) + offset); + /* Ensure the head is writeable before touching the shared info */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) + goto err_linearize; + skb_shinfo(skb)->frag_list = NULL; while (list_skb) { From 6b5c13b591d753c6022fbd12f8c0c0a9a07fc065 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 7 Jul 2023 12:56:20 +0200 Subject: [PATCH 40/79] s390/ism: Fix locking for forwarding of IRQs and events to clients The clients array references all registered clients and is protected by the clients_lock. Besides its use as general list of clients the clients array is accessed in ism_handle_irq() to forward ISM device events to clients. While the clients_lock is taken in the IRQ handler when calling handle_event() it is however incorrectly not held during the client->handle_irq() call and for the preceding clients[] access leaving it unprotected against concurrent client (un-)registration. Furthermore the accesses to ism->sba_client_arr[] in ism_register_dmb() and ism_unregister_dmb() are not protected by any lock. This is especially problematic as the client ID from the ism->sba_client_arr[] is not checked against NO_CLIENT and neither is the client pointer checked. Instead of expanding the use of the clients_lock further add a separate array in struct ism_dev which references clients subscribed to the device's events and IRQs. This array is protected by ism->lock which is already taken in ism_handle_irq() and can be taken outside the IRQ handler when adding/removing subscribers or the accessing ism->sba_client_arr[]. This also means that the clients_lock is no longer taken in IRQ context. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle Reviewed-by: Alexandra Winter Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 44 +++++++++++++++++++++++++++++++------- include/linux/ism.h | 1 + 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 9b5fccdbc7d6..b664e4a08645 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -47,6 +47,15 @@ static struct ism_dev_list ism_dev_list = { .mutex = __MUTEX_INITIALIZER(ism_dev_list.mutex), }; +static void ism_setup_forwarding(struct ism_client *client, struct ism_dev *ism) +{ + unsigned long flags; + + spin_lock_irqsave(&ism->lock, flags); + ism->subs[client->id] = client; + spin_unlock_irqrestore(&ism->lock, flags); +} + int ism_register_client(struct ism_client *client) { struct ism_dev *ism; @@ -71,6 +80,7 @@ int ism_register_client(struct ism_client *client) list_for_each_entry(ism, &ism_dev_list.list, list) { ism->priv[i] = NULL; client->add(ism); + ism_setup_forwarding(client, ism); } } mutex_unlock(&ism_dev_list.mutex); @@ -92,6 +102,9 @@ int ism_unregister_client(struct ism_client *client) max_client--; spin_unlock_irqrestore(&clients_lock, flags); list_for_each_entry(ism, &ism_dev_list.list, list) { + spin_lock_irqsave(&ism->lock, flags); + /* Stop forwarding IRQs and events */ + ism->subs[client->id] = NULL; for (int i = 0; i < ISM_NR_DMBS; ++i) { if (ism->sba_client_arr[i] == client->id) { pr_err("%s: attempt to unregister client '%s'" @@ -101,6 +114,7 @@ int ism_unregister_client(struct ism_client *client) goto out; } } + spin_unlock_irqrestore(&ism->lock, flags); } out: mutex_unlock(&ism_dev_list.mutex); @@ -328,6 +342,7 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, struct ism_client *client) { union ism_reg_dmb cmd; + unsigned long flags; int ret; ret = ism_alloc_dmb(ism, dmb); @@ -351,7 +366,9 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, goto out; } dmb->dmb_tok = cmd.response.dmb_tok; + spin_lock_irqsave(&ism->lock, flags); ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = client->id; + spin_unlock_irqrestore(&ism->lock, flags); out: return ret; } @@ -360,6 +377,7 @@ EXPORT_SYMBOL_GPL(ism_register_dmb); int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { union ism_unreg_dmb cmd; + unsigned long flags; int ret; memset(&cmd, 0, sizeof(cmd)); @@ -368,7 +386,9 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) cmd.request.dmb_tok = dmb->dmb_tok; + spin_lock_irqsave(&ism->lock, flags); ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = NO_CLIENT; + spin_unlock_irqrestore(&ism->lock, flags); ret = ism_cmd(ism, &cmd); if (ret && ret != ISM_ERROR) @@ -491,6 +511,7 @@ static u16 ism_get_chid(struct ism_dev *ism) static void ism_handle_event(struct ism_dev *ism) { struct ism_event *entry; + struct ism_client *clt; int i; while ((ism->ieq_idx + 1) != READ_ONCE(ism->ieq->header.idx)) { @@ -499,21 +520,21 @@ static void ism_handle_event(struct ism_dev *ism) entry = &ism->ieq->entry[ism->ieq_idx]; debug_event(ism_debug_info, 2, entry, sizeof(*entry)); - spin_lock(&clients_lock); - for (i = 0; i < max_client; ++i) - if (clients[i]) - clients[i]->handle_event(ism, entry); - spin_unlock(&clients_lock); + for (i = 0; i < max_client; ++i) { + clt = ism->subs[i]; + if (clt) + clt->handle_event(ism, entry); + } } } static irqreturn_t ism_handle_irq(int irq, void *data) { struct ism_dev *ism = data; - struct ism_client *clt; unsigned long bit, end; unsigned long *bv; u16 dmbemask; + u8 client_id; bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET]; end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET; @@ -530,8 +551,10 @@ static irqreturn_t ism_handle_irq(int irq, void *data) dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET]; ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0; barrier(); - clt = clients[ism->sba_client_arr[bit]]; - clt->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask); + client_id = ism->sba_client_arr[bit]; + if (unlikely(client_id == NO_CLIENT || !ism->subs[client_id])) + continue; + ism->subs[client_id]->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask); } if (ism->sba->e) { @@ -554,6 +577,7 @@ static void ism_dev_add_work_func(struct work_struct *work) add_work); client->add(client->tgt_ism); + ism_setup_forwarding(client, client->tgt_ism); atomic_dec(&client->tgt_ism->add_dev_cnt); wake_up(&client->tgt_ism->waitq); } @@ -691,7 +715,11 @@ static void ism_dev_remove_work_func(struct work_struct *work) { struct ism_client *client = container_of(work, struct ism_client, remove_work); + unsigned long flags; + spin_lock_irqsave(&client->tgt_ism->lock, flags); + client->tgt_ism->subs[client->id] = NULL; + spin_unlock_irqrestore(&client->tgt_ism->lock, flags); client->remove(client->tgt_ism); atomic_dec(&client->tgt_ism->free_clients_cnt); wake_up(&client->tgt_ism->waitq); diff --git a/include/linux/ism.h b/include/linux/ism.h index ea2bcdae7401..5160d47e5ea9 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -44,6 +44,7 @@ struct ism_dev { u64 local_gid; int ieq_idx; + struct ism_client *subs[MAX_CLIENTS]; atomic_t free_clients_cnt; atomic_t add_dev_cnt; wait_queue_head_t waitq; From 76631ffa2fd2d45bae5ad717eef716b94144e0e7 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 7 Jul 2023 12:56:21 +0200 Subject: [PATCH 41/79] s390/ism: Fix and simplify add()/remove() callback handling Previously the clients_lock was protecting the clients array against concurrent addition/removal of clients but was also accessed from IRQ context. This meant that it had to be a spinlock and that the add() and remove() callbacks in which clients need to do allocation and take mutexes can't be called under the clients_lock. To work around this these callbacks were moved to workqueues. This not only introduced significant complexity but is also subtly broken in at least one way. In ism_dev_init() and ism_dev_exit() clients[i]->tgt_ism is used to communicate the added/removed ISM device to the work function. While write access to client[i]->tgt_ism is protected by the clients_lock and the code waits that there is no pending add/remove work before and after setting clients[i]->tgt_ism this is not enough. The problem is that the wait happens based on per ISM device counters. Thus a concurrent ism_dev_init()/ism_dev_exit() for a different ISM device may overwrite a clients[i]->tgt_ism between unlocking the clients_lock and the subsequent wait for the work to finnish. Thankfully with the clients_lock no longer held in IRQ context it can be turned into a mutex which can be held during the calls to add()/remove() completely removing the need for the workqueues and the associated broken housekeeping including the per ISM device counters and the clients[i]->tgt_ism. Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 90 +++++++++++--------------------------- include/linux/ism.h | 6 --- 2 files changed, 26 insertions(+), 70 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index b664e4a08645..54091b7aea16 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -36,7 +36,7 @@ static const struct smcd_ops ism_ops; static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ /* a list for fast mapping */ static u8 max_client; -static DEFINE_SPINLOCK(clients_lock); +static DEFINE_MUTEX(clients_lock); struct ism_dev_list { struct list_head list; struct mutex mutex; /* protects ism device list */ @@ -59,11 +59,10 @@ static void ism_setup_forwarding(struct ism_client *client, struct ism_dev *ism) int ism_register_client(struct ism_client *client) { struct ism_dev *ism; - unsigned long flags; int i, rc = -ENOSPC; mutex_lock(&ism_dev_list.mutex); - spin_lock_irqsave(&clients_lock, flags); + mutex_lock(&clients_lock); for (i = 0; i < MAX_CLIENTS; ++i) { if (!clients[i]) { clients[i] = client; @@ -74,7 +73,8 @@ int ism_register_client(struct ism_client *client) break; } } - spin_unlock_irqrestore(&clients_lock, flags); + mutex_unlock(&clients_lock); + if (i < MAX_CLIENTS) { /* initialize with all devices that we got so far */ list_for_each_entry(ism, &ism_dev_list.list, list) { @@ -96,11 +96,11 @@ int ism_unregister_client(struct ism_client *client) int rc = 0; mutex_lock(&ism_dev_list.mutex); - spin_lock_irqsave(&clients_lock, flags); + mutex_lock(&clients_lock); clients[client->id] = NULL; if (client->id + 1 == max_client) max_client--; - spin_unlock_irqrestore(&clients_lock, flags); + mutex_unlock(&clients_lock); list_for_each_entry(ism, &ism_dev_list.list, list) { spin_lock_irqsave(&ism->lock, flags); /* Stop forwarding IRQs and events */ @@ -571,21 +571,9 @@ static u64 ism_get_local_gid(struct ism_dev *ism) return ism->local_gid; } -static void ism_dev_add_work_func(struct work_struct *work) -{ - struct ism_client *client = container_of(work, struct ism_client, - add_work); - - client->add(client->tgt_ism); - ism_setup_forwarding(client, client->tgt_ism); - atomic_dec(&client->tgt_ism->add_dev_cnt); - wake_up(&client->tgt_ism->waitq); -} - static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - unsigned long flags; int i, ret; ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); @@ -618,25 +606,16 @@ static int ism_dev_init(struct ism_dev *ism) /* hardware is V2 capable */ ism_create_system_eid(); - init_waitqueue_head(&ism->waitq); - atomic_set(&ism->free_clients_cnt, 0); - atomic_set(&ism->add_dev_cnt, 0); - - wait_event(ism->waitq, !atomic_read(&ism->add_dev_cnt)); - spin_lock_irqsave(&clients_lock, flags); - for (i = 0; i < max_client; ++i) - if (clients[i]) { - INIT_WORK(&clients[i]->add_work, - ism_dev_add_work_func); - clients[i]->tgt_ism = ism; - atomic_inc(&ism->add_dev_cnt); - schedule_work(&clients[i]->add_work); - } - spin_unlock_irqrestore(&clients_lock, flags); - - wait_event(ism->waitq, !atomic_read(&ism->add_dev_cnt)); - mutex_lock(&ism_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) { + clients[i]->add(ism); + ism_setup_forwarding(clients[i], ism); + } + } + mutex_unlock(&clients_lock); + list_add(&ism->list, &ism_dev_list.list); mutex_unlock(&ism_dev_list.mutex); @@ -711,40 +690,24 @@ err_dev: return ret; } -static void ism_dev_remove_work_func(struct work_struct *work) -{ - struct ism_client *client = container_of(work, struct ism_client, - remove_work); - unsigned long flags; - - spin_lock_irqsave(&client->tgt_ism->lock, flags); - client->tgt_ism->subs[client->id] = NULL; - spin_unlock_irqrestore(&client->tgt_ism->lock, flags); - client->remove(client->tgt_ism); - atomic_dec(&client->tgt_ism->free_clients_cnt); - wake_up(&client->tgt_ism->waitq); -} - -/* Callers must hold ism_dev_list.mutex */ static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; unsigned long flags; int i; - wait_event(ism->waitq, !atomic_read(&ism->free_clients_cnt)); - spin_lock_irqsave(&clients_lock, flags); + spin_lock_irqsave(&ism->lock, flags); for (i = 0; i < max_client; ++i) - if (clients[i]) { - INIT_WORK(&clients[i]->remove_work, - ism_dev_remove_work_func); - clients[i]->tgt_ism = ism; - atomic_inc(&ism->free_clients_cnt); - schedule_work(&clients[i]->remove_work); - } - spin_unlock_irqrestore(&clients_lock, flags); + ism->subs[i] = NULL; + spin_unlock_irqrestore(&ism->lock, flags); - wait_event(ism->waitq, !atomic_read(&ism->free_clients_cnt)); + mutex_lock(&ism_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->remove(ism); + } + mutex_unlock(&clients_lock); if (SYSTEM_EID.serial_number[0] != '0' || SYSTEM_EID.type[0] != '0') @@ -755,15 +718,14 @@ static void ism_dev_exit(struct ism_dev *ism) kfree(ism->sba_client_arr); pci_free_irq_vectors(pdev); list_del_init(&ism->list); + mutex_unlock(&ism_dev_list.mutex); } static void ism_remove(struct pci_dev *pdev) { struct ism_dev *ism = dev_get_drvdata(&pdev->dev); - mutex_lock(&ism_dev_list.mutex); ism_dev_exit(ism); - mutex_unlock(&ism_dev_list.mutex); pci_release_mem_regions(pdev); pci_disable_device(pdev); diff --git a/include/linux/ism.h b/include/linux/ism.h index 5160d47e5ea9..9a4c204df3da 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -45,9 +45,6 @@ struct ism_dev { int ieq_idx; struct ism_client *subs[MAX_CLIENTS]; - atomic_t free_clients_cnt; - atomic_t add_dev_cnt; - wait_queue_head_t waitq; }; struct ism_event { @@ -69,9 +66,6 @@ struct ism_client { */ void (*handle_irq)(struct ism_dev *dev, unsigned int bit, u16 dmbemask); /* Private area - don't touch! */ - struct work_struct remove_work; - struct work_struct add_work; - struct ism_dev *tgt_ism; u8 id; }; From 266deeea34ffd28c6b6a63edf2af9b5a07161c24 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 7 Jul 2023 12:56:22 +0200 Subject: [PATCH 42/79] s390/ism: Do not unregister clients with registered DMBs When ism_unregister_client() is called but the client still has DMBs registered it returns -EBUSY and prints an error. This only happens after the client has already been unregistered however. This is unexpected as the unregister claims to have failed. Furthermore as this implies a client bug a WARN() is more appropriate. Thus move the deregistration after the check and use WARN(). Fixes: 89e7d2ba61b7 ("net/ism: Add new API for client registration") Signed-off-by: Niklas Schnelle Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 54091b7aea16..6df7f377d2f9 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -96,29 +96,32 @@ int ism_unregister_client(struct ism_client *client) int rc = 0; mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - clients[client->id] = NULL; - if (client->id + 1 == max_client) - max_client--; - mutex_unlock(&clients_lock); list_for_each_entry(ism, &ism_dev_list.list, list) { spin_lock_irqsave(&ism->lock, flags); /* Stop forwarding IRQs and events */ ism->subs[client->id] = NULL; for (int i = 0; i < ISM_NR_DMBS; ++i) { if (ism->sba_client_arr[i] == client->id) { - pr_err("%s: attempt to unregister client '%s'" - "with registered dmb(s)\n", __func__, - client->name); + WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", + __func__, client->name); rc = -EBUSY; - goto out; + goto err_reg_dmb; } } spin_unlock_irqrestore(&ism->lock, flags); } -out: mutex_unlock(&ism_dev_list.mutex); + mutex_lock(&clients_lock); + clients[client->id] = NULL; + if (client->id + 1 == max_client) + max_client--; + mutex_unlock(&clients_lock); + return rc; + +err_reg_dmb: + spin_unlock_irqrestore(&ism->lock, flags); + mutex_unlock(&ism_dev_list.mutex); return rc; } EXPORT_SYMBOL_GPL(ism_unregister_client); From 2aaa8a15de73874847d62eb595c6683bface80fd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 7 Jul 2023 18:43:27 -0700 Subject: [PATCH 43/79] icmp6: Fix null-ptr-deref of ip6_null_entry->rt6i_idev in icmp6_dev(). With some IPv6 Ext Hdr (RPL, SRv6, etc.), we can send a packet that has the link-local address as src and dst IP and will be forwarded to an external IP in the IPv6 Ext Hdr. For example, the script below generates a packet whose src IP is the link-local address and dst is updated to 11::. # for f in $(find /proc/sys/net/ -name *seg6_enabled*); do echo 1 > $f; done # python3 >>> from socket import * >>> from scapy.all import * >>> >>> SRC_ADDR = DST_ADDR = "fe80::5054:ff:fe12:3456" >>> >>> pkt = IPv6(src=SRC_ADDR, dst=DST_ADDR) >>> pkt /= IPv6ExtHdrSegmentRouting(type=4, addresses=["11::", "22::"], segleft=1) >>> >>> sk = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW) >>> sk.sendto(bytes(pkt), (DST_ADDR, 0)) For such a packet, we call ip6_route_input() to look up a route for the next destination in these three functions depending on the header type. * ipv6_rthdr_rcv() * ipv6_rpl_srh_rcv() * ipv6_srh_rcv() If no route is found, ip6_null_entry is set to skb, and the following dst_input(skb) calls ip6_pkt_drop(). Finally, in icmp6_dev(), we dereference skb_rt6_info(skb)->rt6i_idev->dev as the input device is the loopback interface. Then, we have to check if skb_rt6_info(skb)->rt6i_idev is NULL or not to avoid NULL pointer deref for ip6_null_entry. BUG: kernel NULL pointer dereference, address: 0000000000000000 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 157 Comm: python3 Not tainted 6.4.0-11996-gb121d614371c #35 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:icmp6_send (net/ipv6/icmp.c:436 net/ipv6/icmp.c:503) Code: fe ff ff 48 c7 40 30 c0 86 5d 83 e8 c6 44 1c 00 e9 c8 fc ff ff 49 8b 46 58 48 83 e0 fe 0f 84 4a fb ff ff 48 8b 80 d0 00 00 00 <48> 8b 00 44 8b 88 e0 00 00 00 e9 34 fb ff ff 4d 85 ed 0f 85 69 01 RSP: 0018:ffffc90000003c70 EFLAGS: 00000286 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00000000000000e0 RDX: 0000000000000021 RSI: 0000000000000000 RDI: ffff888006d72a18 RBP: ffffc90000003d80 R08: 0000000000000000 R09: 0000000000000001 R10: ffffc90000003d98 R11: 0000000000000040 R12: ffff888006d72a10 R13: 0000000000000000 R14: ffff8880057fb800 R15: ffffffff835d86c0 FS: 00007f9dc72ee740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000057b2000 CR4: 00000000007506f0 PKRU: 55555554 Call Trace: ip6_pkt_drop (net/ipv6/route.c:4513) ipv6_rthdr_rcv (net/ipv6/exthdrs.c:640 net/ipv6/exthdrs.c:686) ip6_protocol_deliver_rcu (net/ipv6/ip6_input.c:437 (discriminator 5)) ip6_input_finish (./include/linux/rcupdate.h:781 net/ipv6/ip6_input.c:483) __netif_receive_skb_one_core (net/core/dev.c:5455) process_backlog (./include/linux/rcupdate.h:781 net/core/dev.c:5895) __napi_poll (net/core/dev.c:6460) net_rx_action (net/core/dev.c:6529 net/core/dev.c:6660) __do_softirq (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:554) do_softirq (kernel/softirq.c:454 kernel/softirq.c:441) __local_bh_enable_ip (kernel/softirq.c:381) __dev_queue_xmit (net/core/dev.c:4231) ip6_finish_output2 (./include/net/neighbour.h:544 net/ipv6/ip6_output.c:135) rawv6_sendmsg (./include/net/dst.h:458 ./include/linux/netfilter.h:303 net/ipv6/raw.c:656 net/ipv6/raw.c:914) sock_sendmsg (net/socket.c:725 net/socket.c:748) __sys_sendto (net/socket.c:2134) __x64_sys_sendto (net/socket.c:2146 net/socket.c:2142 net/socket.c:2142) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7f9dc751baea Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 RSP: 002b:00007ffe98712c38 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007ffe98712cf8 RCX: 00007f9dc751baea RDX: 0000000000000060 RSI: 00007f9dc6460b90 RDI: 0000000000000003 RBP: 00007f9dc56e8be0 R08: 00007ffe98712d70 R09: 000000000000001c R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007f9dc6af5d1b Modules linked in: CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:icmp6_send (net/ipv6/icmp.c:436 net/ipv6/icmp.c:503) Code: fe ff ff 48 c7 40 30 c0 86 5d 83 e8 c6 44 1c 00 e9 c8 fc ff ff 49 8b 46 58 48 83 e0 fe 0f 84 4a fb ff ff 48 8b 80 d0 00 00 00 <48> 8b 00 44 8b 88 e0 00 00 00 e9 34 fb ff ff 4d 85 ed 0f 85 69 01 RSP: 0018:ffffc90000003c70 EFLAGS: 00000286 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 00000000000000e0 RDX: 0000000000000021 RSI: 0000000000000000 RDI: ffff888006d72a18 RBP: ffffc90000003d80 R08: 0000000000000000 R09: 0000000000000001 R10: ffffc90000003d98 R11: 0000000000000040 R12: ffff888006d72a10 R13: 0000000000000000 R14: ffff8880057fb800 R15: ffffffff835d86c0 FS: 00007f9dc72ee740(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000057b2000 CR4: 00000000007506f0 PKRU: 55555554 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: disabled Fixes: 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") Reported-by: Wang Yufen Closes: https://lore.kernel.org/netdev/c41403a9-c2f6-3b7e-0c96-e1901e605cd0@huawei.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9edf1f45b1ed..65fa5014bc85 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -424,7 +424,10 @@ static struct net_device *icmp6_dev(const struct sk_buff *skb) if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); - if (rt6) + /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.), + * and ip6_null_entry could be set to skb if no route is found. + */ + if (rt6 && rt6->rt6i_idev) dev = rt6->rt6i_idev->dev; } From 51d03e2f2203e76ed02d33fb5ffbb5fc85ffaf54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 8 Jul 2023 08:29:58 +0000 Subject: [PATCH 44/79] udp6: fix udp6_ehashfn() typo Amit Klein reported that udp6_ehash_secret was initialized but never used. Fixes: 1bbdceef1e53 ("inet: convert inet_ehash_secret and ipv6_hash_secret to net_get_random_once") Reported-by: Amit Klein Signed-off-by: Eric Dumazet Cc: Willy Tarreau Cc: Willem de Bruijn Cc: David Ahern Cc: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 630d61a32c1d..b7c972aa09a7 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -91,7 +91,7 @@ static u32 udp6_ehashfn(const struct net *net, fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, - udp_ipv6_hash_secret + net_hash_mix(net)); + udp6_ehash_secret + net_hash_mix(net)); } int udp_v6_get_port(struct sock *sk, unsigned short snum) From 06a0716949c22e2aefb648526580671197151acc Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Sat, 8 Jul 2023 14:59:10 +0800 Subject: [PATCH 45/79] ipv6/addrconf: fix a potential refcount underflow for idev Now in addrconf_mod_rs_timer(), reference idev depends on whether rs_timer is not pending. Then modify rs_timer timeout. There is a time gap in [1], during which if the pending rs_timer becomes not pending. It will miss to hold idev, but the rs_timer is activated. Thus rs_timer callback function addrconf_rs_timer() will be executed and put idev later without holding idev. A refcount underflow issue for idev can be caused by this. if (!timer_pending(&idev->rs_timer)) in6_dev_hold(idev); <--------------[1] mod_timer(&idev->rs_timer, jiffies + when); To fix the issue, hold idev if mod_timer() return 0. Fixes: b7b1bfce0bb6 ("ipv6: split duplicate address detection and router solicitation timer") Suggested-by: Eric Dumazet Signed-off-by: Ziyang Xuan Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5479da08ef40..e5213e598a04 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -318,9 +318,8 @@ static void addrconf_del_dad_work(struct inet6_ifaddr *ifp) static void addrconf_mod_rs_timer(struct inet6_dev *idev, unsigned long when) { - if (!timer_pending(&idev->rs_timer)) + if (!mod_timer(&idev->rs_timer, jiffies + when)) in6_dev_hold(idev); - mod_timer(&idev->rs_timer, jiffies + when); } static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp, From 73c4d1b307aeb713e80ab03f90c7df9d417dc0f0 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sat, 8 Jul 2023 15:06:25 +0100 Subject: [PATCH 46/79] net: lan743x: select FIXED_PHY The blamed commit introduces usage of fixed_phy_register() but not a corresponding dependency on FIXED_PHY. This can result in a build failure. s390-linux-ld: drivers/net/ethernet/microchip/lan743x_main.o: in function `lan743x_phy_open': drivers/net/ethernet/microchip/lan743x_main.c:1514: undefined reference to `fixed_phy_register' Fixes: 624864fbff92 ("net: lan743x: add fixed phy support for LAN7431 device") Cc: stable@vger.kernel.org Reported-by: Randy Dunlap Closes: https://lore.kernel.org/netdev/725bf1c5-b252-7d19-7582-a6809716c7d6@infradead.org/ Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/Kconfig b/drivers/net/ethernet/microchip/Kconfig index 24c994baad13..329e374b9539 100644 --- a/drivers/net/ethernet/microchip/Kconfig +++ b/drivers/net/ethernet/microchip/Kconfig @@ -46,7 +46,7 @@ config LAN743X tristate "LAN743x support" depends on PCI depends on PTP_1588_CLOCK_OPTIONAL - select PHYLIB + select FIXED_PHY select CRC16 select CRC32 help From 87355b7c3da9bfd81935caba0ab763355147f7b0 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Mon, 10 Jul 2023 09:39:07 +0800 Subject: [PATCH 47/79] net: dsa: qca8k: Add check for skb_copy Add check for the return value of skb_copy in order to avoid NULL pointer dereference. Fixes: 2cd548566384 ("net: dsa: qca8k: add support for phy read/write with mgmt Ethernet") Signed-off-by: Jiasheng Jiang Reviewed-by: Pavan Chebbi Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index f7d7cfb2fd86..09b80644c11b 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -588,6 +588,9 @@ qca8k_phy_eth_busy_wait(struct qca8k_mgmt_eth_data *mgmt_eth_data, bool ack; int ret; + if (!skb) + return -ENOMEM; + reinit_completion(&mgmt_eth_data->rw_done); /* Increment seq_num and set it in the copy pkt */ From 989b52cdc84955c2a35bc18f53e3a83edfa6f404 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Mon, 10 Jul 2023 03:07:11 +0000 Subject: [PATCH 48/79] net: sched: Replace strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). Direct replacement is safe here since return value of -errno is used to check for truncation instead of sizeof(dest). [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Pavan Chebbi Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f7887f42d542..9d3f26bf0440 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1320,7 +1320,7 @@ struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, return ERR_PTR(err); } } else { - if (strlcpy(act_name, "police", IFNAMSIZ) >= IFNAMSIZ) { + if (strscpy(act_name, "police", IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(-EINVAL); } From 9d0aba98316d00f9c0a4506fc15f5ed9241bc1fd Mon Sep 17 00:00:00 2001 From: Junfeng Guo Date: Sat, 8 Jul 2023 11:14:51 +0800 Subject: [PATCH 49/79] gve: unify driver name usage Current codebase contained the usage of two different names for this driver (i.e., `gvnic` and `gve`), which is quite unfriendly for users to use, especially when trying to bind or unbind the driver manually. The corresponding kernel module is registered with the name of `gve`. It's more reasonable to align the name of the driver with the module. Fixes: 893ce44df565 ("gve: Add basic driver framework for Compute Engine Virtual NIC") Cc: csully@google.com Signed-off-by: Junfeng Guo Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve.h | 1 + drivers/net/ethernet/google/gve/gve_ethtool.c | 2 +- drivers/net/ethernet/google/gve/gve_main.c | 11 ++++++----- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 98eb78d98e9f..4b425bf71ede 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -964,5 +964,6 @@ void gve_handle_report_stats(struct gve_priv *priv); /* exported by ethtool.c */ extern const struct ethtool_ops gve_ethtool_ops; /* needed by ethtool */ +extern char gve_driver_name[]; extern const char gve_version_str[]; #endif /* _GVE_H_ */ diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 50162ec9424d..233e5946905e 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -15,7 +15,7 @@ static void gve_get_drvinfo(struct net_device *netdev, { struct gve_priv *priv = netdev_priv(netdev); - strscpy(info->driver, "gve", sizeof(info->driver)); + strscpy(info->driver, gve_driver_name, sizeof(info->driver)); strscpy(info->version, gve_version_str, sizeof(info->version)); strscpy(info->bus_info, pci_name(priv->pdev), sizeof(info->bus_info)); } diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 8fb70db63b8b..e6f1711d9be0 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -33,6 +33,7 @@ #define MIN_TX_TIMEOUT_GAP (1000 * 10) #define DQO_TX_MAX 0x3FFFF +char gve_driver_name[] = "gve"; const char gve_version_str[] = GVE_VERSION; static const char gve_version_prefix[] = GVE_VERSION_PREFIX; @@ -2200,7 +2201,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) return err; - err = pci_request_regions(pdev, "gvnic-cfg"); + err = pci_request_regions(pdev, gve_driver_name); if (err) goto abort_with_enabled; @@ -2393,8 +2394,8 @@ static const struct pci_device_id gve_id_table[] = { { } }; -static struct pci_driver gvnic_driver = { - .name = "gvnic", +static struct pci_driver gve_driver = { + .name = gve_driver_name, .id_table = gve_id_table, .probe = gve_probe, .remove = gve_remove, @@ -2405,10 +2406,10 @@ static struct pci_driver gvnic_driver = { #endif }; -module_pci_driver(gvnic_driver); +module_pci_driver(gve_driver); MODULE_DEVICE_TABLE(pci, gve_id_table); MODULE_AUTHOR("Google, Inc."); -MODULE_DESCRIPTION("gVNIC Driver"); +MODULE_DESCRIPTION("Google Virtual NIC Driver"); MODULE_LICENSE("Dual MIT/GPL"); MODULE_VERSION(GVE_VERSION); From 8046063df887bee35c002224267ba46f41be7cf6 Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 14 Jun 2023 16:07:09 +0200 Subject: [PATCH 50/79] igc: Rename qbv_enable to taprio_offload_enable In the current implementation the flags adapter->qbv_enable and IGC_FLAG_TSN_QBV_ENABLED have a similar name, but do not have the same meaning. The first one is used only to indicate taprio offload (i.e. when igc_save_qbv_schedule was called), while the second one corresponds to the Qbv mode of the hardware. However, the second one is also used to support the TX launchtime feature, i.e. ETF qdisc offload. This leads to situations where adapter->qbv_enable is false, but the flag IGC_FLAG_TSN_QBV_ENABLED is set. This is prone to confusion. The rename should reduce this confusion. Since it is a pure rename, it has no impact on functionality. Fixes: e17090eb2494 ("igc: allow BaseTime 0 enrollment for Qbv") Signed-off-by: Florian Kauer Reviewed-by: Kurt Kanzenbach Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 2 +- drivers/net/ethernet/intel/igc/igc_main.c | 6 +++--- drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 639a50c02537..9db384f66a8e 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -191,7 +191,7 @@ struct igc_adapter { int tc_setup_type; ktime_t base_time; ktime_t cycle_time; - bool qbv_enable; + bool taprio_offload_enable; u32 qbv_config_change_errors; bool qbv_transition; unsigned int qbv_count; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 281a0e35b9d1..fae534ef1c4f 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6126,16 +6126,16 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, switch (qopt->cmd) { case TAPRIO_CMD_REPLACE: - adapter->qbv_enable = true; + adapter->taprio_offload_enable = true; break; case TAPRIO_CMD_DESTROY: - adapter->qbv_enable = false; + adapter->taprio_offload_enable = false; break; default: return -EOPNOTSUPP; } - if (!adapter->qbv_enable) + if (!adapter->taprio_offload_enable) return igc_tsn_clear_schedule(adapter); if (qopt->base_time < 0) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 3cdb0c988728..b76ebfc10b1d 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -37,7 +37,7 @@ static unsigned int igc_tsn_new_flags(struct igc_adapter *adapter) { unsigned int new_flags = adapter->flags & ~IGC_FLAG_TSN_ANY_ENABLED; - if (adapter->qbv_enable) + if (adapter->taprio_offload_enable) new_flags |= IGC_FLAG_TSN_QBV_ENABLED; if (is_any_launchtime(adapter)) From 82ff5f29b7377d614f0c01fd74b5d0cb225f0adc Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 14 Jun 2023 16:07:10 +0200 Subject: [PATCH 51/79] igc: Do not enable taprio offload for invalid arguments Only set adapter->taprio_offload_enable after validating the arguments. Otherwise, it stays set even if the offload was not enabled. Since the subsequent code does not get executed in case of invalid arguments, it will not be read at first. However, by activating and then deactivating another offload (e.g. ETF/TX launchtime offload), taprio_offload_enable is read and erroneously keeps the offload feature of the NIC enabled. This can be reproduced as follows: # TAPRIO offload (flags == 0x2) and negative base-time leading to expected -ERANGE sudo tc qdisc replace dev enp1s0 parent root handle 100 stab overhead 24 taprio \ num_tc 1 \ map 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ queues 1@0 \ base-time -1000 \ sched-entry S 01 300000 \ flags 0x2 # IGC_TQAVCTRL is 0x0 as expected (iomem=relaxed for reading register) sudo pcimem /sys/bus/pci/devices/0000:01:00.0/resource0 0x3570 w*1 # Activate ETF offload sudo tc qdisc replace dev enp1s0 parent root handle 6666 mqprio \ num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \ queues 1@0 1@1 2@2 \ hw 0 sudo tc qdisc add dev enp1s0 parent 6666:1 etf \ clockid CLOCK_TAI \ delta 500000 \ offload # IGC_TQAVCTRL is 0x9 as expected sudo pcimem /sys/bus/pci/devices/0000:01:00.0/resource0 0x3570 w*1 # Deactivate ETF offload again sudo tc qdisc delete dev enp1s0 parent 6666:1 # IGC_TQAVCTRL should now be 0x0 again, but is observed as 0x9 sudo pcimem /sys/bus/pci/devices/0000:01:00.0/resource0 0x3570 w*1 Fixes: e17090eb2494 ("igc: allow BaseTime 0 enrollment for Qbv") Signed-off-by: Florian Kauer Reviewed-by: Kurt Kanzenbach Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index fae534ef1c4f..fb8e55c7c402 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6097,6 +6097,7 @@ static int igc_tsn_clear_schedule(struct igc_adapter *adapter) adapter->base_time = 0; adapter->cycle_time = NSEC_PER_SEC; + adapter->taprio_offload_enable = false; adapter->qbv_config_change_errors = 0; adapter->qbv_transition = false; adapter->qbv_count = 0; @@ -6124,20 +6125,12 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, size_t n; int i; - switch (qopt->cmd) { - case TAPRIO_CMD_REPLACE: - adapter->taprio_offload_enable = true; - break; - case TAPRIO_CMD_DESTROY: - adapter->taprio_offload_enable = false; - break; - default: - return -EOPNOTSUPP; - } - - if (!adapter->taprio_offload_enable) + if (qopt->cmd == TAPRIO_CMD_DESTROY) return igc_tsn_clear_schedule(adapter); + if (qopt->cmd != TAPRIO_CMD_REPLACE) + return -EOPNOTSUPP; + if (qopt->base_time < 0) return -ERANGE; @@ -6149,6 +6142,7 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, adapter->cycle_time = qopt->cycle_time; adapter->base_time = qopt->base_time; + adapter->taprio_offload_enable = true; igc_ptp_read(adapter, &now); From e5d88c53d03f8df864776431175d08c053645f50 Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 14 Jun 2023 16:07:11 +0200 Subject: [PATCH 52/79] igc: Handle already enabled taprio offload for basetime 0 Since commit e17090eb2494 ("igc: allow BaseTime 0 enrollment for Qbv") it is possible to enable taprio offload with a basetime of 0. However, the check if taprio offload is already enabled (and thus -EALREADY should be returned for igc_save_qbv_schedule) still relied on adapter->base_time > 0. This can be reproduced as follows: # TAPRIO offload (flags == 0x2) and base-time = 0 sudo tc qdisc replace dev enp1s0 parent root handle 100 stab overhead 24 taprio \ num_tc 1 \ map 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ queues 1@0 \ base-time 0 \ sched-entry S 01 300000 \ flags 0x2 # The second call should fail with "Error: Device failed to setup taprio offload." # But that only happens if base-time was != 0 sudo tc qdisc replace dev enp1s0 parent root handle 100 stab overhead 24 taprio \ num_tc 1 \ map 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ queues 1@0 \ base-time 0 \ sched-entry S 01 300000 \ flags 0x2 Fixes: e17090eb2494 ("igc: allow BaseTime 0 enrollment for Qbv") Signed-off-by: Florian Kauer Reviewed-by: Kurt Kanzenbach Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index fb8e55c7c402..5d24930fed8f 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6134,7 +6134,7 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, if (qopt->base_time < 0) return -ERANGE; - if (igc_is_device_id_i225(hw) && adapter->base_time) + if (igc_is_device_id_i225(hw) && adapter->taprio_offload_enable) return -EALREADY; if (!validate_schedule(adapter, qopt)) From 8b86f10ab64eca0287ea8f7c94e9ad8b2e101c01 Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 14 Jun 2023 16:07:12 +0200 Subject: [PATCH 53/79] igc: No strict mode in pure launchtime/CBS offload The flags IGC_TXQCTL_STRICT_CYCLE and IGC_TXQCTL_STRICT_END prevent the packet transmission over slot and cycle boundaries. This is important for taprio offload where the slots and cycles correspond to the slots and cycles configured for the network. However, the Qbv offload feature of the i225 is also used for enabling TX launchtime / ETF offload. In that case, however, the cycle has no meaning for the network and is only used internally to adapt the base time register after a second has passed. Enabling strict mode in this case would unnecessarily prevent the transmission of certain packets (i.e. at the boundary of a second) and thus interferes with the ETF qdisc that promises transmission at a certain point in time. Similar to ETF, this also applies to CBS offload that also should not be influenced by strict mode unless taprio offload would be enabled at the same time. This fully reverts commit d8f45be01dd9 ("igc: Use strict cycles for Qbv scheduling") but its commit message only describes what was already implemented before that commit. The difference to a plain revert of that commit is that it now copes with the base_time = 0 case that was fixed with commit e17090eb2494 ("igc: allow BaseTime 0 enrollment for Qbv") In particular, enabling strict mode leads to TX hang situations under high traffic if taprio is applied WITHOUT taprio offload but WITH ETF offload, e.g. as in sudo tc qdisc replace dev enp1s0 parent root handle 100 taprio \ num_tc 1 \ map 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ queues 1@0 \ base-time 0 \ sched-entry S 01 300000 \ flags 0x1 \ txtime-delay 500000 \ clockid CLOCK_TAI sudo tc qdisc replace dev enp1s0 parent 100:1 etf \ clockid CLOCK_TAI \ delta 500000 \ offload \ skip_sock_check and traffic generator sudo trafgen -i traffic.cfg -o enp1s0 --cpp -n0 -q -t1400ns with traffic.cfg #define ETH_P_IP 0x0800 { /* Ethernet Header */ 0x30, 0x1f, 0x9a, 0xd0, 0xf0, 0x0e, # MAC Dest - adapt as needed 0x24, 0x5e, 0xbe, 0x57, 0x2e, 0x36, # MAC Src - adapt as needed const16(ETH_P_IP), /* IPv4 Header */ 0b01000101, 0, # IPv4 version, IHL, TOS const16(1028), # IPv4 total length (UDP length + 20 bytes (IP header)) const16(2), # IPv4 ident 0b01000000, 0, # IPv4 flags, fragmentation off 64, # IPv4 TTL 17, # Protocol UDP csumip(14, 33), # IPv4 checksum /* UDP Header */ 10, 0, 48, 1, # IP Src - adapt as needed 10, 0, 48, 10, # IP Dest - adapt as needed const16(5555), # UDP Src Port const16(6666), # UDP Dest Port const16(1008), # UDP length (UDP header 8 bytes + payload length) csumudp(14, 34), # UDP checksum /* Payload */ fill('W', 1000), } and the observed message with that is for example igc 0000:01:00.0 enp1s0: Detected Tx Unit Hang Tx Queue <0> TDH TDT next_to_use next_to_clean buffer_info[next_to_clean] time_stamp next_to_watch <00000000245a4efb> jiffies desc.status <1048000> Fixes: d8f45be01dd9 ("igc: Use strict cycles for Qbv scheduling") Signed-off-by: Florian Kauer Reviewed-by: Kurt Kanzenbach Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_tsn.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index b76ebfc10b1d..a9c08321aca9 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -132,8 +132,28 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) wr32(IGC_STQT(i), ring->start_time); wr32(IGC_ENDQT(i), ring->end_time); - txqctl |= IGC_TXQCTL_STRICT_CYCLE | - IGC_TXQCTL_STRICT_END; + if (adapter->taprio_offload_enable) { + /* If taprio_offload_enable is set we are in "taprio" + * mode and we need to be strict about the + * cycles: only transmit a packet if it can be + * completed during that cycle. + * + * If taprio_offload_enable is NOT true when + * enabling TSN offload, the cycle should have + * no external effects, but is only used internally + * to adapt the base time register after a second + * has passed. + * + * Enabling strict mode in this case would + * unnecessarily prevent the transmission of + * certain packets (i.e. at the boundary of a + * second) and thus interfere with the launchtime + * feature that promises transmission at a + * certain point in time. + */ + txqctl |= IGC_TXQCTL_STRICT_CYCLE | + IGC_TXQCTL_STRICT_END; + } if (ring->launchtime_enable) txqctl |= IGC_TXQCTL_QUEUE_MODE_LAUNCHT; From c1bca9ac0bcb355be11354c2e68bc7bf31f5ac5a Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 14 Jun 2023 16:07:13 +0200 Subject: [PATCH 54/79] igc: Fix launchtime before start of cycle It is possible (verified on a running system) that frames are processed by igc_tx_launchtime with a txtime before the start of the cycle (baset_est). However, the result of txtime - baset_est is written into a u32, leading to a wrap around to a positive number. The following launchtime > 0 check will only branch to executing launchtime = 0 if launchtime is already 0. Fix it by using a s32 before checking launchtime > 0. Fixes: db0b124f02ba ("igc: Enhance Qbv scheduling by using first flag bit") Signed-off-by: Florian Kauer Reviewed-by: Kurt Kanzenbach Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 5d24930fed8f..4855caa3bae4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1016,7 +1016,7 @@ static __le32 igc_tx_launchtime(struct igc_ring *ring, ktime_t txtime, ktime_t base_time = adapter->base_time; ktime_t now = ktime_get_clocktai(); ktime_t baset_est, end_of_cycle; - u32 launchtime; + s32 launchtime; s64 n; n = div64_s64(ktime_sub_ns(now, base_time), cycle_time); From 0bcc62858d6ba62cbade957d69745e6adeed5f3d Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Wed, 14 Jun 2023 16:07:14 +0200 Subject: [PATCH 55/79] igc: Fix inserting of empty frame for launchtime The insertion of an empty frame was introduced with commit db0b124f02ba ("igc: Enhance Qbv scheduling by using first flag bit") in order to ensure that the current cycle has at least one packet if there is some packet to be scheduled for the next cycle. However, the current implementation does not properly check if a packet is already scheduled for the current cycle. Currently, an empty packet is always inserted if and only if txtime >= end_of_cycle && txtime > last_tx_cycle but since last_tx_cycle is always either the end of the current cycle (end_of_cycle) or the end of a previous cycle, the second part (txtime > last_tx_cycle) is always true unless txtime == last_tx_cycle. What actually needs to be checked here is if the last_tx_cycle was already written within the current cycle, so an empty frame should only be inserted if and only if txtime >= end_of_cycle && end_of_cycle > last_tx_cycle. This patch does not only avoid an unnecessary insertion, but it can actually be harmful to insert an empty packet if packets are already scheduled in the current cycle, because it can lead to a situation where the empty packet is actually processed as the first packet in the upcoming cycle shifting the packet with the first_flag even one cycle into the future, finally leading to a TX hang. The TX hang can be reproduced on a i225 with: sudo tc qdisc replace dev enp1s0 parent root handle 100 taprio \ num_tc 1 \ map 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ queues 1@0 \ base-time 0 \ sched-entry S 01 300000 \ flags 0x1 \ txtime-delay 500000 \ clockid CLOCK_TAI sudo tc qdisc replace dev enp1s0 parent 100:1 etf \ clockid CLOCK_TAI \ delta 500000 \ offload \ skip_sock_check and traffic generator sudo trafgen -i traffic.cfg -o enp1s0 --cpp -n0 -q -t1400ns with traffic.cfg #define ETH_P_IP 0x0800 { /* Ethernet Header */ 0x30, 0x1f, 0x9a, 0xd0, 0xf0, 0x0e, # MAC Dest - adapt as needed 0x24, 0x5e, 0xbe, 0x57, 0x2e, 0x36, # MAC Src - adapt as needed const16(ETH_P_IP), /* IPv4 Header */ 0b01000101, 0, # IPv4 version, IHL, TOS const16(1028), # IPv4 total length (UDP length + 20 bytes (IP header)) const16(2), # IPv4 ident 0b01000000, 0, # IPv4 flags, fragmentation off 64, # IPv4 TTL 17, # Protocol UDP csumip(14, 33), # IPv4 checksum /* UDP Header */ 10, 0, 48, 1, # IP Src - adapt as needed 10, 0, 48, 10, # IP Dest - adapt as needed const16(5555), # UDP Src Port const16(6666), # UDP Dest Port const16(1008), # UDP length (UDP header 8 bytes + payload length) csumudp(14, 34), # UDP checksum /* Payload */ fill('W', 1000), } and the observed message with that is for example igc 0000:01:00.0 enp1s0: Detected Tx Unit Hang Tx Queue <0> TDH <32> TDT <3c> next_to_use <3c> next_to_clean <32> buffer_info[next_to_clean] time_stamp next_to_watch <00000000632a1828> jiffies desc.status <1048000> Fixes: db0b124f02ba ("igc: Enhance Qbv scheduling by using first flag bit") Signed-off-by: Florian Kauer Reviewed-by: Kurt Kanzenbach Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 4855caa3bae4..9f93f0f4f752 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1029,7 +1029,7 @@ static __le32 igc_tx_launchtime(struct igc_ring *ring, ktime_t txtime, *first_flag = true; ring->last_ff_cycle = baset_est; - if (ktime_compare(txtime, ring->last_tx_cycle) > 0) + if (ktime_compare(end_of_cycle, ring->last_tx_cycle) > 0) *insert_empty = true; } } From c56fb2aab23505bb7160d06097c8de100b82b851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 10 Jul 2023 09:41:31 +0200 Subject: [PATCH 56/79] riscv, bpf: Fix inconsistent JIT image generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to generate the prologue and epilogue, the BPF JIT needs to know which registers that are clobbered. Therefore, the during pre-final passes, the prologue is generated after the body of the program body-prologue-epilogue. Then, in the final pass, a proper prologue-body-epilogue JITted image is generated. This scheme has worked most of the time. However, for some large programs with many jumps, e.g. the test_kmod.sh BPF selftest with hardening enabled (blinding constants), this has shown to be incorrect. For the final pass, when the proper prologue-body-epilogue is generated, the image has not converged. This will lead to that the final image will have incorrect jump offsets. The following is an excerpt from an incorrect image: | ... | 3b8: 00c50663 beq a0,a2,3c4 <.text+0x3c4> | 3bc: 0020e317 auipc t1,0x20e | 3c0: 49630067 jalr zero,1174(t1) # 20e852 <.text+0x20e852> | ... | 20e84c: 8796 c.mv a5,t0 | 20e84e: 6422 c.ldsp s0,8(sp) # Epilogue start | 20e850: 6141 c.addi16sp sp,16 | 20e852: 853e c.mv a0,a5 # Incorrect jump target | 20e854: 8082 c.jr ra The image has shrunk, and the epilogue offset is incorrect in the final pass. Correct the problem by always generating proper prologue-body-epilogue outputs, which means that the first pass will only generate the body to track what registers that are touched. Fixes: 2353ecc6f91f ("bpf, riscv: add BPF JIT for RV64G") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230710074131.19596-1-bjorn@kernel.org --- arch/riscv/net/bpf_jit.h | 6 +++--- arch/riscv/net/bpf_jit_core.c | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index bf9802a63061..2717f5490428 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -69,7 +69,7 @@ struct rv_jit_context { struct bpf_prog *prog; u16 *insns; /* RV insns */ int ninsns; - int body_len; + int prologue_len; int epilogue_offset; int *offset; /* BPF to RV */ int nexentries; @@ -216,8 +216,8 @@ static inline int rv_offset(int insn, int off, struct rv_jit_context *ctx) int from, to; off++; /* BPF branch is from PC+1, RV is from PC */ - from = (insn > 0) ? ctx->offset[insn - 1] : 0; - to = (insn + off > 0) ? ctx->offset[insn + off - 1] : 0; + from = (insn > 0) ? ctx->offset[insn - 1] : ctx->prologue_len; + to = (insn + off > 0) ? ctx->offset[insn + off - 1] : ctx->prologue_len; return ninsns_rvoff(to - from); } diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 737baf8715da..7a26a3e1c73c 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -44,7 +44,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) unsigned int prog_size = 0, extable_size = 0; bool tmp_blinded = false, extra_pass = false; struct bpf_prog *tmp, *orig_prog = prog; - int pass = 0, prev_ninsns = 0, prologue_len, i; + int pass = 0, prev_ninsns = 0, i; struct rv_jit_data *jit_data; struct rv_jit_context *ctx; @@ -83,6 +83,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = orig_prog; goto out_offset; } + + if (build_body(ctx, extra_pass, NULL)) { + prog = orig_prog; + goto out_offset; + } + for (i = 0; i < prog->len; i++) { prev_ninsns += 32; ctx->offset[i] = prev_ninsns; @@ -91,12 +97,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) for (i = 0; i < NR_JIT_ITERATIONS; i++) { pass++; ctx->ninsns = 0; + + bpf_jit_build_prologue(ctx); + ctx->prologue_len = ctx->ninsns; + if (build_body(ctx, extra_pass, ctx->offset)) { prog = orig_prog; goto out_offset; } - ctx->body_len = ctx->ninsns; - bpf_jit_build_prologue(ctx); + ctx->epilogue_offset = ctx->ninsns; bpf_jit_build_epilogue(ctx); @@ -162,10 +171,8 @@ skip_init_ctx: if (!prog->is_func || extra_pass) { bpf_jit_binary_lock_ro(jit_data->header); - prologue_len = ctx->epilogue_offset - ctx->body_len; for (i = 0; i < prog->len; i++) - ctx->offset[i] = ninsns_rvoff(prologue_len + - ctx->offset[i]); + ctx->offset[i] = ninsns_rvoff(ctx->offset[i]); bpf_prog_fill_jited_linfo(prog, ctx->offset); out_offset: kfree(ctx->offset); From be7ecbe7ec7df7320db4b810fef438bf67144011 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 6 Jul 2023 16:10:09 +0800 Subject: [PATCH 57/79] net: fec: dynamically set the NETDEV_XDP_ACT_NDO_XMIT feature of XDP When a XDP program is installed or uninstalled, fec_restart() will be invoked to reset MAC and buffer descriptor rings. It's reasonable not to transmit any packet during the process of reset. However, the NETDEV_XDP_ACT_NDO_XMIT bit of xdp_features is enabled by default, that is to say, it's possible that the fec_enet_xdp_xmit() will be invoked even if the process of reset is not finished. In this case, the redirected XDP frames might be dropped and available transmit BDs may be incorrectly deemed insufficient. So this patch disable the NETDEV_XDP_ACT_NDO_XMIT feature by default and dynamically configure this feature when the bpf program is installed or uninstalled. Fixes: e4ac7cc6e5a4 ("net: fec: turn on XDP features") Signed-off-by: Wei Fang Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fec_main.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 8fbe47703d47..9ce0319b33c3 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3732,12 +3732,18 @@ static int fec_enet_bpf(struct net_device *dev, struct netdev_bpf *bpf) if (fep->quirks & FEC_QUIRK_SWAP_FRAME) return -EOPNOTSUPP; + if (!bpf->prog) + xdp_features_clear_redirect_target(dev); + if (is_run) { napi_disable(&fep->napi); netif_tx_disable(dev); } old_prog = xchg(&fep->xdp_prog, bpf->prog); + if (old_prog) + bpf_prog_put(old_prog); + fec_restart(dev); if (is_run) { @@ -3745,8 +3751,8 @@ static int fec_enet_bpf(struct net_device *dev, struct netdev_bpf *bpf) netif_tx_start_all_queues(dev); } - if (old_prog) - bpf_prog_put(old_prog); + if (bpf->prog) + xdp_features_set_redirect_target(dev, false); return 0; @@ -4016,8 +4022,7 @@ static int fec_enet_init(struct net_device *ndev) if (!(fep->quirks & FEC_QUIRK_SWAP_FRAME)) ndev->xdp_features = NETDEV_XDP_ACT_BASIC | - NETDEV_XDP_ACT_REDIRECT | - NETDEV_XDP_ACT_NDO_XMIT; + NETDEV_XDP_ACT_REDIRECT; fec_restart(ndev); From 20f797399035a8052dbd7297fdbe094079a9482e Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 6 Jul 2023 16:10:10 +0800 Subject: [PATCH 58/79] net: fec: recycle pages for transmitted XDP frames Once the XDP frames have been successfully transmitted through the ndo_xdp_xmit() interface, it's the driver responsibility to free the frames so that the page_pool can recycle the pages and reuse them. However, this action is not implemented in the fec driver. This leads to a user-visible problem that the console will print the following warning log. [ 157.568851] page_pool_release_retry() stalled pool shutdown 1389 inflight 60 sec [ 217.983446] page_pool_release_retry() stalled pool shutdown 1389 inflight 120 sec [ 278.399006] page_pool_release_retry() stalled pool shutdown 1389 inflight 181 sec [ 338.812885] page_pool_release_retry() stalled pool shutdown 1389 inflight 241 sec [ 399.226946] page_pool_release_retry() stalled pool shutdown 1389 inflight 302 sec Therefore, to solve this issue, we free XDP frames via xdp_return_frame() while cleaning the tx BD ring. Fixes: 6d6b39f180b8 ("net: fec: add initial XDP support") Signed-off-by: Wei Fang Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fec.h | 15 ++- drivers/net/ethernet/freescale/fec_main.c | 148 +++++++++++++++------- 2 files changed, 115 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 9939ccafb556..8c0226d061fe 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -544,10 +544,23 @@ enum { XDP_STATS_TOTAL, }; +enum fec_txbuf_type { + FEC_TXBUF_T_SKB, + FEC_TXBUF_T_XDP_NDO, +}; + +struct fec_tx_buffer { + union { + struct sk_buff *skb; + struct xdp_frame *xdp; + }; + enum fec_txbuf_type type; +}; + struct fec_enet_priv_tx_q { struct bufdesc_prop bd; unsigned char *tx_bounce[TX_RING_SIZE]; - struct sk_buff *tx_skbuff[TX_RING_SIZE]; + struct fec_tx_buffer tx_buf[TX_RING_SIZE]; unsigned short tx_stop_threshold; unsigned short tx_wake_threshold; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 9ce0319b33c3..940d3afe1d24 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -397,7 +397,7 @@ static void fec_dump(struct net_device *ndev) fec16_to_cpu(bdp->cbd_sc), fec32_to_cpu(bdp->cbd_bufaddr), fec16_to_cpu(bdp->cbd_datlen), - txq->tx_skbuff[index]); + txq->tx_buf[index].skb); bdp = fec_enet_get_nextdesc(bdp, &txq->bd); index++; } while (bdp != txq->bd.base); @@ -654,7 +654,7 @@ static int fec_enet_txq_submit_skb(struct fec_enet_priv_tx_q *txq, index = fec_enet_get_bd_index(last_bdp, &txq->bd); /* Save skb pointer */ - txq->tx_skbuff[index] = skb; + txq->tx_buf[index].skb = skb; /* Make sure the updates to rest of the descriptor are performed before * transferring ownership. @@ -672,9 +672,7 @@ static int fec_enet_txq_submit_skb(struct fec_enet_priv_tx_q *txq, skb_tx_timestamp(skb); - /* Make sure the update to bdp and tx_skbuff are performed before - * txq->bd.cur. - */ + /* Make sure the update to bdp is performed before txq->bd.cur. */ wmb(); txq->bd.cur = bdp; @@ -862,7 +860,7 @@ static int fec_enet_txq_submit_tso(struct fec_enet_priv_tx_q *txq, } /* Save skb pointer */ - txq->tx_skbuff[index] = skb; + txq->tx_buf[index].skb = skb; skb_tx_timestamp(skb); txq->bd.cur = bdp; @@ -952,16 +950,33 @@ static void fec_enet_bd_init(struct net_device *dev) for (i = 0; i < txq->bd.ring_size; i++) { /* Initialize the BD for every fragment in the page. */ bdp->cbd_sc = cpu_to_fec16(0); - if (bdp->cbd_bufaddr && - !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr))) - dma_unmap_single(&fep->pdev->dev, - fec32_to_cpu(bdp->cbd_bufaddr), - fec16_to_cpu(bdp->cbd_datlen), - DMA_TO_DEVICE); - if (txq->tx_skbuff[i]) { - dev_kfree_skb_any(txq->tx_skbuff[i]); - txq->tx_skbuff[i] = NULL; + if (txq->tx_buf[i].type == FEC_TXBUF_T_SKB) { + if (bdp->cbd_bufaddr && + !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr))) + dma_unmap_single(&fep->pdev->dev, + fec32_to_cpu(bdp->cbd_bufaddr), + fec16_to_cpu(bdp->cbd_datlen), + DMA_TO_DEVICE); + if (txq->tx_buf[i].skb) { + dev_kfree_skb_any(txq->tx_buf[i].skb); + txq->tx_buf[i].skb = NULL; + } + } else { + if (bdp->cbd_bufaddr) + dma_unmap_single(&fep->pdev->dev, + fec32_to_cpu(bdp->cbd_bufaddr), + fec16_to_cpu(bdp->cbd_datlen), + DMA_TO_DEVICE); + + if (txq->tx_buf[i].xdp) { + xdp_return_frame(txq->tx_buf[i].xdp); + txq->tx_buf[i].xdp = NULL; + } + + /* restore default tx buffer type: FEC_TXBUF_T_SKB */ + txq->tx_buf[i].type = FEC_TXBUF_T_SKB; } + bdp->cbd_bufaddr = cpu_to_fec32(0); bdp = fec_enet_get_nextdesc(bdp, &txq->bd); } @@ -1360,6 +1375,7 @@ static void fec_enet_tx_queue(struct net_device *ndev, u16 queue_id) { struct fec_enet_private *fep; + struct xdp_frame *xdpf; struct bufdesc *bdp; unsigned short status; struct sk_buff *skb; @@ -1387,16 +1403,31 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id) index = fec_enet_get_bd_index(bdp, &txq->bd); - skb = txq->tx_skbuff[index]; - txq->tx_skbuff[index] = NULL; - if (!IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr))) - dma_unmap_single(&fep->pdev->dev, - fec32_to_cpu(bdp->cbd_bufaddr), - fec16_to_cpu(bdp->cbd_datlen), - DMA_TO_DEVICE); - bdp->cbd_bufaddr = cpu_to_fec32(0); - if (!skb) - goto skb_done; + if (txq->tx_buf[index].type == FEC_TXBUF_T_SKB) { + skb = txq->tx_buf[index].skb; + txq->tx_buf[index].skb = NULL; + if (bdp->cbd_bufaddr && + !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr))) + dma_unmap_single(&fep->pdev->dev, + fec32_to_cpu(bdp->cbd_bufaddr), + fec16_to_cpu(bdp->cbd_datlen), + DMA_TO_DEVICE); + bdp->cbd_bufaddr = cpu_to_fec32(0); + if (!skb) + goto tx_buf_done; + } else { + xdpf = txq->tx_buf[index].xdp; + if (bdp->cbd_bufaddr) + dma_unmap_single(&fep->pdev->dev, + fec32_to_cpu(bdp->cbd_bufaddr), + fec16_to_cpu(bdp->cbd_datlen), + DMA_TO_DEVICE); + bdp->cbd_bufaddr = cpu_to_fec32(0); + if (!xdpf) { + txq->tx_buf[index].type = FEC_TXBUF_T_SKB; + goto tx_buf_done; + } + } /* Check for errors. */ if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC | @@ -1415,21 +1446,11 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id) ndev->stats.tx_carrier_errors++; } else { ndev->stats.tx_packets++; - ndev->stats.tx_bytes += skb->len; - } - /* NOTE: SKBTX_IN_PROGRESS being set does not imply it's we who - * are to time stamp the packet, so we still need to check time - * stamping enabled flag. - */ - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS && - fep->hwts_tx_en) && - fep->bufdesc_ex) { - struct skb_shared_hwtstamps shhwtstamps; - struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp; - - fec_enet_hwtstamp(fep, fec32_to_cpu(ebdp->ts), &shhwtstamps); - skb_tstamp_tx(skb, &shhwtstamps); + if (txq->tx_buf[index].type == FEC_TXBUF_T_SKB) + ndev->stats.tx_bytes += skb->len; + else + ndev->stats.tx_bytes += xdpf->len; } /* Deferred means some collisions occurred during transmit, @@ -1438,10 +1459,32 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id) if (status & BD_ENET_TX_DEF) ndev->stats.collisions++; - /* Free the sk buffer associated with this last transmit */ - dev_kfree_skb_any(skb); -skb_done: - /* Make sure the update to bdp and tx_skbuff are performed + if (txq->tx_buf[index].type == FEC_TXBUF_T_SKB) { + /* NOTE: SKBTX_IN_PROGRESS being set does not imply it's we who + * are to time stamp the packet, so we still need to check time + * stamping enabled flag. + */ + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS && + fep->hwts_tx_en) && fep->bufdesc_ex) { + struct skb_shared_hwtstamps shhwtstamps; + struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp; + + fec_enet_hwtstamp(fep, fec32_to_cpu(ebdp->ts), &shhwtstamps); + skb_tstamp_tx(skb, &shhwtstamps); + } + + /* Free the sk buffer associated with this last transmit */ + dev_kfree_skb_any(skb); + } else { + xdp_return_frame(xdpf); + + txq->tx_buf[index].xdp = NULL; + /* restore default tx buffer type: FEC_TXBUF_T_SKB */ + txq->tx_buf[index].type = FEC_TXBUF_T_SKB; + } + +tx_buf_done: + /* Make sure the update to bdp and tx_buf are performed * before dirty_tx */ wmb(); @@ -3249,9 +3292,19 @@ static void fec_enet_free_buffers(struct net_device *ndev) for (i = 0; i < txq->bd.ring_size; i++) { kfree(txq->tx_bounce[i]); txq->tx_bounce[i] = NULL; - skb = txq->tx_skbuff[i]; - txq->tx_skbuff[i] = NULL; - dev_kfree_skb(skb); + + if (txq->tx_buf[i].type == FEC_TXBUF_T_SKB) { + skb = txq->tx_buf[i].skb; + txq->tx_buf[i].skb = NULL; + dev_kfree_skb(skb); + } else { + if (txq->tx_buf[i].xdp) { + xdp_return_frame(txq->tx_buf[i].xdp); + txq->tx_buf[i].xdp = NULL; + } + + txq->tx_buf[i].type = FEC_TXBUF_T_SKB; + } } } } @@ -3817,7 +3870,8 @@ static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep, ebdp->cbd_esc = cpu_to_fec32(estatus); } - txq->tx_skbuff[index] = NULL; + txq->tx_buf[index].type = FEC_TXBUF_T_XDP_NDO; + txq->tx_buf[index].xdp = frame; /* Make sure the updates to rest of the descriptor are performed before * transferring ownership. From 56b3c6ba53d0e9649ea5e4089b39cadde13aaef8 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 6 Jul 2023 16:10:11 +0800 Subject: [PATCH 59/79] net: fec: increase the size of tx ring and update tx_wake_threshold When the XDP feature is enabled and with heavy XDP frames to be transmitted, there is a considerable probability that available tx BDs are insufficient. This will lead to some XDP frames to be discarded and the "NOT enough BD for SG!" error log will appear in the console (as shown below). [ 160.013112] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.023116] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.028926] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.038946] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.044758] fec 30be0000.ethernet eth0: NOT enough BD for SG! In the case of heavy XDP traffic, sometimes the speed of recycling tx BDs may be slower than the speed of sending XDP frames. There may be several specific reasons, such as the interrupt is not responsed in time, the efficiency of the NAPI callback function is too low due to all the queues (tx queues and rx queues) share the same NAPI, and so on. After trying various methods, I think that increase the size of tx BD ring is simple and effective. Maybe the best resolution is that allocate NAPI for each queue to improve the efficiency of the NAPI callback, but this change is a bit big and I didn't try this method. Perheps this method will be implemented in a future patch. This patch also updates the tx_wake_threshold of tx ring which is related to the size of tx ring in the previous logic. Otherwise, the tx_wake_threshold will be too high (403 BDs), which is more likely to impact the slow path in the case of heavy XDP traffic, because XDP path and slow path share the tx BD rings. According to Jakub's suggestion, the tx_wake_threshold is at least equal to tx_stop_threshold + 2 * MAX_SKB_FRAGS, if a queue of hundreds of entries is overflowing, we should be able to apply a hysteresis of a few tens of entries. Fixes: 6d6b39f180b8 ("net: fec: add initial XDP support") Signed-off-by: Wei Fang Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fec.h | 2 +- drivers/net/ethernet/freescale/fec_main.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 8c0226d061fe..63a053dea819 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -355,7 +355,7 @@ struct bufdesc_ex { #define RX_RING_SIZE (FEC_ENET_RX_FRPPG * FEC_ENET_RX_PAGES) #define FEC_ENET_TX_FRSIZE 2048 #define FEC_ENET_TX_FRPPG (PAGE_SIZE / FEC_ENET_TX_FRSIZE) -#define TX_RING_SIZE 512 /* Must be power of two */ +#define TX_RING_SIZE 1024 /* Must be power of two */ #define TX_RING_MOD_MASK 511 /* for this to work */ #define BD_ENET_RX_INT 0x00800000 diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 940d3afe1d24..c59576ab8c7a 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3349,8 +3349,7 @@ static int fec_enet_alloc_queue(struct net_device *ndev) fep->total_tx_ring_size += fep->tx_queue[i]->bd.ring_size; txq->tx_stop_threshold = FEC_MAX_SKB_DESCS; - txq->tx_wake_threshold = - (txq->bd.ring_size - txq->tx_stop_threshold) / 2; + txq->tx_wake_threshold = FEC_MAX_SKB_DESCS + 2 * MAX_SKB_FRAGS; txq->tso_hdrs = dma_alloc_coherent(&fep->pdev->dev, txq->bd.ring_size * TSO_HEADER_SIZE, From 84a10947198792d038527af9c3994782ecb37c82 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 6 Jul 2023 16:10:12 +0800 Subject: [PATCH 60/79] net: fec: use netdev_err_once() instead of netdev_err() In the case of heavy XDP traffic to be transmitted, the console will print the error log continuously if there are lack of enough BDs to accommodate the frames. The log looks like below. [ 160.013112] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.023116] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.028926] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.038946] fec 30be0000.ethernet eth0: NOT enough BD for SG! [ 160.044758] fec 30be0000.ethernet eth0: NOT enough BD for SG! Not only will this log be replicated and redundant, it will also degrade XDP performance. So we use netdev_err_once() instead of netdev_err() now. Fixes: 6d6b39f180b8 ("net: fec: add initial XDP support") Signed-off-by: Wei Fang Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fec_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index c59576ab8c7a..ec9e4bdb0c06 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3836,7 +3836,7 @@ static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep, entries_free = fec_enet_get_free_txdesc_num(txq); if (entries_free < MAX_SKB_FRAGS + 1) { - netdev_err(fep->netdev, "NOT enough BD for SG!\n"); + netdev_err_once(fep->netdev, "NOT enough BD for SG!\n"); return -EBUSY; } From 04499f28b40bfc24f20b0e2331008bb90a54a6cf Mon Sep 17 00:00:00 2001 From: Lu Hongfei Date: Mon, 10 Jul 2023 11:18:59 +0800 Subject: [PATCH 61/79] net: dsa: Removed unneeded of_node_put in felix_parse_ports_node Remove unnecessary of_node_put from the continue path to prevent child node from being released twice, which could avoid resource leak or other unexpected issues. Signed-off-by: Lu Hongfei Reviewed-by: Vladimir Oltean Fixes: de879a016a94 ("net: dsa: felix: add functionality when not all ports are supported") Link: https://lore.kernel.org/r/20230710031859.36784-1-luhongfei@vivo.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/ocelot/felix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index dee43caee19e..8da46d284e35 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1286,7 +1286,6 @@ static int felix_parse_ports_node(struct felix *felix, if (err < 0) { dev_info(dev, "Unsupported PHY mode %s on port %d\n", phy_modes(phy_mode), port); - of_node_put(child); /* Leave port_phy_modes[port] = 0, which is also * PHY_INTERFACE_MODE_NA. This will perform a From 8278ee2a2646b9acf747317895e47a640ba933c9 Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Mon, 10 Jul 2023 16:00:27 +0530 Subject: [PATCH 62/79] octeontx2-pf: Add additional check for MCAM rules Due to hardware limitation, MCAM drop rule with ether_type == 802.1Q and vlan_id == 0 is not supported. Hence rejecting such rules. Fixes: dce677da57c0 ("octeontx2-pf: Add vlan-etype to ntuple filters") Signed-off-by: Suman Ghosh Link: https://lore.kernel.org/r/20230710103027.2244139-1-sumang@marvell.com Signed-off-by: Paolo Abeni --- .../ethernet/marvell/octeontx2/nic/otx2_flows.c | 8 ++++++++ .../net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c index 10e11262d48a..2d7713a1a153 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c @@ -872,6 +872,14 @@ static int otx2_prepare_flow_request(struct ethtool_rx_flow_spec *fsp, return -EINVAL; vlan_etype = be16_to_cpu(fsp->h_ext.vlan_etype); + + /* Drop rule with vlan_etype == 802.1Q + * and vlan_id == 0 is not supported + */ + if (vlan_etype == ETH_P_8021Q && !fsp->m_ext.vlan_tci && + fsp->ring_cookie == RX_CLS_FLOW_DISC) + return -EINVAL; + /* Only ETH_P_8021Q and ETH_P_802AD types supported */ if (vlan_etype != ETH_P_8021Q && vlan_etype != ETH_P_8021AD) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c index 8a13df592af6..5e56b6c3e60a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c @@ -597,6 +597,21 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node, return -EOPNOTSUPP; } + if (!match.mask->vlan_id) { + struct flow_action_entry *act; + int i; + + flow_action_for_each(i, act, &rule->action) { + if (act->id == FLOW_ACTION_DROP) { + netdev_err(nic->netdev, + "vlan tpid 0x%x with vlan_id %d is not supported for DROP rule.\n", + ntohs(match.key->vlan_tpid), + match.key->vlan_id); + return -EOPNOTSUPP; + } + } + } + if (match.mask->vlan_id || match.mask->vlan_dei || match.mask->vlan_priority) { From 9373771aaed17f5c2c38485f785568abe3a9f8c1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 9 Jul 2023 06:31:54 -0700 Subject: [PATCH 63/79] wifi: airo: avoid uninitialized warning in airo_get_rate() Quieten a gcc (11.3.0) build error or warning by checking the function call status and returning -EBUSY if the function call failed. This is similar to what several other wireless drivers do for the SIOCGIWRATE ioctl call when there is a locking problem. drivers/net/wireless/cisco/airo.c: error: 'status_rid.currentXmitRate' is used uninitialized [-Werror=uninitialized] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Randy Dunlap Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/39abf2c7-24a-f167-91da-ed4c5435d1c4@linux-m68k.org Link: https://lore.kernel.org/r/20230709133154.26206-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/wireless/cisco/airo.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 7c4cc5f5e1eb..dbd13f7aa3e6 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -6157,8 +6157,11 @@ static int airo_get_rate(struct net_device *dev, struct iw_param *vwrq = &wrqu->bitrate; struct airo_info *local = dev->ml_priv; StatusRid status_rid; /* Card status info */ + int ret; - readStatusRid(local, &status_rid, 1); + ret = readStatusRid(local, &status_rid, 1); + if (ret) + return -EBUSY; vwrq->value = le16_to_cpu(status_rid.currentXmitRate) * 500000; /* If more than one rate, set auto */ From 4369016497319a9635702da010d02af1ebb1849d Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Tue, 11 Jul 2023 19:58:48 +0800 Subject: [PATCH 64/79] bpf: cpumap: Fix memory leak in cpu_map_update_elem Syzkaller reported a memory leak as follows: BUG: memory leak unreferenced object 0xff110001198ef748 (size 192): comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s) hex dump (first 32 bytes): 00 00 00 00 4a 19 00 00 80 ad e3 e4 fe ff c0 00 ....J........... 00 b2 d3 0c 01 00 11 ff 28 f5 8e 19 01 00 11 ff ........(....... backtrace: [] __cpu_map_entry_alloc+0xf7/0xb00 [] cpu_map_update_elem+0x2fe/0x3d0 [] bpf_map_update_value.isra.0+0x2bd/0x520 [] map_update_elem+0x4cb/0x720 [] __se_sys_bpf+0x8c3/0xb90 [] do_syscall_64+0x30/0x40 [] entry_SYSCALL_64_after_hwframe+0x61/0xc6 BUG: memory leak unreferenced object 0xff110001198ef528 (size 192): comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] __cpu_map_entry_alloc+0x260/0xb00 [] cpu_map_update_elem+0x2fe/0x3d0 [] bpf_map_update_value.isra.0+0x2bd/0x520 [] map_update_elem+0x4cb/0x720 [] __se_sys_bpf+0x8c3/0xb90 [] do_syscall_64+0x30/0x40 [] entry_SYSCALL_64_after_hwframe+0x61/0xc6 BUG: memory leak unreferenced object 0xff1100010fd93d68 (size 8): comm "syz-executor.3", pid 17672, jiffies 4298118891 (age 9.906s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: [] kvmalloc_node+0x11e/0x170 [] __cpu_map_entry_alloc+0x2f0/0xb00 [] cpu_map_update_elem+0x2fe/0x3d0 [] bpf_map_update_value.isra.0+0x2bd/0x520 [] map_update_elem+0x4cb/0x720 [] __se_sys_bpf+0x8c3/0xb90 [] do_syscall_64+0x30/0x40 [] entry_SYSCALL_64_after_hwframe+0x61/0xc6 In the cpu_map_update_elem flow, when kthread_stop is called before calling the threadfn of rcpu->kthread, since the KTHREAD_SHOULD_STOP bit of kthread has been set by kthread_stop, the threadfn of rcpu->kthread will never be executed, and rcpu->refcnt will never be 0, which will lead to the allocated rcpu, rcpu->queue and rcpu->queue->queue cannot be released. Calling kthread_stop before executing kthread's threadfn will return -EINTR. We can complete the release of memory resources in this state. Fixes: 6710e1126934 ("bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP") Signed-off-by: Pu Lehui Acked-by: Jesper Dangaard Brouer Acked-by: Hou Tao Link: https://lore.kernel.org/r/20230711115848.2701559-1-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8a33e8747a0e..6ae02be7a48e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -122,22 +122,6 @@ static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) atomic_inc(&rcpu->refcnt); } -/* called from workqueue, to workaround syscall using preempt_disable */ -static void cpu_map_kthread_stop(struct work_struct *work) -{ - struct bpf_cpu_map_entry *rcpu; - - rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); - - /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, - * as it waits until all in-flight call_rcu() callbacks complete. - */ - rcu_barrier(); - - /* kthread_stop will wake_up_process and wait for it to complete */ - kthread_stop(rcpu->kthread); -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -165,6 +149,30 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ + struct bpf_cpu_map_entry *rcpu; + int err; + + rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + + /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, + * as it waits until all in-flight call_rcu() callbacks complete. + */ + rcu_barrier(); + + /* kthread_stop will wake_up_process and wait for it to complete */ + err = kthread_stop(rcpu->kthread); + if (err) { + /* kthread_stop may be called before cpu_map_kthread_run + * is executed, so we need to release the memory related + * to rcpu. + */ + put_cpu_map_entry(rcpu); + } +} + static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, struct list_head *listp, struct xdp_cpumap_stats *stats) From 2e06c57d66d3f6c26faa5f5b479fb3add34ce85a Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 11 Jul 2023 12:59:26 +0200 Subject: [PATCH 65/79] xdp: use trusted arguments in XDP hints kfuncs Currently, verifier does not reject XDP programs that pass NULL pointer to hints functions. At the same time, this case is not handled in any driver implementation (including veth). For example, changing bpf_xdp_metadata_rx_timestamp(ctx, ×tamp); to bpf_xdp_metadata_rx_timestamp(ctx, NULL); in xdp_metadata test successfully crashes the system. Add KF_TRUSTED_ARGS flag to hints kfunc definitions, so driver code does not have to worry about getting invalid pointers. Fixes: 3d76a4d3d4e5 ("bpf: XDP metadata RX kfuncs") Reported-by: Stanislav Fomichev Closes: https://lore.kernel.org/bpf/ZKWo0BbpLfkZHbyE@google.com/ Signed-off-by: Larysa Zaremba Acked-by: Jesper Dangaard Brouer Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20230711105930.29170-1-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- net/core/xdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 41e5ca8643ec..8362130bf085 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash, __diag_pop(); BTF_SET8_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, 0) +#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_SET8_END(xdp_metadata_kfunc_ids) From 12a89f0177092dbc2a1cb1d05a9790adbcea2309 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 10 Jul 2023 16:50:39 +0200 Subject: [PATCH 66/79] wifi: iwlwifi: remove 'use_tfh' config to fix crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is equivalent to 'gen2', and it was always confusing to have two identical config entries. The split config patch actually had been originally developed after removing 'use_tfh" and didn't add the use_tfh in the new configs as they'd later been copied to the new files. Thus the easiest way to fix the init crash here now is to just remove use_tfh (which is erroneously unset in most of the configs now) and use 'gen2' in the code instead. There's possibly still an unwind error in iwl_txq_gen2_init() as it crashes if TXQ 0 fails to initialize, but we can deal with it later since the original failure is due to the use_tfh confusion. Tested-by: Xi Ruoyao Reported-and-tested-by: Niklāvs Koļesņikovs Reported-and-tested-by: Jeff Chua Reported-and-tested-by: Zhang Rui Link: https://bugzilla.kernel.org/show_bug.cgi?id=217622 Link: https://lore.kernel.org/all/9274d9bd3d080a457649ff5addcc1726f08ef5b2.camel@xry111.site/ Link: https://lore.kernel.org/all/CAAJw_Zug6VCS5ZqTWaFSr9sd85k%3DtyPm9DEE%2BmV%3DAKoECZM%2BsQ@mail.gmail.com/ Fixes: 19898ce9cf8a ("wifi: iwlwifi: split 22000.c into multiple files") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20230710145038.84186-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- drivers/net/wireless/intel/iwlwifi/cfg/22000.c | 5 ----- drivers/net/wireless/intel/iwlwifi/iwl-config.h | 2 -- drivers/net/wireless/intel/iwlwifi/iwl-fh.h | 4 ++-- drivers/net/wireless/intel/iwlwifi/iwl-trans.c | 6 +++--- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 2 +- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 4 ++-- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 2 +- drivers/net/wireless/intel/iwlwifi/queue/tx.c | 10 +++++----- drivers/net/wireless/intel/iwlwifi/queue/tx.h | 8 ++++---- 9 files changed, 18 insertions(+), 25 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index aa4320ca4c30..d594694206b3 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -84,7 +84,6 @@ const struct iwl_ht_params iwl_22000_ht_params = { .mac_addr_from_csr = 0x380, \ .ht_params = &iwl_22000_ht_params, \ .nvm_ver = IWL_22000_NVM_VERSION, \ - .trans.use_tfh = true, \ .trans.rf_id = true, \ .trans.gen2 = true, \ .nvm_type = IWL_NVM_EXT, \ @@ -122,7 +121,6 @@ const struct iwl_ht_params iwl_22000_ht_params = { const struct iwl_cfg_trans_params iwl_qu_trans_cfg = { .mq_rx_supported = true, - .use_tfh = true, .rf_id = true, .gen2 = true, .device_family = IWL_DEVICE_FAMILY_22000, @@ -134,7 +132,6 @@ const struct iwl_cfg_trans_params iwl_qu_trans_cfg = { const struct iwl_cfg_trans_params iwl_qu_medium_latency_trans_cfg = { .mq_rx_supported = true, - .use_tfh = true, .rf_id = true, .gen2 = true, .device_family = IWL_DEVICE_FAMILY_22000, @@ -146,7 +143,6 @@ const struct iwl_cfg_trans_params iwl_qu_medium_latency_trans_cfg = { const struct iwl_cfg_trans_params iwl_qu_long_latency_trans_cfg = { .mq_rx_supported = true, - .use_tfh = true, .rf_id = true, .gen2 = true, .device_family = IWL_DEVICE_FAMILY_22000, @@ -200,7 +196,6 @@ const struct iwl_cfg_trans_params iwl_ax200_trans_cfg = { .device_family = IWL_DEVICE_FAMILY_22000, .base_params = &iwl_22000_base_params, .mq_rx_supported = true, - .use_tfh = true, .rf_id = true, .gen2 = true, .bisr_workaround = 1, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 742096c5a36a..241a9e3f2a1a 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -256,7 +256,6 @@ enum iwl_cfg_trans_ltr_delay { * @xtal_latency: power up latency to get the xtal stabilized * @extra_phy_cfg_flags: extra configuration flags to pass to the PHY * @rf_id: need to read rf_id to determine the firmware image - * @use_tfh: use TFH * @gen2: 22000 and on transport operation * @mq_rx_supported: multi-queue rx support * @integrated: discrete or integrated @@ -271,7 +270,6 @@ struct iwl_cfg_trans_params { u32 xtal_latency; u32 extra_phy_cfg_flags; u32 rf_id:1, - use_tfh:1, gen2:1, mq_rx_supported:1, integrated:1, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-fh.h b/drivers/net/wireless/intel/iwlwifi/iwl-fh.h index bedd78a47f67..4e4a60ddf9b2 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-fh.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-fh.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* - * Copyright (C) 2005-2014, 2018-2021 Intel Corporation + * Copyright (C) 2005-2014, 2018-2021, 2023 Intel Corporation * Copyright (C) 2015-2017 Intel Deutschland GmbH */ #ifndef __iwl_fh_h__ @@ -71,7 +71,7 @@ static inline unsigned int FH_MEM_CBBC_QUEUE(struct iwl_trans *trans, unsigned int chnl) { - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { WARN_ON_ONCE(chnl >= 64); return TFH_TFDQ_CBB_TABLE + 8 * chnl; } diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c index b1af9359cea5..4bd759432d44 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c @@ -2,7 +2,7 @@ /* * Copyright (C) 2015 Intel Mobile Communications GmbH * Copyright (C) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2019-2021 Intel Corporation + * Copyright (C) 2019-2021, 2023 Intel Corporation */ #include #include @@ -42,7 +42,7 @@ struct iwl_trans *iwl_trans_alloc(unsigned int priv_size, WARN_ON(!ops->wait_txq_empty && !ops->wait_tx_queues_empty); - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { trans->txqs.tfd.addr_size = 64; trans->txqs.tfd.max_tbs = IWL_TFH_NUM_TBS; trans->txqs.tfd.size = sizeof(struct iwl_tfh_tfd); @@ -101,7 +101,7 @@ int iwl_trans_init(struct iwl_trans *trans) /* Some things must not change even if the config does */ WARN_ON(trans->txqs.tfd.addr_size != - (trans->trans_cfg->use_tfh ? 64 : 36)); + (trans->trans_cfg->gen2 ? 64 : 36)); snprintf(trans->dev_cmd_pool_name, sizeof(trans->dev_cmd_pool_name), "iwl_cmd_pool:%s", dev_name(trans->dev)); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index b83df0631279..b18c91c5dd5d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -1450,7 +1450,7 @@ static inline bool iwl_mvm_has_new_station_api(const struct iwl_fw *fw) static inline bool iwl_mvm_has_new_tx_api(struct iwl_mvm *mvm) { /* TODO - replace with TLV once defined */ - return mvm->trans->trans_cfg->use_tfh; + return mvm->trans->trans_cfg->gen2; } static inline bool iwl_mvm_has_unified_ucode(struct iwl_mvm *mvm) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index eacbbdbffb5e..3e988da44973 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -819,7 +819,7 @@ static int iwl_pcie_load_cpu_sections_8000(struct iwl_trans *trans, iwl_enable_interrupts(trans); - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { if (cpu == 1) iwl_write_prph(trans, UREG_UCODE_LOAD_STATUS, 0xFFFF); @@ -3394,7 +3394,7 @@ iwl_trans_pcie_dump_data(struct iwl_trans *trans, u8 tfdidx; u32 caplen, cmdlen; - if (trans->trans_cfg->use_tfh) + if (trans->trans_cfg->gen2) tfdidx = idx; else tfdidx = ptr; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index 1337fa95f657..790e5b124740 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -364,7 +364,7 @@ void iwl_trans_pcie_tx_reset(struct iwl_trans *trans) for (txq_id = 0; txq_id < trans->trans_cfg->base_params->num_of_queues; txq_id++) { struct iwl_txq *txq = trans->txqs.txq[txq_id]; - if (trans->trans_cfg->use_tfh) + if (trans->trans_cfg->gen2) iwl_write_direct64(trans, FH_MEM_CBBC_QUEUE(trans, txq_id), txq->dma_addr); diff --git a/drivers/net/wireless/intel/iwlwifi/queue/tx.c b/drivers/net/wireless/intel/iwlwifi/queue/tx.c index fbacbe9ada15..5bb3cc3367c9 100644 --- a/drivers/net/wireless/intel/iwlwifi/queue/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/queue/tx.c @@ -985,7 +985,7 @@ void iwl_txq_log_scd_error(struct iwl_trans *trans, struct iwl_txq *txq) bool active; u8 fifo; - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { IWL_ERR(trans, "Queue %d is stuck %d %d\n", txq_id, txq->read_ptr, txq->write_ptr); /* TODO: access new SCD registers and dump them */ @@ -1040,7 +1040,7 @@ int iwl_txq_alloc(struct iwl_trans *trans, struct iwl_txq *txq, int slots_num, if (WARN_ON(txq->entries || txq->tfds)) return -EINVAL; - if (trans->trans_cfg->use_tfh) + if (trans->trans_cfg->gen2) tfd_sz = trans->txqs.tfd.size * slots_num; timer_setup(&txq->stuck_timer, iwl_txq_stuck_timer, 0); @@ -1347,7 +1347,7 @@ static inline dma_addr_t iwl_txq_gen1_tfd_tb_get_addr(struct iwl_trans *trans, dma_addr_t addr; dma_addr_t hi_len; - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { struct iwl_tfh_tfd *tfh_tfd = _tfd; struct iwl_tfh_tb *tfh_tb = &tfh_tfd->tbs[idx]; @@ -1408,7 +1408,7 @@ void iwl_txq_gen1_tfd_unmap(struct iwl_trans *trans, meta->tbs = 0; - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { struct iwl_tfh_tfd *tfd_fh = (void *)tfd; tfd_fh->num_tbs = 0; @@ -1625,7 +1625,7 @@ void iwl_txq_reclaim(struct iwl_trans *trans, int txq_id, int ssn, txq->entries[read_ptr].skb = NULL; - if (!trans->trans_cfg->use_tfh) + if (!trans->trans_cfg->gen2) iwl_txq_gen1_inval_byte_cnt_tbl(trans, txq); iwl_txq_free_tfd(trans, txq); diff --git a/drivers/net/wireless/intel/iwlwifi/queue/tx.h b/drivers/net/wireless/intel/iwlwifi/queue/tx.h index eca53bfd326d..1e4a24ab9bab 100644 --- a/drivers/net/wireless/intel/iwlwifi/queue/tx.h +++ b/drivers/net/wireless/intel/iwlwifi/queue/tx.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation */ #ifndef __iwl_trans_queue_tx_h__ #define __iwl_trans_queue_tx_h__ @@ -38,7 +38,7 @@ static inline void iwl_wake_queue(struct iwl_trans *trans, static inline void *iwl_txq_get_tfd(struct iwl_trans *trans, struct iwl_txq *txq, int idx) { - if (trans->trans_cfg->use_tfh) + if (trans->trans_cfg->gen2) idx = iwl_txq_get_cmd_index(txq, idx); return (u8 *)txq->tfds + trans->txqs.tfd.size * idx; @@ -135,7 +135,7 @@ static inline u8 iwl_txq_gen1_tfd_get_num_tbs(struct iwl_trans *trans, { struct iwl_tfd *tfd; - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { struct iwl_tfh_tfd *tfh_tfd = _tfd; return le16_to_cpu(tfh_tfd->num_tbs) & 0x1f; @@ -151,7 +151,7 @@ static inline u16 iwl_txq_gen1_tfd_tb_get_len(struct iwl_trans *trans, struct iwl_tfd *tfd; struct iwl_tfd_tb *tb; - if (trans->trans_cfg->use_tfh) { + if (trans->trans_cfg->gen2) { struct iwl_tfh_tfd *tfh_tfd = _tfd; struct iwl_tfh_tb *tfh_tb = &tfh_tfd->tbs[idx]; From cf28792facaa9c1c4f5a246d6a364761f7835870 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 10 Jul 2023 10:46:36 -0700 Subject: [PATCH 67/79] docs: netdev: update the URL of the status page Move the status page from vger to the same server as mailbot. Link: https://lore.kernel.org/r/20230710174636.1174684-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/process/maintainer-netdev.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index 2397b31c0198..2ab843cde830 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -98,7 +98,7 @@ If you aren't subscribed to netdev and/or are simply unsure if repository link above for any new networking-related commits. You may also check the following website for the current status: - http://vger.kernel.org/~davem/net-next.html + https://patchwork.hopto.org/net-next.html The ``net`` tree continues to collect fixes for the vX.Y content, and is fed back to Linus at regular (~weekly) intervals. Meaning that the From e522c1bd0ab4f645885a3eef4e1dd920cc9ac3b6 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Mon, 10 Jul 2023 14:50:57 -0500 Subject: [PATCH 68/79] MAINTAINERS: Add another mailing list for QUALCOMM ETHQOS ETHERNET DRIVER linux-arm-msm is the list most people subscribe to in order to receive updates about Qualcomm related drivers. Make sure changes for the Qualcomm ethernet driver make it there. Signed-off-by: Andrew Halaney Acked-by: Vinod Koul Link: https://lore.kernel.org/r/20230710195240.197047-1-ahalaney@redhat.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 99f18f6e8bc6..aef3a391ec6c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17600,6 +17600,7 @@ QUALCOMM ETHQOS ETHERNET DRIVER M: Vinod Koul R: Bhupesh Sharma L: netdev@vger.kernel.org +L: linux-arm-msm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/qcom,ethqos.yaml F: drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c From d3f87278bcb80bd7f9519669d928b43320363d4f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 11 Jul 2023 10:08:09 +0300 Subject: [PATCH 69/79] net/sched: flower: Ensure both minimum and maximum ports are specified The kernel does not currently validate that both the minimum and maximum ports of a port range are specified. This can lead user space to think that a filter matching on a port range was successfully added, when in fact it was not. For example, with a patched (buggy) iproute2 that only sends the minimum port, the following commands do not return an error: # tc filter add dev swp1 ingress pref 1 proto ip flower ip_proto udp src_port 100-200 action pass # tc filter add dev swp1 ingress pref 1 proto ip flower ip_proto udp dst_port 100-200 action pass # tc filter show dev swp1 ingress filter protocol ip pref 1 flower chain 0 filter protocol ip pref 1 flower chain 0 handle 0x1 eth_type ipv4 ip_proto udp not_in_hw action order 1: gact action pass random type none pass val 0 index 1 ref 1 bind 1 filter protocol ip pref 1 flower chain 0 handle 0x2 eth_type ipv4 ip_proto udp not_in_hw action order 1: gact action pass random type none pass val 0 index 2 ref 1 bind 1 Fix by returning an error unless both ports are specified: # tc filter add dev swp1 ingress pref 1 proto ip flower ip_proto udp src_port 100-200 action pass Error: Both min and max source ports must be specified. We have an error talking to the kernel # tc filter add dev swp1 ingress pref 1 proto ip flower ip_proto udp dst_port 100-200 action pass Error: Both min and max destination ports must be specified. We have an error talking to the kernel Fixes: 5c72299fba9d ("net: sched: cls_flower: Classify packets using port ranges") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 56065cc5a661..f2b0bc4142fe 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -812,6 +812,16 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src)); + if (mask->tp_range.tp_min.dst != mask->tp_range.tp_max.dst) { + NL_SET_ERR_MSG(extack, + "Both min and max destination ports must be specified"); + return -EINVAL; + } + if (mask->tp_range.tp_min.src != mask->tp_range.tp_max.src) { + NL_SET_ERR_MSG(extack, + "Both min and max source ports must be specified"); + return -EINVAL; + } if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && ntohs(key->tp_range.tp_max.dst) <= ntohs(key->tp_range.tp_min.dst)) { From f72207a5c0dbaaf6921cf9a6c0d2fd0bc249ea78 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jul 2023 11:52:26 +0300 Subject: [PATCH 70/79] netdevsim: fix uninitialized data in nsim_dev_trap_fa_cookie_write() The simple_write_to_buffer() function is designed to handle partial writes. It returns negatives on error, otherwise it returns the number of bytes that were able to be copied. This code doesn't check the return properly. We only know that the first byte is written, the rest of the buffer might be uninitialized. There is no need to use the simple_write_to_buffer() function. Partial writes are prohibited by the "if (*ppos != 0)" check at the start of the function. Just use memdup_user() and copy the whole buffer. Fixes: d3cbb907ae57 ("netdevsim: add ACL trap reporting cookie as a metadata") Signed-off-by: Dan Carpenter Reviewed-by: Pavan Chebbi Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/7c1f950b-3a7d-4252-82a6-876e53078ef7@moroto.mountain Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/dev.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 6045bece2654..b4d3b9cde8bd 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -184,13 +184,10 @@ static ssize_t nsim_dev_trap_fa_cookie_write(struct file *file, cookie_len = (count - 1) / 2; if ((count - 1) % 2) return -EINVAL; - buf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!buf) - return -ENOMEM; - ret = simple_write_to_buffer(buf, count, ppos, data, count); - if (ret < 0) - goto free_buf; + buf = memdup_user(data, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); fa_cookie = kmalloc(sizeof(*fa_cookie) + cookie_len, GFP_KERNEL | __GFP_NOWARN); From 1e9cb763e9bacf0c932aa948f50dcfca6f519a26 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Mon, 10 Jul 2023 18:36:21 -0700 Subject: [PATCH 71/79] net: ena: fix shift-out-of-bounds in exponential backoff The ENA adapters on our instances occasionally reset. Once recently logged a UBSAN failure to console in the process: UBSAN: shift-out-of-bounds in build/linux/drivers/net/ethernet/amazon/ena/ena_com.c:540:13 shift exponent 32 is too large for 32-bit type 'unsigned int' CPU: 28 PID: 70012 Comm: kworker/u72:2 Kdump: loaded not tainted 5.15.117 Hardware name: Amazon EC2 c5d.9xlarge/, BIOS 1.0 10/16/2017 Workqueue: ena ena_fw_reset_device [ena] Call Trace: dump_stack_lvl+0x4a/0x63 dump_stack+0x10/0x16 ubsan_epilogue+0x9/0x36 __ubsan_handle_shift_out_of_bounds.cold+0x61/0x10e ? __const_udelay+0x43/0x50 ena_delay_exponential_backoff_us.cold+0x16/0x1e [ena] wait_for_reset_state+0x54/0xa0 [ena] ena_com_dev_reset+0xc8/0x110 [ena] ena_down+0x3fe/0x480 [ena] ena_destroy_device+0xeb/0xf0 [ena] ena_fw_reset_device+0x30/0x50 [ena] process_one_work+0x22b/0x3d0 worker_thread+0x4d/0x3f0 ? process_one_work+0x3d0/0x3d0 kthread+0x12a/0x150 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Apparently, the reset delays are getting so large they can trigger a UBSAN panic. Looking at the code, the current timeout is capped at 5000us. Using a base value of 100us, the current code will overflow after (1<<29). Even at values before 32, this function wraps around, perhaps unintentionally. Cap the value of the exponent used for this backoff at (1<<16) which is larger than currently necessary, but large enough to support bigger values in the future. Cc: stable@vger.kernel.org Fixes: 4bb7f4cf60e3 ("net: ena: reduce driver load time") Signed-off-by: Krister Johansen Reviewed-by: Leon Romanovsky Reviewed-by: Shay Agroskin Link: https://lore.kernel.org/r/20230711013621.GE1926@templeofstupid.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_com.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 451c3a1b6255..633b321d7fdd 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -35,6 +35,8 @@ #define ENA_REGS_ADMIN_INTR_MASK 1 +#define ENA_MAX_BACKOFF_DELAY_EXP 16U + #define ENA_MIN_ADMIN_POLL_US 100 #define ENA_MAX_ADMIN_POLL_US 5000 @@ -536,6 +538,7 @@ static int ena_com_comp_status_to_errno(struct ena_com_admin_queue *admin_queue, static void ena_delay_exponential_backoff_us(u32 exp, u32 delay_us) { + exp = min_t(u32, exp, ENA_MAX_BACKOFF_DELAY_EXP); delay_us = max_t(u32, ENA_MIN_ADMIN_POLL_US, delay_us); delay_us = min_t(u32, delay_us * (1U << exp), ENA_MAX_ADMIN_POLL_US); usleep_range(delay_us, 2 * delay_us); From 150e33e62c1fa4af5aaab02776b6c3812711d478 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 10 Jul 2023 23:16:34 -0300 Subject: [PATCH 72/79] net/sched: make psched_mtu() RTNL-less safe Eric Dumazet says[1]: ------- Speaking of psched_mtu(), I see that net/sched/sch_pie.c is using it without holding RTNL, so dev->mtu can be changed underneath. KCSAN could issue a warning. ------- Annotate dev->mtu with READ_ONCE() so KCSAN don't issue a warning. [1] https://lore.kernel.org/all/CANn89iJoJO5VtaJ-2=_d2aOQhb0Xw8iBT_Cxqp2HyuS-zj6azw@mail.gmail.com/ v1 -> v2: Fix commit message Fixes: d4b36210c2e6 ("net: pkt_sched: PIE AQM scheme") Suggested-by: Eric Dumazet Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230711021634.561598-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index e98aac9d5ad5..15960564e0c3 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -134,7 +134,7 @@ extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; */ static inline unsigned int psched_mtu(const struct net_device *dev) { - return dev->mtu + dev->hard_header_len; + return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) From aa846677a9fb19a0f2c58154c140398aa92a87ba Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Tue, 11 Jul 2023 14:34:14 +0800 Subject: [PATCH 73/79] net: txgbe: fix eeprom calculation error For some device types like TXGBE_ID_XAUI, *checksum computed in txgbe_calc_eeprom_checksum() is larger than TXGBE_EEPROM_SUM. Remove the limit on the size of *checksum. Fixes: 049fe5365324 ("net: txgbe: Add operations to interact with firmware") Fixes: 5e2ea7801fac ("net: txgbe: Fix unsigned comparison to zero in txgbe_calc_eeprom_checksum()") Signed-off-by: Jiawen Wu Link: https://lore.kernel.org/r/20230711063414.3311-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c index 12405d71c5ee..0772eb14eabf 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c @@ -186,9 +186,6 @@ static int txgbe_calc_eeprom_checksum(struct wx *wx, u16 *checksum) if (eeprom_ptrs) kvfree(eeprom_ptrs); - if (*checksum > TXGBE_EEPROM_SUM) - return -EINVAL; - *checksum = TXGBE_EEPROM_SUM - *checksum; return 0; From 4f4626cd049576af1276c7568d5b44eb3f7bb1b1 Mon Sep 17 00:00:00 2001 From: Zhang Shurong Date: Thu, 6 Jul 2023 10:45:00 +0800 Subject: [PATCH 74/79] wifi: rtw89: debug: fix error code in rtw89_debug_priv_send_h2c_set() If there is a failure during rtw89_fw_h2c_raw() rtw89_debug_priv_send_h2c should return negative error code instead of a positive value count. Fix this bug by returning correct error code. Fixes: e3ec7017f6a2 ("rtw89: add Realtek 802.11ax driver") Signed-off-by: Zhang Shurong Acked-by: Ping-Ke Shih Link: https://lore.kernel.org/r/tencent_AD09A61BC4DA92AD1EB0790F5C850E544D07@qq.com Signed-off-by: Jakub Kicinski --- drivers/net/wireless/realtek/rtw89/debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/debug.c b/drivers/net/wireless/realtek/rtw89/debug.c index 1db2d59d33ff..a4bbac916e22 100644 --- a/drivers/net/wireless/realtek/rtw89/debug.c +++ b/drivers/net/wireless/realtek/rtw89/debug.c @@ -3026,17 +3026,18 @@ static ssize_t rtw89_debug_priv_send_h2c_set(struct file *filp, struct rtw89_debugfs_priv *debugfs_priv = filp->private_data; struct rtw89_dev *rtwdev = debugfs_priv->rtwdev; u8 *h2c; + int ret; u16 h2c_len = count / 2; h2c = rtw89_hex2bin_user(rtwdev, user_buf, count); if (IS_ERR(h2c)) return -EFAULT; - rtw89_fw_h2c_raw(rtwdev, h2c, h2c_len); + ret = rtw89_fw_h2c_raw(rtwdev, h2c, h2c_len); kfree(h2c); - return count; + return ret ? ret : count; } static int From fec3ebb5ed299ac3a998f011c380f2ded47f4866 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 11 Jul 2023 13:50:52 +0200 Subject: [PATCH 75/79] wifi: cfg80211: fix receiving mesh packets without RFC1042 header Fix ethernet header length field after stripping the mesh header Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/CT5GNZSK28AI.2K6M69OXM9RW5@syracuse/ Fixes: 986e43b19ae9 ("wifi: mac80211: fix receiving A-MSDU frames on mesh interfaces") Reported-and-tested-by: Nicolas Escande Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20230711115052.68430-1-nbd@nbd.name Signed-off-by: Jakub Kicinski --- net/wireless/util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index 89c9ad6c886e..1783ab9d57a3 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -580,6 +580,8 @@ int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) hdrlen += ETH_ALEN + 2; else if (!pskb_may_pull(skb, hdrlen)) return -EINVAL; + else + payload.eth.h_proto = htons(skb->len - hdrlen); mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; switch (payload.flags & MESH_FLAGS_AE) { From 158810b261d02fc7dd92ca9c392d8f8a211a2401 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 11 Jul 2023 18:01:00 -0300 Subject: [PATCH 76/79] net/sched: sch_qfq: reintroduce lmax bound check for MTU 25369891fcef deletes a check for the case where no 'lmax' is specified which 3037933448f6 previously fixed as 'lmax' could be set to the device's MTU without any bound checking for QFQ_LMAX_MIN and QFQ_LMAX_MAX. Therefore, reintroduce the check. Fixes: 25369891fcef ("net/sched: sch_qfq: refactor parsing of netlink parameters") Acked-by: Jamal Hadi Salim Reviewed-by: Eric Dumazet Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/sched/sch_qfq.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index dfd9a99e6257..63a5b277c117 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -423,10 +423,17 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, else weight = 1; - if (tb[TCA_QFQ_LMAX]) + if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); - else + } else { + /* MTU size is user controlled */ lmax = psched_mtu(qdisc_dev(sch)); + if (lmax < QFQ_MIN_LMAX || lmax > QFQ_MAX_LMAX) { + NL_SET_ERR_MSG_MOD(extack, + "MTU size out of bounds for qfq"); + return -EINVAL; + } + } inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; From c5a06fdc618d1d262fa0db3483f096936961588c Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 11 Jul 2023 18:01:01 -0300 Subject: [PATCH 77/79] selftests: tc-testing: add tests for qfq mtu sanity check QFQ only supports a certain bound of MTU size so make sure we check for this requirement in the tests. Acked-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Tested-by: Zhengchao Shao Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/qdiscs/qfq.json | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json index 147899a868d3..965da7622dac 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json @@ -213,5 +213,53 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP link del dev $DUMMY type dummy" ] + }, + { + "id": "85ee", + "name": "QFQ with big MTU", + "category": [ + "qdisc", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true", + "$IP link set dev $DUMMY mtu 2147483647 || /bin/true", + "$TC qdisc add dev $DUMMY handle 1: root qfq" + ], + "cmdUnderTest": "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 100", + "expExitCode": "2", + "verifyCmd": "$TC class show dev $DUMMY", + "matchPattern": "class qfq 1:", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY type dummy" + ] + }, + { + "id": "ddfa", + "name": "QFQ with small MTU", + "category": [ + "qdisc", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true", + "$IP link set dev $DUMMY mtu 256 || /bin/true", + "$TC qdisc add dev $DUMMY handle 1: root qfq" + ], + "cmdUnderTest": "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 100", + "expExitCode": "2", + "verifyCmd": "$TC class show dev $DUMMY", + "matchPattern": "class qfq 1:", + "matchCount": "0", + "teardown": [ + "$IP link del dev $DUMMY type dummy" + ] } ] From 3e337087c3b5805fe0b8a46ba622a962880b5d64 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 11 Jul 2023 18:01:02 -0300 Subject: [PATCH 78/79] net/sched: sch_qfq: account for stab overhead in qfq_enqueue Lion says: ------- In the QFQ scheduler a similar issue to CVE-2023-31436 persists. Consider the following code in net/sched/sch_qfq.c: static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb), gso_segs; // ... if (unlikely(cl->agg->lmax < len)) { pr_debug("qfq: increasing maxpkt from %u to %u for class %u", cl->agg->lmax, len, cl->common.classid); err = qfq_change_agg(sch, cl, cl->agg->class_weight, len); if (err) { cl->qstats.drops++; return qdisc_drop(skb, sch, to_free); } // ... } Similarly to CVE-2023-31436, "lmax" is increased without any bounds checks according to the packet length "len". Usually this would not impose a problem because packet sizes are naturally limited. This is however not the actual packet length, rather the "qdisc_pkt_len(skb)" which might apply size transformations according to "struct qdisc_size_table" as created by "qdisc_get_stab()" in net/sched/sch_api.c if the TCA_STAB option was set when modifying the qdisc. A user may choose virtually any size using such a table. As a result the same issue as in CVE-2023-31436 can occur, allowing heap out-of-bounds read / writes in the kmalloc-8192 cache. ------- We can create the issue with the following commands: tc qdisc add dev $DEV root handle 1: stab mtu 2048 tsize 512 mpu 0 \ overhead 999999999 linklayer ethernet qfq tc class add dev $DEV parent 1: classid 1:1 htb rate 6mbit burst 15k tc filter add dev $DEV parent 1: matchall classid 1:1 ping -I $DEV 1.1.1.2 This is caused by incorrectly assuming that qdisc_pkt_len() returns a length within the QFQ_MIN_LMAX < len < QFQ_MAX_LMAX. Fixes: 462dbc9101ac ("pkt_sched: QFQ Plus: fair-queueing service at DRR cost") Reported-by: Lion Reviewed-by: Eric Dumazet Signed-off-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/sched/sch_qfq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 63a5b277c117..befaf74b33ca 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -381,8 +381,13 @@ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, u32 lmax) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); + struct qfq_aggregate *new_agg; + /* 'lmax' can range from [QFQ_MIN_LMAX, pktlen + stab overhead] */ + if (lmax > QFQ_MAX_LMAX) + return -EINVAL; + + new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); if (new_agg == NULL) From 137f6219da59903f480a9032b01225e9062f7ae0 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 11 Jul 2023 18:01:03 -0300 Subject: [PATCH 79/79] selftests: tc-testing: add test for qfq with stab overhead A packet with stab overhead greater than QFQ_MAX_LMAX should be dropped by the QFQ qdisc as it can't handle such lengths. Signed-off-by: Jamal Hadi Salim Signed-off-by: Pedro Tammela Reviewed-by: Simon Horman Tested-by: Zhengchao Shao Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/qdiscs/qfq.json | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json index 965da7622dac..976dffda4654 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/qfq.json @@ -261,5 +261,43 @@ "teardown": [ "$IP link del dev $DUMMY type dummy" ] + }, + { + "id": "5993", + "name": "QFQ with stab overhead greater than max packet len", + "category": [ + "qdisc", + "qfq", + "scapy" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$IP link add dev $DUMMY type dummy || /bin/true", + "$IP link set dev $DUMMY up || /bin/true", + "$TC qdisc add dev $DUMMY handle 1: stab mtu 2048 tsize 512 mpu 0 overhead 999999999 linklayer ethernet root qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 100", + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip flower dst_ip 1.3.3.7/32 action mirred egress mirror dev $DUMMY" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY parent 1: matchall classid 1:1", + "scapy": [ + { + "iface": "$DEV0", + "count": 22, + "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='1.3.3.7')/TCP(sport=5000,dport=10)" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc ls dev $DUMMY", + "matchPattern": "dropped 22", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root qfq" + ] } ]