fib: RCU conversion of fib_lookup()

fib_lookup() converted to be called in RCU protected context, no
reference taken and released on a contended cache line (fib_clntref)

fib_table_lookup() and fib_semantic_match() get an additional parameter.

struct fib_info gets an rcu_head field, and is freed after an rcu grace
period.

Stress test :
(Sending 160.000.000 UDP frames on same neighbour,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_HASH) (about same results for FIB_TRIE)

Before patch :

real	1m31.199s
user	0m13.761s
sys	23m24.780s

After patch:

real	1m5.375s
user	0m14.997s
sys	15m50.115s

Before patch Profile :

13044.00 15.4% __ip_route_output_key vmlinux
 8438.00 10.0% dst_destroy           vmlinux
 5983.00  7.1% fib_semantic_match    vmlinux
 5410.00  6.4% fib_rules_lookup      vmlinux
 4803.00  5.7% neigh_lookup          vmlinux
 4420.00  5.2% _raw_spin_lock        vmlinux
 3883.00  4.6% rt_set_nexthop        vmlinux
 3261.00  3.9% _raw_read_lock        vmlinux
 2794.00  3.3% fib_table_lookup      vmlinux
 2374.00  2.8% neigh_resolve_output  vmlinux
 2153.00  2.5% dst_alloc             vmlinux
 1502.00  1.8% _raw_read_lock_bh     vmlinux
 1484.00  1.8% kmem_cache_alloc      vmlinux
 1407.00  1.7% eth_header            vmlinux
 1406.00  1.7% ipv4_dst_destroy      vmlinux
 1298.00  1.5% __copy_from_user_ll   vmlinux
 1174.00  1.4% dev_queue_xmit        vmlinux
 1000.00  1.2% ip_output             vmlinux

After patch Profile :

13712.00 15.8% dst_destroy             vmlinux
 8548.00  9.9% __ip_route_output_key   vmlinux
 7017.00  8.1% neigh_lookup            vmlinux
 4554.00  5.3% fib_semantic_match      vmlinux
 4067.00  4.7% _raw_read_lock          vmlinux
 3491.00  4.0% dst_alloc               vmlinux
 3186.00  3.7% neigh_resolve_output    vmlinux
 3103.00  3.6% fib_table_lookup        vmlinux
 2098.00  2.4% _raw_read_lock_bh       vmlinux
 2081.00  2.4% kmem_cache_alloc        vmlinux
 2013.00  2.3% _raw_spin_lock          vmlinux
 1763.00  2.0% __copy_from_user_ll     vmlinux
 1763.00  2.0% ip_output               vmlinux
 1761.00  2.0% ipv4_dst_destroy        vmlinux
 1631.00  1.9% eth_header              vmlinux
 1440.00  1.7% _raw_read_unlock_bh     vmlinux

Reference results, if IP route cache is enabled :

real	0m29.718s
user	0m10.845s
sys	7m37.341s

25213.00 29.5% __ip_route_output_key   vmlinux
 9011.00 10.5% dst_release             vmlinux
 4817.00  5.6% ip_push_pending_frames  vmlinux
 4232.00  5.0% ip_finish_output        vmlinux
 3940.00  4.6% udp_sendmsg             vmlinux
 3730.00  4.4% __copy_from_user_ll     vmlinux
 3716.00  4.4% ip_route_output_flow    vmlinux
 2451.00  2.9% __xfrm_lookup           vmlinux
 2221.00  2.6% ip_append_data          vmlinux
 1718.00  2.0% _raw_spin_lock_bh       vmlinux
 1655.00  1.9% __alloc_skb             vmlinux
 1572.00  1.8% sock_wfree              vmlinux
 1345.00  1.6% kfree                   vmlinux

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2010-10-05 10:41:36 +00:00 committed by David S. Miller
parent c2952c314b
commit ebc0ffae5d
10 changed files with 72 additions and 77 deletions

View file

@ -1773,12 +1773,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt->fl.iif == 0)
src = rt->rt_src;
else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
src = FIB_RES_PREFSRC(res);
fib_res_put(&res);
} else
src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
else {
rcu_read_lock();
if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
src = FIB_RES_PREFSRC(res);
else
src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
RT_SCOPE_UNIVERSE);
rcu_read_unlock();
}
memcpy(addr, &src, 4);
}
@ -2081,6 +2084,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
* 2. IP spoofing attempts are filtered with 100% of guarantee.
* called with rcu_read_lock()
*/
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@ -2102,7 +2106,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
unsigned hash;
__be32 spec_dst;
int err = -EINVAL;
int free_res = 0;
struct net * net = dev_net(dev);
/* IP on this device is disabled. */
@ -2134,12 +2137,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
/*
* Now we are ready to route packet.
*/
if ((err = fib_lookup(net, &fl, &res)) != 0) {
err = fib_lookup(net, &fl, &res);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
goto no_route;
}
free_res = 1;
RT_CACHE_STAT_INC(in_slow_tot);
@ -2148,8 +2151,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res.type == RTN_LOCAL) {
err = fib_validate_source(saddr, daddr, tos,
net->loopback_dev->ifindex,
dev, &spec_dst, &itag, skb->mark);
net->loopback_dev->ifindex,
dev, &spec_dst, &itag, skb->mark);
if (err < 0)
goto martian_source_keep_err;
if (err)
@ -2164,9 +2167,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_destination;
err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
done:
if (free_res)
fib_res_put(&res);
out: return err;
brd_input:
@ -2226,7 +2226,7 @@ local_input:
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
goto done;
goto out;
no_route:
RT_CACHE_STAT_INC(in_no_route);
@ -2249,21 +2249,21 @@ martian_destination:
e_hostunreach:
err = -EHOSTUNREACH;
goto done;
goto out;
e_inval:
err = -EINVAL;
goto done;
goto out;
e_nobufs:
err = -ENOBUFS;
goto done;
goto out;
martian_source:
err = -EINVAL;
martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
goto done;
goto out;
}
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@ -2349,6 +2349,7 @@ skip_cache:
}
EXPORT_SYMBOL(ip_route_input_common);
/* called with rcu_read_lock() */
static int __mkroute_output(struct rtable **result,
struct fib_result *res,
const struct flowi *fl,
@ -2373,18 +2374,13 @@ static int __mkroute_output(struct rtable **result,
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev) {
rcu_read_unlock();
if (!in_dev)
return -EINVAL;
}
if (res->type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
if (res->fi) {
fib_info_put(res->fi);
res->fi = NULL;
}
res->fi = NULL;
} else if (res->type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
@ -2394,10 +2390,8 @@ static int __mkroute_output(struct rtable **result,
* default one, but do not gateway in this case.
* Yes, it is hack.
*/
if (res->fi && res->prefixlen < 4) {
fib_info_put(res->fi);
if (res->fi && res->prefixlen < 4)
res->fi = NULL;
}
}
@ -2467,6 +2461,7 @@ static int __mkroute_output(struct rtable **result,
return 0;
}
/* called with rcu_read_lock() */
static int ip_mkroute_output(struct rtable **rp,
struct fib_result *res,
const struct flowi *fl,
@ -2509,7 +2504,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
struct fib_result res;
unsigned int flags = 0;
struct net_device *dev_out = NULL;
int free_res = 0;
int err;
@ -2636,15 +2630,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
err = -ENETUNREACH;
goto out;
}
free_res = 1;
if (res.type == RTN_LOCAL) {
if (!fl.fl4_src)
fl.fl4_src = fl.fl4_dst;
dev_out = net->loopback_dev;
fl.oif = dev_out->ifindex;
if (res.fi)
fib_info_put(res.fi);
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
@ -2668,8 +2659,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
make_route:
err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
if (free_res)
fib_res_put(&res);
out: return err;
}