diff -urp v3.4/linux/Documentation/networking/ip-sysctl.txt linux/Documentation/networking/ip-sysctl.txt --- v3.4/linux/Documentation/networking/ip-sysctl.txt 2012-05-21 23:03:38.000000000 +0300 +++ linux/Documentation/networking/ip-sysctl.txt 2012-05-21 23:32:17.750747679 +0300 @@ -761,6 +761,24 @@ accept_redirects - BOOLEAN forwarding - BOOLEAN Enable IP forwarding on this interface. +forward_shared - BOOLEAN + Integer value determines if a source validation should allow + forwarding of packets with local source address. 1 means yes, + 0 means no. By default the flag is disabled and such packets + are not forwarded. + + If you enable this flag on internal network, the router will forward + packets from internal hosts with shared IP addresses no matter how + the rp_filter is set. This flag is activated only if it is + enabled both in specific device section and in "all" section. + +loop - BOOLEAN + By default (loop=0) the traffic between local IP addresses + is routed via interface "lo". Setting this flag for two + interfaces allows traffic between their IP addresses to + be looped externally. This is useful for setups where the + interfaces are attached to same broadcast medium. + mc_forwarding - BOOLEAN Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE and a multicast routing daemon is required. @@ -970,6 +988,23 @@ disable_xfrm - BOOLEAN +hidden - BOOLEAN + Hide addresses attached to this device from other devices. + Such addresses will never be selected by source address autoselection + mechanism, host does not answer broadcast ARP requests for them, + does not announce them as source address of ARP requests, but they + are still reachable via IP. This flag is activated only if it is + enabled both in specific device section and in "all" section. + +rp_filter_mask - INTEGER + Integer value representing bitmask of the mediums for which the + reverse path protection is disabled. If the source validation + results in reverse path to interface with medium_id value in + the 1..31 range the access is allowed if the corresponding bit + is set in the bitmask. The bitmask value is considered only when + rp_filter is enabled. By default the bitmask is empty preserving + the original rp_filter semantic. + tag - INTEGER Allows you to write a number, which can be used as required. Default value is 0. diff -urp v3.4/linux/include/linux/inetdevice.h linux/include/linux/inetdevice.h --- v3.4/linux/include/linux/inetdevice.h 2012-05-21 23:04:36.000000000 +0300 +++ linux/include/linux/inetdevice.h 2012-05-21 23:33:47.726751840 +0300 @@ -32,6 +32,10 @@ enum IPV4_DEVCONF_FORCE_IGMP_VERSION, IPV4_DEVCONF_ARP_ANNOUNCE, IPV4_DEVCONF_ARP_IGNORE, + IPV4_DEVCONF_HIDDEN, + IPV4_DEVCONF_FORWARD_SHARED, + IPV4_DEVCONF_RP_FILTER_MASK, + IPV4_DEVCONF_LOOP, IPV4_DEVCONF_PROMOTE_SECONDARIES, IPV4_DEVCONF_ARP_ACCEPT, IPV4_DEVCONF_ARP_NOTIFY, @@ -122,12 +126,14 @@ static inline void ipv4_devconf_setall(s #define IN_DEV_LOG_MARTIANS(in_dev) IN_DEV_ORCONF((in_dev), LOG_MARTIANS) #define IN_DEV_PROXY_ARP(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP) #define IN_DEV_PROXY_ARP_PVLAN(in_dev) IN_DEV_CONF_GET(in_dev, PROXY_ARP_PVLAN) +#define IN_DEV_HIDDEN(in_dev) IN_DEV_ANDCONF((in_dev), HIDDEN) #define IN_DEV_SHARED_MEDIA(in_dev) IN_DEV_ORCONF((in_dev), SHARED_MEDIA) #define IN_DEV_TX_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), SEND_REDIRECTS) #define IN_DEV_SEC_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), \ SECURE_REDIRECTS) #define IN_DEV_IDTAG(in_dev) IN_DEV_CONF_GET(in_dev, TAG) #define IN_DEV_MEDIUM_ID(in_dev) IN_DEV_CONF_GET(in_dev, MEDIUM_ID) +#define IN_DEV_RPFILTER_MASK(in_dev) IN_DEV_CONF_GET(in_dev, RP_FILTER_MASK) #define IN_DEV_PROMOTE_SECONDARIES(in_dev) \ IN_DEV_ORCONF((in_dev), \ PROMOTE_SECONDARIES) @@ -138,6 +144,8 @@ static inline void ipv4_devconf_setall(s || (!IN_DEV_FORWARD(in_dev) && \ IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS))) +#define IN_DEV_LOOP(in_dev) IN_DEV_CONF_GET(in_dev, LOOP) +#define IN_DEV_FORWARD_SHARED(in_dev) IN_DEV_ANDCONF((in_dev), FORWARD_SHARED) #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_ORCONF((in_dev), ARP_ACCEPT) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) diff -urp v3.4/linux/include/linux/rtnetlink.h linux/include/linux/rtnetlink.h --- v3.4/linux/include/linux/rtnetlink.h 2012-03-20 00:05:18.000000000 +0200 +++ linux/include/linux/rtnetlink.h 2012-05-21 23:32:17.754747680 +0300 @@ -120,6 +120,13 @@ enum { RTM_SETDCB, #define RTM_SETDCB RTM_SETDCB + RTM_NEWARPRULE = 80, +#define RTM_NEWARPRULE RTM_NEWARPRULE + RTM_DELARPRULE, +#define RTM_DELARPRULE RTM_DELARPRULE + RTM_GETARPRULE, +#define RTM_GETARPRULE RTM_GETARPRULE + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -312,6 +319,8 @@ struct rtnexthop { #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_SUSPECT 8 /* We don't know the real state */ +#define RTNH_F_BADSTATE (RTNH_F_DEAD | RTNH_F_SUSPECT) /* Macros to handle hexthops */ @@ -516,6 +525,54 @@ enum { #define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1) +/****************************************************************************** + * Definitions used in ARP tables administration + ****/ + +#define ARPA_TABLE_INPUT 0 +#define ARPA_TABLE_OUTPUT 1 +#define ARPA_TABLE_FORWARD 2 +#define ARPA_TABLE_ALL -1 + +#define ARPM_F_PREFSRC 0x0001 +#define ARPM_F_WILDIIF 0x0002 +#define ARPM_F_WILDOIF 0x0004 +#define ARPM_F_BROADCAST 0x0008 +#define ARPM_F_UNICAST 0x0010 + +struct arpmsg +{ + unsigned char arpm_family; + unsigned char arpm_table; + unsigned char arpm_action; + unsigned char arpm_from_len; + unsigned char arpm_to_len; + unsigned char arpm__pad1; + unsigned short arpm__pad2; + unsigned arpm_pref; + unsigned arpm_flags; +}; + +enum +{ + ARPA_UNSPEC, + ARPA_FROM, /* FROM IP prefix */ + ARPA_TO, /* TO IP prefix */ + ARPA_LLFROM, /* FROM LL prefix */ + ARPA_LLTO, /* TO LL prefix */ + ARPA_LLSRC, /* New SRC lladdr */ + ARPA_LLDST, /* New DST lladdr */ + ARPA_IIF, /* In interface prefix */ + ARPA_OIF, /* Out interface prefix */ + ARPA_SRC, /* New IP SRC */ + ARPA_DST, /* New IP DST, not used */ + ARPA_PACKETS, /* Packets */ +}; + +#define ARPA_MAX ARPA_PACKETS + +#define ARPA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct arpmsg)))) + #ifndef __KERNEL__ /* RTnetlink multicast groups - backwards compatibility for userspace */ #define RTMGRP_LINK 1 @@ -536,6 +593,8 @@ enum { #define RTMGRP_DECnet_IFADDR 0x1000 #define RTMGRP_DECnet_ROUTE 0x4000 +#define RTMGRP_ARP 0x00010000 + #define RTMGRP_IPV6_PREFIX 0x20000 #endif @@ -587,6 +646,8 @@ enum rtnetlink_groups { #define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE RTNLGRP_DCB, #define RTNLGRP_DCB RTNLGRP_DCB + RTNLGRP_ARP, +#define RTNLGRP_ARP RTNLGRP_ARP __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff -urp v3.4/linux/include/net/flow.h linux/include/net/flow.h --- v3.4/linux/include/net/flow.h 2012-03-20 00:05:18.000000000 +0200 +++ linux/include/net/flow.h 2012-05-21 23:32:17.754747680 +0300 @@ -72,6 +72,7 @@ struct flowi4 { #define fl4_ipsec_spi uli.spi #define fl4_mh_type uli.mht.type #define fl4_gre_key uli.gre_key + __be32 fl4_gw; } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline void flowi4_init_output(struct flowi4 *fl4, int oif, @@ -92,6 +93,7 @@ static inline void flowi4_init_output(st fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = sport; + fl4->fl4_gw = 0; } /* Reset some input parameters after previous lookup */ diff -urp v3.4/linux/include/net/ip_fib.h linux/include/net/ip_fib.h --- v3.4/linux/include/net/ip_fib.h 2011-07-22 09:43:31.000000000 +0300 +++ linux/include/net/ip_fib.h 2012-05-21 23:32:17.754747680 +0300 @@ -223,6 +223,8 @@ extern int fib_lookup(struct net *n, str extern struct fib_table *fib_new_table(struct net *net, u32 id); extern struct fib_table *fib_get_table(struct net *net, u32 id); +extern int fib_result_table(struct fib_result *res); + #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ @@ -230,8 +232,9 @@ extern const struct nla_policy rtm_ipv4_ extern void ip_fib_init(void); extern int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, - __be32 *spec_dst, u32 *itag); -extern void fib_select_default(struct fib_result *res); + __be32 *spec_dst, u32 *itag, int our); +extern void fib_select_default(const struct flowi4 *flp, + struct fib_result *res); /* Exported by fib_semantics.c */ extern int ip_fib_check_default(__be32 gw, struct net_device *dev); @@ -239,7 +242,8 @@ extern int fib_sync_down_dev(struct net_ extern int fib_sync_down_addr(struct net *net, __be32 local); extern void fib_update_nh_saddrs(struct net_device *dev); extern int fib_sync_up(struct net_device *dev); -extern void fib_select_multipath(struct fib_result *res); +extern void fib_select_multipath(const struct flowi4 *flp, + struct fib_result *res); /* Exported by fib_trie.c */ extern void fib_trie_init(void); @@ -282,4 +286,6 @@ static inline void fib_proc_exit(struct } #endif +extern rwlock_t fib_nhflags_lock; + #endif /* _NET_FIB_H */ diff -urp v3.4/linux/include/net/netfilter/nf_nat.h linux/include/net/netfilter/nf_nat.h --- v3.4/linux/include/net/netfilter/nf_nat.h 2012-03-20 00:05:18.000000000 +0200 +++ linux/include/net/netfilter/nf_nat.h 2012-05-21 23:32:17.754747680 +0300 @@ -48,6 +48,13 @@ struct nf_conn_nat { #endif }; +/* Call input routing for SNAT-ed traffic */ +extern unsigned int ip_nat_route_input(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)); + /* Set up the info structure to map into this range. */ extern unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_ipv4_range *range, diff -urp v3.4/linux/include/net/route.h linux/include/net/route.h --- v3.4/linux/include/net/route.h 2012-03-20 00:05:18.000000000 +0200 +++ linux/include/net/route.h 2012-05-21 23:32:17.754747680 +0300 @@ -48,6 +48,8 @@ struct rtable { /* Lookup key. */ __be32 rt_key_dst; __be32 rt_key_src; + __be32 rt_key_lsrc; + __be32 rt_key_gw; int rt_genid; unsigned rt_flags; @@ -191,6 +193,7 @@ extern void ip_rt_multicast_event(struc extern int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg); extern void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb); +extern int ip_route_input_lookup(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin, __be32 lsrc); struct in_ifaddr; extern void fib_add_ifaddr(struct in_ifaddr *); diff -urp v3.4/linux/net/bridge/br_netfilter.c linux/net/bridge/br_netfilter.c --- v3.4/linux/net/bridge/br_netfilter.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/bridge/br_netfilter.c 2012-05-21 23:32:17.758747680 +0300 @@ -436,6 +436,9 @@ static int br_nf_pre_routing_finish(stru struct rtable *rt; int err; + /* Old skb->dst is not expected, it is lost in all cases */ + skb_dst_drop(skb); + if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->mask ^= BRNF_PKT_TYPE; diff -urp v3.4/linux/net/core/rtnetlink.c linux/net/core/rtnetlink.c --- v3.4/linux/net/core/rtnetlink.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/core/rtnetlink.c 2012-05-21 23:32:17.758747680 +0300 @@ -525,6 +525,7 @@ static const int rtm_min[RTM_NR_FAMILIES [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), + [RTM_FAM(RTM_GETARPRULE)] = NLMSG_LENGTH(sizeof(struct arpmsg)), }; static const int rta_max[RTM_NR_FAMILIES] = @@ -537,6 +538,7 @@ static const int rta_max[RTM_NR_FAMILIES [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, + [RTM_FAM(RTM_GETARPRULE)] = ARPA_MAX, }; void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) diff -urp v3.4/linux/net/ipv4/arp.c linux/net/ipv4/arp.c --- v3.4/linux/net/ipv4/arp.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/ipv4/arp.c 2012-05-21 23:32:17.762747679 +0300 @@ -71,6 +71,9 @@ * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. + * Julian Anastasov: "hidden" flag: hide the + * interface and don't reply for it + * Julian Anastasov: ARP filtering via netlink */ #include @@ -94,6 +97,7 @@ #include #include #include +#include #include #include #include @@ -185,6 +189,47 @@ struct neigh_table arp_tbl = { }; EXPORT_SYMBOL(arp_tbl); +struct arpf_node { + struct arpf_node * at_next; + u32 at_pref; + u32 at_from; + u32 at_from_mask; + u32 at_to; + u32 at_to_mask; + u32 at_src; + atomic_t at_packets; + atomic_t at_refcnt; + unsigned at_flags; + unsigned char at_from_len; + unsigned char at_to_len; + unsigned char at_action; + char at_dead; + unsigned char at_llfrom_len; + unsigned char at_llto_len; + unsigned char at_llsrc_len; + unsigned char at_lldst_len; + unsigned char at_iif_len; + unsigned char at_oif_len; + unsigned short at__pad1; + unsigned char at_llfrom[MAX_ADDR_LEN]; + unsigned char at_llto[MAX_ADDR_LEN]; + unsigned char at_llsrc[MAX_ADDR_LEN]; + unsigned char at_lldst[MAX_ADDR_LEN]; + char at_iif[IFNAMSIZ]; + char at_oif[IFNAMSIZ]; +}; + +static struct arpf_node *arp_tabs[3]; + +static struct kmem_cache *arpf_cachep; + +static DEFINE_RWLOCK(arpf_lock); + +static void +arpf_send(int table, struct net *net, struct sk_buff *skb, u32 sip, u32 tip, + unsigned char *from_hw, unsigned char *to_hw, + struct net_device *idev, struct net_device *odev); + int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { @@ -327,7 +372,10 @@ static void arp_solicit(struct neighbour struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); - struct in_device *in_dev; + struct in_device *in_dev, *in_dev2; + struct net_device *dev2; + int mode; + unsigned char tha[MAX_ADDR_LEN]; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -335,9 +383,22 @@ static void arp_solicit(struct neighbour rcu_read_unlock(); return; } - switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { + mode = IN_DEV_ARP_ANNOUNCE(in_dev); + if (mode != 2 && skb && + (dev2 = __ip_dev_find(dev_net(dev), ip_hdr(skb)->saddr, + false)) != NULL && + (saddr = ip_hdr(skb)->saddr, + in_dev2 = __in_dev_get_rcu(dev2)) != NULL && + IN_DEV_HIDDEN(in_dev2)) { + saddr = 0; + goto get; + } + + switch (mode) { default: case 0: /* By default announce any local IP */ + if (saddr) + break; if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; @@ -345,8 +406,9 @@ static void arp_solicit(struct neighbour case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; - saddr = ip_hdr(skb)->saddr; - if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { + if (saddr || + (saddr = ip_hdr(skb)->saddr, + inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL)) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -356,6 +418,8 @@ static void arp_solicit(struct neighbour case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } + +get: rcu_read_unlock(); if (!saddr) @@ -366,8 +430,10 @@ static void arp_solicit(struct neighbour if (!(neigh->nud_state & NUD_VALID)) printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); - dst_ha = neigh->ha; + dst_ha = tha; read_lock_bh(&neigh->lock); + memcpy(dst_ha, neigh->ha, dev->addr_len); + read_unlock_bh(&neigh->lock); } else { probes -= neigh->parms->app_probes; if (probes < 0) { @@ -378,10 +444,7 @@ static void arp_solicit(struct neighbour } } - arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_ha, dev->dev_addr, NULL); - if (dst_ha) - read_unlock_bh(&neigh->lock); + arpf_send(ARPA_TABLE_OUTPUT,dev_net(dev),skb,saddr,target,NULL,dst_ha,NULL,dev); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -436,6 +499,21 @@ static int arp_filter(__be32 sip, __be32 return flag; } +static int arp_hidden(u32 tip, struct net_device *dev) +{ + struct net_device *dev2; + struct in_device *in_dev2; + int ret = 0; + + if (!IPV4_DEVCONF_ALL(dev_net(dev), HIDDEN)) + return 0; + + if ((dev2 = __ip_dev_find(dev_net(dev), tip, false)) && dev2 != dev && + (in_dev2 = __in_dev_get_rcu(dev2)) && IN_DEV_HIDDEN(in_dev2)) + ret = 1; + return ret; +} + /* OBSOLETE FUNCTIONS */ /* @@ -728,7 +806,7 @@ static int arp_process(struct sk_buff *s struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; - unsigned char *sha; + unsigned char *sha, *tha; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; @@ -794,6 +872,7 @@ static int arp_process(struct sk_buff *s arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; + tha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&tip, arp_ptr, 4); /* @@ -830,9 +909,10 @@ static int arp_process(struct sk_buff *s if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type(net, tip) == RTN_LOCAL && + !arp_hidden(tip, dev) && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arpf_send(ARPA_TABLE_INPUT,net, + skb,sip,tip,sha,tha,dev,NULL); goto out; } @@ -848,12 +928,13 @@ static int arp_process(struct sk_buff *s dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); + if (!dont_send && skb->pkt_type != PACKET_HOST) + dont_send = arp_hidden(tip,dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arpf_send(ARPA_TABLE_INPUT,net, + skb,sip,tip,sha,tha,dev,NULL); neigh_release(n); } } @@ -871,9 +952,9 @@ static int arp_process(struct sk_buff *s if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arpf_send(ARPA_TABLE_FORWARD,net, + skb,sip,tip,sha,tha,dev, + rt->dst.dev); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); @@ -1258,6 +1339,548 @@ void arp_ifdown(struct net_device *dev) } +static void arpf_destroy(struct arpf_node *afp) +{ + if (!afp->at_dead) { + printk(KERN_ERR "Destroying alive arp table node %p from %08lx\n", afp, + *(((unsigned long*)&afp)-1)); + return; + } + kmem_cache_free(arpf_cachep, afp); +} + +static inline void arpf_put(struct arpf_node *afp) +{ + if (atomic_dec_and_test(&afp->at_refcnt)) + arpf_destroy(afp); +} + +static inline struct arpf_node * +arpf_lookup(int table, struct sk_buff *skb, u32 sip, u32 tip, + unsigned char *from_hw, unsigned char *to_hw, + struct net_device *idev, struct net_device *odev) +{ + int sz_iif = idev? strlen(idev->name) : 0; + int sz_oif = odev? strlen(odev->name) : 0; + int alen; + struct arpf_node *afp; + + if (ARPA_TABLE_OUTPUT != table) { + alen = idev->addr_len; + } else { + if (!from_hw) from_hw = odev->dev_addr; + if (!to_hw) to_hw = odev->broadcast; + alen = odev->addr_len; + } + + read_lock(&arpf_lock); + for (afp = arp_tabs[table]; afp; afp = afp->at_next) { + if ((tip ^ afp->at_to) & afp->at_to_mask) + continue; + if ((sip ^ afp->at_from) & afp->at_from_mask) + continue; + if (afp->at_llfrom_len && + (afp->at_llfrom_len > alen || + memcmp(from_hw, afp->at_llfrom, afp->at_llfrom_len))) + continue; + if (afp->at_llto_len && + (afp->at_llto_len > alen || + memcmp(to_hw, afp->at_llto, afp->at_llto_len))) + continue; + if (afp->at_iif_len && + (afp->at_iif_len > sz_iif || + memcmp(afp->at_iif, idev->name, afp->at_iif_len) || + (sz_iif != afp->at_iif_len && + !(afp->at_flags & ARPM_F_WILDIIF)))) + continue; + if (afp->at_oif_len && + (afp->at_oif_len > sz_oif || + memcmp(afp->at_oif, odev->name, afp->at_oif_len) || + (sz_oif != afp->at_oif_len && + !(afp->at_flags & ARPM_F_WILDOIF)))) + continue; + if (afp->at_flags & ARPM_F_BROADCAST && + skb->pkt_type == PACKET_HOST) + continue; + if (afp->at_flags & ARPM_F_UNICAST && + skb->pkt_type != PACKET_HOST) + continue; + if (afp->at_llsrc_len && afp->at_llsrc_len != alen) + continue; + if (afp->at_lldst_len && afp->at_lldst_len != alen) + continue; + atomic_inc(&afp->at_packets); + break; + } + read_unlock(&arpf_lock); + return afp; +} + +static void +arpf_send(int table, struct net *net, struct sk_buff *skb, u32 sip, u32 tip, + unsigned char *from_hw, unsigned char *to_hw, + struct net_device *idev, struct net_device *odev) +{ + struct arpf_node *afp = NULL; + + if (!arp_tabs[table] || + net != &init_net || + !(afp = arpf_lookup(table, skb, sip, tip, + from_hw, to_hw, idev, odev))) { + switch (table) { + case ARPA_TABLE_INPUT: + case ARPA_TABLE_FORWARD: + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, idev, tip, + from_hw, idev->dev_addr, from_hw); + break; + case ARPA_TABLE_OUTPUT: + arp_send(ARPOP_REQUEST, ETH_P_ARP, tip, odev, sip, + to_hw, odev->dev_addr, NULL); + break; + } + return; + } + + /* deny? */ + if (!afp->at_action) goto out; + + switch (table) { + case ARPA_TABLE_INPUT: + case ARPA_TABLE_FORWARD: + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, idev, tip, + afp->at_lldst_len?afp->at_lldst:from_hw, + afp->at_llsrc_len?afp->at_llsrc:idev->dev_addr, + afp->at_lldst_len?afp->at_lldst:from_hw); + break; + case ARPA_TABLE_OUTPUT: + if (afp->at_flags & ARPM_F_PREFSRC && afp->at_src == 0) { + struct rtable *rt; + struct flowi4 fl4 = { .daddr = tip, + .flowi4_oif = odev->ifindex }; + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + break; + sip = rt->rt_src; + ip_rt_put(rt); + if (!sip) + break; + } + arp_send(ARPOP_REQUEST, ETH_P_ARP, tip, odev, afp->at_src?:sip, + afp->at_lldst_len?afp->at_lldst:to_hw, + afp->at_llsrc_len?afp->at_llsrc:odev->dev_addr, + NULL); + break; + } + +out: + arpf_put(afp); +} + +static int +arpf_fill_node(struct sk_buff *skb, u32 pid, u32 seq, unsigned flags, + int event, int table, struct arpf_node *afp) +{ + struct arpmsg *am; + struct nlmsghdr *nlh; + u32 packets = atomic_read(&afp->at_packets); + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*am), 0); + if (nlh == NULL) + return -ENOBUFS; + nlh->nlmsg_flags = flags; + am = nlmsg_data(nlh); + am->arpm_family = AF_UNSPEC; + am->arpm_table = table; + am->arpm_action = afp->at_action; + am->arpm_from_len = afp->at_from_len; + am->arpm_to_len = afp->at_to_len; + am->arpm_pref = afp->at_pref; + am->arpm_flags = afp->at_flags; + if (afp->at_from_len) + NLA_PUT(skb, ARPA_FROM, 4, &afp->at_from); + if (afp->at_to_len) + NLA_PUT(skb, ARPA_TO, 4, &afp->at_to); + if (afp->at_src || afp->at_flags & ARPM_F_PREFSRC) + NLA_PUT(skb, ARPA_SRC, 4, &afp->at_src); + if (afp->at_iif[0]) + NLA_PUT(skb, ARPA_IIF, sizeof(afp->at_iif), afp->at_iif); + if (afp->at_oif[0]) + NLA_PUT(skb, ARPA_OIF, sizeof(afp->at_oif), afp->at_oif); + if (afp->at_llfrom_len) + NLA_PUT(skb, ARPA_LLFROM, afp->at_llfrom_len, afp->at_llfrom); + if (afp->at_llto_len) + NLA_PUT(skb, ARPA_LLTO, afp->at_llto_len, afp->at_llto); + if (afp->at_llsrc_len) + NLA_PUT(skb, ARPA_LLSRC, afp->at_llsrc_len, afp->at_llsrc); + if (afp->at_lldst_len) + NLA_PUT(skb, ARPA_LLDST, afp->at_lldst_len, afp->at_lldst); + NLA_PUT(skb, ARPA_PACKETS, 4, &packets); + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void +arpmsg_notify(struct sk_buff *oskb, struct nlmsghdr *nlh, int table, + struct arpf_node *afp, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + int payload = sizeof(struct arpmsg) + 256; + int err = -ENOBUFS; + + skb = nlmsg_new(nlmsg_total_size(payload), GFP_KERNEL); + if (!skb) + goto errout; + + err = arpf_fill_node(skb, pid, nlh->nlmsg_seq, 0, event, table, afp); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, &init_net, pid, RTNLGRP_ARP, nlh, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(&init_net, RTNLGRP_ARP, err); +} + +static inline int +arpf_str_size(int a, struct rtattr **rta, int maxlen) +{ + int size = 0; + + if (rta[a-1] && (size = RTA_PAYLOAD(rta[a-1]))) { + if (size > maxlen) + size = maxlen; + } + return size; +} + +static inline int +arpf_get_str(int a, struct rtattr **rta, unsigned char *p, + int maxlen, unsigned char *l) +{ + int size = arpf_str_size(a, rta, maxlen); + + if (size) { + memcpy(p, RTA_DATA(rta[a-1]), size); + *l = size; + } + return size; +} + +#define ARPF_MATCH_U32(ind, field) ( \ + (!rta[ind-1] && r->at_ ## field == 0) || \ + (rta[ind-1] && \ + *(u32*) RTA_DATA(rta[ind-1]) == r->at_ ## field)) + +#define ARPF_MATCH_STR(ind, field) ( \ + (!rta[ind-1] && r->at_ ## field ## _len == 0) || \ + (rta[ind-1] && r->at_ ## field ## _len && \ + r->at_ ## field ## _len < RTA_PAYLOAD(rta[ind-1]) && \ + strcmp(RTA_DATA(rta[ind-1]), r->at_ ## field) == 0)) + +#define ARPF_MATCH_DATA(ind, field) ( \ + (!rta[ind-1] && r->at_ ## field ## _len == 0) || \ + (rta[ind-1] && r->at_ ## field ## _len && \ + r->at_ ## field ## _len == RTA_PAYLOAD(rta[ind-1]) && \ + memcmp(RTA_DATA(rta[ind-1]), &r->at_ ## field, \ + r->at_ ## field ## _len) == 0)) + +/* RTM_NEWARPRULE/RTM_DELARPRULE/RTM_GETARPRULE */ + +int arpf_rule_ctl(struct sk_buff *skb, struct nlmsghdr* n, void *arg) +{ + struct rtattr **rta = arg; + struct arpmsg *am = NLMSG_DATA(n); + struct arpf_node *r, **rp, **prevp = 0, **delp = 0, *newp = 0; + unsigned pref = 1; + int size, ret = -EINVAL; + + if (am->arpm_table >= sizeof(arp_tabs)/sizeof(arp_tabs[0])) + goto out; + if (!((~am->arpm_flags) & (ARPM_F_BROADCAST|ARPM_F_UNICAST))) + goto out; + if (am->arpm_action > 1) + goto out; + if (am->arpm_to_len > 32 || am->arpm_from_len > 32) + goto out; + if (am->arpm_flags & ARPM_F_WILDIIF && + (!rta[ARPA_IIF-1] || !RTA_PAYLOAD(rta[ARPA_IIF-1]) || + !*(char*)RTA_DATA(rta[ARPA_IIF-1]))) + am->arpm_flags &= ~ARPM_F_WILDIIF; + if (am->arpm_flags & ARPM_F_WILDOIF && + (!rta[ARPA_OIF-1] || !RTA_PAYLOAD(rta[ARPA_OIF-1]) || + !*(char*)RTA_DATA(rta[ARPA_OIF-1]))) + am->arpm_flags &= ~ARPM_F_WILDOIF; + switch (am->arpm_table) { + case ARPA_TABLE_INPUT: + if (rta[ARPA_SRC-1] || rta[ARPA_OIF-1]) + goto out; + break; + case ARPA_TABLE_OUTPUT: + if (rta[ARPA_IIF-1]) + goto out; + if (am->arpm_flags & (ARPM_F_BROADCAST|ARPM_F_UNICAST)) + goto out; + break; + case ARPA_TABLE_FORWARD: + if (rta[ARPA_SRC-1]) + goto out; + break; + } + if (rta[ARPA_SRC-1] && !*(u32*) RTA_DATA(rta[ARPA_SRC-1])) + am->arpm_flags |= ARPM_F_PREFSRC; + else + am->arpm_flags &= ~ARPM_F_PREFSRC; + + for (rp = &arp_tabs[am->arpm_table]; (r=*rp) != NULL; rp=&r->at_next) { + if (pref < r->at_pref) + prevp = rp; + if (am->arpm_pref == r->at_pref || + (!am->arpm_pref && + am->arpm_to_len == r->at_to_len && + am->arpm_from_len == r->at_from_len && + !((am->arpm_flags ^ r->at_flags) & + (ARPM_F_BROADCAST | ARPM_F_UNICAST | + ARPM_F_WILDIIF | ARPM_F_WILDOIF)) && + ARPF_MATCH_U32(ARPA_TO, to) && + ARPF_MATCH_U32(ARPA_FROM, from) && + ARPF_MATCH_DATA(ARPA_LLFROM, llfrom) && + ARPF_MATCH_DATA(ARPA_LLTO, llto) && + ARPF_MATCH_STR(ARPA_IIF, iif) && + ARPF_MATCH_STR(ARPA_OIF, oif) && + (n->nlmsg_type != RTM_DELARPRULE || + /* DEL matches more keys */ + (am->arpm_flags == r->at_flags && + am->arpm_action == r->at_action && + ARPF_MATCH_U32(ARPA_SRC, src) && + ARPF_MATCH_DATA(ARPA_LLSRC, llsrc) && + ARPF_MATCH_DATA(ARPA_LLDST, lldst) + ) + ) + ) + ) + break; + if (am->arpm_pref && r->at_pref > am->arpm_pref) { + r = NULL; + break; + } + pref = r->at_pref+1; + } + + /* + * r=NULL: *rp != NULL (stopped before next pref), pref: not valid + * *rp == NULL (not found), pref: ready to use + * r!=NULL: found, pref: not valid + * + * prevp=NULL: no free slot + * prevp!=NULL: free slot for rule + */ + + if (n->nlmsg_type == RTM_DELARPRULE) { + if (!r) + return -ESRCH; + delp = rp; + goto dequeue; + } + + if (r) { + /* Existing rule */ + ret = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + + if (n->nlmsg_flags&NLM_F_REPLACE) { + pref = r->at_pref; + prevp = delp = rp; + goto replace; + } + } + + if (n->nlmsg_flags&NLM_F_APPEND) { + if (r) { + pref = r->at_pref+1; + for (rp=&r->at_next; (r=*rp) != NULL; rp=&r->at_next) { + if (pref != r->at_pref) + break; + pref ++; + } + ret = -EBUSY; + if (!pref) + goto out; + } else if (am->arpm_pref) + pref = am->arpm_pref; + prevp = rp; + } + + if (!(n->nlmsg_flags&NLM_F_CREATE)) { + ret = -ENOENT; + if (n->nlmsg_flags&NLM_F_EXCL || r) + ret = 0; + goto out; + } + + if (!(n->nlmsg_flags&NLM_F_APPEND)) { + if (!prevp) { + ret = -EBUSY; + if (r || *rp || + (!am->arpm_pref && arp_tabs[am->arpm_table])) + goto out; + prevp = rp; + pref = am->arpm_pref? : 99; + } else { + if (r || !am->arpm_pref) { + pref = (*prevp)->at_pref - 1; + if (am->arpm_pref && am->arpm_pref < pref) + pref = am->arpm_pref; + } else { + prevp = rp; + pref = am->arpm_pref; + } + } + } + +replace: + + ret = -ENOMEM; + r = kmem_cache_alloc(arpf_cachep, GFP_KERNEL); + if (!r) + return ret; + memset(r, 0, sizeof(*r)); + + arpf_get_str(ARPA_LLFROM, rta, r->at_llfrom, MAX_ADDR_LEN, + &r->at_llfrom_len); + arpf_get_str(ARPA_LLTO, rta, r->at_llto, MAX_ADDR_LEN, + &r->at_llto_len); + arpf_get_str(ARPA_LLSRC, rta, r->at_llsrc, MAX_ADDR_LEN, + &r->at_llsrc_len); + arpf_get_str(ARPA_LLDST, rta, r->at_lldst, MAX_ADDR_LEN, + &r->at_lldst_len); + + if (delp) + r->at_next = (*delp)->at_next; + else if (*prevp) + r->at_next = *prevp; + + r->at_pref = pref; + r->at_from_len = am->arpm_from_len; + r->at_from_mask = inet_make_mask(r->at_from_len); + if (rta[ARPA_FROM-1]) + r->at_from = *(u32*) RTA_DATA(rta[ARPA_FROM-1]); + r->at_from &= r->at_from_mask; + r->at_to_len = am->arpm_to_len; + r->at_to_mask = inet_make_mask(r->at_to_len); + if (rta[ARPA_TO-1]) + r->at_to = *(u32*) RTA_DATA(rta[ARPA_TO-1]); + r->at_to &= r->at_to_mask; + if (rta[ARPA_SRC-1]) + r->at_src = *(u32*) RTA_DATA(rta[ARPA_SRC-1]); + if (rta[ARPA_PACKETS-1]) { + u32 packets = *(u32*) RTA_DATA(rta[ARPA_PACKETS-1]); + atomic_set(&r->at_packets, packets); + } + atomic_set(&r->at_refcnt, 1); + r->at_flags = am->arpm_flags; + r->at_action = am->arpm_action; + + if (rta[ARPA_IIF-1] && (size = RTA_PAYLOAD(rta[ARPA_IIF-1]))) { + if (size >= sizeof(r->at_iif)) + size = sizeof(r->at_iif)-1; + memcpy(r->at_iif, RTA_DATA(rta[ARPA_IIF-1]), size); + r->at_iif_len = strlen(r->at_iif); + } + if (rta[ARPA_OIF-1] && (size = RTA_PAYLOAD(rta[ARPA_OIF-1]))) { + if (size >= sizeof(r->at_oif)) + size = sizeof(r->at_oif)-1; + memcpy(r->at_oif, RTA_DATA(rta[ARPA_OIF-1]), size); + r->at_oif_len = strlen(r->at_oif); + } + + newp = r; + +dequeue: + + if (delp) { + r = *delp; + write_lock_bh(&arpf_lock); + if (newp) { + if (!rta[ARPA_PACKETS-1]) + atomic_set(&newp->at_packets, + atomic_read(&r->at_packets)); + *delp = newp; + } else { + *delp = r->at_next; + } + r->at_dead = 1; + write_unlock_bh(&arpf_lock); + arpmsg_notify(skb, n, am->arpm_table, r, RTM_DELARPRULE); + arpf_put(r); + prevp = 0; + } + + if (newp) { + if (prevp) { + write_lock_bh(&arpf_lock); + *prevp = newp; + write_unlock_bh(&arpf_lock); + } + arpmsg_notify(skb, n, am->arpm_table, newp, RTM_NEWARPRULE); + } + + ret = 0; + +out: + return ret; +} + +int arpf_dump_table(int t, struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, ret = -1; + struct arpf_node *afp; + int s_idx = cb->args[1]; + + for (idx=0, afp = arp_tabs[t]; afp; afp = afp->at_next, idx++) { + if (idx < s_idx) + continue; + if (arpf_fill_node(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWARPRULE, t, afp) < 0) + goto out; + } + + ret = skb->len; + +out: + cb->args[1] = idx; + + return ret; +} + +int arpf_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + + read_lock_bh(&arpf_lock); + for (idx = 0; idx < sizeof(arp_tabs)/sizeof(arp_tabs[0]); idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + memset(&cb->args[1], 0, sizeof(cb->args)-1*sizeof(cb->args[0])); + if (arpf_dump_table(idx, skb, cb) < 0) + break; + } + read_unlock_bh(&arpf_lock); + cb->args[0] = idx; + + return skb->len; +} + /* * Called once on startup. */ @@ -1271,6 +1894,16 @@ static int arp_proc_init(void); void __init arp_init(void) { + arpf_cachep = kmem_cache_create("ip_arpf_cache", + sizeof(struct arpf_node), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!arpf_cachep) + panic("IP: failed to allocate ip_arpf_cache\n"); + + rtnl_register(PF_UNSPEC, RTM_NEWARPRULE, arpf_rule_ctl, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELARPRULE, arpf_rule_ctl, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETARPRULE, NULL, arpf_dump_rules, NULL); + neigh_table_init(&arp_tbl); dev_add_pack(&arp_packet_type); diff -urp v3.4/linux/net/ipv4/devinet.c linux/net/ipv4/devinet.c --- v3.4/linux/net/ipv4/devinet.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/ipv4/devinet.c 2012-05-21 23:32:17.766747678 +0300 @@ -997,7 +997,8 @@ no_in_dev: continue; for_primary_ifa(in_dev) { - if (ifa->ifa_scope != RT_SCOPE_LINK && + if (!IN_DEV_HIDDEN(in_dev) && + ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) { addr = ifa->ifa_local; goto out_unlock; @@ -1601,14 +1602,18 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), + DEVINET_SYSCTL_RW_ENTRY(FORWARD_SHARED, "forward_shared"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), + DEVINET_SYSCTL_RW_ENTRY(RP_FILTER_MASK, "rp_filter_mask"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), + DEVINET_SYSCTL_RW_ENTRY(HIDDEN, "hidden"), DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), + DEVINET_SYSCTL_RW_ENTRY(LOOP, "loop"), DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), diff -urp v3.4/linux/net/ipv4/fib_frontend.c linux/net/ipv4/fib_frontend.c --- v3.4/linux/net/ipv4/fib_frontend.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/ipv4/fib_frontend.c 2012-05-21 23:32:17.770747678 +0300 @@ -47,6 +47,8 @@ #ifndef CONFIG_IP_MULTIPLE_TABLES +#define FIB_RES_TABLE(r) (RT_TABLE_MAIN) + static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; @@ -71,6 +73,8 @@ fail: } #else +#define FIB_RES_TABLE(r) (fib_result_table(r)) + struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb; @@ -190,14 +194,20 @@ EXPORT_SYMBOL(inet_dev_addr_type); */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, __be32 *spec_dst, - u32 *itag) + u32 *itag, int our) { struct in_device *in_dev; struct flowi4 fl4; struct fib_result res; + int table; + unsigned char prefixlen; + unsigned char scope; int no_addr, rpf, accept_local; bool dev_match; + unsigned rpf_mask = 0; int ret; + int fwdsh = 0; + int loop = 0; struct net *net; fl4.flowi4_oif = 0; @@ -206,6 +216,7 @@ int fib_validate_source(struct sk_buff * fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.fl4_gw = 0; no_addr = rpf = accept_local = 0; in_dev = __in_dev_get_rcu(dev); @@ -217,6 +228,9 @@ int fib_validate_source(struct sk_buff * accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + fwdsh = IN_DEV_FORWARD_SHARED(in_dev); + rpf_mask = IN_DEV_RPFILTER_MASK(in_dev); + loop = IN_DEV_LOOP(in_dev); } if (in_dev == NULL) @@ -225,6 +239,17 @@ int fib_validate_source(struct sk_buff * net = dev_net(dev); if (fib_lookup(net, &fl4, &res)) goto last_resort; + if (loop && res.type == RTN_LOCAL) { + *spec_dst = FIB_RES_PREFSRC(net, res); + return 0; + } + if (fwdsh) { + fwdsh = (res.type == RTN_LOCAL && !our); + if (fwdsh) { + rpf = 0; + accept_local = 1; + } + } if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !accept_local) goto e_inval; @@ -250,19 +275,37 @@ int fib_validate_source(struct sk_buff * ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; return ret; } + if (rpf_mask && rpf) { + int omi = 0; + + in_dev = __in_dev_get_rcu(FIB_RES_DEV(res)); + if (in_dev) + omi = IN_DEV_MEDIUM_ID(in_dev); + if (omi >= 1 && omi <= 31 && ((1 << omi) & rpf_mask)) + rpf = 0; + } if (no_addr) goto last_resort; - if (rpf == 1) - goto e_rpf; + table = FIB_RES_TABLE(&res); + prefixlen = res.prefixlen; + scope = res.scope; fl4.flowi4_oif = dev->ifindex; + if (fwdsh) + fl4.flowi4_iif = net->loopback_dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res) == 0) { - if (res.type == RTN_UNICAST) { + if (res.type == RTN_UNICAST && + ((table == FIB_RES_TABLE(&res) && + res.prefixlen >= prefixlen && res.scope >= scope) || + !rpf)) { *spec_dst = FIB_RES_PREFSRC(net, res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + return ret; } } + if (rpf == 1) + goto e_rpf; return ret; last_resort: @@ -966,9 +1009,7 @@ static int fib_inetaddr_event(struct not switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); -#ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); -#endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; @@ -1007,9 +1048,7 @@ static int fib_netdev_event(struct notif for_ifa(in_dev) { fib_add_ifaddr(ifa); } endfor_ifa(in_dev); -#ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); -#endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; diff -urp v3.4/linux/net/ipv4/fib_lookup.h linux/net/ipv4/fib_lookup.h --- v3.4/linux/net/ipv4/fib_lookup.h 2011-05-20 10:38:08.000000000 +0300 +++ linux/net/ipv4/fib_lookup.h 2012-05-21 23:32:17.770747678 +0300 @@ -8,6 +8,7 @@ struct fib_alias { struct list_head fa_list; struct fib_info *fa_info; + int fa_last_dflt; u8 fa_tos; u8 fa_type; u8 fa_state; @@ -38,7 +39,8 @@ extern struct fib_alias *fib_find_alias( u8 tos, u32 prio); extern int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, - int *last_idx, int dflt); + int *last_idx, int *dflt, int *last_nhsel, + const struct flowi4 *flp); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) diff -urp v3.4/linux/net/ipv4/fib_rules.c linux/net/ipv4/fib_rules.c --- v3.4/linux/net/ipv4/fib_rules.c 2012-03-20 00:05:19.000000000 +0200 +++ linux/net/ipv4/fib_rules.c 2012-05-21 23:32:17.770747678 +0300 @@ -54,6 +54,11 @@ u32 fib_rules_tclass(const struct fib_re } #endif +int fib_result_table(struct fib_result *res) +{ + return res->r->table; +} + int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { diff -urp v3.4/linux/net/ipv4/fib_semantics.c linux/net/ipv4/fib_semantics.c --- v3.4/linux/net/ipv4/fib_semantics.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/ipv4/fib_semantics.c 2012-05-21 23:32:17.774747679 +0300 @@ -50,6 +50,7 @@ static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; static unsigned int fib_info_hash_size; static unsigned int fib_info_cnt; +DEFINE_RWLOCK(fib_nhflags_lock); #define DEVINDEX_HASHBITS 8 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) @@ -198,7 +199,7 @@ static inline int nh_comp(const struct f #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_BADSTATE)) return -1; onh++; } endfor_nexthops(fi); @@ -250,7 +251,7 @@ static struct fib_info *fib_find_info(co nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && - ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && + ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_BADSTATE) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -361,26 +362,70 @@ struct fib_alias *fib_find_alias(struct } int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int dflt) + struct fib_info **last_resort, int *last_idx, int *dflt, + int *last_nhsel, const struct flowi4 *flp) { struct neighbour *n; - int state = NUD_NONE; + int nhsel; + int state; + struct fib_nh * nh; + __be32 dst; + int flag, dead = 1; + + /* change_nexthops(fi) { */ + for (nhsel = 0, nh = fi->fib_nh; nhsel < fi->fib_nhs; nh++, nhsel++) { + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK) + continue; + if (nh->nh_flags & RTNH_F_DEAD) + continue; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); - if (n) { - state = n->nud_state; - neigh_release(n); - } - if (state == NUD_REACHABLE) - return 0; - if ((state & NUD_VALID) && order != dflt) - return 0; - if ((state & NUD_VALID) || - (*last_idx < 0 && order > dflt)) { - *last_resort = fi; - *last_idx = order; + flag = 0; + if (nh->nh_dev->flags & IFF_NOARP) { + dead = 0; + goto setfl; + } + + dst = nh->nh_gw; + if (!nh->nh_gw || nh->nh_scope != RT_SCOPE_LINK) + dst = flp->daddr; + + state = NUD_NONE; + n = neigh_lookup(&arp_tbl, &dst, nh->nh_dev); + if (n) { + state = n->nud_state; + neigh_release(n); + } + if (state == NUD_REACHABLE || + ((state & NUD_VALID) && order != *dflt)) { + dead = 0; + goto setfl; + } + if (!(state & NUD_VALID)) + flag = 1; + if (!dead) + goto setfl; + if ((state & NUD_VALID) || + (*last_idx < 0 && order >= *dflt)) { + *last_resort = fi; + *last_idx = order; + *last_nhsel = nhsel; + } + + setfl: + + read_lock_bh(&fib_nhflags_lock); + if (flag) + nh->nh_flags |= RTNH_F_SUSPECT; + else + nh->nh_flags &= ~RTNH_F_SUSPECT; + read_unlock_bh(&fib_nhflags_lock); } - return 1; + /* } endfor_nexthops(fi) */ + + return dead; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -549,8 +594,11 @@ static int fib_check_nh(struct fib_confi dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; - if (!(dev->flags & IFF_UP)) - return -ENETDOWN; + if (!(dev->flags & IFF_UP)) { + if (fi->fib_protocol != RTPROT_STATIC) + return -ENETDOWN; + nh->nh_flags |= RTNH_F_DEAD; + } nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; @@ -568,21 +616,41 @@ static int fib_check_nh(struct fib_confi if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; err = fib_lookup(net, &fl4, &res); - if (err) { - rcu_read_unlock(); - return err; + } + if (err) { + struct in_device *in_dev; + + if (err != -ENETUNREACH || + fi->fib_protocol != RTPROT_STATIC) + goto out; + + in_dev = inetdev_by_index(net, nh->nh_oif); + if (in_dev == NULL || + in_dev->dev->flags & IFF_UP) + goto out; + nh->nh_flags |= RTNH_F_DEAD; + nh->nh_scope = RT_SCOPE_LINK; + nh->nh_dev = in_dev->dev; + dev_hold(nh->nh_dev); + } else { + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + goto out; + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + nh->nh_dev = dev = FIB_RES_DEV(res); + if (!dev) + goto out; + dev_hold(dev); + if (!(nh->nh_dev->flags & IFF_UP)) { + if (fi->fib_protocol != RTPROT_STATIC) { + err = -ENETDOWN; + goto out; + } + nh->nh_flags |= RTNH_F_DEAD; } + err = 0; } - err = -EINVAL; - if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) - goto out; - nh->nh_scope = res.scope; - nh->nh_oif = FIB_RES_OIF(res); - nh->nh_dev = dev = FIB_RES_DEV(res); - if (!dev) - goto out; - dev_hold(dev); - err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; @@ -595,8 +663,11 @@ static int fib_check_nh(struct fib_confi if (in_dev == NULL) goto out; err = -ENETDOWN; - if (!(in_dev->dev->flags & IFF_UP)) - goto out; + if (!(in_dev->dev->flags & IFF_UP)) { + if (fi->fib_protocol != RTPROT_STATIC) + goto out; + nh->nh_flags |= RTNH_F_DEAD; + } nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; @@ -1049,18 +1120,29 @@ int fib_sync_down_dev(struct net_device prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nexthop_nh->nh_flags & RTNH_F_DEAD) - dead++; - else if (nexthop_nh->nh_dev == dev && - nexthop_nh->nh_scope != scope) { - nexthop_nh->nh_flags |= RTNH_F_DEAD; + if (nexthop_nh->nh_flags & RTNH_F_DEAD) { + if (fi->fib_protocol != RTPROT_STATIC || + nexthop_nh->nh_dev == NULL || + __in_dev_get_rtnl(nexthop_nh->nh_dev) == NULL || + nexthop_nh->nh_dev->flags&IFF_UP) + dead++; + } else if (nexthop_nh->nh_dev == dev && + nexthop_nh->nh_scope != scope) { + write_lock_bh(&fib_nhflags_lock); #ifdef CONFIG_IP_ROUTE_MULTIPATH - spin_lock_bh(&fib_multipath_lock); + spin_lock(&fib_multipath_lock); + nexthop_nh->nh_flags |= RTNH_F_DEAD; fi->fib_power -= nexthop_nh->nh_power; nexthop_nh->nh_power = 0; - spin_unlock_bh(&fib_multipath_lock); + spin_unlock(&fib_multipath_lock); +#else + nexthop_nh->nh_flags |= RTNH_F_DEAD; #endif - dead++; + write_unlock_bh(&fib_nhflags_lock); + if (fi->fib_protocol!=RTPROT_STATIC || + force || + __in_dev_get_rtnl(dev) == NULL) + dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (force > 1 && nexthop_nh->nh_dev == dev) { @@ -1079,12 +1161,12 @@ int fib_sync_down_dev(struct net_device } /* Must be invoked inside of an RCU protected region. */ -void fib_select_default(struct fib_result *res) +void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct list_head *fa_head = res->fa_head; - struct fib_table *tb = res->table; - int order = -1, last_idx = -1; + int order = -1, last_idx = -1, last_dflt = -2, last_nhsel = 0; + struct fib_alias *first_fa = NULL; struct fib_alias *fa; list_for_each_entry_rcu(fa, fa_head, fa_list) { @@ -1094,21 +1176,21 @@ void fib_select_default(struct fib_resul fa->fa_type != RTN_UNICAST) continue; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; if (next_fi->fib_priority > res->fi->fib_priority) break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; fib_alias_accessed(fa); - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { + if (!first_fa) { + last_dflt = fa->fa_last_dflt; + first_fa = fa; + } + if (fi && !fib_detect_death(fi, order, &last_resort, + &last_idx, &last_dflt, &last_nhsel, flp)) { fib_result_assign(res, fi); - tb->tb_default = order; + first_fa->fa_last_dflt = order; goto out; } fi = next_fi; @@ -1116,29 +1198,38 @@ void fib_select_default(struct fib_resul } if (order <= 0 || fi == NULL) { - tb->tb_default = -1; + if (fi && fi->fib_nhs > 1 && + fib_detect_death(fi, order, &last_resort, &last_idx, + &last_dflt, &last_nhsel, flp) && + last_resort == fi) { + read_lock_bh(&fib_nhflags_lock); + fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; + read_unlock_bh(&fib_nhflags_lock); + } + if (first_fa) first_fa->fa_last_dflt = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { + &last_dflt, &last_nhsel, flp)) { fib_result_assign(res, fi); - tb->tb_default = order; + first_fa->fa_last_dflt = order; goto out; } - if (last_idx >= 0) + if (last_idx >= 0) { fib_result_assign(res, last_resort); - tb->tb_default = last_idx; + read_lock_bh(&fib_nhflags_lock); + last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; + read_unlock_bh(&fib_nhflags_lock); + first_fa->fa_last_dflt = last_idx; + } out: return; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* - * Dead device goes up. We wake up dead nexthops. - * It takes sense only on multipath routes. ++ Dead device goes up or new address is added. We wake up dead nexthops. */ int fib_sync_up(struct net_device *dev) { @@ -1147,8 +1238,10 @@ int fib_sync_up(struct net_device *dev) struct hlist_head *head; struct hlist_node *node; struct fib_nh *nh; - int ret; + struct fib_result res; + int ret, rep; +repeat: if (!(dev->flags & IFF_UP)) return 0; @@ -1156,6 +1249,7 @@ int fib_sync_up(struct net_device *dev) hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; ret = 0; + rep = 0; hlist_for_each_entry(nh, node, head, nh_hash) { struct fib_info *fi = nh->nh_parent; @@ -1168,21 +1262,44 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { - alive++; + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) continue; - } if (nexthop_nh->nh_dev == NULL || !(nexthop_nh->nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; + if (nexthop_nh->nh_gw && fi->fib_protocol == RTPROT_STATIC) { + struct flowi4 fl4 = { + .daddr = nexthop_nh->nh_gw, + .flowi4_scope = nexthop_nh->nh_scope, + .flowi4_oif = nexthop_nh->nh_oif, + }; + + rcu_read_lock(); + if (fib_lookup(dev_net(dev), &fl4, &res) != 0) { + rcu_read_unlock(); + continue; + } + if (res.type != RTN_UNICAST && + res.type != RTN_LOCAL) { + rcu_read_unlock(); + continue; + } + nexthop_nh->nh_scope = res.scope; + rcu_read_unlock(); + rep = 1; + } alive++; +#ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); nexthop_nh->nh_power = 0; +#endif nexthop_nh->nh_flags &= ~RTNH_F_DEAD; +#ifdef CONFIG_IP_ROUTE_MULTIPATH spin_unlock_bh(&fib_multipath_lock); +#endif } endfor_nexthops(fi) if (alive > 0) { @@ -1190,35 +1307,61 @@ int fib_sync_up(struct net_device *dev) ret++; } } + if (rep) + goto repeat; return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* * The algorithm is suboptimal, but it provides really * fair weighted route distribution. */ -void fib_select_multipath(struct fib_result *res) +void fib_select_multipath(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = res->fi; - int w; + int w, alive; spin_lock_bh(&fib_multipath_lock); + if (flp->flowi4_oif) { + int sel = -1; + w = -1; + change_nexthops(fi) { + if (flp->flowi4_oif != nexthop_nh->nh_oif) + continue; + if (flp->fl4_gw && flp->fl4_gw != nexthop_nh->nh_gw && + nexthop_nh->nh_gw && + nexthop_nh->nh_scope == RT_SCOPE_LINK) + continue; + if (!(nexthop_nh->nh_flags & RTNH_F_BADSTATE)) { + if (nexthop_nh->nh_power > w) { + w = nexthop_nh->nh_power; + sel = nhsel; + } + } + } endfor_nexthops(fi); + if (sel >= 0) { + spin_unlock_bh(&fib_multipath_lock); + res->nh_sel = sel; + return; + } + goto last_resort; + } + +repeat: if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & RTNH_F_BADSTATE)) { power += nexthop_nh->nh_weight; nexthop_nh->nh_power = nexthop_nh->nh_weight; } } endfor_nexthops(fi); fi->fib_power = power; - if (power <= 0) { - spin_unlock_bh(&fib_multipath_lock); - /* Race condition: route has just become dead. */ - res->nh_sel = 0; - return; - } + if (power <= 0) + goto last_resort; } @@ -1228,8 +1371,9 @@ void fib_select_multipath(struct fib_res w = jiffies % fi->fib_power; + alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && + if (!(nexthop_nh->nh_flags & RTNH_F_BADSTATE) && nexthop_nh->nh_power) { w -= nexthop_nh->nh_power; if (w <= 0) { @@ -1239,11 +1383,29 @@ void fib_select_multipath(struct fib_res spin_unlock_bh(&fib_multipath_lock); return; } + alive = 1; + } + } endfor_nexthops(fi); + if (alive) { + fi->fib_power = 0; + goto repeat; + } + +last_resort: + for_nexthops(fi) { + if (!(nh->nh_flags & RTNH_F_DEAD)) { + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && + nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + continue; + spin_unlock_bh(&fib_multipath_lock); + res->nh_sel = nhsel; + return; } } endfor_nexthops(fi); /* Race condition: route has just become dead. */ - res->nh_sel = 0; spin_unlock_bh(&fib_multipath_lock); } #endif diff -urp v3.4/linux/net/ipv4/fib_trie.c linux/net/ipv4/fib_trie.c --- v3.4/linux/net/ipv4/fib_trie.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/ipv4/fib_trie.c 2012-05-21 23:32:17.778747680 +0300 @@ -1279,6 +1279,7 @@ int fib_table_insert(struct fib_table *t fi_drop = fa->fa_info; new_fa->fa_tos = fa->fa_tos; new_fa->fa_info = fi; + new_fa->fa_last_dflt = -1; new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; @@ -1317,6 +1318,7 @@ int fib_table_insert(struct fib_table *t new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; + new_fa->fa_last_dflt = -1; /* * Insert new entry to the list. */ @@ -1391,6 +1393,9 @@ static int check_leaf(struct fib_table * continue; if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; + if (flp->fl4_gw && flp->fl4_gw != nh->nh_gw && + nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + continue; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.semantic_match_passed++; diff -urp v3.4/linux/net/ipv4/netfilter/ipt_MASQUERADE.c linux/net/ipv4/netfilter/ipt_MASQUERADE.c --- v3.4/linux/net/ipv4/netfilter/ipt_MASQUERADE.c 2012-03-20 00:05:19.000000000 +0200 +++ linux/net/ipv4/netfilter/ipt_MASQUERADE.c 2012-05-21 23:32:17.778747680 +0300 @@ -51,7 +51,7 @@ masquerade_tg(struct sk_buff *skb, const enum ip_conntrack_info ctinfo; struct nf_nat_ipv4_range newrange; const struct nf_nat_ipv4_multi_range_compat *mr; - const struct rtable *rt; + struct rtable *rt; __be32 newsrc; NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); @@ -69,13 +69,27 @@ masquerade_tg(struct sk_buff *skb, const return NF_ACCEPT; mr = par->targinfo; - rt = skb_rtable(skb); - newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); - if (!newsrc) { - pr_info("%s ate my IP address\n", par->out->name); - return NF_DROP; + + { + struct flowi4 fl4 = { .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_mark = skb->mark, + .flowi4_oif = par->out->ifindex, + .daddr = ip_hdr(skb)->daddr, + .fl4_gw = skb_rtable(skb)->rt_gateway }; + rt = ip_route_output_key(dev_net(par->out), &fl4); + if (IS_ERR(rt)) { + /* Funky routing can do this. */ + if (net_ratelimit()) + pr_info("%s:" + " No route: Rusty's brain broke!\n", + par->out->name); + return NF_DROP; + } } + newsrc = rt->rt_src; + ip_rt_put(rt); + nat->masq_index = par->out->ifindex; /* Transfer from original range. */ diff -urp v3.4/linux/net/ipv4/netfilter/nf_nat_core.c linux/net/ipv4/netfilter/nf_nat_core.c --- v3.4/linux/net/ipv4/netfilter/nf_nat_core.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/ipv4/netfilter/nf_nat_core.c 2012-05-21 23:32:17.778747680 +0300 @@ -691,6 +691,52 @@ static struct nf_ct_helper_expectfn foll .expectfn = nf_nat_follow_master, }; +unsigned int +ip_nat_route_input(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct iphdr *iph; + struct nf_conn *conn; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + unsigned long statusbit; + __be32 saddr; + + if (!(conn = nf_ct_get(skb, &ctinfo))) + return NF_ACCEPT; + + if (!(conn->status & IPS_NAT_DONE_MASK)) + return NF_ACCEPT; + dir = CTINFO2DIR(ctinfo); + statusbit = IPS_SRC_NAT; + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + if (!(conn->status & statusbit)) + return NF_ACCEPT; + + if (skb_dst(skb)) + return NF_ACCEPT; + + if (skb->len < sizeof(struct iphdr)) + return NF_ACCEPT; + + /* use daddr in other direction as masquerade address (lsrc) */ + iph = ip_hdr(skb); + saddr = conn->tuplehash[!dir].tuple.dst.u3.ip; + if (saddr == iph->saddr) + return NF_ACCEPT; + + if (ip_route_input_lookup(skb, iph->daddr, iph->saddr, iph->tos, + skb->dev, saddr)) + return NF_DROP; + + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ip_nat_route_input); + static int __init nf_nat_init(void) { size_t i; diff -urp v3.4/linux/net/ipv4/netfilter/nf_nat_standalone.c linux/net/ipv4/netfilter/nf_nat_standalone.c --- v3.4/linux/net/ipv4/netfilter/nf_nat_standalone.c 2012-03-20 00:05:19.000000000 +0200 +++ linux/net/ipv4/netfilter/nf_nat_standalone.c 2012-05-21 23:32:17.782747681 +0300 @@ -250,6 +250,14 @@ static struct nf_hook_ops nf_nat_ops[] _ .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, + /* Before routing, route before mangling */ + { + .hook = ip_nat_route_input, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_LAST-1, + }, /* After packet filtering, change source */ { .hook = nf_nat_out, diff -urp v3.4/linux/net/ipv4/route.c linux/net/ipv4/route.c --- v3.4/linux/net/ipv4/route.c 2012-05-21 23:04:39.000000000 +0300 +++ linux/net/ipv4/route.c 2012-05-21 23:32:17.786747682 +0300 @@ -738,6 +738,8 @@ static inline int compare_keys(struct rt return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | (rt1->rt_mark ^ rt2->rt_mark) | + ((__force u32)rt1->rt_key_lsrc ^ (__force u32)rt2->rt_key_lsrc) | + ((__force u32)rt1->rt_key_gw ^ (__force u32)rt2->rt_key_gw) | (rt1->rt_key_tos ^ rt2->rt_key_tos) | (rt1->rt_route_iif ^ rt2->rt_route_iif) | (rt1->rt_oif ^ rt2->rt_oif)) == 0; @@ -1419,6 +1421,8 @@ static void check_peer_redir(struct dst_ rt->rt_gateway = orig_gw; return; } + if (rt->rt_key_gw) + rt->rt_key_gw = rt->rt_gateway; old_n = xchg(&rt->dst._neighbour, n); if (old_n) neigh_release(old_n); @@ -1962,6 +1966,8 @@ static void rt_init_metrics(struct rtabl if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; + if (rt->rt_key_gw) + rt->rt_key_gw = rt->rt_gateway; rt->rt_flags |= RTCF_REDIRECTED; } } else { @@ -2037,7 +2043,7 @@ static int ip_route_input_mc(struct sk_b spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + &itag, our); if (err < 0) goto e_err; } @@ -2053,6 +2059,8 @@ static int ip_route_input_mc(struct sk_b rth->rt_key_dst = daddr; rth->rt_key_src = saddr; + rth->rt_key_lsrc = 0; + rth->rt_key_gw = daddr; rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; @@ -2122,7 +2130,7 @@ static int __mkroute_input(struct sk_buf const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) + __be32 lsrc, struct rtable **result) { struct rtable *rth; int err; @@ -2141,7 +2149,7 @@ static int __mkroute_input(struct sk_buf err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, &spec_dst, &itag, 0); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2153,6 +2161,7 @@ static int __mkroute_input(struct sk_buf flags |= RTCF_DIRECTSRC; if (out_dev == in_dev && err && + !lsrc && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; @@ -2182,6 +2191,8 @@ static int __mkroute_input(struct sk_buf rth->rt_key_dst = daddr; rth->rt_key_src = saddr; + rth->rt_key_lsrc = lsrc; + rth->rt_key_gw = 0; rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rth->rt_flags = flags; rth->rt_type = res->type; @@ -2211,21 +2222,23 @@ static int __mkroute_input(struct sk_buf static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, + struct net *net, const struct flowi4 *fl4, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) + __be32 daddr, __be32 saddr, u32 tos, __be32 lsrc) { struct rtable* rth = NULL; int err; unsigned hash; + fib_select_default(fl4, res); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) - fib_select_multipath(res); + fib_select_multipath(fl4, res); #endif /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); + err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, lsrc, &rth); if (err) return err; @@ -2250,7 +2263,7 @@ static int ip_mkroute_input(struct sk_bu */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + u8 tos, struct net_device *dev, __be32 lsrc) { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2288,22 +2301,32 @@ static int ip_route_input_slow(struct sk if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) goto martian_destination; + if (lsrc) { + if (ipv4_is_multicast(lsrc) || ipv4_is_lbcast(lsrc) || + ipv4_is_zeronet(lsrc) || ipv4_is_loopback(lsrc)) + goto e_inval; + } + /* * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_iif = lsrc ? + dev_net(dev)->loopback_dev->ifindex : dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.daddr = daddr; - fl4.saddr = saddr; + fl4.saddr = lsrc? : saddr; + fl4.fl4_gw = 0; err = fib_lookup(net, &fl4, &res); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; } + fl4.flowi4_iif = dev->ifindex; + fl4.saddr = saddr; RT_CACHE_STAT_INC(in_slow_tot); @@ -2313,7 +2336,7 @@ static int ip_route_input_slow(struct sk if (res.type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + dev, &spec_dst, &itag, 1); if (err < 0) goto martian_source_keep_err; if (err) @@ -2327,18 +2350,21 @@ static int ip_route_input_slow(struct sk if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, &res, net, &fl4, in_dev, daddr, saddr, + tos, lsrc); out: return err; brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; + if (lsrc) + goto e_inval; if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, - &itag); + &itag, 1); if (err < 0) goto martian_source_keep_err; if (err) @@ -2362,6 +2388,8 @@ local_input: rth->rt_key_dst = daddr; rth->rt_key_src = saddr; + rth->rt_key_lsrc = 0; + rth->rt_key_gw = 0; rth->rt_genid = rt_genid(net); rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; @@ -2430,8 +2458,9 @@ martian_source_keep_err: goto out; } -int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, bool noref) +int ip_route_input_cached(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, bool noref, + __be32 lsrc) { struct rtable * rth; unsigned hash; @@ -2454,6 +2483,7 @@ int ip_route_input_common(struct sk_buff if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | (rth->rt_route_iif ^ iif) | + (rth->rt_key_lsrc ^ lsrc) | (rth->rt_key_tos ^ tos)) == 0 && rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && @@ -2507,12 +2537,25 @@ skip_cache: rcu_read_unlock(); return -EINVAL; } - res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev, lsrc); rcu_read_unlock(); return res; } + +int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, bool noref) +{ + return ip_route_input_cached(skb, daddr, saddr, tos, dev, noref, 0); +} EXPORT_SYMBOL(ip_route_input_common); +int ip_route_input_lookup(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, __be32 lsrc) +{ + return ip_route_input_cached(skb, daddr, saddr, tos, dev, true, lsrc); +} +EXPORT_SYMBOL(ip_route_input_lookup); + /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, @@ -2569,6 +2612,8 @@ static struct rtable *__mkroute_output(c rth->rt_key_dst = orig_daddr; rth->rt_key_src = orig_saddr; + rth->rt_key_lsrc = 0; + rth->rt_key_gw = fl4->fl4_gw; rth->rt_genid = rt_genid(dev_net(dev_out)); rth->rt_flags = flags; rth->rt_type = type; @@ -2729,6 +2774,7 @@ static struct rtable *ip_route_output_sl fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = net->loopback_dev->ifindex; + fl4->fl4_gw = 0; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2766,6 +2812,27 @@ static struct rtable *ip_route_output_sl } if (res.type == RTN_LOCAL) { + struct in_device *in_dev; + __be32 src; + + dev_out = FIB_RES_DEV(res); + in_dev = __in_dev_get_rcu(dev_out); + src = fl4->saddr? : FIB_RES_PREFSRC(net, res); + if (in_dev && IN_DEV_LOOP(in_dev) && src) { + struct net_device *dev_src; + + dev_src = __ip_dev_find(net, src, false); + if (dev_src && dev_src != dev_out && + (in_dev = __in_dev_get_rcu(dev_src)) && + IN_DEV_LOOP(in_dev)) { + dev_out = dev_src; + fl4->saddr = src; + fl4->flowi4_oif = dev_out->ifindex; + res.type = RTN_UNICAST; + res.fi = NULL; + goto make_route; + } + } if (!fl4->saddr) { if (res.fi->fib_prefsrc) fl4->saddr = res.fi->fib_prefsrc; @@ -2774,20 +2841,18 @@ static struct rtable *ip_route_output_sl } dev_out = net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; + fl4->fl4_gw = 0; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } + if (res.type == RTN_UNICAST) + fib_select_default(fl4, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) - fib_select_multipath(&res); - else + if (res.fi->fib_nhs > 1) + fib_select_multipath(fl4, &res); #endif - if (!res.prefixlen && - res.table->tb_num_default > 1 && - res.type == RTN_UNICAST && !fl4->flowi4_oif) - fib_select_default(&res); if (!fl4->saddr) fl4->saddr = FIB_RES_PREFSRC(net, res); @@ -2829,6 +2894,7 @@ struct rtable *__ip_route_output_key(str rth->rt_key_src == flp4->saddr && rt_is_output_route(rth) && rth->rt_oif == flp4->flowi4_oif && + rth->rt_key_gw == flp4->fl4_gw && rth->rt_mark == flp4->flowi4_mark && !((rth->rt_key_tos ^ flp4->flowi4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && @@ -2906,6 +2972,8 @@ struct dst_entry *ipv4_blackhole_route(s rt->rt_key_dst = ort->rt_key_dst; rt->rt_key_src = ort->rt_key_src; + rt->rt_key_lsrc = ort->rt_key_lsrc; + rt->rt_key_gw = ort->rt_key_gw ? ort->rt_gateway : 0; rt->rt_key_tos = ort->rt_key_tos; rt->rt_route_iif = ort->rt_route_iif; rt->rt_iif = ort->rt_iif;