From 511c3f92ad5b6d9f8f6464be1b4f85f0422be91a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:14:27 +0000
Subject: net: skb->rtable accessor

Define skb_rtable(const struct sk_buff *skb) accessor to get rtable from skb

Delete skb->rtable field

Setting rtable is not allowed, just set dst instead as rtable is an alias.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

(limited to 'net/ipv4/route.c')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 28205e5bfa9b..f20060ac2f09 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1064,7 +1064,8 @@ work_done:
 out:	return 0;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static int rt_intern_hash(unsigned hash, struct rtable *rt,
+			  struct rtable **rp, struct sk_buff *skb)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now;
@@ -1114,7 +1115,10 @@ restart:
 			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
-			*rp = rth;
+			if (rp)
+				*rp = rth;
+			else
+				skb->dst = &rth->u.dst;
 			return 0;
 		}
 
@@ -1210,7 +1214,10 @@ restart:
 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
-	*rp = rt;
+	if (rp)
+		*rp = rt;
+	else
+		skb->dst = &rt->u.dst;
 	return 0;
 }
 
@@ -1407,7 +1414,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 							&netevent);
 
 				rt_del(hash, rth);
-				if (!rt_intern_hash(hash, rt, &rt))
+				if (!rt_intern_hash(hash, rt, &rt, NULL))
 					ip_rt_put(rt);
 				goto do_next;
 			}
@@ -1473,7 +1480,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 
 void ip_rt_send_redirect(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
 
 	if (!in_dev)
@@ -1521,7 +1528,7 @@ out:
 
 static int ip_error(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	unsigned long now;
 	int code;
 
@@ -1698,7 +1705,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
 
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	if (rt)
 		dst_set_expires(&rt->u.dst, 0);
 }
@@ -1858,7 +1865,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	in_dev_put(in_dev);
 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	return rt_intern_hash(hash, rth, &skb->rtable);
+	return rt_intern_hash(hash, rth, NULL, skb);
 
 e_nobufs:
 	in_dev_put(in_dev);
@@ -2019,7 +2026,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	/* put it into the cache */
 	hash = rt_hash(daddr, saddr, fl->iif,
 		       rt_genid(dev_net(rth->u.dst.dev)));
-	return rt_intern_hash(hash, rth, &skb->rtable);
+	return rt_intern_hash(hash, rth, NULL, skb);
 }
 
 /*
@@ -2175,7 +2182,7 @@ local_input:
 	}
 	rth->rt_type	= res.type;
 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
-	err = rt_intern_hash(hash, rth, &skb->rtable);
+	err = rt_intern_hash(hash, rth, NULL, skb);
 	goto done;
 
 no_route:
@@ -2244,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb->rtable = rth;
+			skb->dst = &rth->u.dst;
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2420,7 +2427,7 @@ static int ip_mkroute_output(struct rtable **rp,
 	if (err == 0) {
 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
 			       rt_genid(dev_net(dev_out)));
-		err = rt_intern_hash(hash, rth, rp);
+		err = rt_intern_hash(hash, rth, rp, NULL);
 	}
 
 	return err;
@@ -2763,7 +2770,7 @@ static int rt_fill_info(struct net *net,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	long expires;
@@ -2907,7 +2914,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
 		local_bh_enable();
 
-		rt = skb->rtable;
+		rt = skb_rtable(skb);
 		if (err == 0 && rt->u.dst.error)
 			err = -rt->u.dst.error;
 	} else {
@@ -2927,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (err)
 		goto errout_free;
 
-	skb->rtable = rt;
+	skb->dst = &rt->u.dst;
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-- 
cgit v1.2.3


From adf30907d63893e4208dfe3f5c88ae12bc2f25d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:19:30 +0000
Subject: net: skb->dst accessors

Define three accessors to get/set dst attached to a skb

struct dst_entry *skb_dst(const struct sk_buff *skb)

void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)

void skb_dst_drop(struct sk_buff *skb)
This one should replace occurrences of :
dst_release(skb->dst)
skb->dst = NULL;

Delete skb->dst field

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/ipv4/route.c')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f20060ac2f09..a849bb15d864 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1118,7 +1118,7 @@ restart:
 			if (rp)
 				*rp = rth;
 			else
-				skb->dst = &rth->u.dst;
+				skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 
@@ -1217,7 +1217,7 @@ restart:
 	if (rp)
 		*rp = rt;
 	else
-		skb->dst = &rt->u.dst;
+		skb_dst_set(skb, &rt->u.dst);
 	return 0;
 }
 
@@ -2251,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb->dst = &rth->u.dst;
+			skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2934,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (err)
 		goto errout_free;
 
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
@@ -2975,15 +2975,15 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 				continue;
 			if (rt_is_expired(rt))
 				continue;
-			skb->dst = dst_clone(&rt->u.dst);
+			skb_dst_set(skb, dst_clone(&rt->u.dst));
 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
-				dst_release(xchg(&skb->dst, NULL));
+				skb_dst_drop(skb);
 				rcu_read_unlock_bh();
 				goto done;
 			}
-			dst_release(xchg(&skb->dst, NULL));
+			skb_dst_drop(skb);
 		}
 		rcu_read_unlock_bh();
 	}
-- 
cgit v1.2.3


From 125bb8f5637bd653244728f734bcac218986d910 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 11 Jun 2009 20:10:07 +0000
Subject: net: use a deferred timer in rt_check_expire

For the sake of power saver lovers, use a deferrable timer to fire
rt_check_expire()

As some big routers cache equilibrium depends on garbage collection
done in time, we take into account elapsed time between two
rt_check_expire() invocations to adjust the amount of slots we have to
check.

Based on an initial idea and patch from Tero Kristo

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Tero Kristo <tero.kristo@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/route.c')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a849bb15d864..cd76b3cb7092 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -131,8 +131,8 @@ static int ip_rt_min_advmss __read_mostly	= 256;
 static int ip_rt_secret_interval __read_mostly	= 10 * 60 * HZ;
 static int rt_chain_length_max __read_mostly	= 20;
 
-static void rt_worker_func(struct work_struct *work);
-static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
 
 /*
  *	Interface to generic destination cache.
@@ -787,9 +787,12 @@ static void rt_check_expire(void)
 	struct rtable *rth, *aux, **rthp;
 	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
+	unsigned long delta;
 	u64 mult;
 
-	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+	delta = jiffies - expires_ljiffies;
+	expires_ljiffies = jiffies;
+	mult = ((u64)delta) << rt_hash_log;
 	if (ip_rt_gc_timeout > 1)
 		do_div(mult, ip_rt_gc_timeout);
 	goal = (unsigned int)mult;
@@ -3397,6 +3400,8 @@ int __init ip_rt_init(void)
 	/* All the timers, started at system startup tend
 	   to synchronize. Perturb it a bit.
 	 */
+	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+	expires_ljiffies = jiffies;
 	schedule_delayed_work(&expires_work,
 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
 
-- 
cgit v1.2.3


From 73e42897e8e5619eacb787d2ce69be12f47cfc21 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Sat, 20 Jun 2009 01:15:16 -0700
Subject: ipv4: fix NULL pointer + success return in route lookup path

Don't drop route if we're not caching

	I recently got a report of an oops on a route lookup.  Maxime was
testing what would happen if route caching was turned off (doing so by setting
making rt_caching always return 0), and found that it triggered an oops.  I
looked at it and found that the problem stemmed from the fact that the route
lookup routines were returning success from their lookup paths (which is good),
but never set the **rp pointer to anything (which is bad).  This happens because
in rt_intern_hash, if rt_caching returns false, we call rt_drop and return 0.
This almost emulates slient success.  What we should be doing is assigning *rp =
rt and _not_ dropping the route.  This way, during slow path lookups, when we
create a new route cache entry, we don't immediately discard it, rather we just
don't add it into the cache hash table, but we let this one lookup use it for
the purpose of this route request.  Maxime has tested and reports it prevents
the oops.  There is still a subsequent routing issue that I'm looking into
further, but I'm confident that, even if its related to this same path, this
patch makes sense to take.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'net/ipv4/route.c')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index cd76b3cb7092..65b3a8b11a6c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1085,8 +1085,16 @@ restart:
 	now = jiffies;
 
 	if (!rt_caching(dev_net(rt->u.dst.dev))) {
-		rt_drop(rt);
-		return 0;
+		/*
+		 * If we're not caching, just tell the caller we
+		 * were successful and don't touch the route.  The
+		 * caller hold the sole reference to the cache entry, and
+		 * it will be released when the caller is done with it.
+		 * If we drop it here, the callers have no way to resolve routes
+		 * when we're not caching.  Instead, just point *rp at rt, so
+		 * the caller gets a single use out of the route
+		 */
+		goto report_and_exit;
 	}
 
 	rthp = &rt_hash_table[hash].chain;
@@ -1217,6 +1225,8 @@ restart:
 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
+
+report_and_exit:
 	if (rp)
 		*rp = rt;
 	else
-- 
cgit v1.2.3


From b6280b47a7a42970d098a3059f4ebe7e55e90d8d Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 22 Jun 2009 10:18:53 +0000
Subject: ipv4 routing: Ensure that route cache entries are usable and
 reclaimable with caching is off

When route caching is disabled (rt_caching returns false), We still use route
cache entries that are created and passed into rt_intern_hash once.  These
routes need to be made usable for the one call path that holds a reference to
them, and they need to be reclaimed when they're finished with their use.  To be
made usable, they need to be associated with a neighbor table entry (which they
currently are not), otherwise iproute_finish2 just discards the packet, since we
don't know which L2 peer to send the packet to.  To do this binding, we need to
follow the path a bit higher up in rt_intern_hash, which calls
arp_bind_neighbour, but not assign the route entry to the hash table.
Currently, if caching is off, we simply assign the route to the rp pointer and
are reutrn success.  This patch associates us with a neighbor entry first.

Secondly, we need to make sure that any single use routes like this are known to
the garbage collector when caching is off.  If caching is off, and we try to
hash in a route, it will leak when its refcount reaches zero.  To avoid this,
this patch calls rt_free on the route cache entry passed into rt_intern_hash.
This places us on the gc list for the route cache garbage collector, so that
when its refcount reaches zero, it will be reclaimed (Thanks to Alexey for this
suggestion).

I've tested this on a local system here, and with these patches in place, I'm
able to maintain routed connectivity to remote systems, even if I set
/proc/sys/net/ipv4/rt_cache_rebuild_count to -1, which forces rt_caching to
return false.

Signed-off-by: Neil Horman <nhorman@redhat.com>
Reported-by: Jarek Poplawski <jarkao2@gmail.com>
Reported-by: Maxime Bizon <mbizon@freebox.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'net/ipv4/route.c')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 65b3a8b11a6c..278f46f5011b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1093,8 +1093,27 @@ restart:
 		 * If we drop it here, the callers have no way to resolve routes
 		 * when we're not caching.  Instead, just point *rp at rt, so
 		 * the caller gets a single use out of the route
+		 * Note that we do rt_free on this new route entry, so that
+		 * once its refcount hits zero, we are still able to reap it
+		 * (Thanks Alexey)
+		 * Note also the rt_free uses call_rcu.  We don't actually
+		 * need rcu protection here, this is just our path to get
+		 * on the route gc list.
 		 */
-		goto report_and_exit;
+
+		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+			int err = arp_bind_neighbour(&rt->u.dst);
+			if (err) {
+				if (net_ratelimit())
+					printk(KERN_WARNING
+					    "Neighbour table failure & not caching routes.\n");
+				rt_drop(rt);
+				return err;
+			}
+		}
+
+		rt_free(rt);
+		goto skip_hashing;
 	}
 
 	rthp = &rt_hash_table[hash].chain;
@@ -1211,7 +1230,8 @@ restart:
 #if RT_CACHE_DEBUG >= 2
 	if (rt->u.dst.rt_next) {
 		struct rtable *trt;
-		printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
+		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
+		       hash, &rt->rt_dst);
 		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
 			printk(" . %pI4", &trt->rt_dst);
 		printk("\n");
@@ -1226,7 +1246,7 @@ restart:
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 
-report_and_exit:
+skip_hashing:
 	if (rp)
 		*rp = rt;
 	else
-- 
cgit v1.2.3