From ebe0bbf06c9e03613bdcb6b5a704595a9344b7ff Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 20:52:19 -0700
Subject: [NETFILTER] nfnetlink: use highest bit of nfa_type to indicate nested
 TLV

As Henrik Nordstrom pointed out, all our efforts with "split endian" (i.e.
host byte order tags, net byte order values) are useless, unless a parser
can determine whether an attribute is nested or not.

This patch steals the highest bit of nfattr.nfa_type to indicate whether
the data payload contains a nested nfattr (1) or not (0).

This will break userspace compatibility, but luckily no kernel with
nfnetlink was released so far.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 1d5b10ae2399..f08e870100f4 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -41,11 +41,15 @@ enum nfnetlink_groups {
 struct nfattr
 {
 	u_int16_t nfa_len;
-	u_int16_t nfa_type;
+	u_int16_t nfa_type;	/* we use 15 bits for the type, and the highest
+				 * bit to indicate whether the payload is nested */
 } __attribute__ ((packed));
 
-/* FIXME: Shamelessly copy and pasted from rtnetlink.h, it's time
- * 	  to put this in a generic file */
+/* FIXME: Apart from NFNL_NFA_NESTED shamelessly copy and pasted from
+ * rtnetlink.h, it's time to put this in a generic file */
+
+#define NFNL_NFA_NEST	0x8000
+#define NFA_TYPE(attr) 	((attr)->nfa_type & 0x7fff)
 
 #define NFA_ALIGNTO     4
 #define NFA_ALIGN(len)	(((len) + NFA_ALIGNTO - 1) & ~(NFA_ALIGNTO - 1))
@@ -59,7 +63,7 @@ struct nfattr
 #define NFA_PAYLOAD(nfa) ((int)((nfa)->nfa_len) - NFA_LENGTH(0))
 #define NFA_NEST(skb, type) \
 ({	struct nfattr *__start = (struct nfattr *) (skb)->tail; \
-	NFA_PUT(skb, type, 0, NULL); \
+	NFA_PUT(skb, (NFNL_NFA_NEST | type), 0, NULL); \
 	__start;  })
 #define NFA_NEST_END(skb, start) \
 ({      (start)->nfa_len = ((skb)->tail - (unsigned char *) (start)); \
-- 
cgit v1.2.3


From b3a91d037a2575040f9b6a483f60c407a3d80368 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 20:52:36 -0700
Subject: [NETFILTER] nat: remove bogus structure member

When 'rustynat' was merged in 2.6.12, the use of the "helper" pointer of
struct ipt_nat_info was obsoleted, but the pointer not removed from the
struct.

This patch removes the pointer, thereby yet again shrinking struct
ip_conntrack.

Discovered-by: Rusty Russell <rusty@netfilter.org>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_nat.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h
index e201ec6e9905..41a107de17cf 100644
--- a/include/linux/netfilter_ipv4/ip_nat.h
+++ b/include/linux/netfilter_ipv4/ip_nat.h
@@ -58,10 +58,6 @@ extern rwlock_t ip_nat_lock;
 struct ip_nat_info
 {
 	struct list_head bysource;
-
-	/* Helper (NULL if none). */
-	struct ip_nat_helper *helper;
-
 	struct ip_nat_seq seq[IP_CT_DIR_MAX];
 };
 
-- 
cgit v1.2.3


From 5bbc243aafff9ad653dc7a9fa7bcaf0b4631355a Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 20:54:01 -0700
Subject: [NETFILTER]: Add missing include to ip_conntrack_tuple.h

Without this #include, __be16 is not defined and userspace programs
will break.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_tuple.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
index 20e43f018b7c..3232db11a4e5 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
@@ -1,6 +1,8 @@
 #ifndef _IP_CONNTRACK_TUPLE_H
 #define _IP_CONNTRACK_TUPLE_H
 
+#include <linux/types.h>
+
 /* A `tuple' is a structure containing the information to uniquely
   identify a connection.  ie. if two packets have the same tuple, they
   are in the same connection; if not, they are not.
-- 
cgit v1.2.3


From e1c73b78e3706bd3c336d4730a01dd4081dfb7ee Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Oct 2005 20:55:49 -0700
Subject: [NETFILTER] ctnetlink: add one nesting level for TCP state

To keep consistency, the TCP private protocol information is nested
attributes under CTA_PROTOINFO_TCP. This way the sequence of attributes to
access the TCP state information looks like here below:

CTA_PROTOINFO
CTA_PROTOINFO_TCP
CTA_PROTOINFO_TCP_STATE

instead of:

CTA_PROTOINFO
CTA_PROTOINFO_TCP_STATE

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 5c55751c78e4..fb5511030185 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -70,11 +70,18 @@ enum ctattr_l4proto {
 
 enum ctattr_protoinfo {
 	CTA_PROTOINFO_UNSPEC,
-	CTA_PROTOINFO_TCP_STATE,
+	CTA_PROTOINFO_TCP,
 	__CTA_PROTOINFO_MAX
 };
 #define CTA_PROTOINFO_MAX (__CTA_PROTOINFO_MAX - 1)
 
+enum ctattr_protoinfo_tcp {
+	CTA_PROTOINFO_TCP_UNSPEC,
+	CTA_PROTOINFO_TCP_STATE,
+	__CTA_PROTOINFO_TCP_MAX
+};
+#define CTA_PROTOINFO_TCP_MAX (__CTA_PROTOINFO_TCP_MAX - 1)
+
 enum ctattr_counters {
 	CTA_COUNTERS_UNSPEC,
 	CTA_COUNTERS_PACKETS,
-- 
cgit v1.2.3


From a051a8f7306476af0a74370ad56e793cb6c43bf7 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 21:21:10 -0700
Subject: [NETFILTER]: Use only 32bit counters for CONNTRACK_ACCT

Initially we used 64bit counters for conntrack-based accounting, since we
had no event mechanism to tell userspace that our counters are about to
overflow.  With nfnetlink_conntrack, we now have such a event mechanism and
thus can save 16bytes per connection.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h | 6 ++++--
 include/linux/netfilter_ipv4/ip_conntrack.h   | 8 ++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index fb5511030185..116fcaced909 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -84,8 +84,10 @@ enum ctattr_protoinfo_tcp {
 
 enum ctattr_counters {
 	CTA_COUNTERS_UNSPEC,
-	CTA_COUNTERS_PACKETS,
-	CTA_COUNTERS_BYTES,
+	CTA_COUNTERS_PACKETS,		/* old 64bit counters */
+	CTA_COUNTERS_BYTES,		/* old 64bit counters */
+	CTA_COUNTERS32_PACKETS,
+	CTA_COUNTERS32_BYTES,
 	__CTA_COUNTERS_MAX
 };
 #define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1)
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 4ced38736813..d078bb91d9e5 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -117,6 +117,10 @@ enum ip_conntrack_events
 	/* NAT info */
 	IPCT_NATINFO_BIT = 10,
 	IPCT_NATINFO = (1 << IPCT_NATINFO_BIT),
+
+	/* Counter highest bit has been set */
+	IPCT_COUNTER_FILLING_BIT = 11,
+	IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT),
 };
 
 enum ip_conntrack_expect_events {
@@ -192,8 +196,8 @@ do {									\
 
 struct ip_conntrack_counter
 {
-	u_int64_t packets;
-	u_int64_t bytes;
+	u_int32_t packets;
+	u_int32_t bytes;
 };
 
 struct ip_conntrack_helper;
-- 
cgit v1.2.3


From 339231537506846cb232a2f0cc4a2c662b2d5b07 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Oct 2005 21:23:28 -0700
Subject: [NETFILTER] ctnetlink: allow userspace to change TCP state

This patch adds the ability of changing the state a TCP connection. I know
that this must be used with care but it's required to provide a complete
conntrack creation via conntrack_netlink. So I'll document this aspect on
the upcoming docs.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_protocol.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h
index b6b99be8632a..2c76b879e3dc 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h
@@ -52,6 +52,9 @@ struct ip_conntrack_protocol
 	int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa,
 			 const struct ip_conntrack *ct);
 
+	/* convert nfnetlink attributes to protoinfo */
+	int (*from_nfattr)(struct nfattr *tb[], struct ip_conntrack *ct);
+
 	int (*tuple_to_nfattr)(struct sk_buff *skb,
 			       const struct ip_conntrack_tuple *t);
 	int (*nfattr_to_tuple)(struct nfattr *tb[],
-- 
cgit v1.2.3


From afb997c6163b33292d31a09d6aa5cbb03ffa5bf1 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Wed, 12 Oct 2005 15:12:21 -0700
Subject: [NETPOLL]: wrong return for null netpoll_poll_lock()

When netpoll is not being used, the macro that
defines the removed routing netpoll_poll_lock
defines the return as zero, but the real
routine returns a `void *`

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 5ade54a78dbb..ca5a8733000f 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -86,7 +86,7 @@ static inline void netpoll_poll_unlock(void *have)
 
 #else
 #define netpoll_rx(a) 0
-#define netpoll_poll_lock(a) 0
+#define netpoll_poll_lock(a) NULL
 #define netpoll_poll_unlock(a)
 #endif
 
-- 
cgit v1.2.3


From c8923c6b852d3a97c1faad0566e38fca330375a7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 13 Oct 2005 14:41:23 -0700
Subject: [NETFILTER]: Fix OOPSes on machines with discontiguous cpu numbering.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Original patch by Harald Welte, with feedback from Herbert Xu
and testing by Sébastien Bernard.

EBTABLES, ARP tables, and IP/IP6 tables all assume that cpus
are numbered linearly.  That is not necessarily true.

This patch fixes that up by calculating the largest possible
cpu number, and allocating enough per-cpu structure space given
that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cpumask.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b15826f6e3a2..fe9778301d07 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -392,4 +392,16 @@ extern cpumask_t cpu_present_map;
 #define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
+/* Find the highest possible smp_processor_id() */
+static inline unsigned int highest_possible_processor_id(void)
+{
+	unsigned int cpu, highest = 0;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		highest = cpu;
+
+	return highest;
+}
+
+
 #endif /* __LINUX_CPUMASK_H */
-- 
cgit v1.2.3


From e26148d934762b61133a64b6862f870624ff617d Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Fri, 14 Oct 2005 15:59:05 -0700
Subject: [PATCH] Fix copy-and-paste error in BSD accounting

Fix copy and paste error in jiffies_to_AHZ conversion which leads to wrong
BSD accounting information on alpha and ia64 when
CONFIG_BSD_PROCESS_ACCT_V3 is turned on.

Also update comment to match reorganised header files.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/acct.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acct.h b/include/linux/acct.h
index 1993a3691768..19f70462b3be 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -162,13 +162,13 @@ typedef struct acct acct_t;
 #ifdef __KERNEL__
 /*
  * Yet another set of HZ to *HZ helper functions.
- * See <linux/times.h> for the original.
+ * See <linux/jiffies.h> for the original.
  */
 
 static inline u32 jiffies_to_AHZ(unsigned long x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / AHZ)) == 0
-	return x / (HZ / USER_HZ);
+	return x / (HZ / AHZ);
 #else
         u64 tmp = (u64)x * TICK_NSEC;
         do_div(tmp, (NSEC_PER_SEC / AHZ));
-- 
cgit v1.2.3


From 688ce17b8599abc548b406c00e4d18ae0dec954f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Oct 2005 00:17:33 -0700
Subject: [PATCH]: highest_possible_processor_id() has to be a macro

	... otherwise, things like alpha and sparc64 break and break
badly.  They define cpu_possible_map to something else in smp.h
*AFTER* having included cpumask.h.

	If that puppy is a macro, expansion will happen at the actual
caller, when we'd already seen #define cpu_possible_map ... and we will
get the right thing used.

	As an inline helper it will be tokenized before we get to that
define and that's it; no matter what we define later, it won't affect
anything.  We get modules with dependency on cpu_possible_map instead
of the right symbol (phys_cpu_present_map in case of sparc64), or outright
link errors if they are built-in.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cpumask.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe9778301d07..9bdba8169b41 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -393,15 +393,13 @@ extern cpumask_t cpu_present_map;
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
 /* Find the highest possible smp_processor_id() */
-static inline unsigned int highest_possible_processor_id(void)
-{
-	unsigned int cpu, highest = 0;
-
-	for_each_cpu_mask(cpu, cpu_possible_map)
-		highest = cpu;
-
-	return highest;
-}
+#define highest_possible_processor_id() \
+({ \
+	unsigned int cpu, highest = 0; \
+	for_each_cpu_mask(cpu, cpu_possible_map) \
+		highest = cpu; \
+	highest; \
+})
 
 
 #endif /* __LINUX_CPUMASK_H */
-- 
cgit v1.2.3


From b24d18aa743dad0c42918157c5d717686269d3a8 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Oct 2005 20:29:20 -0700
Subject: [PATCH] list: add missing rcu_dereference on first element

It seems that all the list_*_rcu primitives are missing a memory barrier
on the very first dereference.  For example,

#define list_for_each_rcu(pos, head) \
	for (pos = (head)->next; prefetch(pos->next), pos != (head); \
		pos = rcu_dereference(pos->next))

It will go something like:

	pos = (head)->next

	prefetch(pos->next)

	pos != (head)

	do stuff

We're missing a barrier here.

	pos = rcu_dereference(pos->next)

		fetch pos->next

		barrier given by rcu_dereference(pos->next)

		store pos

Without the missing barrier, the pos->next value may turn out to be stale.
In fact, if "do stuff" were also dereferencing pos and relying on
list_for_each_rcu to provide the barrier then it may also break.

So here is a patch to make sure that we have a barrier for the first
element in the list.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/list.h | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index e6ec59682274..084971f333fe 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -442,12 +442,14 @@ static inline void list_splice_init(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_rcu(pos, head) \
-	for (pos = (head)->next; prefetch(pos->next), pos != (head); \
-        	pos = rcu_dereference(pos->next))
+	for (pos = (head)->next; \
+		prefetch(rcu_dereference(pos)->next), pos != (head); \
+        	pos = pos->next)
 
 #define __list_for_each_rcu(pos, head) \
-	for (pos = (head)->next; pos != (head); \
-        	pos = rcu_dereference(pos->next))
+	for (pos = (head)->next; \
+		rcu_dereference(pos) != (head); \
+        	pos = pos->next)
 
 /**
  * list_for_each_safe_rcu	-	iterate over an rcu-protected list safe
@@ -461,8 +463,9 @@ static inline void list_splice_init(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_safe_rcu(pos, n, head) \
-	for (pos = (head)->next, n = pos->next; pos != (head); \
-		pos = rcu_dereference(n), n = pos->next)
+	for (pos = (head)->next; \
+		n = rcu_dereference(pos)->next, pos != (head); \
+		pos = n)
 
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
@@ -474,11 +477,11 @@ static inline void list_splice_init(struct list_head *list,
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define list_for_each_entry_rcu(pos, head, member)			\
-	for (pos = list_entry((head)->next, typeof(*pos), member);	\
-	     prefetch(pos->member.next), &pos->member != (head); 	\
-	     pos = rcu_dereference(list_entry(pos->member.next, 	\
-					typeof(*pos), member)))
+#define list_for_each_entry_rcu(pos, head, member) \
+	for (pos = list_entry((head)->next, typeof(*pos), member); \
+		prefetch(rcu_dereference(pos)->member.next), \
+			&pos->member != (head); \
+		pos = list_entry(pos->member.next, typeof(*pos), member))
 
 
 /**
@@ -492,8 +495,9 @@ static inline void list_splice_init(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_continue_rcu(pos, head) \
-	for ((pos) = (pos)->next; prefetch((pos)->next), (pos) != (head); \
-        	(pos) = rcu_dereference((pos)->next))
+	for ((pos) = (pos)->next; \
+		prefetch(rcu_dereference((pos))->next), (pos) != (head); \
+        	(pos) = (pos)->next)
 
 /*
  * Double linked lists with a single pointer list head.
@@ -696,8 +700,9 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 	     pos = n)
 
 #define hlist_for_each_rcu(pos, head) \
-	for ((pos) = (head)->first; pos && ({ prefetch((pos)->next); 1; }); \
-		(pos) = rcu_dereference((pos)->next))
+	for ((pos) = (head)->first; \
+		rcu_dereference((pos)) && ({ prefetch((pos)->next); 1; }); \
+		(pos) = (pos)->next)
 
 /**
  * hlist_for_each_entry	- iterate over list of given type
@@ -762,9 +767,9 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
  */
 #define hlist_for_each_entry_rcu(tpos, pos, head, member)		 \
 	for (pos = (head)->first;					 \
-	     pos && ({ prefetch(pos->next); 1;}) &&			 \
+	     rcu_dereference(pos) && ({ prefetch(pos->next); 1;}) &&	 \
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-	     pos = rcu_dereference(pos->next))
+	     pos = pos->next)
 
 #else
 #warning "don't include kernel headers in userspace"
-- 
cgit v1.2.3


From 5ee832dbc6770135ec8d63296af0a4374557bb79 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 17 Oct 2005 20:01:21 +0200
Subject: [PATCH] rcu: keep rcu callback event counter

This makes call_rcu() keep track of how many events there are on the RCU
list, and cause a reschedule event when the list gets too long.

This helps keep RCU event lists down.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4e65eb44adfd..70191a5a148f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -94,6 +94,7 @@ struct rcu_data {
 	long  	       	batch;           /* Batch # for current RCU batch */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail;
+	long            count; /* # of queued items */
 	struct rcu_head *curlist;
 	struct rcu_head **curtail;
 	struct rcu_head *donelist;
-- 
cgit v1.2.3


From 4faa5285283fad081443e3612ca426a311bb6c7e Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 17 Oct 2005 16:43:33 -0700
Subject: [PATCH] aio: revert lock_kiocb()

lock_kiocb() was introduced to serialize retrying and cancellation.  In the
process of doing so it tried to sleep waiting for KIF_LOCKED while holding
the ctx_lock spinlock.  Recent fixes have ensured that multiple concurrent
retries won't be attempted for a given iocb.  Cancel has other problems and
has no significant in-tree users that have been complaining about it.  So
for the immediate future we'll revert sleeping with the lock held and will
address proper cancellation and retry serialization in the future.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/aio.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 60def658b246..0decf66117c1 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -24,7 +24,12 @@ struct kioctx;
 #define KIOCB_SYNC_KEY		(~0U)
 
 /* ki_flags bits */
-#define KIF_LOCKED		0
+/*
+ * This may be used for cancel/retry serialization in the future, but
+ * for now it's unused and we probably don't want modules to even
+ * think they can use it.
+ */
+/* #define KIF_LOCKED		0 */
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
 
-- 
cgit v1.2.3


From 3359b54c8c07338f3a863d1109b42eebccdcf379 Mon Sep 17 00:00:00 2001
From: "Seth, Rohit" <rohit.seth@intel.com>
Date: Tue, 18 Oct 2005 14:15:12 -0700
Subject: [PATCH] Handle spurious page fault for hugetlb region

The hugetlb pages are currently pre-faulted.  At the time of mmap of
hugepages, we populate the new PTEs.  It is possible that HW has already
cached some of the unused PTEs internally.  These stale entries never
get a chance to be purged in existing control flow.

This patch extends the check in page fault code for hugepages.  Check if
a faulted address falls with in size for the hugetlb file backing it.
We return VM_FAULT_MINOR for these cases (assuming that the arch
specific page-faulting code purges the stale entry for the archs that
need it).

Signed-off-by: Rohit Seth <rohit.seth@intel.com>

[ This is apparently arguably an ia64 port bug. But the code won't
  hurt, and for now it fixes a real problem on some ia64 machines ]

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hugetlb.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e670b0d13fe0..42cb7d70f9ac 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -155,11 +155,24 @@ static inline void set_file_hugepages(struct file *file)
 {
 	file->f_op = &hugetlbfs_file_operations;
 }
+
+static inline int valid_hugetlb_file_off(struct vm_area_struct *vma, 
+					  unsigned long address) 
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	loff_t file_off = address - vma->vm_start;
+	
+	file_off += (vma->vm_pgoff << PAGE_SHIFT);
+	
+	return (file_off < inode->i_size);
+}
+
 #else /* !CONFIG_HUGETLBFS */
 
 #define is_file_hugepages(file)		0
 #define set_file_hugepages(file)	BUG()
 #define hugetlb_zero_setup(size)	ERR_PTR(-ENOSYS)
+#define valid_hugetlb_file_off(vma, address) 	0
 
 #endif /* !CONFIG_HUGETLBFS */
 
-- 
cgit v1.2.3


From 281dd25cdc0d6903929b79183816d151ea626341 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Wed, 19 Oct 2005 15:52:18 -0700
Subject: [PATCH] swiotlb: make sure initial DMA allocations really are in DMA
 memory

This introduces a limit parameter to the core bootmem allocator; The new
parameter indicates that physical memory allocated by the bootmem
allocator should be within the requested limit.

We also introduce alloc_bootmem_low_pages_limit, alloc_bootmem_node_limit,
alloc_bootmem_low_pages_node_limit apis, but alloc_bootmem_low_pages_limit
is the only api used for swiotlb.

The existing alloc_bootmem_low_pages() api could instead have been
changed and made to pass right limit to the core allocator.  But that
would make the patch more intrusive for 2.6.14, as other arches use
alloc_bootmem_low_pages().  We may be done that post 2.6.14 as a
cleanup.

With this, swiotlb gets memory within 4G for both x86_64 and ia64
arches.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Ravikiran G Thirumalai <kiran@scalex86.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 82bd8842d11c..3b03b0b868dd 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -43,7 +43,7 @@ typedef struct bootmem_data {
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
 extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend);
 extern void __init free_bootmem (unsigned long addr, unsigned long size);
-extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal);
+extern void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
@@ -54,6 +54,16 @@ extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 	__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem((x), PAGE_SIZE, 0)
+
+#define alloc_bootmem_limit(x, limit)						\
+	__alloc_bootmem_limit((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
+#define alloc_bootmem_low_limit(x, limit)			\
+	__alloc_bootmem_limit((x), SMP_CACHE_BYTES, 0, (limit))
+#define alloc_bootmem_pages_limit(x, limit)					\
+	__alloc_bootmem_limit((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
+#define alloc_bootmem_low_pages_limit(x, limit)		\
+	__alloc_bootmem_limit((x), PAGE_SIZE, 0, (limit))
+
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern unsigned long __init free_all_bootmem (void);
 
@@ -61,7 +71,7 @@ extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long f
 extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size);
 extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
 extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat);
-extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
+extern void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
@@ -69,6 +79,14 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size,
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
+
+#define alloc_bootmem_node_limit(pgdat, x, limit)				\
+	__alloc_bootmem_node_limit((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
+#define alloc_bootmem_pages_node_limit(pgdat, x, limit)				\
+	__alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
+#define alloc_bootmem_low_pages_node_limit(pgdat, x, limit)		\
+	__alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, 0, (limit))
+
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
@@ -105,5 +123,15 @@ extern void *__init alloc_large_system_hash(const char *tablename,
 #endif
 extern int __initdata hashdist;		/* Distribute hashes across NUMA nodes? */
 
+static inline void *__alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
+{
+	return __alloc_bootmem_limit(size, align, goal, 0);
+}
+
+static inline void *__alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align,
+				     unsigned long goal)
+{
+	return __alloc_bootmem_node_limit(pgdat, size, align, goal, 0);
+}
 
 #endif /* _LINUX_BOOTMEM_H */
-- 
cgit v1.2.3


From ac9b9c667c2e1194e22ebe0a441ae1c37aaa9b90 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 20 Oct 2005 16:24:28 +0100
Subject: [PATCH] Fix handling spurious page fault for hugetlb region

This reverts commit 3359b54c8c07338f3a863d1109b42eebccdcf379 and
replaces it with a cleaner version that is purely based on page table
operations, so that the synchronization between inode size and hugetlb
mappings becomes moot.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hugetlb.h | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 42cb7d70f9ac..d664330d900e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, int write_access);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -99,6 +101,7 @@ static inline unsigned long hugetlb_total_pages(void)
 						do { } while (0)
 #define alloc_huge_page()			({ NULL; })
 #define free_huge_page(p)			({ (void)(p); BUG(); })
+#define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
 #ifndef HPAGE_MASK
 #define HPAGE_MASK	0		/* Keep the compiler happy */
@@ -155,24 +158,11 @@ static inline void set_file_hugepages(struct file *file)
 {
 	file->f_op = &hugetlbfs_file_operations;
 }
-
-static inline int valid_hugetlb_file_off(struct vm_area_struct *vma, 
-					  unsigned long address) 
-{
-	struct inode *inode = vma->vm_file->f_dentry->d_inode;
-	loff_t file_off = address - vma->vm_start;
-	
-	file_off += (vma->vm_pgoff << PAGE_SHIFT);
-	
-	return (file_off < inode->i_size);
-}
-
 #else /* !CONFIG_HUGETLBFS */
 
 #define is_file_hugepages(file)		0
 #define set_file_hugepages(file)	BUG()
 #define hugetlb_zero_setup(size)	ERR_PTR(-ENOSYS)
-#define valid_hugetlb_file_off(vma, address) 	0
 
 #endif /* !CONFIG_HUGETLBFS */
 
-- 
cgit v1.2.3