From dd7f0b80926befc8c70a873b5b0c0c7b5fd1e7b9 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Wed, 22 Jun 2005 12:38:33 -0700
Subject: [NETFILTER]: Fix "iptables -D" rule deletion with ipt_CLUSTERIP
 target.

The patch just changes the order of structure members.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
index baa83e757156..d9bceedfb3dc 100644
--- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
+++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
@@ -18,7 +18,6 @@ struct clusterip_config;
 struct ipt_clusterip_tgt_info {
 
 	u_int32_t flags;
-	struct clusterip_config *config;
 	
 	/* only relevant for new ones */
 	u_int8_t clustermac[6];
@@ -27,6 +26,8 @@ struct ipt_clusterip_tgt_info {
 	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
 	enum clusterip_hashmode hash_mode;
 	u_int32_t hash_initval;
+
+	struct clusterip_config *config;
 };
 
 #endif /*_IPT_CLUSTERIP_H_target*/
-- 
cgit v1.2.3


From 6ca4f65e6b390d09e1de7280cf9fd4f5d8e4b48b Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:04:55 -0700
Subject: [NETPOLL]: Set poll_owner to -1 before unlocking in
 netpoll_poll_unlock()

This trivial patch moves the assignment of poll_owner to -1 inside of
the lock.  This fixes a potential SMP race in the code.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index c0d8b90c5202..449a4fde6587 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -53,8 +53,8 @@ static inline void netpoll_poll_lock(struct net_device *dev)
 static inline void netpoll_poll_unlock(struct net_device *dev)
 {
 	if (dev->np) {
-		spin_unlock(&dev->np->poll_lock);
 		dev->np->poll_owner = -1;
+		spin_unlock(&dev->np->poll_lock);
 	}
 }
 
-- 
cgit v1.2.3


From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:31 -0700
Subject: [NETPOLL]: Introduce a netpoll_info struct

This patch introduces a netpoll_info structure, which the struct net_device
will now point to instead of pointing to a struct netpoll.  The reason for
this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock
should be maintained per net_device, not per netpoll;  and 2) this is a first
step in providing support for multiple netpoll clients to register against the
same net_device.

The struct netpoll is now pointed to by the netpoll_info structure.  As
such, the previous behaviour of the code is preserved.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 include/linux/netpoll.h   | 25 +++++++++++++++++--------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ba5d1236aa17..d6afd440cf7b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -41,7 +41,7 @@
 struct divert_blk;
 struct vlan_group;
 struct ethtool_ops;
-struct netpoll;
+struct netpoll_info;
 					/* source back-compat hooks */
 #define SET_ETHTOOL_OPS(netdev,ops) \
 	( (netdev)->ethtool_ops = (ops) )
@@ -468,7 +468,7 @@ struct net_device
 						     unsigned char *haddr);
 	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
 #ifdef CONFIG_NETPOLL
-	struct netpoll		*np;
+	struct netpoll_info	*npinfo;
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*poll_controller)(struct net_device *dev);
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 449a4fde6587..388cd91bc7a6 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -16,14 +16,18 @@ struct netpoll;
 struct netpoll {
 	struct net_device *dev;
 	char dev_name[16], *name;
-	int rx_flags;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
 	void (*drop)(struct sk_buff *skb);
 	u32 local_ip, remote_ip;
 	u16 local_port, remote_port;
 	unsigned char local_mac[6], remote_mac[6];
+};
+
+struct netpoll_info {
 	spinlock_t poll_lock;
 	int poll_owner;
+	int rx_flags;
+	struct netpoll *np;
 };
 
 void netpoll_poll(struct netpoll *np);
@@ -39,22 +43,27 @@ void netpoll_queue(struct sk_buff *skb);
 #ifdef CONFIG_NETPOLL
 static inline int netpoll_rx(struct sk_buff *skb)
 {
-	return skb->dev->np && skb->dev->np->rx_flags && __netpoll_rx(skb);
+	struct netpoll_info *npinfo = skb->dev->npinfo;
+
+	if (!npinfo || !npinfo->rx_flags)
+		return 0;
+
+	return npinfo->np && __netpoll_rx(skb);
 }
 
 static inline void netpoll_poll_lock(struct net_device *dev)
 {
-	if (dev->np) {
-		spin_lock(&dev->np->poll_lock);
-		dev->np->poll_owner = smp_processor_id();
+	if (dev->npinfo) {
+		spin_lock(&dev->npinfo->poll_lock);
+		dev->npinfo->poll_owner = smp_processor_id();
 	}
 }
 
 static inline void netpoll_poll_unlock(struct net_device *dev)
 {
-	if (dev->np) {
-		dev->np->poll_owner = -1;
-		spin_unlock(&dev->np->poll_lock);
+	if (dev->npinfo) {
+		dev->npinfo->poll_owner = -1;
+		spin_unlock(&dev->npinfo->poll_lock);
 	}
 }
 
-- 
cgit v1.2.3


From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:59 -0700
Subject: [NETPOLL]: allow multiple netpoll_clients to register against one
 interface

This patch provides support for registering multiple netpoll clients to the
same network device.  Only one of these clients may register an rx_hook,
however.  In practice, this restriction has not been problematic.  It is
worth mentioning, though, that the current design can be easily extended to
allow for the registration of multiple rx_hooks.

The basic idea of the patch is that the rx_np pointer in the netpoll_info
structure points to the struct netpoll that has rx_hook filled in.  Aside
from this one case, there is no need for a pointer from the struct
net_device to an individual struct netpoll.

A lock is introduced to protect the setting and clearing of the np_rx
pointer.  The pointer will only be cleared upon netpoll client module
removal, and the lock should be uncontested.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 388cd91bc7a6..bcd0ac33f592 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -27,7 +27,8 @@ struct netpoll_info {
 	spinlock_t poll_lock;
 	int poll_owner;
 	int rx_flags;
-	struct netpoll *np;
+	spinlock_t rx_lock;
+	struct netpoll *rx_np; /* netpoll that registered an rx_hook */
 };
 
 void netpoll_poll(struct netpoll *np);
@@ -44,11 +45,19 @@ void netpoll_queue(struct sk_buff *skb);
 static inline int netpoll_rx(struct sk_buff *skb)
 {
 	struct netpoll_info *npinfo = skb->dev->npinfo;
+	unsigned long flags;
+	int ret = 0;
 
-	if (!npinfo || !npinfo->rx_flags)
+	if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags))
 		return 0;
 
-	return npinfo->np && __netpoll_rx(skb);
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	/* check rx_flags again with the lock held */
+	if (npinfo->rx_flags && __netpoll_rx(skb))
+		ret = 1;
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
+	return ret;
 }
 
 static inline void netpoll_poll_lock(struct net_device *dev)
-- 
cgit v1.2.3


From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:15:01 -0700
Subject: [X25]: Selective sub-address matching with call user data.

From: Shaun Pereira <spereira@tusc.com.au>

This is the first (independent of the second) patch of two that I am
working on with x25 on linux (tested with xot on a cisco router).  Details
are as follows.

Current state of module:

A server using the current implementation (2.6.11.7) of the x25 module will
accept a call request/ incoming call packet at the listening x.25 address,
from all callers to that address, as long as NO call user data is present
in the packet header.

If the server needs to choose to accept a particular call request/ incoming
call packet arriving at its listening x25 address, then the kernel has to
allow a match of call user data present in the call request packet with its
own.  This is required when multiple servers listen at the same x25 address
and device interface.  The kernel currently matches ALL call user data, if
present.

Current Changes:

This patch is a follow up to the patch submitted previously by Andrew
Hendry, and allows the user to selectively control the number of octets of
call user data in the call request packet, that the kernel will match.  By
default no call user data is matched, even if call user data is present.
To allow call user data matching, a cudmatchlength > 0 has to be passed
into the kernel after which the passed number of octets will be matched.
Otherwise the kernel behavior is exactly as the original implementation.

This patch also ensures that as is normally the case, no call user data
will be present in the Call accepted / call connected packet sent back to
the caller

Future Changes on next patch:

There are cases however when call user data may be present in the call
accepted packet.  According to the X.25 recommendation (ITU-T 10/96)
section 5.2.3.2 call user data may be present in the call accepted packet
provided the fast select facility is used.  My next patch will include this
fast select utility and the ability to send up to 128 octets call user data
in the call accepted packet provided the fast select facility is used.  I
am currently testing this, again with xot on linux and cisco.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>

(With a fix from Alexey Dobriyan <adobriyan@gmail.com>)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index 7531cfed5885..6f43b3d20248 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -4,6 +4,8 @@
  * 	History
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities 
  *					  negotiation.
+ *	apr/02/05	Shaun Pereira Selective sub address matching with
+ *					call user data
  */
 
 #ifndef	X25_KERNEL_H
@@ -16,6 +18,7 @@
 #define	SIOCX25GCALLUSERDATA	(SIOCPROTOPRIVATE + 4)
 #define	SIOCX25SCALLUSERDATA	(SIOCPROTOPRIVATE + 5)
 #define	SIOCX25GCAUSEDIAG	(SIOCPROTOPRIVATE + 6)
+#define SIOCX25SCUDMATCHLEN	(SIOCPROTOPRIVATE + 7)
 
 /*
  *	Values for {get,set}sockopt.
@@ -109,4 +112,11 @@ struct x25_causediag {
 	unsigned char	diagnostic;
 };
 
+/*
+ *	Further optional call user data match length selection
+ */
+struct x25_subaddr {
+	unsigned int cudmatchlength;
+};
+
 #endif
-- 
cgit v1.2.3


From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:16:17 -0700
Subject: [X25]: Fast select with no restriction on response

This patch is a follow up to patch 1 regarding "Selective Sub Address
matching with call user data".  It allows use of the Fast-Select-Acceptance
optional user facility for X.25.

This patch just implements fast select with no restriction on response
(NRR).  What this means (according to ITU-T Recomendation 10/96 section
6.16) is that if in an incoming call packet, the relevant facility bits are
set for fast-select-NRR, then the called DTE can issue a direct response to
the incoming packet using a call-accepted packet that contains
call-user-data.  This patch allows such a response.

The called DTE can also respond with a clear-request packet that contains
call-user-data.  However, this feature is currently not implemented by the
patch.

How is Fast Select Acceptance used?
By default, the system does not allow fast select acceptance (as before).
To enable a response to fast select acceptance,
After a listen socket in created and bound as follows
	socket(AF_X25, SOCK_SEQPACKET, 0);
	bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr));
but before a listen system call is made, the following ioctl should be used.
	ioctl(call_soc,SIOCX25CALLACCPTAPPRV);
Now the listen system call can be made
	listen(call_soc, 4);
After this, an incoming-call packet will be accepted, but no call-accepted
packet will be sent back until the following system call is made on the socket
that accepts the call
	ioctl(vc_soc,SIOCX25SENDCALLACCPT);
The network (or cisco xot router used for testing here) will allow the
application server's call-user-data in the call-accepted packet,
provided the call-request was made with Fast-select NRR.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index 6f43b3d20248..16d44931afa0 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -19,6 +19,8 @@
 #define	SIOCX25SCALLUSERDATA	(SIOCPROTOPRIVATE + 5)
 #define	SIOCX25GCAUSEDIAG	(SIOCPROTOPRIVATE + 6)
 #define SIOCX25SCUDMATCHLEN	(SIOCPROTOPRIVATE + 7)
+#define SIOCX25CALLACCPTAPPRV   (SIOCPROTOPRIVATE + 8)
+#define SIOCX25SENDCALLACCPT    (SIOCPROTOPRIVATE + 9)
 
 /*
  *	Values for {get,set}sockopt.
-- 
cgit v1.2.3


From 408fde81c1bff15c875a3618481e93a01dcc79ea Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:37 -0700
Subject: [PATCH] remove non-DISCONTIG use of pgdat->node_mem_map

This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code.  On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.

There was also a node_mem_map(nid) macro on many architectures.  Its use
along with the use of ->node_mem_map itself was not consistent.  It has
been removed in favor of two new, more explicit, arch-independent macros:

	pgdat_page_nr(pgdat, pagenr)
	nid_page_nr(nid, pagenr)

I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways.  I
believe the newer names are much clearer.

These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead.  We could
make this the only behavior if people want, but I don't want to change too
much at once.  One thing at a time.

This patch removes more code than it adds.

Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64.  Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/

Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4733d35d8223..b79633d3a97b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -284,6 +284,8 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+#define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
+#define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 extern struct pglist_data *pgdat_list;
 
-- 
cgit v1.2.3


From 6f167ec721108c9282d54424516a12c805e3c306 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:39 -0700
Subject: [PATCH] sparsemem base: simple NUMA remap space allocator

Introduce a simple allocator for the NUMA remap space.  This space is very
scarce, used for structures which are best allocated node local.

This mechanism is also used on non-NUMA ia64 systems with a vmem_map to keep
the pgdat->node_mem_map initialized in a consistent place for all
architectures.

Issues:
o alloc_remap takes a node_id where we might expect a pgdat which was intended
  to allow us to allocate the pgdat's using this mechanism; which we do not yet
  do.  Could have alloc_remap_node() and alloc_remap_nid() for this purpose.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0dd8ca1a3d5a..500f451ce0c0 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -67,6 +67,15 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size,
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
+extern void *alloc_remap(int nid, unsigned long size);
+#else
+static inline void *alloc_remap(int nid, unsigned long size)
+{
+	return NULL;
+}
+#endif
+
 extern unsigned long __initdata nr_kernel_pages;
 extern unsigned long __initdata nr_all_pages;
 
-- 
cgit v1.2.3


From 348f8b6c4837a07304d2f72b11ce8d96588065e0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:40 -0700
Subject: [PATCH] sparsemem base: reorganize page->flags bit operations

Generify the value fields in the page_flags.  The aim is to allow the location
and size of these fields to be varied.  Additionally we want to move away from
fixed allocations per field whilst still enforcing the overall bit utilisation
limits.  We rely on the compiler to spot and optimise the accessor functions.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h     | 53 +++++++++++++++++++++++++++++++++++++++++---------
 include/linux/mmzone.h | 19 +++++++-----------
 2 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1813b162b0a8..57b2ead51dba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -395,19 +395,41 @@ static inline void put_page(struct page *page)
 /*
  * The zone field is never updated after free_area_init_core()
  * sets it, so none of the operations on it need to be atomic.
- * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total,
- * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits.
  */
-#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT)
+
+/* Page flags: | NODE | ZONE | ... | FLAGS | */
+#define NODES_PGOFF		((sizeof(page_flags_t)*8) - NODES_SHIFT)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_SHIFT)
+
+/*
+ * Define the bit shifts to access each section.  For non-existant
+ * sections we define the shift as 0; that plus a 0 mask ensures
+ * the compiler will optimise away reference to them.
+ */
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_SHIFT != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_SHIFT != 0))
+
+/* NODE:ZONE is used to lookup the zone from a page. */
+#define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
+
+#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#endif
+
 #define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
 
+#define ZONES_MASK		((1UL << ZONES_SHIFT) - 1)
+#define NODES_MASK		((1UL << NODES_SHIFT) - 1)
+#define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
+
 static inline unsigned long page_zonenum(struct page *page)
 {
-	return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT));
+	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 static inline unsigned long page_to_nid(struct page *page)
 {
-	return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT));
+	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 }
 
 struct zone;
@@ -415,13 +437,26 @@ extern struct zone *zone_table[];
 
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[page->flags >> NODEZONE_SHIFT];
+	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
+			ZONETABLE_MASK];
+}
+
+static inline void set_page_zone(struct page *page, unsigned long zone)
+{
+	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
+	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+}
+static inline void set_page_node(struct page *page, unsigned long node)
+{
+	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
+	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
 }
 
-static inline void set_page_zone(struct page *page, unsigned long nodezone_num)
+static inline void set_page_links(struct page *page, unsigned long zone,
+	unsigned long node)
 {
-	page->flags &= ~(~0UL << NODEZONE_SHIFT);
-	page->flags |= nodezone_num << NODEZONE_SHIFT;
+	set_page_zone(page, zone);
+	set_page_node(page, node);
 }
 
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b79633d3a97b..39e912708e2a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -414,30 +414,25 @@ extern struct pglist_data contig_page_data;
 
 #include <asm/mmzone.h>
 
+#endif /* !CONFIG_DISCONTIGMEM */
+
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
  * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
  */
-#define MAX_NODES_SHIFT		6
+#define FLAGS_RESERVED		8
+
 #elif BITS_PER_LONG == 64
 /*
  * with 64 bit flags field, there's plenty of room.
  */
-#define MAX_NODES_SHIFT		10
-#endif
+#define FLAGS_RESERVED		32
 
-#endif /* !CONFIG_DISCONTIGMEM */
-
-#if NODES_SHIFT > MAX_NODES_SHIFT
-#error NODES_SHIFT > MAX_NODES_SHIFT
-#endif
+#else
 
-/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */
-#define MAX_ZONES_SHIFT		2
+#error BITS_PER_LONG not defined
 
-#if ZONES_SHIFT > MAX_ZONES_SHIFT
-#error ZONES_SHIFT > MAX_ZONES_SHIFT
 #endif
 
 #endif /* !__ASSEMBLY__ */
-- 
cgit v1.2.3


From 93b7504e3e6c1d98586854806e51bea329ea3aa9 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:47 -0700
Subject: [PATCH] Introduce new Kconfig option for NUMA or DISCONTIG

There is some confusion that arose when working on SPARSEMEM patch between
what is needed for DISCONTIG vs. NUMA.

Multiple pg_data_t's are needed for DISCONTIGMEM or NUMA, independently.
All of the current NUMA implementations require an implementation of
DISCONTIG.  Because of this, quite a lot of code which is really needed for
NUMA is actually under DISCONTIG #ifdefs.  For SPARSEMEM, we changed some
of these #ifdefs to CONFIG_NUMA, but that broke the DISCONTIG=y and NUMA=n
case.

Introducing this new NEED_MULTIPLE_NODES config option allows code that is
needed for both NUMA or DISCONTIG to be separated out from code that is
specific to DISCONTIG.

One great advantage of this approach is that it doesn't require every
architecture to be converted over.  All of the current implementations
should "just work", only the ones implementing SPARSEMEM will have to be
fixed up.

The change to free_area_init() makes it work inside, or out of the new
config option.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39e912708e2a..95f4a780ea66 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -402,7 +402,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 /* Returns the number of the current Node. */
 #define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 
 extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
@@ -410,11 +410,11 @@ extern struct pglist_data contig_page_data;
 #define MAX_NODES_SHIFT		1
 #define pfn_to_nid(pfn)		(0)
 
-#else /* CONFIG_DISCONTIGMEM */
+#else /* CONFIG_NEED_MULTIPLE_NODES */
 
 #include <asm/mmzone.h>
 
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
-- 
cgit v1.2.3


From b159d43fbf7eaaac6ecc647f51cf4257332db47b Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:52 -0700
Subject: [PATCH] generify early_pfn_to_nid

Provide a default implementation for early_pfn_to_nid returning node 0.  Allow
architectures to override this with their own implementation out of
asm/mmzone.h.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 95f4a780ea66..6ef07de98d69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,6 +435,10 @@ extern struct pglist_data contig_page_data;
 
 #endif
 
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+#define early_pfn_to_nid(nid)  (0UL)
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
-- 
cgit v1.2.3


From d41dee369bff3b9dcb6328d4d822926c28cc2594 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:54 -0700
Subject: [PATCH] sparsemem memory model

Sparsemem abstracts the use of discontiguous mem_maps[].  This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems.  Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.

A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA.  When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.

Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous.  It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.

Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory.  This is what allows the mem_map[]
to be chopped up.

In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags.  Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations.  However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions.  Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.

One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags.  It might provide
speed increases on certain platforms and will be stored there if there is
room.  But, if out of room, an alternate (theoretically slower) mechanism is
used.

This patch introduces CONFIG_FLATMEM.  It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h     | 92 ++++++++++++++++++++++++++++++++++++++---------
 include/linux/mmzone.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/numa.h   |  2 +-
 3 files changed, 172 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57b2ead51dba..6eb7f48317f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -397,40 +397,80 @@ static inline void put_page(struct page *page)
  * sets it, so none of the operations on it need to be atomic.
  */
 
-/* Page flags: | NODE | ZONE | ... | FLAGS | */
-#define NODES_PGOFF		((sizeof(page_flags_t)*8) - NODES_SHIFT)
-#define ZONES_PGOFF		(NODES_PGOFF - ZONES_SHIFT)
+
+/*
+ * page->flags layout:
+ *
+ * There are three possibilities for how page->flags get
+ * laid out.  The first is for the normal case, without
+ * sparsemem.  The second is for sparsemem when there is
+ * plenty of space for node and section.  The last is when
+ * we have run out of space and have to fall back to an
+ * alternate (slower) way of determining the node.
+ *
+ *        No sparsemem: |       NODE     | ZONE | ... | FLAGS |
+ * with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
+ *   no space for node: | SECTION |     ZONE    | ... | FLAGS |
+ */
+#ifdef CONFIG_SPARSEMEM
+#define SECTIONS_WIDTH		SECTIONS_SHIFT
+#else
+#define SECTIONS_WIDTH		0
+#endif
+
+#define ZONES_WIDTH		ZONES_SHIFT
+
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
+#define NODES_WIDTH		NODES_SHIFT
+#else
+#define NODES_WIDTH		0
+#endif
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
+#define SECTIONS_PGOFF		((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
+
+/*
+ * We are going to use the flags for the page to node mapping if its in
+ * there.  This includes the case where there is no node, so it is implicit.
+ */
+#define FLAGS_HAS_NODE		(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+
+#ifndef PFN_SECTION_SHIFT
+#define PFN_SECTION_SHIFT 0
+#endif
 
 /*
  * Define the bit shifts to access each section.  For non-existant
  * sections we define the shift as 0; that plus a 0 mask ensures
  * the compiler will optimise away reference to them.
  */
-#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_SHIFT != 0))
-#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_SHIFT != 0))
+#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
 
-/* NODE:ZONE is used to lookup the zone from a page. */
+/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
+#if FLAGS_HAS_NODE
 #define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#else
+#define ZONETABLE_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
+#endif
 #define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
 
-#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
-#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
 #endif
 
-#define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
-
-#define ZONES_MASK		((1UL << ZONES_SHIFT) - 1)
-#define NODES_MASK		((1UL << NODES_SHIFT) - 1)
+#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
+#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
+#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
 #define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
 
 static inline unsigned long page_zonenum(struct page *page)
 {
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
-static inline unsigned long page_to_nid(struct page *page)
-{
-	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
-}
 
 struct zone;
 extern struct zone *zone_table[];
@@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page)
 			ZONETABLE_MASK];
 }
 
+static inline unsigned long page_to_nid(struct page *page)
+{
+	if (FLAGS_HAS_NODE)
+		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
+	else
+		return page_zone(page)->zone_pgdat->node_id;
+}
+static inline unsigned long page_to_section(struct page *page)
+{
+	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
+}
+
 static inline void set_page_zone(struct page *page, unsigned long zone)
 {
 	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
@@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node)
 	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
 	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
 }
+static inline void set_page_section(struct page *page, unsigned long section)
+{
+	page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+	page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+}
 
 static inline void set_page_links(struct page *page, unsigned long zone,
-	unsigned long node)
+	unsigned long node, unsigned long pfn)
 {
 	set_page_zone(page, zone);
 	set_page_node(page, node);
+	set_page_section(page, pfn_to_section_nr(pfn));
 }
 
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6ef07de98d69..19860d317ec2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -269,7 +269,9 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[GFP_ZONETYPES];
 	int nr_zones;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 	struct page *node_mem_map;
+#endif
 	struct bootmem_data *bdata;
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
@@ -284,7 +286,11 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 #define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
+#else
+#define pgdat_page_nr(pgdat, pagenr)	pfn_to_page((pgdat)->node_start_pfn + (pagenr))
+#endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 extern struct pglist_data *pgdat_list;
@@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
+#ifdef CONFIG_SPARSEMEM
+#include <asm/sparsemem.h>
+#endif
+
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
@@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data;
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 
+#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
+#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
+
+#ifdef CONFIG_SPARSEMEM
+
+/*
+ * SECTION_SHIFT    		#bits space required to store a section #
+ *
+ * PA_SECTION_SHIFT		physical address to/from section number
+ * PFN_SECTION_SHIFT		pfn to/from section number
+ */
+#define SECTIONS_SHIFT		(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
+
+#define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
+#define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
+
+#define NR_MEM_SECTIONS		(1UL << SECTIONS_SHIFT)
+
+#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
+#define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
+
+#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
+#error Allocator MAX_ORDER exceeds SECTION_SIZE
+#endif
+
+struct page;
+struct mem_section {
+	struct page *section_mem_map;
+};
+
+extern struct mem_section mem_section[NR_MEM_SECTIONS];
+
+/*
+ * Given a kernel address, find the home node of the underlying memory.
+ */
+#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
+
+static inline struct mem_section *__pfn_to_section(unsigned long pfn)
+{
+	return &mem_section[pfn_to_section_nr(pfn)];
+}
+
+#define pfn_to_page(pfn) 						\
+({ 									\
+	unsigned long __pfn = (pfn);					\
+	__pfn_to_section(__pfn)->section_mem_map + __pfn;		\
+})
+#define page_to_pfn(page)						\
+({									\
+	page - mem_section[page_to_section(page)].section_mem_map;	\
+})
+
+static inline int pfn_valid(unsigned long pfn)
+{
+	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
+		return 0;
+	return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
+}
+
+/*
+ * These are _only_ used during initialisation, therefore they
+ * can use __initdata ...  They could have names to indicate
+ * this restriction.
+ */
+#ifdef CONFIG_NUMA
+#define pfn_to_nid		early_pfn_to_nid
+#endif
+
+#define pfn_to_pgdat(pfn)						\
+({									\
+	NODE_DATA(pfn_to_nid(pfn));					\
+})
+
+#define early_pfn_valid(pfn)	pfn_valid(pfn)
+void sparse_init(void);
+#else
+#define sparse_init()	do {} while (0)
+#endif /* CONFIG_SPARSEMEM */
+
+#ifndef early_pfn_valid
+#define early_pfn_valid(pfn)	(1)
+#endif
+
+void memory_present(int nid, unsigned long start, unsigned long end);
+unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/include/linux/numa.h b/include/linux/numa.h
index bd0c8c4e9a95..f0c539bd3cfc 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_FLATMEM
 #include <asm/numnodes.h>
 #endif
 
-- 
cgit v1.2.3


From 641c767389b19859a45e6de46d8e18cd935bdb60 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:59 -0700
Subject: [PATCH] sparsemem swiss cheese numa layouts

The part of the sparsemem patch which modifies memmap_init_zone() has recently
become a problem.  It changes behavior so that there is a call to
pfn_to_page() for each individual page inside of a node's range:
node_start_pfn through node_end_pfn.  It used to simply do this once, at the
beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside
of a node made it necessary to change.

Mike Kravetz recently wrote a patch which made the NUMA code accept some new
kinds of layouts.  The system's memory was laid out like this, with node 0's
memory in two pieces: one before and one after node 1's memory:

	Node 0: +++++     +++++
	Node 1:      +++++

Previous behavior before Mike's patch was to assign nodes like this:

	Node 0: 00000     XXXXX
	Node 1:      11111

Where the 'X' areas were simply thrown away.  The new behavior was to make the
pg_data_t span node 0 across all of its areas, including areas that are really
node 1's: Node 0: 000000000000000 Node 1: 11111

This wastes a little bit of mem_map space, but ends up being OK, and more
fully utilizes the system's memory.  memmap_init_zone() initializes all of the
"struct page"s for node 0, even for the "hole", but those never get used,
because there is no pfn_to_page() that resolves to those pages.  However, only
calling pfn_to_page() once, memmap_init_zone() always uses the pages that were
allocated for node0->node_mem_map because:

	struct page *start = pfn_to_page(start_pfn);
	// effectively start = &node->node_mem_map[0]
	for (page = start; page < (start + size); page++) {
		init_page_here();...
		page++;
	}

Slow, and wasteful, but generally harmless.

But, modify that to call pfn_to_page() for each loop iteration (like sparsemem
does):

	for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) {
		page = pfn_to_page(pfn);
	}

And you end up trying to initialize node 1's pages too early, along with bogus
data from node 0.  This patch checks for those weird layouts and declines to
touch the pages, making the more frequent pfn_to_page() calls OK to do.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 19860d317ec2..746b57e3d370 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -528,6 +528,12 @@ void sparse_init(void);
 #define sparse_init()	do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+#define early_pfn_in_nid(pfn, nid)	(early_pfn_to_nid(pfn) == (nid))
+#else
+#define early_pfn_in_nid(pfn, nid)	(1)
+#endif
+
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
-- 
cgit v1.2.3


From 29751f6991e845f7d002a6ae520bf996b38c8dcd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:08:00 -0700
Subject: [PATCH] sparsemem hotplug base

Make sparse's initalization be accessible at runtime.  This allows sparse
mappings to be created after boot in a hotplug situation.

This patch is separated from the previous one just to give an indication how
much of the sparse infrastructure is *just* for hotplug memory.

The section_mem_map doesn't really store a pointer.  It stores something that
is convenient to do some math against to get a pointer.  It isn't valid to
just do *section_mem_map, so I don't think it should be stored as a pointer.

There are a couple of things I'd like to store about a section.  First of all,
the fact that it is !NULL does not mean that it is present.  There could be
such a combination where section_mem_map *is* NULL, but the math gets you
properly to a real mem_map.  So, I don't think that check is safe.

Since we're storing 32-bit-aligned structures, we have a few bits in the
bottom of the pointer to play with.  Use one bit to encode whether there's
really a mem_map there, and the other one to tell whether there's a valid
section there.  We need to distinguish between the two because sometimes
there's a gap between when a section is discovered to be present and when we
can get the mem_map for it.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 56 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 746b57e3d370..6c90461ed99f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -476,11 +476,56 @@ extern struct pglist_data contig_page_data;
 
 struct page;
 struct mem_section {
-	struct page *section_mem_map;
+	/*
+	 * This is, logically, a pointer to an array of struct
+	 * pages.  However, it is stored with some other magic.
+	 * (see sparse.c::sparse_init_one_section())
+	 *
+	 * Making it a UL at least makes someone do a cast
+	 * before using it wrong.
+	 */
+	unsigned long section_mem_map;
 };
 
 extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
+static inline struct mem_section *__nr_to_section(unsigned long nr)
+{
+	return &mem_section[nr];
+}
+
+/*
+ * We use the lower bits of the mem_map pointer to store
+ * a little bit of information.  There should be at least
+ * 3 bits here due to 32-bit alignment.
+ */
+#define	SECTION_MARKED_PRESENT	(1UL<<0)
+#define SECTION_HAS_MEM_MAP	(1UL<<1)
+#define SECTION_MAP_LAST_BIT	(1UL<<2)
+#define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
+
+static inline struct page *__section_mem_map_addr(struct mem_section *section)
+{
+	unsigned long map = section->section_mem_map;
+	map &= SECTION_MAP_MASK;
+	return (struct page *)map;
+}
+
+static inline int valid_section(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_MARKED_PRESENT);
+}
+
+static inline int section_has_mem_map(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_HAS_MEM_MAP);
+}
+
+static inline int valid_section_nr(unsigned long nr)
+{
+	return valid_section(__nr_to_section(nr));
+}
+
 /*
  * Given a kernel address, find the home node of the underlying memory.
  */
@@ -488,24 +533,25 @@ extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
-	return &mem_section[pfn_to_section_nr(pfn)];
+	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
 #define pfn_to_page(pfn) 						\
 ({ 									\
 	unsigned long __pfn = (pfn);					\
-	__pfn_to_section(__pfn)->section_mem_map + __pfn;		\
+	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
 })
 #define page_to_pfn(page)						\
 ({									\
-	page - mem_section[page_to_section(page)].section_mem_map;	\
+	page - __section_mem_map_addr(__nr_to_section(			\
+		page_to_section(page)));				\
 })
 
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
+	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 
 /*
-- 
cgit v1.2.3


From 1946089a109251655c5438d92c539bd2930e71ea Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:19 -0700
Subject: [PATCH] NUMA aware block device control structure allocation

Patch to allocate the control structures for for ide devices on the node of
the device itself (for NUMA systems).  The patch depends on the Slab API
change patch by Manfred and me (in mm) and the pcidev_to_node patch that I
posted today.

Does some realignment too.

Signed-off-by: Justin M. Forbes <jmforbes@linuxtx.org>
Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Pravin Shelar <pravin@calsoftinc.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/blkdev.h  |  6 +++++-
 include/linux/genhd.h   |  1 +
 include/linux/ide.h     |  2 +-
 include/linux/mempool.h | 11 ++++++++---
 4 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4a99b76c5a33..235c3414d268 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -396,6 +396,7 @@ struct request_queue
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
+	int			node;
 
 	struct list_head	drain_list;
 
@@ -615,6 +616,8 @@ static inline void blkdev_dequeue_request(struct request *req)
 /*
  * Access functions for manipulating queue properties
  */
+extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn,
+					spinlock_t *lock, int node_id);
 extern request_queue_t *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(request_queue_t *);
 extern void blk_queue_make_request(request_queue_t *, make_request_fn *);
@@ -646,7 +649,8 @@ extern void blk_wait_queue_drained(request_queue_t *, int);
 extern void blk_finish_queue_drain(request_queue_t *);
 
 int blk_get_queue(request_queue_t *);
-request_queue_t *blk_alloc_queue(int);
+request_queue_t *blk_alloc_queue(int gfp_mask);
+request_queue_t *blk_alloc_queue_node(int,int);
 #define blk_put_queue(q) blk_cleanup_queue((q))
 
 /*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 47dedaf971d6..af26dc718ef6 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -403,6 +403,7 @@ extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern void add_partition(struct gendisk *, int, sector_t, sector_t);
 extern void delete_partition(struct gendisk *, int);
 
+extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
 extern struct kobject *get_disk(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 336d6e509f59..92129078d4f3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -917,7 +917,7 @@ typedef struct hwif_s {
 	unsigned dma;
 
 	void (*led_act)(void *data, int rw);
-} ide_hwif_t;
+} ____cacheline_maxaligned_in_smp ide_hwif_t;
 
 /*
  *  internal ide interrupt handler type
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 4a36edf1c974..796220ce47cc 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -20,9 +20,14 @@ typedef struct mempool_s {
 	mempool_free_t *free;
 	wait_queue_head_t wait;
 } mempool_t;
-extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
-				 mempool_free_t *free_fn, void *pool_data);
-extern int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask);
+
+extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data);
+extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data, int nid);
+
+extern int mempool_resize(mempool_t *pool, int new_min_nr,
+			unsigned int __nocast gfp_mask);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
-- 
cgit v1.2.3


From fa72b903f75e4f0f0b2c2feed093005167da4023 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 23 Jun 2005 00:08:49 -0700
Subject: [PATCH] blk: remove blk_queue_tag->real_max_depth optimization

blk_queue_tag->real_max_depth was used to optimize out unnecessary
allocations/frees on tag resize.  However, the whole thing was very broken -
tag_map was never allocated to real_max_depth resulting in access beyond the
end of the map, bits in [max_depth..real_max_depth] were set when initializing
a map and copied when resizing resulting in pre-occupied tags.

As the gain of the optimization is very small, well, almost nill, remove the
whole thing.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 235c3414d268..8d7e2f4151d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -294,7 +294,6 @@ struct blk_queue_tag {
 	struct list_head busy_list;	/* fifo list of busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
-	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 
-- 
cgit v1.2.3


From f7d37d028dfba90b1b747f8ac685bf0959aeda8b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 23 Jun 2005 00:08:50 -0700
Subject: [PATCH] blk: remove BLK_TAGS_{PER_LONG|MASK}

Replace BLK_TAGS_PER_LONG with BITS_PER_LONG and remove unused BLK_TAGS_MASK.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8d7e2f4151d0..60272141ff19 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -285,9 +285,6 @@ enum blk_queue_state {
 	Queue_up,
 };
 
-#define BLK_TAGS_PER_LONG	(sizeof(unsigned long) * 8)
-#define BLK_TAGS_MASK		(BLK_TAGS_PER_LONG - 1)
-
 struct blk_queue_tag {
 	struct request **tag_index;	/* map of busy tags */
 	unsigned long *tag_map;		/* bit map of free/busy tags */
-- 
cgit v1.2.3


From 55c888d6d09a0df236adfaf8ccf06ff5d0646775 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:56 -0700
Subject: [PATCH] timers fixes/improvements

This patch tries to solve following problems:

1. del_timer_sync() is racy. The timer can be fired again after
   del_timer_sync have checked all cpus and before it will recheck
   timer_pending().

2. It has scalability problems. All cpus are scanned to determine
   if the timer is running on that cpu.

   With this patch del_timer_sync is O(1) and no slower than plain
   del_timer(pending_timer), unless it has to actually wait for
   completion of the currently running timer.

   The only restriction is that the recurring timer should not use
   add_timer_on().

3. The timers are not serialized wrt to itself.

   If CPU_0 does mod_timer(jiffies+1) while the timer is currently
   running on CPU 1, it is quite possible that local interrupt on
   CPU_0 will start that timer before it finished on CPU_1.

4. The timers locking is suboptimal. __mod_timer() takes 3 locks
   at once and still requires wmb() in del_timer/run_timers.

   The new implementation takes 2 locks sequentially and does not
   need memory barriers.

Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.

This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.

The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.

So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).

When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.

This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.

__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.

__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.

So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.

We don't need timer_list->lock anymore, this patch kills it.

We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.

One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global

        struct timer_base_s {
                spinlock_t lock;
                struct timer_list *running_timer;
        } __init_timer_base;

which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.

It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 90db1cc62ddd..2e78fedfc069 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -6,45 +6,33 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
-struct tvec_t_base_s;
+struct timer_base_s;
 
 struct timer_list {
 	struct list_head entry;
 	unsigned long expires;
 
-	spinlock_t lock;
 	unsigned long magic;
 
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct tvec_t_base_s *base;
+	struct timer_base_s *base;
 };
 
 #define TIMER_MAGIC	0x4b87ad6e
 
+extern struct timer_base_s __init_timer_base;
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = NULL,					\
+		.base = &__init_timer_base,			\
 		.magic = TIMER_MAGIC,				\
-		.lock = SPIN_LOCK_UNLOCKED,			\
 	}
 
-/***
- * init_timer - initialize a timer.
- * @timer: the timer to be initialized
- *
- * init_timer() must be done to a timer prior calling *any* of the
- * other timer functions.
- */
-static inline void init_timer(struct timer_list * timer)
-{
-	timer->base = NULL;
-	timer->magic = TIMER_MAGIC;
-	spin_lock_init(&timer->lock);
-}
+void fastcall init_timer(struct timer_list * timer);
 
 /***
  * timer_pending - is a timer pending?
@@ -58,7 +46,7 @@ static inline void init_timer(struct timer_list * timer)
  */
 static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->base != NULL;
+	return timer->entry.next != NULL;
 }
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
@@ -89,12 +77,12 @@ static inline void add_timer(struct timer_list * timer)
 
 #ifdef CONFIG_SMP
   extern int del_timer_sync(struct timer_list *timer);
-  extern int del_singleshot_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t) del_timer(t)
-# define del_singleshot_timer_sync(t) del_timer(t)
 #endif
 
+#define del_singleshot_timer_sync(t) del_timer_sync(t)
+
 extern void init_timers(void);
 extern void run_local_timers(void);
 extern void it_real_fn(unsigned long);
-- 
cgit v1.2.3


From fd450b7318b75343fd76b3d95416853e34e72c95 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:59 -0700
Subject: [PATCH] timers: introduce try_to_del_timer_sync()

This patch splits del_timer_sync() into 2 functions.  The new one,
try_to_del_timer_sync(), returns -1 when it hits executing timer.

It can be used in interrupt context, or when the caller hold locks which
can prevent completion of the timer's handler.

NOTE.  Currently it can't be used in interrupt context in UP case, because
->running_timer is used only with CONFIG_SMP.

Should the need arise, it is possible to kill #ifdef CONFIG_SMP in
set_running_timer(), it is cheap.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 2e78fedfc069..221f81ac2002 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -76,9 +76,11 @@ static inline void add_timer(struct timer_list * timer)
 }
 
 #ifdef CONFIG_SMP
+  extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
-# define del_timer_sync(t) del_timer(t)
+# define try_to_del_timer_sync(t)	del_timer(t)
+# define del_timer_sync(t)		del_timer(t)
 #endif
 
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
-- 
cgit v1.2.3


From 991114c6fa6a21d1fa4d544abe78592352860c82 Mon Sep 17 00:00:00 2001
From: Alexander Viro <aviro@redhat.com>
Date: Thu, 23 Jun 2005 00:09:01 -0700
Subject: [PATCH] fix for prune_icache()/forced final iput() races

Based on analysis and a patch from Russ Weight <rweight@us.ibm.com>

There is a race condition that can occur if an inode is allocated and then
released (using iput) during the ->fill_super functions.  The race
condition is between kswapd and mount.

For most filesystems this can only happen in an error path when kswapd is
running concurrently.  For isofs, however, the error can occur in a more
common code path (which is how the bug was found).

The logic here is "we want final iput() to free inode *now* instead of
letting it sit in cache if fs is going down or had not quite come up".  The
problem is with kswapd seeing such inodes in the middle of being killed and
happily taking over.

The clean solution would be to tell kswapd to leave those inodes alone and
let our final iput deal with them.  I.e.  add a new flag
(I_FORCED_FREEING), set it before write_inode_now() there and make
prune_icache() leave those alone.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5a8db00df29..3622e952e98c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1025,6 +1025,7 @@ struct super_operations {
 #define I_FREEING		16
 #define I_CLEAR			32
 #define I_NEW			64
+#define I_WILL_FREE		128
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
-- 
cgit v1.2.3


From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001
From: Paulo Marques <pmarques@grupopie.com>
Date: Thu, 23 Jun 2005 00:09:02 -0700
Subject: [PATCH] create a kstrdup library function

This patch creates a new kstrdup library function and changes the "local"
implementations in several places to use this function.

Most of the changes come from the sound and net subsystems.  The sound part
had already been acknowledged by Takashi Iwai and the net part by David S.
Miller.

I left UML alone for now because I would need more time to read the code
carefully before making changes there.

Signed-off-by: Paulo Marques <pmarques@grupopie.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/netdevice.h | 4 ----
 include/linux/string.h    | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d6afd440cf7b..d89816ad642f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -925,10 +925,6 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward);
 extern void		net_enable_timestamp(void);
 extern void		net_disable_timestamp(void);
 
-#ifdef CONFIG_SYSCTL
-extern char *net_sysctl_strdup(const char *s);
-#endif
-
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/include/linux/string.h b/include/linux/string.h
index b9fc59469956..93994c613095 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -88,6 +88,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
+extern char *kstrdup(const char *s, int gfp);
+
 #ifdef __cplusplus
 }
 #endif
-- 
cgit v1.2.3


From 35a82d1a53e1a9ad54efafcc940f9335beaed5c3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Thu, 23 Jun 2005 00:09:06 -0700
Subject: [PATCH] optimise loop driver a bit

Looks like locking can be optimised quite a lot.  Increase lock widths
slightly so lo_lock is taken fewer times per request.  Also it was quite
trivial to cover lo_pending with that lock, and remove the atomic
requirement.  This also makes memory ordering explicitly correct, which is
nice (not that I particularly saw any mem ordering bugs).

Test was reading 4 250MB files in parallel on ext2-on-tmpfs filesystem (1K
block size, 4K page size).  System is 2 socket Xeon with HT (4 thread).

intel:/home/npiggin# umount /dev/loop0 ; mount /dev/loop0 /mnt/loop ; /usr/bin/time ./mtloop.sh

Before:
0.24user 5.51system 0:02.84elapsed 202%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.52system 0:02.88elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.57system 0:02.89elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k
0.22user 5.51system 0:02.90elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.44system 0:02.91elapsed 193%CPU (0avgtext+0avgdata 0maxresident)k

After:
0.07user 2.34system 0:01.68elapsed 143%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.37system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.39system 0:01.68elapsed 145%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.36system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.42system 0:01.68elapsed 147%CPU (0avgtext+0avgdata 0maxresident)k

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/loop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/loop.h b/include/linux/loop.h
index 8220d9c9da00..53fa51595443 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -61,7 +61,7 @@ struct loop_device {
 	struct semaphore	lo_sem;
 	struct semaphore	lo_ctl_mutex;
 	struct semaphore	lo_bh_mutex;
-	atomic_t		lo_pending;
+	int			lo_pending;
 
 	request_queue_t		*lo_queue;
 };
-- 
cgit v1.2.3


From ac20427ef6aa63da663bdc88b71d16f7394f5e23 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Thu, 23 Jun 2005 00:09:11 -0700
Subject: [PATCH] add check to /proc/devices read routines

Patch to add check to get_chrdev_list and get_blkdev_list to prevent reads
of /proc/devices from spilling over the provided page if more than 4096
bytes of string data are generated from all the registered character and
block devices in a system

Signed-off-by: Neil Horman <nhorman@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/genhd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index af26dc718ef6..01796c41c951 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -224,7 +224,7 @@ static inline void free_disk_stats(struct gendisk *disk)
 extern void disk_round_stats(struct gendisk *disk);
 
 /* drivers/block/genhd.c */
-extern int get_blkdev_list(char *);
+extern int get_blkdev_list(char *, int);
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
 extern void unlink_gendisk(struct gendisk *gp);
-- 
cgit v1.2.3


From 84de856ed30c568c2bb7b9ac0679772bd2737d9b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:09:16 -0700
Subject: [PATCH] quota: consolidate code surrounding vfs_quota_on_mount

Move some code duplicated in both callers into vfs_quota_on_mount

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/quotaops.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index e57baa85e744..d211507ab246 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -39,7 +39,8 @@ extern int dquot_commit_info(struct super_block *sb, int type);
 extern int dquot_mark_dquot_dirty(struct dquot *dquot);
 
 extern int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path);
-extern int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry);
+extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+		int format_id, int type);
 extern int vfs_quota_off(struct super_block *sb, int type);
 #define vfs_quota_off_mount(sb, type) vfs_quota_off(sb, type)
 extern int vfs_quota_sync(struct super_block *sb, int type);
-- 
cgit v1.2.3


From b94cce926b2b902b79380ccba370d6f9f2980de0 Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:19 -0700
Subject: [PATCH] kprobes: function-return probes

This patch adds function-return probes to kprobes for the i386
architecture.  This enables you to establish a handler to be run when a
function returns.

1. API

Two new functions are added to kprobes:

	int register_kretprobe(struct kretprobe *rp);
	void unregister_kretprobe(struct kretprobe *rp);

2. Registration and unregistration

2.1 Register

  To register a function-return probe, the user populates the following
  fields in a kretprobe object and calls register_kretprobe() with the
  kretprobe address as an argument:

  kp.addr - the function's address

  handler - this function is run after the ret instruction executes, but
  before control returns to the return address in the caller.

  maxactive - The maximum number of instances of the probed function that
  can be active concurrently.  For example, if the function is non-
  recursive and is called with a spinlock or mutex held, maxactive = 1
  should be enough.  If the function is non-recursive and can never
  relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
  be enough.  maxactive is used to determine how many kretprobe_instance
  objects to allocate for this particular probed function.  If maxactive <=
  0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
  NR_CPUS) else maxactive=NR_CPUS)

  For example:

    struct kretprobe rp;
    rp.kp.addr = /* entrypoint address */
    rp.handler = /*return probe handler */
    rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
    register_kretprobe(&rp);

  The following field may also be of interest:

  nmissed - Initialized to zero when the function-return probe is
  registered, and incremented every time the probed function is entered but
  there is no kretprobe_instance object available for establishing the
  function-return probe (i.e., because maxactive was set too low).

2.2 Unregister

  To unregiter a function-return probe, the user calls
  unregister_kretprobe() with the same kretprobe object as registered
  previously.  If a probed function is running when the return probe is
  unregistered, the function will return as expected, but the handler won't
  be run.

3. Limitations

3.1 This patch supports only the i386 architecture, but patches for
    x86_64 and ppc64 are anticipated soon.

3.2 Return probes operates by replacing the return address in the stack
    (or in a known register, such as the lr register for ppc).  This may
    cause __builtin_return_address(0), when invoked from the return-probed
    function, to return the address of the return-probes trampoline.

3.3 This implementation uses the "Multiprobes at an address" feature in
    2.6.12-rc3-mm3.

3.4 Due to a limitation in multi-probes, you cannot currently establish
    a return probe and a jprobe on the same function.  A patch to remove
    this limitation is being tested.

This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 99ddba5a4e00..fba39f87efec 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -25,21 +25,31 @@
  *		Rusty Russell).
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com> and Jim Keniston
+ *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/config.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/spinlock.h>
+
 #include <asm/kprobes.h>
 
 struct kprobe;
 struct pt_regs;
+struct kretprobe;
+struct kretprobe_instance;
 typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
 typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *);
 typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
 				       unsigned long flags);
 typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
 				       int trapnr);
+typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
+				    struct pt_regs *);
+
 struct kprobe {
 	struct hlist_node hlist;
 
@@ -85,6 +95,62 @@ struct jprobe {
 	kprobe_opcode_t *entry;	/* probe handling code to jump to */
 };
 
+#ifdef ARCH_SUPPORTS_KRETPROBES
+extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs);
+extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+							unsigned long flags);
+extern struct task_struct *arch_get_kprobe_task(void *ptr);
+extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
+extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+#else /* ARCH_SUPPORTS_KRETPROBES */
+static inline void kretprobe_trampoline(void)
+{
+}
+static inline int trampoline_probe_handler(struct kprobe *p,
+						struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void trampoline_post_handler(struct kprobe *p,
+				struct pt_regs *regs, unsigned long flags)
+{
+}
+static inline void arch_prepare_kretprobe(struct kretprobe *rp,
+					struct pt_regs *regs)
+{
+}
+static inline void arch_kprobe_flush_task(struct task_struct *tk)
+{
+}
+#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL)
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+/*
+ * Function-return probe -
+ * Note:
+ * User needs to provide a handler function, and initialize maxactive.
+ * maxactive - The maximum number of instances of the probed function that
+ * can be active concurrently.
+ * nmissed - tracks the number of times the probed function's return was
+ * ignored, due to maxactive being too low.
+ *
+ */
+struct kretprobe {
+	struct kprobe kp;
+	kretprobe_handler_t handler;
+	int maxactive;
+	int nmissed;
+	struct hlist_head free_instances;
+	struct hlist_head used_instances;
+};
+
+struct kretprobe_instance {
+	struct hlist_node uflist; /* either on free list or used list */
+	struct hlist_node hlist;
+	struct kretprobe *rp;
+	void *ret_addr;
+	void *stack_addr;
+};
+
 #ifdef CONFIG_KPROBES
 /* Locks kprobe: irq must be disabled */
 void lock_kprobes(void);
@@ -104,6 +170,7 @@ extern void show_registers(struct pt_regs *regs);
 
 /* Get the kprobe at this addr (if any).  Must have called lock_kprobes */
 struct kprobe *get_kprobe(void *addr);
+struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
@@ -113,7 +180,16 @@ int register_jprobe(struct jprobe *p);
 void unregister_jprobe(struct jprobe *p);
 void jprobe_return(void);
 
-#else
+int register_kretprobe(struct kretprobe *rp);
+void unregister_kretprobe(struct kretprobe *rp);
+
+struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
+struct kretprobe_instance *get_rp_inst(void *sara);
+struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk);
+void add_rp_inst(struct kretprobe_instance *ri);
+void kprobe_flush_task(struct task_struct *tk);
+void recycle_rp_inst(struct kretprobe_instance *ri);
+#else /* CONFIG_KPROBES */
 static inline int kprobe_running(void)
 {
 	return 0;
@@ -135,5 +211,15 @@ static inline void unregister_jprobe(struct jprobe *p)
 static inline void jprobe_return(void)
 {
 }
-#endif
+static inline int register_kretprobe(struct kretprobe *rp)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kretprobe(struct kretprobe *rp)
+{
+}
+static inline void kprobe_flush_task(struct task_struct *tk)
+{
+}
+#endif				/* CONFIG_KPROBES */
 #endif				/* _LINUX_KPROBES_H */
-- 
cgit v1.2.3


From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:25 -0700
Subject: [PATCH] Move kprobe [dis]arming into arch specific code

The architecture independent code of the current kprobes implementation is
arming and disarming kprobes at registration time.  The problem is that the
code is assuming that arming and disarming is a just done by a simple write
of some magic value to an address.  This is problematic for ia64 where our
instructions look more like structures, and we can not insert break points
by just doing something like:

*p->addr = BREAKPOINT_INSTRUCTION;

The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent
functions:

     * void arch_arm_kprobe(struct kprobe *p)
     * void arch_disarm_kprobe(struct kprobe *p)

and then adds the new functions for each of the architectures that already
implement kprobes (spar64/ppc64/i386/x86_64).

I thought arch_[dis]arm_kprobe was the most descriptive of what was really
happening, but each of the architectures already had a disarm_kprobe()
function that was really a "disarm and do some other clean-up items as
needed when you stumble across a recursive kprobe." So...  I took the
liberty of changing the code that was calling disarm_kprobe() to call
arch_disarm_kprobe(), and then do the cleanup in the block of code dealing
with the recursive kprobe case.

So far this patch as been tested on i386, x86_64, and ppc64, but still
needs to be tested in sparc64.

Signed-off-by: Rusty Lynch <rusty.lynch@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index fba39f87efec..0f90466fb8b0 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -165,6 +165,8 @@ static inline int kprobe_running(void)
 
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_copy_kprobe(struct kprobe *p);
+extern void arch_arm_kprobe(struct kprobe *p);
+extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
 extern void show_registers(struct pt_regs *regs);
 
-- 
cgit v1.2.3


From 0aa55e4d7db822059fe8132fe9f2b7773c48216c Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:26 -0700
Subject: [PATCH] kprobes: moves lock-unlock to non-arch kprobe_flush_task

This patch moves the lock/unlock of the arch specific kprobe_flush_task()
to the non-arch specific kprobe_flusk_task().

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0f90466fb8b0..461391decc46 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -33,7 +33,6 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/spinlock.h>
 
 #include <asm/kprobes.h>
 
@@ -101,7 +100,7 @@ extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
 							unsigned long flags);
 extern struct task_struct *arch_get_kprobe_task(void *ptr);
 extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
-extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+extern void arch_kprobe_flush_task(struct task_struct *tk);
 #else /* ARCH_SUPPORTS_KRETPROBES */
 static inline void kretprobe_trampoline(void)
 {
-- 
cgit v1.2.3


From ea32c65cc2d2294c04e9f81d0578a6f51febfdbf Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Thu, 23 Jun 2005 00:09:36 -0700
Subject: [PATCH] kprobes: Temporary disarming of reentrant probe

In situations where a kprobes handler calls a routine which has a probe on it,
then kprobes_handler() disarms the new probe forever.  This patch removes the
above limitation by temporarily disarming the new probe.  When the another
probe hits while handling the old probe, the kprobes_handler() saves previous
kprobes state and handles the new probe without calling the new kprobes
registered handlers.  kprobe_post_handler() restores back the previous kprobes
state and the normal execution continues.

However on x86_64 architecture, re-rentrancy is provided only through
pre_handler().  If a routine having probe is referenced through
post_handler(), then the probes on that routine are disarmed forever, since
the exception stack is gets changed after the processor single steps the
instruction of the new probe.

This patch includes generic changes to support temporary disarming on
reentrancy of probes.

Signed-of-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 461391decc46..5e1a7b0d7b3f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -36,6 +36,12 @@
 
 #include <asm/kprobes.h>
 
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE	0x00000001
+#define KPROBE_HIT_SS		0x00000002
+#define KPROBE_REENTER		0x00000004
+#define KPROBE_HIT_SSDONE	0x00000008
+
 struct kprobe;
 struct pt_regs;
 struct kretprobe;
@@ -55,6 +61,9 @@ struct kprobe {
 	/* list of kprobes for multi-handler support */
 	struct list_head list;
 
+	/*count the number of times this probe was temporarily disarmed */
+	unsigned long nmissed;
+
 	/* location of the probe point */
 	kprobe_opcode_t *addr;
 
-- 
cgit v1.2.3


From d6e711448137ca3301512cec41a2c2ce852b3d0a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 23 Jun 2005 00:09:43 -0700
Subject: [PATCH] setuid core dump

Add a new `suid_dumpable' sysctl:

This value can be used to query and set the core dump mode for setuid
or otherwise protected/tainted binaries. The modes are

0 - (default) - traditional behaviour.  Any process which has changed
    privilege levels or is execute only will not be dumped

1 - (debug) - all processes dump core when possible.  The core dump is
    owned by the current user and no security is applied.  This is intended
    for system debugging situations only.  Ptrace is unchecked.

2 - (suidsafe) - any binary which normally would not be dumped is dumped
    readable by root only.  This allows the end user to remove such a dump but
    not access it directly.  For security reasons core dumps in this mode will
    not overwrite one another or other files.  This mode is appropriate when
    adminstrators are attempting to debug problems in a normal environment.

(akpm:

> > +EXPORT_SYMBOL(suid_dumpable);
>
> EXPORT_SYMBOL_GPL?

No problem to me.

> >  	if (current->euid == current->uid && current->egid == current->gid)
> >  		current->mm->dumpable = 1;
>
> Should this be SUID_DUMP_USER?

Actually the feedback I had from last time was that the SUID_ defines
should go because its clearer to follow the numbers. They can go
everywhere (and there are lots of places where dumpable is tested/used
as a bool in untouched code)

> Maybe this should be renamed to `dump_policy' or something.  Doing that
> would help us catch any code which isn't using the #defines, too.

Fair comment. The patch was designed to be easy to maintain for Red Hat
rather than for merging. Changing that field would create a gigantic
diff because it is used all over the place.

)

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/binfmts.h | 5 +++++
 include/linux/sched.h   | 2 +-
 include/linux/sysctl.h  | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7e736e201c46..c1e82c514443 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -69,6 +69,11 @@ extern void remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 
+extern int suid_dumpable;
+#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
+#define SUID_DUMP_USER		1	/* Dump as user of process */
+#define SUID_DUMP_ROOT		2	/* Dump as root */
+
 /* Stack area protections */
 #define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
 #define EXSTACK_DISABLE_X 1	/* Disable executable stacks */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b58afd97a180..901742f92389 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,7 +246,7 @@ struct mm_struct {
 
 	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
 
-	unsigned dumpable:1;
+	unsigned dumpable:2;
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a17745c80a91..614e939c78a4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -136,6 +136,7 @@ enum
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
 	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
+	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 };
 
 
-- 
cgit v1.2.3


From ef3daeda7b58f046f94b26637d500354038d39f4 Mon Sep 17 00:00:00 2001
From: Yoav Zach <yoav_zach@yahoo.com>
Date: Thu, 23 Jun 2005 00:09:58 -0700
Subject: [PATCH] Don't force O_LARGEFILE for 32 bit processes on ia64

In ia64 kernel, the O_LARGEFILE flag is forced when opening a file.  This
is problematic for execution of 32 bit processes, which are not largefile
aware, either by SW emulation or by HW execution.

For such processes, the problem is two-fold:

1) When trying to open a file that is larger than 4G
   the operation should fail, but it's not
2) Writing to offset larger than 4G should fail, but
   it's not

The proposed patch takes advantage of the way 32 bit processes are
identified in ia64 systems.  Such processes have PER_LINUX32 for their
personality.  With the patch, the ia64 kernel will not enforce the
O_LARGEFILE flag if the current process has PER_LINUX32 set.  The behavior
for all other architectures remains unchanged.

Signed-off-by: Yoav Zach <yoav.zach@intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fcntl.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 704fb76b6334..8a7c82151de9 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -25,6 +25,10 @@
 
 #ifdef __KERNEL__
 
+#ifndef force_o_largefile
+#define force_o_largefile() (BITS_PER_LONG != 32)
+#endif
+
 #if BITS_PER_LONG == 32
 #define IS_GETLK32(cmd)		((cmd) == F_GETLK)
 #define IS_SETLK32(cmd)		((cmd) == F_SETLK)
-- 
cgit v1.2.3


From 46c271bedd2c8444b1d05bc44928beec0c07debc Mon Sep 17 00:00:00 2001
From: Peter Osterlund <petero2@telia.com>
Date: Thu, 23 Jun 2005 00:10:02 -0700
Subject: [PATCH] Improve CD/DVD packet driver write performance

This patch improves write performance for the CD/DVD packet writing driver.
 The logic for switching between reading and writing has been changed so
that streaming writes are no longer interrupted by read requests.

Signed-off-by: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pktcdvd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 4e2d2a942ecb..4b32bce9a289 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -159,7 +159,7 @@ struct packet_iosched
 	struct bio		*read_queue_tail;
 	struct bio		*write_queue;
 	struct bio		*write_queue_tail;
-	int			high_prio_read;	/* An important read request has been queued */
+	sector_t		last_write;	/* The sector where the last write ended */
 	int			successive_reads;
 };
 
-- 
cgit v1.2.3


From bb93e3a52f8db7210258a1a2134cced0b78a46e1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Jun 2005 00:10:15 -0700
Subject: [PATCH] block: add unlocked_ioctl support for block devices

This patch allows block device drivers to convert their ioctl functions to
unlocked_ioctl() like character devices and other subsystems.  All
functions that were called with the BKL held before are still used that
way, but I would not be surprised if it could be removed from the ioctl
functions in drivers/block/ioctl.c themselves.

As a side note, I found that compat_blkdev_ioctl() acquires the BKL as
well, which looks like a bug.  I have checked that every user of
disk->fops->compat_ioctl() in the current git tree gets the BKL itself, so
it could easily be removed from compat_blkdev_ioctl().

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3622e952e98c..9b1278e21279 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -884,6 +884,7 @@ struct block_device_operations {
 	int (*open) (struct inode *, struct file *);
 	int (*release) (struct inode *, struct file *);
 	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+	long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned, unsigned long);
 	int (*media_changed) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
-- 
cgit v1.2.3


From 45778ca819accab1a4a3378b3566cab0f189164f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@graphe.net>
Date: Thu, 23 Jun 2005 00:10:17 -0700
Subject: [PATCH] Remove f_error field from struct file

The following patch removes the f_error field and all checks of f_error.

Trond said:

  f_error was introduced for NFS, and made sense when we were guaranteed
  always to have a file pointer around when write errors occurred.  Since
  then, we have (for various reasons) had to introduce the nfs_open_context in
  order to track the file read/write state, and it made sense to move our
  f_error tracking there too.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9b1278e21279..517bf4966bf5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -581,7 +581,6 @@ struct file {
 	atomic_t		f_count;
 	unsigned int 		f_flags;
 	mode_t			f_mode;
-	int			f_error;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	unsigned int		f_uid, f_gid;
-- 
cgit v1.2.3


From f9fd27a253d5e0b23531d12ce7ad15b6535d4486 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:10:19 -0700
Subject: [PATCH] acl endianess annotations

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix_acl_xattr.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 5efd0a6dad94..fe271c1947b2 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -23,13 +23,13 @@
 #define ACL_UNDEFINED_ID	(-1)
 
 typedef struct {
-	__u16			e_tag;
-	__u16			e_perm;
-	__u32			e_id;
+	__le16			e_tag;
+	__le16			e_perm;
+	__le32			e_id;
 } posix_acl_xattr_entry;
 
 typedef struct {
-	__u32			a_version;
+	__le32			a_version;
 	posix_acl_xattr_entry	a_entries[0];
 } posix_acl_xattr_header;
 
-- 
cgit v1.2.3


From 9a59f452abe11f569e13ec16c51e6d61c54b9838 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:10:19 -0700
Subject: [PATCH] remove <linux/xattr_acl.h>

This file duplicates <linux/posix_acl_xattr.h>, using slightly different
names.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix_acl_xattr.h | 3 +++
 include/linux/reiserfs_acl.h    | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index fe271c1947b2..6e53c34035cd 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -52,4 +52,7 @@ posix_acl_xattr_count(size_t size)
 	return size / sizeof(posix_acl_xattr_entry);
 }
 
+struct posix_acl *posix_acl_from_xattr(const void *value, size_t size);
+int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size);
+
 #endif	/* _POSIX_ACL_XATTR_H */
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 2aef9c3f5ce8..0760507a545b 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -1,6 +1,5 @@
 #include <linux/init.h>
 #include <linux/posix_acl.h>
-#include <linux/xattr_acl.h>
 
 #define REISERFS_ACL_VERSION	0x0001
 
-- 
cgit v1.2.3


From c43dc2fd885b5658cfd7cedb7bcca20910c517a4 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Thu, 23 Jun 2005 00:10:27 -0700
Subject: [PATCH] aio: make wait_queue ->task ->private

In the upcoming aio_down patch, it is useful to store a private data
pointer in the kiocb's wait_queue.  Since we provide our own wake up
function and do not require the task_struct pointer, it makes sense to
convert the task pointer into a generic private pointer.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/wait.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index c9486c3efb4a..d38c9fecdc36 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -33,7 +33,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
-	struct task_struct * task;
+	void *private;
 	wait_queue_func_t func;
 	struct list_head task_list;
 };
@@ -60,7 +60,7 @@ typedef struct __wait_queue_head wait_queue_head_t;
  */
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
-	.task		= tsk,						\
+	.private	= tsk,						\
 	.func		= default_wake_function,			\
 	.task_list	= { NULL, NULL } }
 
@@ -86,7 +86,7 @@ static inline void init_waitqueue_head(wait_queue_head_t *q)
 static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
 {
 	q->flags = 0;
-	q->task = p;
+	q->private = p;
 	q->func = default_wake_function;
 }
 
@@ -94,7 +94,7 @@ static inline void init_waitqueue_func_entry(wait_queue_t *q,
 					wait_queue_func_t func)
 {
 	q->flags = 0;
-	q->task = NULL;
+	q->private = NULL;
 	q->func = func;
 }
 
@@ -110,7 +110,7 @@ static inline int waitqueue_active(wait_queue_head_t *q)
  * aio specifies a wait queue entry with an async notification
  * callback routine, not associated with any task.
  */
-#define is_sync_wait(wait)	(!(wait) || ((wait)->task))
+#define is_sync_wait(wait)	(!(wait) || ((wait)->private))
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
@@ -384,7 +384,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT(name)						\
 	wait_queue_t name = {						\
-		.task		= current,				\
+		.private	= current,				\
 		.func		= autoremove_wake_function,		\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
 	}
@@ -393,7 +393,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
 		.wait	= {						\
-			.task		= current,			\
+			.private	= current,			\
 			.func		= wake_bit_function,		\
 			.task_list	=				\
 				LIST_HEAD_INIT((name).wait.task_list),	\
@@ -402,7 +402,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define init_wait(wait)							\
 	do {								\
-		(wait)->task = current;					\
+		(wait)->private = current;				\
 		(wait)->func = autoremove_wake_function;		\
 		INIT_LIST_HEAD(&(wait)->task_list);			\
 	} while (0)
-- 
cgit v1.2.3


From bfb07599da289881d3bcbb601a110e997fc7444b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 23 Jun 2005 00:10:32 -0700
Subject: [PATCH] Introduce tty_unregister_ldisc()

It's a bit strange to see tty_register_ldisc call in modules' exit
functions.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/tty.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1b76106272d3..59ff42c629ec 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -345,6 +345,7 @@ extern int tty_check_change(struct tty_struct * tty);
 extern void stop_tty(struct tty_struct * tty);
 extern void start_tty(struct tty_struct * tty);
 extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc);
+extern int tty_unregister_ldisc(int disc);
 extern int tty_register_driver(struct tty_driver *driver);
 extern int tty_unregister_driver(struct tty_driver *driver);
 extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev);
-- 
cgit v1.2.3


From 4749f32da939d4e4160541b2cadc22492bb507ec Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Jun 2005 11:36:56 +0200
Subject: [PATCH] better USB_MON dependencies

This makes the USB_MON less confusing.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/usb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3d508bf08402..eb282b581546 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -290,7 +290,7 @@ struct usb_bus {
 	struct class_device *class_dev;	/* class device for this bus */
 	struct kref kref;		/* handles reference counting this bus */
 	void (*release)(struct usb_bus *bus);	/* function to destroy this bus's memory */
-#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)
+#if defined(CONFIG_USB_MON)
 	struct mon_bus *mon_bus;	/* non-null when associated */
 	int monitored;			/* non-zero when monitored */
 #endif
-- 
cgit v1.2.3


From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:19:55 -0700
Subject: [TCP]: Add pluggable congestion control algorithm infrastructure.

Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules.  The legacy "new RENO" algorithm is used as a starting
point and fallback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h |  9 +--------
 include/linux/tcp.h    | 49 ++++++++++---------------------------------------
 2 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 614e939c78a4..72965bfe6cfb 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -333,21 +333,14 @@ enum
 	NET_TCP_FRTO=92,
 	NET_TCP_LOW_LATENCY=93,
 	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
-	NET_TCP_WESTWOOD=95,
 	NET_IPV4_IGMP_MAX_MSF=96,
 	NET_TCP_NO_METRICS_SAVE=97,
-	NET_TCP_VEGAS=98,
-	NET_TCP_VEGAS_ALPHA=99,
-	NET_TCP_VEGAS_BETA=100,
-	NET_TCP_VEGAS_GAMMA=101,
- 	NET_TCP_BIC=102,
- 	NET_TCP_BIC_FAST_CONVERGENCE=103,
-	NET_TCP_BIC_LOW_WINDOW=104,
 	NET_TCP_DEFAULT_WIN_SCALE=105,
 	NET_TCP_MODERATE_RCVBUF=106,
 	NET_TCP_TSO_WIN_DIVISOR=107,
 	NET_TCP_BIC_BETA=108,
 	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
+	NET_TCP_CONG_CONTROL=110,
 };
 
 enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 97a7c9e03df5..3ea75dd6640a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -203,13 +203,6 @@ struct tcp_sack_block {
 	__u32	end_seq;
 };
 
-enum tcp_congestion_algo {
-	TCP_RENO=0,
-	TCP_VEGAS,
-	TCP_WESTWOOD,
-	TCP_BIC,
-};
-
 struct tcp_options_received {
 /*	PAWS/RTTM data	*/
 	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
@@ -305,7 +298,7 @@ struct tcp_sock {
 	__u8	reordering;	/* Packet reordering metric.		*/
 	__u8	frto_counter;	/* Number of new acks after RTO */
 
-	__u8	adv_cong;	/* Using Vegas, Westwood, or BIC */
+	__u8	unused;
 	__u8	defer_accept;	/* User waits for some data after accept() */
 
 /* RTT measurement */
@@ -401,37 +394,10 @@ struct tcp_sock {
 		__u32	time;
 	} rcvq_space;
 
-/* TCP Westwood structure */
-        struct {
-                __u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
-                __u32    bw_est;           /* bandwidth estimate */
-                __u32    rtt_win_sx;       /* here starts a new evaluation... */
-                __u32    bk;
-                __u32    snd_una;          /* used for evaluating the number of acked bytes */
-                __u32    cumul_ack;
-                __u32    accounted;
-                __u32    rtt;
-                __u32    rtt_min;          /* minimum observed RTT */
-        } westwood;
-
-/* Vegas variables */
-	struct {
-		__u32	beg_snd_nxt;	/* right edge during last RTT */
-		__u32	beg_snd_una;	/* left edge  during last RTT */
-		__u32	beg_snd_cwnd;	/* saves the size of the cwnd */
-		__u8	doing_vegas_now;/* if true, do vegas for this RTT */
-		__u16	cntRTT;		/* # of RTTs measured within last RTT */
-		__u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
-		__u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
-	} vegas;
-
-	/* BI TCP Parameters */
-	struct {
-		__u32	cnt;		/* increase cwnd by 1 after this number of ACKs */
-		__u32 	last_max_cwnd;	/* last maximium snd_cwnd */
-		__u32	last_cwnd;	/* the last snd_cwnd */
-		__u32   last_stamp;     /* time when updated last_cwnd */
-	} bictcp;
+	/* Pluggable TCP congestion control hook */
+	struct tcp_congestion_ops *ca_ops;
+	u32	ca_priv[16];
+#define TCP_CA_PRIV_SIZE	(16*sizeof(u32))
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 	return (struct tcp_sock *)sk;
 }
 
+static inline void *tcp_ca(const struct tcp_sock *tp)
+{
+	return (void *) tp->ca_priv;
+}
+
 #endif
 
 #endif	/* _LINUX_TCP_H */
-- 
cgit v1.2.3


From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:21:28 -0700
Subject: [TCP]: Report congestion control algorithm in tcp_diag.

Enhancement to the tcp_diag interface used by the iproute2 ss command
to report the tcp congestion control being used by a socket.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp_diag.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h
index ceee962e1d15..7a5996743946 100644
--- a/include/linux/tcp_diag.h
+++ b/include/linux/tcp_diag.h
@@ -99,9 +99,10 @@ enum
 	TCPDIAG_MEMINFO,
 	TCPDIAG_INFO,
 	TCPDIAG_VEGASINFO,
+	TCPDIAG_CONG,
 };
 
-#define TCPDIAG_MAX TCPDIAG_VEGASINFO
+#define TCPDIAG_MAX TCPDIAG_CONG
 
 
 /* TCPDIAG_MEM */
@@ -123,5 +124,4 @@ struct tcpvegas_info {
 	__u32	tcpv_minrtt;
 };
 
-
 #endif /* _TCP_DIAG_H_ */
-- 
cgit v1.2.3