From dd7f0b80926befc8c70a873b5b0c0c7b5fd1e7b9 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Wed, 22 Jun 2005 12:38:33 -0700
Subject: [NETFILTER]: Fix "iptables -D" rule deletion with ipt_CLUSTERIP
 target.

The patch just changes the order of structure members.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
index baa83e757156..d9bceedfb3dc 100644
--- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
+++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
@@ -18,7 +18,6 @@ struct clusterip_config;
 struct ipt_clusterip_tgt_info {
 
 	u_int32_t flags;
-	struct clusterip_config *config;
 	
 	/* only relevant for new ones */
 	u_int8_t clustermac[6];
@@ -27,6 +26,8 @@ struct ipt_clusterip_tgt_info {
 	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
 	enum clusterip_hashmode hash_mode;
 	u_int32_t hash_initval;
+
+	struct clusterip_config *config;
 };
 
 #endif /*_IPT_CLUSTERIP_H_target*/
-- 
cgit v1.2.3


From 10f7e7c15e6ce41799c5dba6925ae4bf8048c870 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Jun 2005 09:43:07 +1000
Subject: [PATCH] ppc64: consolidate calibrate_decr implementations

pSeries and maple have almost the same code for calibrate_decr,
and BPA would need yet another copy. Instead, I'm moving the
code to arch/ppc64/kernel/time.c.

Some of the related declarations were missing from header
files, so I'm moving those as well.

It makes sense to merge this with the pmac function of the
same name, so we end up having just one implemetation for
iSeries and one for Open Firmware based machines.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/asm-ppc64/time.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/asm-ppc64/time.h b/include/asm-ppc64/time.h
index 8d6e3760ee10..c6c762cad8b0 100644
--- a/include/asm-ppc64/time.h
+++ b/include/asm-ppc64/time.h
@@ -34,6 +34,15 @@ struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern time_t last_rtc_update;
 
+void generic_calibrate_decr(void);
+void setup_default_decr(void);
+
+/* Some sane defaults: 125 MHz timebase, 1GHz processor */
+extern unsigned long ppc_proc_freq;
+#define DEFAULT_PROC_FREQ	(DEFAULT_TB_FREQ * 8)
+extern unsigned long ppc_tb_freq;
+#define DEFAULT_TB_FREQ		125000000UL
+
 /*
  * By putting all of this stuff into a single struct we 
  * reduce the number of cache lines touched by do_gettimeofday.
-- 
cgit v1.2.3


From 773bf9c469c01f01280c9bd45ec2462dd94d08a0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Jun 2005 09:43:18 +1000
Subject: [PATCH] ppc64: rename pSeries rtc functions into rtas_*

The rtc rtas functions are not pSeries specific but can
also be used by BPA and other SLOF based platforms

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/asm-ppc64/rtas.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h
index a8ab0e9db84a..c46ceb2ffc12 100644
--- a/include/asm-ppc64/rtas.h
+++ b/include/asm-ppc64/rtas.h
@@ -188,6 +188,11 @@ extern int rtas_set_power_level(int powerdomain, int level, int *setlevel);
 extern int rtas_set_indicator(int indicator, int index, int new_value);
 extern void rtas_initialize(void);
 
+struct rtc_time;
+extern void rtas_get_boot_time(struct rtc_time *rtc_time);
+extern void rtas_get_rtc_time(struct rtc_time *rtc_time);
+extern int rtas_set_rtc_time(struct rtc_time *rtc_time);
+
 /* Given an RTAS status code of 9900..9905 compute the hinted delay */
 unsigned int rtas_extended_busy_delay_time(int status);
 static inline int rtas_is_extended_busy(int status)
-- 
cgit v1.2.3


From 6566c6f1f18d42affe73ccdd403e290b64d10473 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Jun 2005 09:43:28 +1000
Subject: [PATCH] ppc64: pSeries_progress -> rtas_progress

The pSeries_progress function is called from some places in the rtas code,
which may also be used by non-pSeries platforms.
Though pSeries is currently the only platform type that implements
display-character, the code is actually generic enough to be part of
the rtas subsystem.

I hit a bug here because the generic rtas code tried calling ppc_md.progress,
which points to an __init function on most platforms.

We could also clear the ppc_md.progress pointer when freeing the init memory
to make it more explicit that ppc_md.progress must not be called after
bootup.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/asm-ppc64/rtas.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h
index c46ceb2ffc12..e7d1b5222802 100644
--- a/include/asm-ppc64/rtas.h
+++ b/include/asm-ppc64/rtas.h
@@ -186,6 +186,7 @@ extern int rtas_get_sensor(int sensor, int index, int *state);
 extern int rtas_get_power_level(int powerdomain, int *level);
 extern int rtas_set_power_level(int powerdomain, int level, int *setlevel);
 extern int rtas_set_indicator(int indicator, int index, int new_value);
+extern void rtas_progress(char *s, unsigned short hex);
 extern void rtas_initialize(void);
 
 struct rtc_time;
-- 
cgit v1.2.3


From 5f5b4e669a59be1cf8fc9d6d04ff1ccad8ab6de0 Mon Sep 17 00:00:00 2001
From: Utz Bacher <utz.bacher@de.ibm.com>
Date: Thu, 23 Jun 2005 09:43:31 +1000
Subject: [PATCH] ppc64: add a minimal nvram driver

The firmware provides the location and size of the nvram
in the device tree, so it does not really contain any
hardware specific bits and could be used on other
machines as well.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/asm-ppc64/nvram.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/asm-ppc64/nvram.h b/include/asm-ppc64/nvram.h
index 4e6dd370d936..dfaa21566c9a 100644
--- a/include/asm-ppc64/nvram.h
+++ b/include/asm-ppc64/nvram.h
@@ -70,6 +70,7 @@ extern struct nvram_partition *nvram_find_partition(int sig, const char *name);
 
 extern int pSeries_nvram_init(void);
 extern int pmac_nvram_init(void);
+extern int bpa_nvram_init(void);
 
 /* PowerMac specific nvram stuffs */
 
-- 
cgit v1.2.3


From fef1c772fa154c16e0a54577e9ecb5480f7b937e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Jun 2005 09:43:37 +1000
Subject: [PATCH] ppc64: add BPA platform type

This adds the basic support for running on BPA machines.
So far, this is only the IBM workstation, and it will
not run on others without a little more generalization.

It should be possible to configure a kernel for any
combination of CONFIG_PPC_BPA with any of the other
multiplatform targets.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/asm-ppc64/mmu.h       |  5 +++--
 include/asm-ppc64/processor.h | 15 +++++++++++++--
 include/asm-ppc64/smp.h       |  8 ++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h
index c78282a67d8e..9d03a98a4fa3 100644
--- a/include/asm-ppc64/mmu.h
+++ b/include/asm-ppc64/mmu.h
@@ -47,9 +47,10 @@
 #define SLB_VSID_KS		ASM_CONST(0x0000000000000800)
 #define SLB_VSID_KP		ASM_CONST(0x0000000000000400)
 #define SLB_VSID_N		ASM_CONST(0x0000000000000200) /* no-execute */
-#define SLB_VSID_L		ASM_CONST(0x0000000000000100) /* largepage 16M */
+#define SLB_VSID_L		ASM_CONST(0x0000000000000100) /* largepage */
 #define SLB_VSID_C		ASM_CONST(0x0000000000000080) /* class */
-
+#define SLB_VSID_LS		ASM_CONST(0x0000000000000070) /* size of largepage */
+ 
 #define SLB_VSID_KERNEL		(SLB_VSID_KP|SLB_VSID_C)
 #define SLB_VSID_USER		(SLB_VSID_KP|SLB_VSID_KS)
 
diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h
index 3084099086a8..af28aa55d8c1 100644
--- a/include/asm-ppc64/processor.h
+++ b/include/asm-ppc64/processor.h
@@ -138,8 +138,16 @@
 #define	SPRN_NIADORM	0x3F3	/* Hardware Implementation Register 2 */
 #define SPRN_HID4	0x3F4	/* 970 HID4 */
 #define SPRN_HID5	0x3F6	/* 970 HID5 */
-#define	SPRN_TSC 	0x3FD	/* Thread switch control */
-#define	SPRN_TST 	0x3FC	/* Thread switch timeout */
+#define	SPRN_HID6	0x3F9	/* BE HID 6 */
+#define	  HID6_LB	(0x0F<<12) /* Concurrent Large Page Modes */
+#define	  HID6_DLP	(1<<20)	/* Disable all large page modes (4K only) */
+#define	SPRN_TSCR	0x399   /* Thread switch control on BE */
+#define	SPRN_TTR	0x39A   /* Thread switch timeout on BE */
+#define	  TSCR_DEC_ENABLE	0x200000 /* Decrementer Interrupt */
+#define	  TSCR_EE_ENABLE	0x100000 /* External Interrupt */
+#define	  TSCR_EE_BOOST		0x080000 /* External Interrupt Boost */
+#define	SPRN_TSC 	0x3FD	/* Thread switch control on others */
+#define	SPRN_TST 	0x3FC	/* Thread switch timeout on others */
 #define	SPRN_L2CR	0x3F9	/* Level 2 Cache Control Regsiter */
 #define	SPRN_LR		0x008	/* Link Register */
 #define	SPRN_PIR	0x3FF	/* Processor Identification Register */
@@ -259,6 +267,7 @@
 #define PV_970FX	0x003C
 #define	PV_630        	0x0040
 #define	PV_630p	        0x0041
+#define	PV_BE		0x0070
 
 /* Platforms supported by PPC64 */
 #define PLATFORM_PSERIES      0x0100
@@ -267,6 +276,7 @@
 #define PLATFORM_LPAR         0x0001
 #define PLATFORM_POWERMAC     0x0400
 #define PLATFORM_MAPLE        0x0500
+#define PLATFORM_BPA          0x1000
 
 /* Compatibility with drivers coming from PPC32 world */
 #define _machine	(systemcfg->platform)
@@ -278,6 +288,7 @@
 #define IC_INVALID    0
 #define IC_OPEN_PIC   1
 #define IC_PPC_XIC    2
+#define IC_BPA_IIC    3
 
 #define XGLUE(a,b) a##b
 #define GLUE(a,b) XGLUE(a,b)
diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h
index 8115ecb8feee..d86f742e9a21 100644
--- a/include/asm-ppc64/smp.h
+++ b/include/asm-ppc64/smp.h
@@ -85,6 +85,14 @@ extern void smp_generic_take_timebase(void);
 
 extern struct smp_ops_t *smp_ops;
 
+#ifdef CONFIG_PPC_PSERIES
+void vpa_init(int cpu);
+#else
+static inline void vpa_init(int cpu)
+{
+}
+#endif /* CONFIG_PPC_PSERIES */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* !(_PPC64_SMP_H) */
-- 
cgit v1.2.3


From 6ca4f65e6b390d09e1de7280cf9fd4f5d8e4b48b Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:04:55 -0700
Subject: [NETPOLL]: Set poll_owner to -1 before unlocking in
 netpoll_poll_unlock()

This trivial patch moves the assignment of poll_owner to -1 inside of
the lock.  This fixes a potential SMP race in the code.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index c0d8b90c5202..449a4fde6587 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -53,8 +53,8 @@ static inline void netpoll_poll_lock(struct net_device *dev)
 static inline void netpoll_poll_unlock(struct net_device *dev)
 {
 	if (dev->np) {
-		spin_unlock(&dev->np->poll_lock);
 		dev->np->poll_owner = -1;
+		spin_unlock(&dev->np->poll_lock);
 	}
 }
 
-- 
cgit v1.2.3


From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:31 -0700
Subject: [NETPOLL]: Introduce a netpoll_info struct

This patch introduces a netpoll_info structure, which the struct net_device
will now point to instead of pointing to a struct netpoll.  The reason for
this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock
should be maintained per net_device, not per netpoll;  and 2) this is a first
step in providing support for multiple netpoll clients to register against the
same net_device.

The struct netpoll is now pointed to by the netpoll_info structure.  As
such, the previous behaviour of the code is preserved.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 include/linux/netpoll.h   | 25 +++++++++++++++++--------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ba5d1236aa17..d6afd440cf7b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -41,7 +41,7 @@
 struct divert_blk;
 struct vlan_group;
 struct ethtool_ops;
-struct netpoll;
+struct netpoll_info;
 					/* source back-compat hooks */
 #define SET_ETHTOOL_OPS(netdev,ops) \
 	( (netdev)->ethtool_ops = (ops) )
@@ -468,7 +468,7 @@ struct net_device
 						     unsigned char *haddr);
 	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
 #ifdef CONFIG_NETPOLL
-	struct netpoll		*np;
+	struct netpoll_info	*npinfo;
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*poll_controller)(struct net_device *dev);
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 449a4fde6587..388cd91bc7a6 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -16,14 +16,18 @@ struct netpoll;
 struct netpoll {
 	struct net_device *dev;
 	char dev_name[16], *name;
-	int rx_flags;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
 	void (*drop)(struct sk_buff *skb);
 	u32 local_ip, remote_ip;
 	u16 local_port, remote_port;
 	unsigned char local_mac[6], remote_mac[6];
+};
+
+struct netpoll_info {
 	spinlock_t poll_lock;
 	int poll_owner;
+	int rx_flags;
+	struct netpoll *np;
 };
 
 void netpoll_poll(struct netpoll *np);
@@ -39,22 +43,27 @@ void netpoll_queue(struct sk_buff *skb);
 #ifdef CONFIG_NETPOLL
 static inline int netpoll_rx(struct sk_buff *skb)
 {
-	return skb->dev->np && skb->dev->np->rx_flags && __netpoll_rx(skb);
+	struct netpoll_info *npinfo = skb->dev->npinfo;
+
+	if (!npinfo || !npinfo->rx_flags)
+		return 0;
+
+	return npinfo->np && __netpoll_rx(skb);
 }
 
 static inline void netpoll_poll_lock(struct net_device *dev)
 {
-	if (dev->np) {
-		spin_lock(&dev->np->poll_lock);
-		dev->np->poll_owner = smp_processor_id();
+	if (dev->npinfo) {
+		spin_lock(&dev->npinfo->poll_lock);
+		dev->npinfo->poll_owner = smp_processor_id();
 	}
 }
 
 static inline void netpoll_poll_unlock(struct net_device *dev)
 {
-	if (dev->np) {
-		dev->np->poll_owner = -1;
-		spin_unlock(&dev->np->poll_lock);
+	if (dev->npinfo) {
+		dev->npinfo->poll_owner = -1;
+		spin_unlock(&dev->npinfo->poll_lock);
 	}
 }
 
-- 
cgit v1.2.3


From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:59 -0700
Subject: [NETPOLL]: allow multiple netpoll_clients to register against one
 interface

This patch provides support for registering multiple netpoll clients to the
same network device.  Only one of these clients may register an rx_hook,
however.  In practice, this restriction has not been problematic.  It is
worth mentioning, though, that the current design can be easily extended to
allow for the registration of multiple rx_hooks.

The basic idea of the patch is that the rx_np pointer in the netpoll_info
structure points to the struct netpoll that has rx_hook filled in.  Aside
from this one case, there is no need for a pointer from the struct
net_device to an individual struct netpoll.

A lock is introduced to protect the setting and clearing of the np_rx
pointer.  The pointer will only be cleared upon netpoll client module
removal, and the lock should be uncontested.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 388cd91bc7a6..bcd0ac33f592 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -27,7 +27,8 @@ struct netpoll_info {
 	spinlock_t poll_lock;
 	int poll_owner;
 	int rx_flags;
-	struct netpoll *np;
+	spinlock_t rx_lock;
+	struct netpoll *rx_np; /* netpoll that registered an rx_hook */
 };
 
 void netpoll_poll(struct netpoll *np);
@@ -44,11 +45,19 @@ void netpoll_queue(struct sk_buff *skb);
 static inline int netpoll_rx(struct sk_buff *skb)
 {
 	struct netpoll_info *npinfo = skb->dev->npinfo;
+	unsigned long flags;
+	int ret = 0;
 
-	if (!npinfo || !npinfo->rx_flags)
+	if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags))
 		return 0;
 
-	return npinfo->np && __netpoll_rx(skb);
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	/* check rx_flags again with the lock held */
+	if (npinfo->rx_flags && __netpoll_rx(skb))
+		ret = 1;
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
+	return ret;
 }
 
 static inline void netpoll_poll_lock(struct net_device *dev)
-- 
cgit v1.2.3


From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:15:01 -0700
Subject: [X25]: Selective sub-address matching with call user data.

From: Shaun Pereira <spereira@tusc.com.au>

This is the first (independent of the second) patch of two that I am
working on with x25 on linux (tested with xot on a cisco router).  Details
are as follows.

Current state of module:

A server using the current implementation (2.6.11.7) of the x25 module will
accept a call request/ incoming call packet at the listening x.25 address,
from all callers to that address, as long as NO call user data is present
in the packet header.

If the server needs to choose to accept a particular call request/ incoming
call packet arriving at its listening x25 address, then the kernel has to
allow a match of call user data present in the call request packet with its
own.  This is required when multiple servers listen at the same x25 address
and device interface.  The kernel currently matches ALL call user data, if
present.

Current Changes:

This patch is a follow up to the patch submitted previously by Andrew
Hendry, and allows the user to selectively control the number of octets of
call user data in the call request packet, that the kernel will match.  By
default no call user data is matched, even if call user data is present.
To allow call user data matching, a cudmatchlength > 0 has to be passed
into the kernel after which the passed number of octets will be matched.
Otherwise the kernel behavior is exactly as the original implementation.

This patch also ensures that as is normally the case, no call user data
will be present in the Call accepted / call connected packet sent back to
the caller

Future Changes on next patch:

There are cases however when call user data may be present in the call
accepted packet.  According to the X.25 recommendation (ITU-T 10/96)
section 5.2.3.2 call user data may be present in the call accepted packet
provided the fast select facility is used.  My next patch will include this
fast select utility and the ability to send up to 128 octets call user data
in the call accepted packet provided the fast select facility is used.  I
am currently testing this, again with xot on linux and cisco.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>

(With a fix from Alexey Dobriyan <adobriyan@gmail.com>)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h | 10 ++++++++++
 include/net/x25.h   |  3 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index 7531cfed5885..6f43b3d20248 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -4,6 +4,8 @@
  * 	History
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities 
  *					  negotiation.
+ *	apr/02/05	Shaun Pereira Selective sub address matching with
+ *					call user data
  */
 
 #ifndef	X25_KERNEL_H
@@ -16,6 +18,7 @@
 #define	SIOCX25GCALLUSERDATA	(SIOCPROTOPRIVATE + 4)
 #define	SIOCX25SCALLUSERDATA	(SIOCPROTOPRIVATE + 5)
 #define	SIOCX25GCAUSEDIAG	(SIOCPROTOPRIVATE + 6)
+#define SIOCX25SCUDMATCHLEN	(SIOCPROTOPRIVATE + 7)
 
 /*
  *	Values for {get,set}sockopt.
@@ -109,4 +112,11 @@ struct x25_causediag {
 	unsigned char	diagnostic;
 };
 
+/*
+ *	Further optional call user data match length selection
+ */
+struct x25_subaddr {
+	unsigned int cudmatchlength;
+};
+
 #endif
diff --git a/include/net/x25.h b/include/net/x25.h
index 7a1ba5bbb868..9dd70dd4a9b7 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -134,7 +134,7 @@ struct x25_sock {
 	struct sock		sk;
 	struct x25_address	source_addr, dest_addr;
 	struct x25_neigh	*neighbour;
-	unsigned int		lci;
+	unsigned int		lci, cudmatchlength;
 	unsigned char		state, condition, qbitincl, intflag;
 	unsigned short		vs, vr, va, vl;
 	unsigned long		t2, t21, t22, t23;
@@ -242,7 +242,6 @@ extern int  x25_validate_nr(struct sock *, unsigned short);
 extern void x25_write_internal(struct sock *, int);
 extern int  x25_decode(struct sock *, struct sk_buff *, int *, int *, int *, int *, int *);
 extern void x25_disconnect(struct sock *, int, unsigned char, unsigned char);
-extern int x25_check_calluserdata(struct x25_calluserdata *,struct x25_calluserdata *);
 
 /* x25_timer.c */
 extern void x25_start_heartbeat(struct sock *);
-- 
cgit v1.2.3


From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:16:17 -0700
Subject: [X25]: Fast select with no restriction on response

This patch is a follow up to patch 1 regarding "Selective Sub Address
matching with call user data".  It allows use of the Fast-Select-Acceptance
optional user facility for X.25.

This patch just implements fast select with no restriction on response
(NRR).  What this means (according to ITU-T Recomendation 10/96 section
6.16) is that if in an incoming call packet, the relevant facility bits are
set for fast-select-NRR, then the called DTE can issue a direct response to
the incoming packet using a call-accepted packet that contains
call-user-data.  This patch allows such a response.

The called DTE can also respond with a clear-request packet that contains
call-user-data.  However, this feature is currently not implemented by the
patch.

How is Fast Select Acceptance used?
By default, the system does not allow fast select acceptance (as before).
To enable a response to fast select acceptance,
After a listen socket in created and bound as follows
	socket(AF_X25, SOCK_SEQPACKET, 0);
	bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr));
but before a listen system call is made, the following ioctl should be used.
	ioctl(call_soc,SIOCX25CALLACCPTAPPRV);
Now the listen system call can be made
	listen(call_soc, 4);
After this, an incoming-call packet will be accepted, but no call-accepted
packet will be sent back until the following system call is made on the socket
that accepts the call
	ioctl(vc_soc,SIOCX25SENDCALLACCPT);
The network (or cisco xot router used for testing here) will allow the
application server's call-user-data in the call-accepted packet,
provided the call-request was made with Fast-select NRR.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h | 2 ++
 include/net/x25.h   | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index 6f43b3d20248..16d44931afa0 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -19,6 +19,8 @@
 #define	SIOCX25SCALLUSERDATA	(SIOCPROTOPRIVATE + 5)
 #define	SIOCX25GCAUSEDIAG	(SIOCPROTOPRIVATE + 6)
 #define SIOCX25SCUDMATCHLEN	(SIOCPROTOPRIVATE + 7)
+#define SIOCX25CALLACCPTAPPRV   (SIOCPROTOPRIVATE + 8)
+#define SIOCX25SENDCALLACCPT    (SIOCPROTOPRIVATE + 9)
 
 /*
  *	Values for {get,set}sockopt.
diff --git a/include/net/x25.h b/include/net/x25.h
index 9dd70dd4a9b7..8b39b98876e8 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -79,6 +79,8 @@ enum {
 #define	X25_DEFAULT_PACKET_SIZE	X25_PS128		/* Default Packet Size */
 #define	X25_DEFAULT_THROUGHPUT	0x0A			/* Deafult Throughput */
 #define	X25_DEFAULT_REVERSE	0x00			/* Default Reverse Charging */
+#define X25_DENY_ACCPT_APPRV   0x01			/* Default value */
+#define X25_ALLOW_ACCPT_APPRV  0x00			/* Control enabled */
 
 #define X25_SMODULUS 		8
 #define	X25_EMODULUS		128
@@ -94,7 +96,7 @@ enum {
 #define	X25_FAC_CLASS_C		0x80
 #define	X25_FAC_CLASS_D		0xC0
 
-#define	X25_FAC_REVERSE		0x01
+#define	X25_FAC_REVERSE		0x01			/* also fast select */
 #define	X25_FAC_THROUGHPUT	0x02
 #define	X25_FAC_PACKET_SIZE	0x42
 #define	X25_FAC_WINDOW_SIZE	0x43
@@ -135,7 +137,7 @@ struct x25_sock {
 	struct x25_address	source_addr, dest_addr;
 	struct x25_neigh	*neighbour;
 	unsigned int		lci, cudmatchlength;
-	unsigned char		state, condition, qbitincl, intflag;
+	unsigned char		state, condition, qbitincl, intflag, accptapprv;
 	unsigned short		vs, vr, va, vl;
 	unsigned long		t2, t21, t22, t23;
 	unsigned short		fraglen;
-- 
cgit v1.2.3


From dad32bbf43b496bcd32a83f73a1e7fd0a02cfd3e Mon Sep 17 00:00:00 2001
From: John Rose <johnrose@austin.ibm.com>
Date: Thu, 23 Jun 2005 17:09:54 +1000
Subject: [PATCH] pSeries - read irqs dynamically

For I/O DLPAR to work properly, the kernel needs to allow for dynamic
assignment of the irq field of the pci_dev structure upon dynamic bus
addition.  This patch moves the assignment of that field from
pSeries_final_fixup() to pcibios_fixup_bus(), which enables dynamic
assignment for the children of a newly added bus.

Currently, pci_devs receive their irq numbers in one of two ways.  The
irq line is either read at boot for all pci_devs, or read by the rpaphp
module at slot enable time.  The latter is no longer sufficient for
DLPAR addition of slots that don't qualify as PCI-hotplug capable.
This solution handles the cases of boot and dynamic add.

Signed-off-by: John Rose <johnrose@austin.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/asm-ppc64/machdep.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h
index 5d3cd9d042e2..553b2ea23bed 100644
--- a/include/asm-ppc64/machdep.h
+++ b/include/asm-ppc64/machdep.h
@@ -76,6 +76,7 @@ struct machdep_calls {
 	void		(*tce_flush)(struct iommu_table *tbl);
 	void		(*iommu_dev_setup)(struct pci_dev *dev);
 	void		(*iommu_bus_setup)(struct pci_bus *bus);
+	void		(*irq_bus_setup)(struct pci_bus *bus);
 
 	int		(*probe)(int platform);
 	void		(*setup_arch)(void);
-- 
cgit v1.2.3


From 408fde81c1bff15c875a3618481e93a01dcc79ea Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:37 -0700
Subject: [PATCH] remove non-DISCONTIG use of pgdat->node_mem_map

This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code.  On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.

There was also a node_mem_map(nid) macro on many architectures.  Its use
along with the use of ->node_mem_map itself was not consistent.  It has
been removed in favor of two new, more explicit, arch-independent macros:

	pgdat_page_nr(pgdat, pagenr)
	nid_page_nr(nid, pagenr)

I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways.  I
believe the newer names are much clearer.

These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead.  We could
make this the only behavior if people want, but I don't want to change too
much at once.  One thing at a time.

This patch removes more code than it adds.

Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64.  Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/

Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/mmzone.h  | 3 +--
 include/asm-i386/mmzone.h   | 3 +--
 include/asm-m32r/mmzone.h   | 3 +--
 include/asm-parisc/mmzone.h | 3 +--
 include/asm-ppc64/mmzone.h  | 3 +--
 include/asm-x86_64/mmzone.h | 5 +----
 include/linux/mmzone.h      | 2 ++
 7 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index 726c150dcbe4..a011ef4cf3d3 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -57,7 +57,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
  * Given a kernel address, find the home node of the underlying memory.
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 
 #define local_mapnr(kvaddr) \
@@ -108,7 +107,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 #define pfn_to_page(pfn)						\
 ({									\
  	unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT);	\
-	(node_mem_map(kvaddr_to_nid(kaddr)) + local_mapnr(kaddr));	\
+	(NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr));	\
 })
 
 #define page_to_pfn(page)						\
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 13830ae67cac..9cec191f462c 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -79,7 +79,6 @@ static inline int pfn_to_nid(unsigned long pfn)
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -100,7 +99,7 @@ static inline int pfn_to_nid(unsigned long pfn)
 ({									\
 	unsigned long __pfn = pfn;					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h
index ebf0228fec42..d58878ec899e 100644
--- a/include/asm-m32r/mmzone.h
+++ b/include/asm-m32r/mmzone.h
@@ -14,7 +14,6 @@ extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -32,7 +31,7 @@ extern struct pglist_data *node_data[];
 ({									\
 	unsigned long __pfn = pfn;					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h
index 928bf50c4693..595d3dce120a 100644
--- a/include/asm-parisc/mmzone.h
+++ b/include/asm-parisc/mmzone.h
@@ -19,7 +19,6 @@ extern struct node_map_data node_data[];
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -38,7 +37,7 @@ extern struct node_map_data node_data[];
 ({									\
 	unsigned long __pfn = (pfn);					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index 0619a41a3c9d..cbfc5ecfe875 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -65,7 +65,6 @@ static inline int pa_to_nid(unsigned long pa)
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
@@ -76,7 +75,7 @@ static inline int pa_to_nid(unsigned long pa)
 #define discontigmem_pfn_to_page(pfn) \
 ({ \
 	unsigned long __tmp = pfn; \
-	(node_mem_map(pfn_to_nid(__tmp)) + \
+	(NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \
 	 node_localnr(__tmp, pfn_to_nid(__tmp))); \
 })
 
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index d95b7c240831..ca4fc3fe0dee 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -35,9 +35,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 #define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
-
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
 				 NODE_DATA(nid)->node_spanned_pages)
@@ -50,7 +47,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
    (2.4 used to). */
 #define pfn_to_page(pfn) ({ \
 	int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); 	\
-	((pfn) - node_start_pfn(nid)) + node_mem_map(nid);		\
+	((pfn) - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;	\
 })
 
 #define page_to_pfn(page) \
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4733d35d8223..b79633d3a97b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -284,6 +284,8 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+#define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
+#define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 extern struct pglist_data *pgdat_list;
 
-- 
cgit v1.2.3


From 6f167ec721108c9282d54424516a12c805e3c306 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:39 -0700
Subject: [PATCH] sparsemem base: simple NUMA remap space allocator

Introduce a simple allocator for the NUMA remap space.  This space is very
scarce, used for structures which are best allocated node local.

This mechanism is also used on non-NUMA ia64 systems with a vmem_map to keep
the pgdat->node_mem_map initialized in a consistent place for all
architectures.

Issues:
o alloc_remap takes a node_id where we might expect a pgdat which was intended
  to allow us to allocate the pgdat's using this mechanism; which we do not yet
  do.  Could have alloc_remap_node() and alloc_remap_nid() for this purpose.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0dd8ca1a3d5a..500f451ce0c0 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -67,6 +67,15 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size,
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
+extern void *alloc_remap(int nid, unsigned long size);
+#else
+static inline void *alloc_remap(int nid, unsigned long size)
+{
+	return NULL;
+}
+#endif
+
 extern unsigned long __initdata nr_kernel_pages;
 extern unsigned long __initdata nr_all_pages;
 
-- 
cgit v1.2.3


From 348f8b6c4837a07304d2f72b11ce8d96588065e0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:40 -0700
Subject: [PATCH] sparsemem base: reorganize page->flags bit operations

Generify the value fields in the page_flags.  The aim is to allow the location
and size of these fields to be varied.  Additionally we want to move away from
fixed allocations per field whilst still enforcing the overall bit utilisation
limits.  We rely on the compiler to spot and optimise the accessor functions.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h     | 53 +++++++++++++++++++++++++++++++++++++++++---------
 include/linux/mmzone.h | 19 +++++++-----------
 2 files changed, 51 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1813b162b0a8..57b2ead51dba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -395,19 +395,41 @@ static inline void put_page(struct page *page)
 /*
  * The zone field is never updated after free_area_init_core()
  * sets it, so none of the operations on it need to be atomic.
- * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total,
- * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits.
  */
-#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT)
+
+/* Page flags: | NODE | ZONE | ... | FLAGS | */
+#define NODES_PGOFF		((sizeof(page_flags_t)*8) - NODES_SHIFT)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_SHIFT)
+
+/*
+ * Define the bit shifts to access each section.  For non-existant
+ * sections we define the shift as 0; that plus a 0 mask ensures
+ * the compiler will optimise away reference to them.
+ */
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_SHIFT != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_SHIFT != 0))
+
+/* NODE:ZONE is used to lookup the zone from a page. */
+#define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
+
+#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#endif
+
 #define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
 
+#define ZONES_MASK		((1UL << ZONES_SHIFT) - 1)
+#define NODES_MASK		((1UL << NODES_SHIFT) - 1)
+#define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
+
 static inline unsigned long page_zonenum(struct page *page)
 {
-	return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT));
+	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 static inline unsigned long page_to_nid(struct page *page)
 {
-	return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT));
+	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 }
 
 struct zone;
@@ -415,13 +437,26 @@ extern struct zone *zone_table[];
 
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[page->flags >> NODEZONE_SHIFT];
+	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
+			ZONETABLE_MASK];
+}
+
+static inline void set_page_zone(struct page *page, unsigned long zone)
+{
+	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
+	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+}
+static inline void set_page_node(struct page *page, unsigned long node)
+{
+	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
+	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
 }
 
-static inline void set_page_zone(struct page *page, unsigned long nodezone_num)
+static inline void set_page_links(struct page *page, unsigned long zone,
+	unsigned long node)
 {
-	page->flags &= ~(~0UL << NODEZONE_SHIFT);
-	page->flags |= nodezone_num << NODEZONE_SHIFT;
+	set_page_zone(page, zone);
+	set_page_node(page, node);
 }
 
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b79633d3a97b..39e912708e2a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -414,30 +414,25 @@ extern struct pglist_data contig_page_data;
 
 #include <asm/mmzone.h>
 
+#endif /* !CONFIG_DISCONTIGMEM */
+
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
  * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
  */
-#define MAX_NODES_SHIFT		6
+#define FLAGS_RESERVED		8
+
 #elif BITS_PER_LONG == 64
 /*
  * with 64 bit flags field, there's plenty of room.
  */
-#define MAX_NODES_SHIFT		10
-#endif
+#define FLAGS_RESERVED		32
 
-#endif /* !CONFIG_DISCONTIGMEM */
-
-#if NODES_SHIFT > MAX_NODES_SHIFT
-#error NODES_SHIFT > MAX_NODES_SHIFT
-#endif
+#else
 
-/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */
-#define MAX_ZONES_SHIFT		2
+#error BITS_PER_LONG not defined
 
-#if ZONES_SHIFT > MAX_ZONES_SHIFT
-#error ZONES_SHIFT > MAX_ZONES_SHIFT
 #endif
 
 #endif /* !__ASSEMBLY__ */
-- 
cgit v1.2.3


From 5b505b90b2d54e526cc8d123bdef3b98c9f0bbc6 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:41 -0700
Subject: [PATCH] sparsemem base: teach discontig about sparse ranges

discontig.c has some assumptions that mem_map[]s inside of a node are
contiguous.  Teach it to make sure that each region that it's bringing online
is actually made up of valid ranges of ram.

Written-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/page.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 41400d342d44..8f3dded01bff 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -120,6 +120,8 @@ static __inline__ int get_order(unsigned long size)
 
 extern int sysctl_legacy_va_layout;
 
+extern int page_is_ram(unsigned long pagenr);
+
 #endif /* __ASSEMBLY__ */
 
 #ifdef __ASSEMBLY__
-- 
cgit v1.2.3


From 93b7504e3e6c1d98586854806e51bea329ea3aa9 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:47 -0700
Subject: [PATCH] Introduce new Kconfig option for NUMA or DISCONTIG

There is some confusion that arose when working on SPARSEMEM patch between
what is needed for DISCONTIG vs. NUMA.

Multiple pg_data_t's are needed for DISCONTIGMEM or NUMA, independently.
All of the current NUMA implementations require an implementation of
DISCONTIG.  Because of this, quite a lot of code which is really needed for
NUMA is actually under DISCONTIG #ifdefs.  For SPARSEMEM, we changed some
of these #ifdefs to CONFIG_NUMA, but that broke the DISCONTIG=y and NUMA=n
case.

Introducing this new NEED_MULTIPLE_NODES config option allows code that is
needed for both NUMA or DISCONTIG to be separated out from code that is
specific to DISCONTIG.

One great advantage of this approach is that it doesn't require every
architecture to be converted over.  All of the current implementations
should "just work", only the ones implementing SPARSEMEM will have to be
fixed up.

The change to free_area_init() makes it work inside, or out of the new
config option.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39e912708e2a..95f4a780ea66 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -402,7 +402,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 /* Returns the number of the current Node. */
 #define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 
 extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
@@ -410,11 +410,11 @@ extern struct pglist_data contig_page_data;
 #define MAX_NODES_SHIFT		1
 #define pfn_to_nid(pfn)		(0)
 
-#else /* CONFIG_DISCONTIGMEM */
+#else /* CONFIG_NEED_MULTIPLE_NODES */
 
 #include <asm/mmzone.h>
 
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
-- 
cgit v1.2.3


From b159d43fbf7eaaac6ecc647f51cf4257332db47b Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:52 -0700
Subject: [PATCH] generify early_pfn_to_nid

Provide a default implementation for early_pfn_to_nid returning node 0.  Allow
architectures to override this with their own implementation out of
asm/mmzone.h.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mmzone.h | 3 +++
 include/linux/mmzone.h    | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 9cec191f462c..48e46d403aa6 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -143,4 +143,7 @@ static inline void get_memcfg_numa(void)
 }
 
 #endif /* CONFIG_DISCONTIGMEM */
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
 #endif /* _ASM_MMZONE_H_ */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 95f4a780ea66..6ef07de98d69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,6 +435,10 @@ extern struct pglist_data contig_page_data;
 
 #endif
 
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+#define early_pfn_to_nid(nid)  (0UL)
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
-- 
cgit v1.2.3


From d41dee369bff3b9dcb6328d4d822926c28cc2594 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:54 -0700
Subject: [PATCH] sparsemem memory model

Sparsemem abstracts the use of discontiguous mem_maps[].  This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems.  Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.

A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA.  When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.

Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous.  It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.

Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory.  This is what allows the mem_map[]
to be chopped up.

In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags.  Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations.  However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions.  Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.

One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags.  It might provide
speed increases on certain platforms and will be stored there if there is
room.  But, if out of room, an alternate (theoretically slower) mechanism is
used.

This patch introduces CONFIG_FLATMEM.  It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h     | 92 ++++++++++++++++++++++++++++++++++++++---------
 include/linux/mmzone.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/numa.h   |  2 +-
 3 files changed, 172 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57b2ead51dba..6eb7f48317f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -397,40 +397,80 @@ static inline void put_page(struct page *page)
  * sets it, so none of the operations on it need to be atomic.
  */
 
-/* Page flags: | NODE | ZONE | ... | FLAGS | */
-#define NODES_PGOFF		((sizeof(page_flags_t)*8) - NODES_SHIFT)
-#define ZONES_PGOFF		(NODES_PGOFF - ZONES_SHIFT)
+
+/*
+ * page->flags layout:
+ *
+ * There are three possibilities for how page->flags get
+ * laid out.  The first is for the normal case, without
+ * sparsemem.  The second is for sparsemem when there is
+ * plenty of space for node and section.  The last is when
+ * we have run out of space and have to fall back to an
+ * alternate (slower) way of determining the node.
+ *
+ *        No sparsemem: |       NODE     | ZONE | ... | FLAGS |
+ * with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
+ *   no space for node: | SECTION |     ZONE    | ... | FLAGS |
+ */
+#ifdef CONFIG_SPARSEMEM
+#define SECTIONS_WIDTH		SECTIONS_SHIFT
+#else
+#define SECTIONS_WIDTH		0
+#endif
+
+#define ZONES_WIDTH		ZONES_SHIFT
+
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
+#define NODES_WIDTH		NODES_SHIFT
+#else
+#define NODES_WIDTH		0
+#endif
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
+#define SECTIONS_PGOFF		((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
+
+/*
+ * We are going to use the flags for the page to node mapping if its in
+ * there.  This includes the case where there is no node, so it is implicit.
+ */
+#define FLAGS_HAS_NODE		(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+
+#ifndef PFN_SECTION_SHIFT
+#define PFN_SECTION_SHIFT 0
+#endif
 
 /*
  * Define the bit shifts to access each section.  For non-existant
  * sections we define the shift as 0; that plus a 0 mask ensures
  * the compiler will optimise away reference to them.
  */
-#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_SHIFT != 0))
-#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_SHIFT != 0))
+#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
 
-/* NODE:ZONE is used to lookup the zone from a page. */
+/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
+#if FLAGS_HAS_NODE
 #define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#else
+#define ZONETABLE_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
+#endif
 #define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
 
-#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
-#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
 #endif
 
-#define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
-
-#define ZONES_MASK		((1UL << ZONES_SHIFT) - 1)
-#define NODES_MASK		((1UL << NODES_SHIFT) - 1)
+#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
+#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
+#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
 #define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
 
 static inline unsigned long page_zonenum(struct page *page)
 {
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
-static inline unsigned long page_to_nid(struct page *page)
-{
-	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
-}
 
 struct zone;
 extern struct zone *zone_table[];
@@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page)
 			ZONETABLE_MASK];
 }
 
+static inline unsigned long page_to_nid(struct page *page)
+{
+	if (FLAGS_HAS_NODE)
+		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
+	else
+		return page_zone(page)->zone_pgdat->node_id;
+}
+static inline unsigned long page_to_section(struct page *page)
+{
+	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
+}
+
 static inline void set_page_zone(struct page *page, unsigned long zone)
 {
 	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
@@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node)
 	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
 	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
 }
+static inline void set_page_section(struct page *page, unsigned long section)
+{
+	page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+	page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+}
 
 static inline void set_page_links(struct page *page, unsigned long zone,
-	unsigned long node)
+	unsigned long node, unsigned long pfn)
 {
 	set_page_zone(page, zone);
 	set_page_node(page, node);
+	set_page_section(page, pfn_to_section_nr(pfn));
 }
 
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6ef07de98d69..19860d317ec2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -269,7 +269,9 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[GFP_ZONETYPES];
 	int nr_zones;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 	struct page *node_mem_map;
+#endif
 	struct bootmem_data *bdata;
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
@@ -284,7 +286,11 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 #define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
+#else
+#define pgdat_page_nr(pgdat, pagenr)	pfn_to_page((pgdat)->node_start_pfn + (pagenr))
+#endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 extern struct pglist_data *pgdat_list;
@@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
+#ifdef CONFIG_SPARSEMEM
+#include <asm/sparsemem.h>
+#endif
+
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
@@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data;
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 
+#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
+#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
+
+#ifdef CONFIG_SPARSEMEM
+
+/*
+ * SECTION_SHIFT    		#bits space required to store a section #
+ *
+ * PA_SECTION_SHIFT		physical address to/from section number
+ * PFN_SECTION_SHIFT		pfn to/from section number
+ */
+#define SECTIONS_SHIFT		(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
+
+#define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
+#define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
+
+#define NR_MEM_SECTIONS		(1UL << SECTIONS_SHIFT)
+
+#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
+#define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
+
+#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
+#error Allocator MAX_ORDER exceeds SECTION_SIZE
+#endif
+
+struct page;
+struct mem_section {
+	struct page *section_mem_map;
+};
+
+extern struct mem_section mem_section[NR_MEM_SECTIONS];
+
+/*
+ * Given a kernel address, find the home node of the underlying memory.
+ */
+#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
+
+static inline struct mem_section *__pfn_to_section(unsigned long pfn)
+{
+	return &mem_section[pfn_to_section_nr(pfn)];
+}
+
+#define pfn_to_page(pfn) 						\
+({ 									\
+	unsigned long __pfn = (pfn);					\
+	__pfn_to_section(__pfn)->section_mem_map + __pfn;		\
+})
+#define page_to_pfn(page)						\
+({									\
+	page - mem_section[page_to_section(page)].section_mem_map;	\
+})
+
+static inline int pfn_valid(unsigned long pfn)
+{
+	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
+		return 0;
+	return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
+}
+
+/*
+ * These are _only_ used during initialisation, therefore they
+ * can use __initdata ...  They could have names to indicate
+ * this restriction.
+ */
+#ifdef CONFIG_NUMA
+#define pfn_to_nid		early_pfn_to_nid
+#endif
+
+#define pfn_to_pgdat(pfn)						\
+({									\
+	NODE_DATA(pfn_to_nid(pfn));					\
+})
+
+#define early_pfn_valid(pfn)	pfn_valid(pfn)
+void sparse_init(void);
+#else
+#define sparse_init()	do {} while (0)
+#endif /* CONFIG_SPARSEMEM */
+
+#ifndef early_pfn_valid
+#define early_pfn_valid(pfn)	(1)
+#endif
+
+void memory_present(int nid, unsigned long start, unsigned long end);
+unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/include/linux/numa.h b/include/linux/numa.h
index bd0c8c4e9a95..f0c539bd3cfc 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_FLATMEM
 #include <asm/numnodes.h>
 #endif
 
-- 
cgit v1.2.3


From 05b79bdcb48c18cd9b580c39e3efb9a1ab078151 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:57 -0700
Subject: [PATCH] sparsemem memory model for i386

Provide the architecture specific implementation for SPARSEMEM for i386 SMP
and NUMA systems.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mmzone.h    | 89 ++++++++++++++++++++++++--------------------
 include/asm-i386/page.h      |  4 +-
 include/asm-i386/pgtable.h   |  4 +-
 include/asm-i386/sparsemem.h | 31 +++++++++++++++
 4 files changed, 83 insertions(+), 45 deletions(-)
 create mode 100644 include/asm-i386/sparsemem.h

(limited to 'include')

diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 48e46d403aa6..33ce5d37e894 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -8,7 +8,9 @@
 
 #include <asm/smp.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#if CONFIG_NUMA
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid)	(node_data[nid])
 
 #ifdef CONFIG_NUMA
 	#ifdef CONFIG_X86_NUMAQ
@@ -21,8 +23,28 @@
 	#define get_zholes_size(n) (0)
 #endif /* CONFIG_NUMA */
 
-extern struct pglist_data *node_data[];
-#define NODE_DATA(nid)		(node_data[nid])
+extern int get_memcfg_numa_flat(void );
+/*
+ * This allows any one NUMA architecture to be compiled
+ * for, and still fall back to the flat function if it
+ * fails.
+ */
+static inline void get_memcfg_numa(void)
+{
+#ifdef CONFIG_X86_NUMAQ
+	if (get_memcfg_numaq())
+		return;
+#elif CONFIG_ACPI_SRAT
+	if (get_memcfg_from_srat())
+		return;
+#endif
+
+	get_memcfg_numa_flat();
+}
+
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DISCONTIGMEM
 
 /*
  * generic node memory support, the following assumptions apply:
@@ -48,26 +70,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 #endif
 }
 
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size) \
-	reserve_bootmem_node(NODE_DATA(0), (addr), (size))
-#define alloc_bootmem(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(ignore, x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_node(ignore, x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages_node(ignore, x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-
 #define node_localnr(pfn, nid)		((pfn) - node_data[nid]->node_start_pfn)
 
 /*
@@ -121,28 +123,33 @@ static inline int pfn_valid(int pfn)
 		return (pfn < node_end_pfn(nid));
 	return 0;
 }
-#endif
+#endif /* CONFIG_X86_NUMAQ */
+
+#endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
 
-extern int get_memcfg_numa_flat(void );
 /*
- * This allows any one NUMA architecture to be compiled
- * for, and still fall back to the flat function if it
- * fails.
+ * Following are macros that are specific to this numa platform.
  */
-static inline void get_memcfg_numa(void)
-{
-#ifdef CONFIG_X86_NUMAQ
-	if (get_memcfg_numaq())
-		return;
-#elif CONFIG_ACPI_SRAT
-	if (get_memcfg_from_srat())
-		return;
-#endif
-
-	get_memcfg_numa_flat();
-}
+#define reserve_bootmem(addr, size) \
+	reserve_bootmem_node(NODE_DATA(0), (addr), (size))
+#define alloc_bootmem(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_pages(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low_pages(x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
+#define alloc_bootmem_node(ignore, x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_node(ignore, x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_low_pages_node(ignore, x) \
+	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
 
-#endif /* CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 extern int early_pfn_to_nid(unsigned long pfn);
 
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 8f3dded01bff..dea8f8e6d86e 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -137,11 +137,11 @@ extern int page_is_ram(unsigned long pagenr);
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_FLATMEM */
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index e9efe148fdf7..77c6497f416e 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -398,9 +398,9 @@ extern void noexec_setup(const char *str);
 
 #endif /* !__ASSEMBLY__ */
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 #define kern_addr_valid(addr)	(1)
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* CONFIG_FLATMEM */
 
 #define io_remap_page_range(vma, vaddr, paddr, size, prot)		\
 		remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot)
diff --git a/include/asm-i386/sparsemem.h b/include/asm-i386/sparsemem.h
new file mode 100644
index 000000000000..cfeed990585f
--- /dev/null
+++ b/include/asm-i386/sparsemem.h
@@ -0,0 +1,31 @@
+#ifndef _I386_SPARSEMEM_H
+#define _I386_SPARSEMEM_H
+#ifdef CONFIG_SPARSEMEM
+
+/*
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the
+ *    flags field of the struct page
+ */
+
+/*
+ * SECTION_SIZE_BITS		2^N: how big each section will be
+ * MAX_PHYSADDR_BITS		2^N: how much physical address space we have
+ * MAX_PHYSMEM_BITS		2^N: how much memory we can have in that space
+ */
+#ifdef CONFIG_X86_PAE
+#define SECTION_SIZE_BITS       30
+#define MAX_PHYSADDR_BITS       36
+#define MAX_PHYSMEM_BITS	36
+#else
+#define SECTION_SIZE_BITS       26
+#define MAX_PHYSADDR_BITS       32
+#define MAX_PHYSMEM_BITS	32
+#endif
+
+/* XXX: FIXME -- wli */
+#define kern_addr_valid(kaddr)  (0)
+
+#endif /* CONFIG_SPARSEMEM */
+#endif /* _I386_SPARSEMEM_H */
-- 
cgit v1.2.3


From 641c767389b19859a45e6de46d8e18cd935bdb60 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:59 -0700
Subject: [PATCH] sparsemem swiss cheese numa layouts

The part of the sparsemem patch which modifies memmap_init_zone() has recently
become a problem.  It changes behavior so that there is a call to
pfn_to_page() for each individual page inside of a node's range:
node_start_pfn through node_end_pfn.  It used to simply do this once, at the
beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside
of a node made it necessary to change.

Mike Kravetz recently wrote a patch which made the NUMA code accept some new
kinds of layouts.  The system's memory was laid out like this, with node 0's
memory in two pieces: one before and one after node 1's memory:

	Node 0: +++++     +++++
	Node 1:      +++++

Previous behavior before Mike's patch was to assign nodes like this:

	Node 0: 00000     XXXXX
	Node 1:      11111

Where the 'X' areas were simply thrown away.  The new behavior was to make the
pg_data_t span node 0 across all of its areas, including areas that are really
node 1's: Node 0: 000000000000000 Node 1: 11111

This wastes a little bit of mem_map space, but ends up being OK, and more
fully utilizes the system's memory.  memmap_init_zone() initializes all of the
"struct page"s for node 0, even for the "hole", but those never get used,
because there is no pfn_to_page() that resolves to those pages.  However, only
calling pfn_to_page() once, memmap_init_zone() always uses the pages that were
allocated for node0->node_mem_map because:

	struct page *start = pfn_to_page(start_pfn);
	// effectively start = &node->node_mem_map[0]
	for (page = start; page < (start + size); page++) {
		init_page_here();...
		page++;
	}

Slow, and wasteful, but generally harmless.

But, modify that to call pfn_to_page() for each loop iteration (like sparsemem
does):

	for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) {
		page = pfn_to_page(pfn);
	}

And you end up trying to initialize node 1's pages too early, along with bogus
data from node 0.  This patch checks for those weird layouts and declines to
touch the pages, making the more frequent pfn_to_page() calls OK to do.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 19860d317ec2..746b57e3d370 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -528,6 +528,12 @@ void sparse_init(void);
 #define sparse_init()	do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+#define early_pfn_in_nid(pfn, nid)	(early_pfn_to_nid(pfn) == (nid))
+#else
+#define early_pfn_in_nid(pfn, nid)	(1)
+#endif
+
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
-- 
cgit v1.2.3


From 29751f6991e845f7d002a6ae520bf996b38c8dcd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:08:00 -0700
Subject: [PATCH] sparsemem hotplug base

Make sparse's initalization be accessible at runtime.  This allows sparse
mappings to be created after boot in a hotplug situation.

This patch is separated from the previous one just to give an indication how
much of the sparse infrastructure is *just* for hotplug memory.

The section_mem_map doesn't really store a pointer.  It stores something that
is convenient to do some math against to get a pointer.  It isn't valid to
just do *section_mem_map, so I don't think it should be stored as a pointer.

There are a couple of things I'd like to store about a section.  First of all,
the fact that it is !NULL does not mean that it is present.  There could be
such a combination where section_mem_map *is* NULL, but the math gets you
properly to a real mem_map.  So, I don't think that check is safe.

Since we're storing 32-bit-aligned structures, we have a few bits in the
bottom of the pointer to play with.  Use one bit to encode whether there's
really a mem_map there, and the other one to tell whether there's a valid
section there.  We need to distinguish between the two because sometimes
there's a gap between when a section is discovered to be present and when we
can get the mem_map for it.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 56 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 746b57e3d370..6c90461ed99f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -476,11 +476,56 @@ extern struct pglist_data contig_page_data;
 
 struct page;
 struct mem_section {
-	struct page *section_mem_map;
+	/*
+	 * This is, logically, a pointer to an array of struct
+	 * pages.  However, it is stored with some other magic.
+	 * (see sparse.c::sparse_init_one_section())
+	 *
+	 * Making it a UL at least makes someone do a cast
+	 * before using it wrong.
+	 */
+	unsigned long section_mem_map;
 };
 
 extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
+static inline struct mem_section *__nr_to_section(unsigned long nr)
+{
+	return &mem_section[nr];
+}
+
+/*
+ * We use the lower bits of the mem_map pointer to store
+ * a little bit of information.  There should be at least
+ * 3 bits here due to 32-bit alignment.
+ */
+#define	SECTION_MARKED_PRESENT	(1UL<<0)
+#define SECTION_HAS_MEM_MAP	(1UL<<1)
+#define SECTION_MAP_LAST_BIT	(1UL<<2)
+#define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
+
+static inline struct page *__section_mem_map_addr(struct mem_section *section)
+{
+	unsigned long map = section->section_mem_map;
+	map &= SECTION_MAP_MASK;
+	return (struct page *)map;
+}
+
+static inline int valid_section(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_MARKED_PRESENT);
+}
+
+static inline int section_has_mem_map(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_HAS_MEM_MAP);
+}
+
+static inline int valid_section_nr(unsigned long nr)
+{
+	return valid_section(__nr_to_section(nr));
+}
+
 /*
  * Given a kernel address, find the home node of the underlying memory.
  */
@@ -488,24 +533,25 @@ extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
-	return &mem_section[pfn_to_section_nr(pfn)];
+	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
 #define pfn_to_page(pfn) 						\
 ({ 									\
 	unsigned long __pfn = (pfn);					\
-	__pfn_to_section(__pfn)->section_mem_map + __pfn;		\
+	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
 })
 #define page_to_pfn(page)						\
 ({									\
-	page - mem_section[page_to_section(page)].section_mem_map;	\
+	page - __section_mem_map_addr(__nr_to_section(			\
+		page_to_section(page)));				\
 })
 
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
+	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 
 /*
-- 
cgit v1.2.3


From 510f8fa7ba18320d408dd3093663e58f5664f2f0 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:08:01 -0700
Subject: [PATCH] ppc64: add early_pfn_to_nid

Provide an implementation of early_pfn_to_nid for PPC64.  This is used by
memory models to determine the node from which to take allocations before the
memory allocators are fully initialised.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ppc64/mmzone.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index cbfc5ecfe875..ecc4b07507d6 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -90,4 +90,9 @@ static inline int pa_to_nid(unsigned long pa)
 #define discontigmem_pfn_valid(pfn)		((pfn) < num_physpages)
 
 #endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+#define early_pfn_to_nid(pfn)  pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT)
+#endif
+
 #endif /* _ASM_MMZONE_H_ */
-- 
cgit v1.2.3


From 145e664231648121026d470094c200851a446a73 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:08:03 -0700
Subject: [PATCH] ppc64: sparsemem memory model

Provide the architecture specific implementation for SPARSEMEM for PPC64
systems.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com> (in part)
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ppc64/mmzone.h    | 36 +++++++++++++++++++++++-------------
 include/asm-ppc64/page.h      |  3 ++-
 include/asm-ppc64/sparsemem.h | 16 ++++++++++++++++
 3 files changed, 41 insertions(+), 14 deletions(-)
 create mode 100644 include/asm-ppc64/sparsemem.h

(limited to 'include')

diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index ecc4b07507d6..ed473f4b0152 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -10,9 +10,20 @@
 #include <linux/config.h>
 #include <asm/smp.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+/* generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the
+ *    flags field of the struct page
+ */
+
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
 
 extern struct pglist_data *node_data[];
+/*
+ * Return a pointer to the node data for node n.
+ */
+#define NODE_DATA(nid)		(node_data[nid])
 
 /*
  * Following are specific to this numa platform.
@@ -47,30 +58,27 @@ static inline int pa_to_nid(unsigned long pa)
 	return nid;
 }
 
-#define pfn_to_nid(pfn)		pa_to_nid((pfn) << PAGE_SHIFT)
-
-/*
- * Return a pointer to the node data for node n.
- */
-#define NODE_DATA(nid)		(node_data[nid])
-
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
 
 /*
  * Following are macros that each numa implmentation must define.
  */
 
-/*
- * Given a kernel address, find the home node of the underlying memory.
- */
-#define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
-
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
 #define local_mapnr(kvaddr) \
 	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) 
 
+#ifdef CONFIG_DISCONTIGMEM
+
+/*
+ * Given a kernel address, find the home node of the underlying memory.
+ */
+#define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
+
+#define pfn_to_nid(pfn)		pa_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
+
 /* Written this way to avoid evaluating arguments twice */
 #define discontigmem_pfn_to_page(pfn) \
 ({ \
@@ -91,6 +99,8 @@ static inline int pa_to_nid(unsigned long pa)
 
 #endif /* CONFIG_DISCONTIGMEM */
 
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
+
 #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 #define early_pfn_to_nid(pfn)  pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT)
 #endif
diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
index 257d87eb7c34..a5893a305a09 100644
--- a/include/asm-ppc64/page.h
+++ b/include/asm-ppc64/page.h
@@ -217,7 +217,8 @@ extern u64 ppc64_pft_size;		/* Log 2 of page table size */
 #define page_to_pfn(page)	discontigmem_page_to_pfn(page)
 #define pfn_to_page(pfn)	discontigmem_pfn_to_page(pfn)
 #define pfn_valid(pfn)		discontigmem_pfn_valid(pfn)
-#else
+#endif
+#ifdef CONFIG_FLATMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
diff --git a/include/asm-ppc64/sparsemem.h b/include/asm-ppc64/sparsemem.h
new file mode 100644
index 000000000000..c5bd47e57f17
--- /dev/null
+++ b/include/asm-ppc64/sparsemem.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_PPC64_SPARSEMEM_H
+#define _ASM_PPC64_SPARSEMEM_H 1
+
+#ifdef CONFIG_SPARSEMEM
+/*
+ * SECTION_SIZE_BITS		2^N: how big each section will be
+ * MAX_PHYSADDR_BITS		2^N: how much physical address space we have
+ * MAX_PHYSMEM_BITS		2^N: how much memory we can have in that space
+ */
+#define SECTION_SIZE_BITS       24
+#define MAX_PHYSADDR_BITS       38
+#define MAX_PHYSMEM_BITS        36
+
+#endif /* CONFIG_SPARSEMEM */
+
+#endif /* _ASM_PPC64_SPARSEMEM_H */
-- 
cgit v1.2.3


From 2b97690f4cd960779fb351b7cd9974390afabb36 Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Thu, 23 Jun 2005 00:08:06 -0700
Subject: [PATCH] reorganize x86-64 NUMA and DISCONTIGMEM config options

In order to use the alternative sparsemem implmentation for NUMA kernels,
we need to reorganize the config options.  This patch effectively abstracts
out the CONFIG_DISCONTIGMEM options to CONFIG_NUMA in most cases.  Thus,
the discontigmem implementation may be employed as always, but the
sparsemem implementation may be used alternatively.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/io.h       |  5 -----
 include/asm-x86_64/mmzone.h   | 15 +++++++++------
 include/asm-x86_64/page.h     |  4 +++-
 include/asm-x86_64/topology.h |  4 +---
 4 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h
index 94202703fae2..37fc3f149a5a 100644
--- a/include/asm-x86_64/io.h
+++ b/include/asm-x86_64/io.h
@@ -124,12 +124,7 @@ extern inline void * phys_to_virt(unsigned long address)
 /*
  * Change "struct page" to physical address.
  */
-#ifdef CONFIG_DISCONTIGMEM
-#include <asm/mmzone.h>
 #define page_to_phys(page)    ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-#else
-#define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
-#endif
 
 #include <asm-generic/iomap.h>
 
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index ca4fc3fe0dee..768413751b34 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -6,7 +6,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 
 #define VIRTUAL_BUG_ON(x) 
 
@@ -30,17 +30,16 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 	return nid; 
 } 
 
-#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
-
-#define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
 				 NODE_DATA(nid)->node_spanned_pages)
 
-#define local_mapnr(kvaddr) \
-	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
+#ifdef CONFIG_DISCONTIGMEM
+
+#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
+#define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 
 /* AK: this currently doesn't deal with invalid addresses. We'll see 
    if the 2.5 kernel doesn't pass them
@@ -57,4 +56,8 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 			({ u8 nid__ = pfn_to_nid(pfn); \
 			   nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) <= node_end_pfn(nid__); }))
 #endif
+
+#define local_mapnr(kvaddr) \
+	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
+#endif
 #endif
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 9ce338c3a71e..60130f4ca986 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -119,7 +119,9 @@ extern __inline__ int get_order(unsigned long size)
 	  __pa(v); })
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
-#ifndef CONFIG_DISCONTIGMEM
+#define __boot_va(x)		__va(x)
+#define __boot_pa(x)		__pa(x)
+#ifdef CONFIG_FLATMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 67f24e0ea819..da21573ec731 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 
 #include <asm/mpspec.h>
 #include <asm/bitops.h>
@@ -37,7 +37,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus)
 }
 #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number)
 
-#ifdef CONFIG_NUMA
 /* sched_domains SD_NODE_INIT for x86_64 machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
@@ -59,7 +58,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus)
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
 }
-#endif
 
 #endif
 
-- 
cgit v1.2.3


From bbfceef47fb9467424113a004070bf37a806a97c Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Thu, 23 Jun 2005 00:08:07 -0700
Subject: [PATCH] add x86-64 specific support for sparsemem

This patch adds in the necessary support for sparsemem such that x86-64
kernels may use sparsemem as an alternative to discontigmem for NUMA
kernels.  Note that this does no preclude one from continuing to build NUMA
kernels using discontigmem, but merely allows the option to build NUMA
kernels with sparsemem.

Interestingly, the use of sparsemem in lieu of discontigmem in NUMA kernels
results in reduced text size for otherwise equivalent kernels as shown in
the example builds below:

   text	   data	    bss	    dec	    hex	filename
2371036	 765884	1237108	4374028	 42be0c	vmlinux.discontig
2366549	 776484	1302772	4445805	 43d66d	vmlinux.sparse

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/bitops.h    |  2 --
 include/asm-x86_64/sparsemem.h | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 include/asm-x86_64/sparsemem.h

(limited to 'include')

diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 5dd7727c756b..a31bb99be53f 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -411,8 +411,6 @@ static __inline__ int ffs(int x)
 /* find last set bit */
 #define fls(x) generic_fls(x)
 
-#define ARCH_HAS_ATOMIC_UNSIGNED 1
-
 #endif /* __KERNEL__ */
 
 #endif /* _X86_64_BITOPS_H */
diff --git a/include/asm-x86_64/sparsemem.h b/include/asm-x86_64/sparsemem.h
new file mode 100644
index 000000000000..dabb16714a71
--- /dev/null
+++ b/include/asm-x86_64/sparsemem.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_64_SPARSEMEM_H
+#define _ASM_X86_64_SPARSEMEM_H 1
+
+#ifdef CONFIG_SPARSEMEM
+
+/*
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the flags
+ *    field of the struct page
+ *
+ * SECTION_SIZE_BITS		2^n: size of each section
+ * MAX_PHYSADDR_BITS		2^n: max size of physical address space
+ * MAX_PHYSMEM_BITS		2^n: how much memory we can have in that space
+ *
+ */
+
+#define SECTION_SIZE_BITS	27 /* matt - 128 is convenient right now */
+#define MAX_PHYSADDR_BITS	40
+#define MAX_PHYSMEM_BITS	40
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
+#endif /* CONFIG_SPARSEMEM */
+
+#endif /* _ASM_X86_64_SPARSEMEM_H */
-- 
cgit v1.2.3


From 8a9e1b0f564615bd92ba50162623e25c2904e564 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Thu, 23 Jun 2005 00:08:13 -0700
Subject: [PATCH] Platform SMIs and their interferance with tsc based delay
 calibration

Issue:
Current tsc based delay_calibration can result in significant errors in
loops_per_jiffy count when the platform events like SMIs
(System Management Interrupts that are non-maskable) are present. This could
lead to potential kernel panic(). This issue is becoming more visible with 2.6
kernel (as default HZ is 1000) and on platforms with higher SMI handling
latencies. During the boot time, SMIs are mostly used by BIOS (for things
like legacy keyboard emulation).

Description:
The psuedocode for current delay calibration with tsc based delay looks like
(0) Estimate a value for loops_per_jiffy
(1) While (loops_per_jiffy estimate is accurate enough)
(2)   wait for jiffy transition (jiffy1)
(3)   Note down current tsc (tsc1)
(4)   loop until tsc becomes tsc1 + loops_per_jiffy
(5)   check whether jiffy changed since jiffy1 or not and refine
loops_per_jiffy estimate

Consider the following cases
Case 1:
If SMIs happen between (2) and (3) above, we can end up with a
loops_per_jiffy value that is too low. This results in shorted delays and
kernel can panic () during boot (Mostly at IOAPIC timer initialization
timer_irq_works() as we don't have enough timer interrupts in a specified
interval).

Case 2:
If SMIs happen between (3) and (4) above, then we can end up with a
loops_per_jiffy value that is too high. And with current i386 code, too
high lpj value (greater than 17M) can result in a overflow in
delay.c:__const_udelay() again resulting in shorter delay and panic().

Solution:
The patch below makes the calibration routine aware of asynchronous events
like SMIs. We increase the delay calibration time and also identify any
significant errors (greater than 12.5%) in the calibration and notify it to
user.

Patch below changes both i386 and x86-64 architectures to use this
new and improved calibrate_delay_direct() routine.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/timer.h   | 2 ++
 include/asm-i386/timex.h   | 3 +++
 include/asm-x86_64/timex.h | 3 +++
 3 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index c34709849839..dcf1e07db08a 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -22,6 +22,7 @@ struct timer_opts {
 	unsigned long (*get_offset)(void);
 	unsigned long long (*monotonic_clock)(void);
 	void (*delay)(unsigned long);
+	unsigned long (*read_timer)(void);
 };
 
 struct init_timer_opts {
@@ -52,6 +53,7 @@ extern struct init_timer_opts timer_cyclone_init;
 #endif
 
 extern unsigned long calibrate_tsc(void);
+extern unsigned long read_timer_tsc(void);
 extern void init_cpu_khz(void);
 extern int recalibrate_cpu_khz(void);
 #ifdef CONFIG_HPET_TIMER
diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h
index b41e484c3445..e370f907bd39 100644
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -49,4 +49,7 @@ static inline cycles_t get_cycles (void)
 
 extern unsigned long cpu_khz;
 
+extern int read_current_timer(unsigned long *timer_value);
+#define ARCH_HAS_READ_CURRENT_TIMER	1
+
 #endif
diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h
index 34f31a18f90b..24ecf6a637cb 100644
--- a/include/asm-x86_64/timex.h
+++ b/include/asm-x86_64/timex.h
@@ -26,6 +26,9 @@ static inline cycles_t get_cycles (void)
 
 extern unsigned int cpu_khz;
 
+extern int read_current_timer(unsigned long *timer_value);
+#define ARCH_HAS_READ_CURRENT_TIMER	1
+
 extern struct vxtime_data vxtime;
 
 #endif
-- 
cgit v1.2.3


From 4a35293667f8691b45fdb72770f087c086fb9702 Mon Sep 17 00:00:00 2001
From: Hirokazu Takata <takata@linux-m32r.org>
Date: Thu, 23 Jun 2005 00:08:14 -0700
Subject: [PATCH] m32r: build fix for asm-m32r/topology.h

Use asm-generic/topology.h to fix yet another pcibus_to_node() build error.

Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-m32r/topology.h | 44 +-------------------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/asm-m32r/topology.h b/include/asm-m32r/topology.h
index 299a89d91bde..d607eb32bd7e 100644
--- a/include/asm-m32r/topology.h
+++ b/include/asm-m32r/topology.h
@@ -1,48 +1,6 @@
-/*
- * linux/include/asm-generic/topology.h
- *
- * Written by: Matthew Dobson, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
 #ifndef _ASM_M32R_TOPOLOGY_H
 #define _ASM_M32R_TOPOLOGY_H
 
-/* Other architectures wishing to use this simple topology API should fill
-   in the below functions as appropriate in their own <asm/topology.h> file. */
-
-#define cpu_to_node(cpu)	(0)
-
-#ifndef parent_node
-#define parent_node(node)	(0)
-#endif
-#ifndef node_to_cpumask
-#define node_to_cpumask(node)	(cpu_online_map)
-#endif
-#ifndef node_to_first_cpu
-#define node_to_first_cpu(node)	(0)
-#endif
-#ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus)	(cpu_online_map)
-#endif
+#include <asm-generic/topology.h>
 
 #endif /* _ASM_M32R_TOPOLOGY_H */
-- 
cgit v1.2.3


From e164f5573bef0e6caf53519719cf0228c9c15ce3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:17 -0700
Subject: [PATCH] ppc64: pcibus_to_node fix

asm-generic/topology.h must also be included if CONFIG_NUMA is set in order to
provide the fall back pcibus_to_node function.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ppc64/topology.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-ppc64/topology.h b/include/asm-ppc64/topology.h
index d58d9dd79998..fcdcfd26a26b 100644
--- a/include/asm-ppc64/topology.h
+++ b/include/asm-ppc64/topology.h
@@ -59,10 +59,8 @@ static inline int node_to_first_cpu(int node)
 	.nr_balance_failed	= 0,			\
 }
 
-#else /* !CONFIG_NUMA */
+#endif /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
 
-#endif /* CONFIG_NUMA */
-
 #endif /* _ASM_PPC64_TOPOLOGY_H */
-- 
cgit v1.2.3


From 8c5a09082f4e61a176382e96a831a0636b918602 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:18 -0700
Subject: [PATCH] x86/x86_64: pcibus_to_node

Define pcibus_to_node to be able to figure out which NUMA node contains a
given PCI device.  This defines pcibus_to_node(bus) in
include/linux/topology.h and adjusts the macros for i386 and x86_64 that
already provided a way to determine the cpumask of a pci device.

x86_64 was changed to not build an array of cpumasks anymore.  Instead an
array of nodes is build which can be used to generate the cpumask via
node_to_cpumask.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/topology.h |  9 ++++++++-
 include/asm-i386/topology.h    |  8 ++------
 include/asm-x86_64/topology.h  | 14 +++-----------
 3 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index ec96e8b0f190..5d9d70cd17fc 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -41,8 +41,15 @@
 #ifndef node_to_first_cpu
 #define node_to_first_cpu(node)	(0)
 #endif
+#ifndef pcibus_to_node
+#define pcibus_to_node(node)	(-1)
+#endif
+
 #ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus)	(cpu_online_map)
+#define pcibus_to_cpumask(bus)	(pcibus_to_node(bus) == -1 ? \
+					CPU_MASK_ALL : \
+					node_to_cpumask(pcibus_to_node(bus)) \
+				)
 #endif
 
 #endif /* _ASM_GENERIC_TOPOLOGY_H */
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 98f9e6850cba..6d0f67507b21 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -60,12 +60,8 @@ static inline int node_to_first_cpu(int node)
 	return first_cpu(mask);
 }
 
-/* Returns the number of the node containing PCI bus number 'busnr' */
-static inline cpumask_t __pcibus_to_cpumask(int busnr)
-{
-	return node_to_cpumask(mp_bus_id_to_node[busnr]);
-}
-#define pcibus_to_cpumask(bus)	__pcibus_to_cpumask(bus->number)
+#define pcibus_to_node(bus) mp_bus_id_to_node[(bus)->number]
+#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus))
 
 /* sched_domains SD_NODE_INIT for NUMAQ machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index da21573ec731..8f77e9f6bc23 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -13,8 +13,8 @@
 extern cpumask_t cpu_online_map;
 
 extern unsigned char cpu_to_node[];
+extern unsigned char pci_bus_to_node[];
 extern cpumask_t     node_to_cpumask[];
-extern cpumask_t pci_bus_to_cpumask[];
 
 #ifdef CONFIG_ACPI_NUMA
 extern int __node_distance(int, int);
@@ -26,16 +26,8 @@ extern int __node_distance(int, int);
 #define parent_node(node)		(node)
 #define node_to_first_cpu(node) 	(__ffs(node_to_cpumask[node]))
 #define node_to_cpumask(node)		(node_to_cpumask[node])
-
-static inline cpumask_t __pcibus_to_cpumask(int bus)
-{
-	cpumask_t busmask = pci_bus_to_cpumask[bus];
-	cpumask_t online = cpu_online_map;
-	cpumask_t res;
-	cpus_and(res, busmask, online);
-	return res;
-}
-#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number)
+#define pcibus_to_node(bus)		pci_bus_to_node[(bus)->number]
+#define pcibus_to_cpumask(bus)		node_to_cpumask(pcibus_to_node(bus));
 
 /* sched_domains SD_NODE_INIT for x86_64 machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
-- 
cgit v1.2.3


From 1946089a109251655c5438d92c539bd2930e71ea Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:19 -0700
Subject: [PATCH] NUMA aware block device control structure allocation

Patch to allocate the control structures for for ide devices on the node of
the device itself (for NUMA systems).  The patch depends on the Slab API
change patch by Manfred and me (in mm) and the pcidev_to_node patch that I
posted today.

Does some realignment too.

Signed-off-by: Justin M. Forbes <jmforbes@linuxtx.org>
Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Pravin Shelar <pravin@calsoftinc.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/blkdev.h  |  6 +++++-
 include/linux/genhd.h   |  1 +
 include/linux/ide.h     |  2 +-
 include/linux/mempool.h | 11 ++++++++---
 4 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4a99b76c5a33..235c3414d268 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -396,6 +396,7 @@ struct request_queue
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
+	int			node;
 
 	struct list_head	drain_list;
 
@@ -615,6 +616,8 @@ static inline void blkdev_dequeue_request(struct request *req)
 /*
  * Access functions for manipulating queue properties
  */
+extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn,
+					spinlock_t *lock, int node_id);
 extern request_queue_t *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(request_queue_t *);
 extern void blk_queue_make_request(request_queue_t *, make_request_fn *);
@@ -646,7 +649,8 @@ extern void blk_wait_queue_drained(request_queue_t *, int);
 extern void blk_finish_queue_drain(request_queue_t *);
 
 int blk_get_queue(request_queue_t *);
-request_queue_t *blk_alloc_queue(int);
+request_queue_t *blk_alloc_queue(int gfp_mask);
+request_queue_t *blk_alloc_queue_node(int,int);
 #define blk_put_queue(q) blk_cleanup_queue((q))
 
 /*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 47dedaf971d6..af26dc718ef6 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -403,6 +403,7 @@ extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern void add_partition(struct gendisk *, int, sector_t, sector_t);
 extern void delete_partition(struct gendisk *, int);
 
+extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
 extern struct kobject *get_disk(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 336d6e509f59..92129078d4f3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -917,7 +917,7 @@ typedef struct hwif_s {
 	unsigned dma;
 
 	void (*led_act)(void *data, int rw);
-} ide_hwif_t;
+} ____cacheline_maxaligned_in_smp ide_hwif_t;
 
 /*
  *  internal ide interrupt handler type
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 4a36edf1c974..796220ce47cc 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -20,9 +20,14 @@ typedef struct mempool_s {
 	mempool_free_t *free;
 	wait_queue_head_t wait;
 } mempool_t;
-extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
-				 mempool_free_t *free_fn, void *pool_data);
-extern int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask);
+
+extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data);
+extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data, int nid);
+
+extern int mempool_resize(mempool_t *pool, int new_min_nr,
+			unsigned int __nocast gfp_mask);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
-- 
cgit v1.2.3


From ca05fea6db5259c6d62e517c41d448a4249175f4 Mon Sep 17 00:00:00 2001
From: Natalie Protasevich <Natalie.Protasevich@unisys.com>
Date: Thu, 23 Jun 2005 00:08:22 -0700
Subject: [PATCH] Do not enforce unique IO_APIC_ID check for xAPIC systems
 (i386)

This patch is per Andi's request to remove NO_IOAPIC_CHECK from genapic and
use heuristics to prevent unique I/O APIC ID check for systems that don't
need it.  The patch disables unique I/O APIC ID check for Xeon-based and
other platforms that don't use serial APIC bus for interrupt delivery.
Andi stated that AMD systems don't need unique IO_APIC_IDs either.

Signed-off-by: Natalie Protasevich <Natalie.Protasevich@unisys.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/genapic.h                | 1 -
 include/asm-i386/mach-bigsmp/mach_apic.h  | 2 --
 include/asm-i386/mach-default/mach_apic.h | 2 --
 include/asm-i386/mach-es7000/mach_apic.h  | 2 --
 include/asm-i386/mach-generic/mach_apic.h | 1 -
 include/asm-i386/mach-numaq/mach_apic.h   | 2 --
 include/asm-i386/mach-summit/mach_apic.h  | 2 --
 include/asm-i386/mach-visws/mach_apic.h   | 2 --
 8 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h
index fc813b2e8274..b3783a32abee 100644
--- a/include/asm-i386/genapic.h
+++ b/include/asm-i386/genapic.h
@@ -78,7 +78,6 @@ struct genapic {
 	.int_delivery_mode = INT_DELIVERY_MODE, \
 	.int_dest_mode = INT_DEST_MODE, \
 	.no_balance_irq = NO_BALANCE_IRQ, \
-	.no_ioapic_check = NO_IOAPIC_CHECK, \
 	.ESR_DISABLE = esr_disable, \
 	.apic_destination_logical = APIC_DEST_LOGICAL, \
 	APICFUNC(apic_id_registered), \
diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h
index 2339868270ef..ba936d4daedb 100644
--- a/include/asm-i386/mach-bigsmp/mach_apic.h
+++ b/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -14,8 +14,6 @@
 #define NO_BALANCE_IRQ (1)
 #define esr_disable (1)
 
-#define NO_IOAPIC_CHECK (0)
-
 static inline int apic_id_registered(void)
 {
 	return (1);
diff --git a/include/asm-i386/mach-default/mach_apic.h b/include/asm-i386/mach-default/mach_apic.h
index 627f1cd084ba..3ef6292db780 100644
--- a/include/asm-i386/mach-default/mach_apic.h
+++ b/include/asm-i386/mach-default/mach_apic.h
@@ -19,8 +19,6 @@ static inline cpumask_t target_cpus(void)
 #define NO_BALANCE_IRQ (0)
 #define esr_disable (0)
 
-#define NO_IOAPIC_CHECK (0)
-
 #define INT_DELIVERY_MODE dest_LowestPrio
 #define INT_DEST_MODE 1     /* logical delivery broadcast to all procs */
 
diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h
index ceab2c464b13..b5f3f0d0b2bc 100644
--- a/include/asm-i386/mach-es7000/mach_apic.h
+++ b/include/asm-i386/mach-es7000/mach_apic.h
@@ -38,8 +38,6 @@ static inline cpumask_t target_cpus(void)
 #define WAKE_SECONDARY_VIA_INIT
 #endif
 
-#define NO_IOAPIC_CHECK (1)
-
 static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
 { 
 	return 0;
diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h
index ab36d02ebede..b13767a4e934 100644
--- a/include/asm-i386/mach-generic/mach_apic.h
+++ b/include/asm-i386/mach-generic/mach_apic.h
@@ -5,7 +5,6 @@
 
 #define esr_disable (genapic->ESR_DISABLE)
 #define NO_BALANCE_IRQ (genapic->no_balance_irq)
-#define NO_IOAPIC_CHECK	(genapic->no_ioapic_check)
 #define INT_DELIVERY_MODE (genapic->int_delivery_mode)
 #define INT_DEST_MODE (genapic->int_dest_mode)
 #undef APIC_DEST_LOGICAL
diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h
index e1a04494764a..9d158095da82 100644
--- a/include/asm-i386/mach-numaq/mach_apic.h
+++ b/include/asm-i386/mach-numaq/mach_apic.h
@@ -17,8 +17,6 @@ static inline cpumask_t target_cpus(void)
 #define NO_BALANCE_IRQ (1)
 #define esr_disable (1)
 
-#define NO_IOAPIC_CHECK (0)
-
 #define INT_DELIVERY_MODE dest_LowestPrio
 #define INT_DEST_MODE 0     /* physical delivery on LOCAL quad */
  
diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h
index 74e9cbc8c01b..3d6d12937e1f 100644
--- a/include/asm-i386/mach-summit/mach_apic.h
+++ b/include/asm-i386/mach-summit/mach_apic.h
@@ -7,8 +7,6 @@
 #define esr_disable (1)
 #define NO_BALANCE_IRQ (0)
 
-#define NO_IOAPIC_CHECK (1)	/* Don't check I/O APIC ID for xAPIC */
-
 /* In clustered mode, the high nibble of APIC ID is a cluster number.
  * The low nibble is a 4-bit bitmap. */
 #define XAPIC_DEST_CPUS_SHIFT	4
diff --git a/include/asm-i386/mach-visws/mach_apic.h b/include/asm-i386/mach-visws/mach_apic.h
index 4e6cdfb8b091..de438c7147a8 100644
--- a/include/asm-i386/mach-visws/mach_apic.h
+++ b/include/asm-i386/mach-visws/mach_apic.h
@@ -9,8 +9,6 @@
 #define no_balance_irq (0)
 #define esr_disable (0)
 
-#define NO_IOAPIC_CHECK (0)
-
 #define INT_DELIVERY_MODE dest_LowestPrio
 #define INT_DEST_MODE 1     /* logical delivery broadcast to all procs */
 
-- 
cgit v1.2.3


From 59121003721a8fad11ee72e646fd9d3076b5679c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:25 -0700
Subject: [PATCH] i386: Selectable Frequency of the Timer Interrupt

Make the timer frequency selectable. The timer interrupt may cause bus
and memory contention in large NUMA systems since the interrupt occurs
on each processor HZ times per second.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/param.h   | 4 +++-
 include/asm-x86_64/param.h | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h
index b6440526e42a..fa02e67ea86b 100644
--- a/include/asm-i386/param.h
+++ b/include/asm-i386/param.h
@@ -1,8 +1,10 @@
+#include <linux/config.h>
+
 #ifndef _ASMi386_PARAM_H
 #define _ASMi386_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ		1000		/* Internal kernel timer frequency */
+# define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
 # define USER_HZ	100		/* .. some user interfaces are in "ticks" */
 # define CLOCKS_PER_SEC		(USER_HZ)	/* like times() */
 #endif
diff --git a/include/asm-x86_64/param.h b/include/asm-x86_64/param.h
index b707f0568c9e..40b11937180d 100644
--- a/include/asm-x86_64/param.h
+++ b/include/asm-x86_64/param.h
@@ -1,9 +1,11 @@
+#include <linux/config.h>
+
 #ifndef _ASMx86_64_PARAM_H
 #define _ASMx86_64_PARAM_H
 
 #ifdef __KERNEL__
-# define HZ            1000            /* Internal kernel timer frequency */
-# define USER_HZ       100          /* .. some user interfaces are in "ticks */
+# define HZ            CONFIG_HZ	/* Internal kernel timer frequency */
+# define USER_HZ       100		/* .. some user interfaces are in "ticks */
 #define CLOCKS_PER_SEC        (USER_HZ)       /* like times() */
 #endif
 
-- 
cgit v1.2.3


From b5d23e5b8c7ecd97d32f6ad7680d9909977580a7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Thu, 23 Jun 2005 00:08:27 -0700
Subject: [PATCH] ia64: Selectable Timer Interrupt Frequency

It allows a selectable timer interrupt frequency of 100, 250 and 1000 HZ.
Reducing the timer frequency may have important performance benefits on
large systems.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-ia64/param.h b/include/asm-ia64/param.h
index 6c6b679b7a9e..5e1e0d2d7baf 100644
--- a/include/asm-ia64/param.h
+++ b/include/asm-ia64/param.h
@@ -27,7 +27,7 @@
    */
 #  define HZ	  32
 # else
-#  define HZ	1024
+#  define HZ	CONFIG_HZ
 # endif
 # define USER_HZ	HZ
 # define CLOCKS_PER_SEC	HZ	/* frequency at which times() counts */
-- 
cgit v1.2.3


From a9ed8817966dd723754a990f1003264a21b5747e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 23 Jun 2005 00:08:32 -0700
Subject: [PATCH] x86: #include asm/uaccess.h in asm/checksum.h

csum_and_copy_to_user is static inline and uses VERIFY_WRITE.  Patch allows
to remove asm/uaccess.h from i386_ksyms.c without dependency surprises.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/checksum.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/asm-i386/checksum.h b/include/asm-i386/checksum.h
index 641342002bcd..f949e44c2a35 100644
--- a/include/asm-i386/checksum.h
+++ b/include/asm-i386/checksum.h
@@ -3,6 +3,8 @@
 
 #include <linux/in6.h>
 
+#include <asm/uaccess.h>
+
 /*
  * computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit)
-- 
cgit v1.2.3


From a3a255e744dfa672e741dc24306491139d0de2d8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Jun 2005 00:08:34 -0700
Subject: [PATCH] x86: cpu_khz type fix

x86_64's cpu_khz is unsigned int and there is no reason why x86 needs to use
unsigned long.

So make cpu_khz unsigned int on x86 as well.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/timex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h
index e370f907bd39..292b5a68f627 100644
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -47,7 +47,7 @@ static inline cycles_t get_cycles (void)
 	return ret;
 }
 
-extern unsigned long cpu_khz;
+extern unsigned int cpu_khz;
 
 extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
-- 
cgit v1.2.3


From 32ecd42b6f94d3ee320a22827b46bd19ccf924e5 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 23 Jun 2005 00:08:38 -0700
Subject: [PATCH] eliminate duplicate rdpmc definition

Eliminate duplicate definition of rdpmc in x86-64's mtrr.h.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/msr.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index 513e52c71821..bc700232728d 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -57,11 +57,6 @@
      (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
 } while(0)
 
-#define rdpmc(counter,low,high) \
-     __asm__ __volatile__("rdpmc" \
-			  : "=a" (low), "=d" (high) \
-			  : "c" (counter))
-
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
 
 #define rdpmc(counter,low,high) \
-- 
cgit v1.2.3


From f5012310e35bd62fd39fce338ee44422c975ff3c Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Date: Thu, 23 Jun 2005 00:08:42 -0700
Subject: [PATCH] xen: x86: add macro for debugreg

Add 2 macros to set and get debugreg on x86.  This is useful for Xen because
it will need only to redefine each macro to a hypervisor call.

Signed-off-by: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/processor.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 359bb0151742..c76c50e96225 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -501,12 +501,16 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
 } while (0)
 
 /*
- * This special macro can be used to load a debugging register
+ * These special macros can be used to get or set a debugging register
  */
-#define loaddebug(thread,register) \
-               __asm__("movl %0,%%db" #register  \
-                       : /* no output */ \
-                       :"r" ((thread)->debugreg[register]))
+#define get_debugreg(var, register)				\
+		__asm__("movl %%db" #register ", %0"		\
+			:"=r" (var))
+#define set_debugreg(value, register)			\
+		__asm__("movl %0,%%db" #register		\
+			: /* no output */			\
+			:"r" (value))
+
 
 /* Forward declaration, a strange C thing */
 struct task_struct;
-- 
cgit v1.2.3


From fa1e1bdf78d405f9905b8290ee9211e7a7bbc99b Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Date: Thu, 23 Jun 2005 00:08:44 -0700
Subject: [PATCH] xen: x86: Rename usermode macro

Rename user_mode to user_mode_vm and add a user_mode macro similar to the
x86-64 one.

This is useful for Xen because the linux xen kernel does not runs on the same
priviledge that a vanilla linux kernel, and with this we just need to redefine
user_mode().

Signed-off-by: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/ptrace.h   | 3 ++-
 include/asm-x86_64/ptrace.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 8618914b3521..eef9f93870d4 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -57,7 +57,8 @@ struct pt_regs {
 #ifdef __KERNEL__
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
-#define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs))
+#define user_mode(regs)		(3 & (regs)->xcs)
+#define user_mode_vm(regs)	((VM_MASK & (regs)->eflags) || user_mode(regs))
 #define instruction_pointer(regs) ((regs)->eip)
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 extern unsigned long profile_pc(struct pt_regs *regs);
diff --git a/include/asm-x86_64/ptrace.h b/include/asm-x86_64/ptrace.h
index 5bbc8d3141c8..ca6f15ff61d4 100644
--- a/include/asm-x86_64/ptrace.h
+++ b/include/asm-x86_64/ptrace.h
@@ -82,6 +82,7 @@ struct pt_regs {
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__) 
 #define user_mode(regs) (!!((regs)->cs & 3))
+#define user_mode_vm(regs) user_mode(regs)
 #define instruction_pointer(regs) ((regs)->rip)
 extern unsigned long profile_pc(struct pt_regs *regs);
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-- 
cgit v1.2.3


From e9129e56e9ec50c0689eb4cf7a3ca132f1e776db Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Date: Thu, 23 Jun 2005 00:08:46 -0700
Subject: [PATCH] xen: x86_64: Add macro for debugreg

Add 2 macros to set and get debugreg on x86_64.  This is useful for Xen
because it will need only to redefine each macro to a hypervisor call.

Signed-off-by: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/processor.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8b55f139968f..106f666517bb 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -280,6 +280,14 @@ struct thread_struct {
 	set_fs(USER_DS);							 \
 } while(0) 
 
+#define get_debugreg(var, register)				\
+		__asm__("movq %%db" #register ", %0"		\
+			:"=r" (var))
+#define set_debugreg(value, register)			\
+		__asm__("movq %0,%%db" #register		\
+			: /* no output */			\
+			:"r" (value))
+
 struct task_struct;
 struct mm_struct;
 
-- 
cgit v1.2.3


From fa72b903f75e4f0f0b2c2feed093005167da4023 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 23 Jun 2005 00:08:49 -0700
Subject: [PATCH] blk: remove blk_queue_tag->real_max_depth optimization

blk_queue_tag->real_max_depth was used to optimize out unnecessary
allocations/frees on tag resize.  However, the whole thing was very broken -
tag_map was never allocated to real_max_depth resulting in access beyond the
end of the map, bits in [max_depth..real_max_depth] were set when initializing
a map and copied when resizing resulting in pre-occupied tags.

As the gain of the optimization is very small, well, almost nill, remove the
whole thing.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 235c3414d268..8d7e2f4151d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -294,7 +294,6 @@ struct blk_queue_tag {
 	struct list_head busy_list;	/* fifo list of busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
-	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 
-- 
cgit v1.2.3


From f7d37d028dfba90b1b747f8ac685bf0959aeda8b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 23 Jun 2005 00:08:50 -0700
Subject: [PATCH] blk: remove BLK_TAGS_{PER_LONG|MASK}

Replace BLK_TAGS_PER_LONG with BITS_PER_LONG and remove unused BLK_TAGS_MASK.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8d7e2f4151d0..60272141ff19 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -285,9 +285,6 @@ enum blk_queue_state {
 	Queue_up,
 };
 
-#define BLK_TAGS_PER_LONG	(sizeof(unsigned long) * 8)
-#define BLK_TAGS_MASK		(BLK_TAGS_PER_LONG - 1)
-
 struct blk_queue_tag {
 	struct request **tag_index;	/* map of busy tags */
 	unsigned long *tag_map;		/* bit map of free/busy tags */
-- 
cgit v1.2.3


From 55c888d6d09a0df236adfaf8ccf06ff5d0646775 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:56 -0700
Subject: [PATCH] timers fixes/improvements

This patch tries to solve following problems:

1. del_timer_sync() is racy. The timer can be fired again after
   del_timer_sync have checked all cpus and before it will recheck
   timer_pending().

2. It has scalability problems. All cpus are scanned to determine
   if the timer is running on that cpu.

   With this patch del_timer_sync is O(1) and no slower than plain
   del_timer(pending_timer), unless it has to actually wait for
   completion of the currently running timer.

   The only restriction is that the recurring timer should not use
   add_timer_on().

3. The timers are not serialized wrt to itself.

   If CPU_0 does mod_timer(jiffies+1) while the timer is currently
   running on CPU 1, it is quite possible that local interrupt on
   CPU_0 will start that timer before it finished on CPU_1.

4. The timers locking is suboptimal. __mod_timer() takes 3 locks
   at once and still requires wmb() in del_timer/run_timers.

   The new implementation takes 2 locks sequentially and does not
   need memory barriers.

Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.

This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.

The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.

So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).

When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.

This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.

__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.

__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.

So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.

We don't need timer_list->lock anymore, this patch kills it.

We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.

One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global

        struct timer_base_s {
                spinlock_t lock;
                struct timer_list *running_timer;
        } __init_timer_base;

which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.

It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 90db1cc62ddd..2e78fedfc069 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -6,45 +6,33 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
-struct tvec_t_base_s;
+struct timer_base_s;
 
 struct timer_list {
 	struct list_head entry;
 	unsigned long expires;
 
-	spinlock_t lock;
 	unsigned long magic;
 
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct tvec_t_base_s *base;
+	struct timer_base_s *base;
 };
 
 #define TIMER_MAGIC	0x4b87ad6e
 
+extern struct timer_base_s __init_timer_base;
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = NULL,					\
+		.base = &__init_timer_base,			\
 		.magic = TIMER_MAGIC,				\
-		.lock = SPIN_LOCK_UNLOCKED,			\
 	}
 
-/***
- * init_timer - initialize a timer.
- * @timer: the timer to be initialized
- *
- * init_timer() must be done to a timer prior calling *any* of the
- * other timer functions.
- */
-static inline void init_timer(struct timer_list * timer)
-{
-	timer->base = NULL;
-	timer->magic = TIMER_MAGIC;
-	spin_lock_init(&timer->lock);
-}
+void fastcall init_timer(struct timer_list * timer);
 
 /***
  * timer_pending - is a timer pending?
@@ -58,7 +46,7 @@ static inline void init_timer(struct timer_list * timer)
  */
 static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->base != NULL;
+	return timer->entry.next != NULL;
 }
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
@@ -89,12 +77,12 @@ static inline void add_timer(struct timer_list * timer)
 
 #ifdef CONFIG_SMP
   extern int del_timer_sync(struct timer_list *timer);
-  extern int del_singleshot_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t) del_timer(t)
-# define del_singleshot_timer_sync(t) del_timer(t)
 #endif
 
+#define del_singleshot_timer_sync(t) del_timer_sync(t)
+
 extern void init_timers(void);
 extern void run_local_timers(void);
 extern void it_real_fn(unsigned long);
-- 
cgit v1.2.3


From fd450b7318b75343fd76b3d95416853e34e72c95 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:59 -0700
Subject: [PATCH] timers: introduce try_to_del_timer_sync()

This patch splits del_timer_sync() into 2 functions.  The new one,
try_to_del_timer_sync(), returns -1 when it hits executing timer.

It can be used in interrupt context, or when the caller hold locks which
can prevent completion of the timer's handler.

NOTE.  Currently it can't be used in interrupt context in UP case, because
->running_timer is used only with CONFIG_SMP.

Should the need arise, it is possible to kill #ifdef CONFIG_SMP in
set_running_timer(), it is cheap.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 2e78fedfc069..221f81ac2002 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -76,9 +76,11 @@ static inline void add_timer(struct timer_list * timer)
 }
 
 #ifdef CONFIG_SMP
+  extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
-# define del_timer_sync(t) del_timer(t)
+# define try_to_del_timer_sync(t)	del_timer(t)
+# define del_timer_sync(t)		del_timer(t)
 #endif
 
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
-- 
cgit v1.2.3


From 991114c6fa6a21d1fa4d544abe78592352860c82 Mon Sep 17 00:00:00 2001
From: Alexander Viro <aviro@redhat.com>
Date: Thu, 23 Jun 2005 00:09:01 -0700
Subject: [PATCH] fix for prune_icache()/forced final iput() races

Based on analysis and a patch from Russ Weight <rweight@us.ibm.com>

There is a race condition that can occur if an inode is allocated and then
released (using iput) during the ->fill_super functions.  The race
condition is between kswapd and mount.

For most filesystems this can only happen in an error path when kswapd is
running concurrently.  For isofs, however, the error can occur in a more
common code path (which is how the bug was found).

The logic here is "we want final iput() to free inode *now* instead of
letting it sit in cache if fs is going down or had not quite come up".  The
problem is with kswapd seeing such inodes in the middle of being killed and
happily taking over.

The clean solution would be to tell kswapd to leave those inodes alone and
let our final iput deal with them.  I.e.  add a new flag
(I_FORCED_FREEING), set it before write_inode_now() there and make
prune_icache() leave those alone.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5a8db00df29..3622e952e98c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1025,6 +1025,7 @@ struct super_operations {
 #define I_FREEING		16
 #define I_CLEAR			32
 #define I_NEW			64
+#define I_WILL_FREE		128
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
-- 
cgit v1.2.3


From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001
From: Paulo Marques <pmarques@grupopie.com>
Date: Thu, 23 Jun 2005 00:09:02 -0700
Subject: [PATCH] create a kstrdup library function

This patch creates a new kstrdup library function and changes the "local"
implementations in several places to use this function.

Most of the changes come from the sound and net subsystems.  The sound part
had already been acknowledged by Takashi Iwai and the net part by David S.
Miller.

I left UML alone for now because I would need more time to read the code
carefully before making changes there.

Signed-off-by: Paulo Marques <pmarques@grupopie.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/netdevice.h | 4 ----
 include/linux/string.h    | 2 ++
 include/sound/core.h      | 3 ++-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d6afd440cf7b..d89816ad642f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -925,10 +925,6 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward);
 extern void		net_enable_timestamp(void);
 extern void		net_disable_timestamp(void);
 
-#ifdef CONFIG_SYSCTL
-extern char *net_sysctl_strdup(const char *s);
-#endif
-
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/include/linux/string.h b/include/linux/string.h
index b9fc59469956..93994c613095 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -88,6 +88,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
+extern char *kstrdup(const char *s, int gfp);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sound/core.h b/include/sound/core.h
index 9117c23e3a01..f8c4ef0aa352 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -292,6 +292,7 @@ void *snd_hidden_kcalloc(size_t n, size_t size, int flags);
 void snd_hidden_kfree(const void *obj);
 void *snd_hidden_vmalloc(unsigned long size);
 void snd_hidden_vfree(void *obj);
+char *snd_hidden_kstrdup(const char *s, int flags);
 #define kmalloc(size, flags) snd_hidden_kmalloc(size, flags)
 #define kcalloc(n, size, flags) snd_hidden_kcalloc(n, size, flags)
 #define kfree(obj) snd_hidden_kfree(obj)
@@ -301,6 +302,7 @@ void snd_hidden_vfree(void *obj);
 #define vmalloc_nocheck(size) snd_wrapper_vmalloc(size)
 #define kfree_nocheck(obj) snd_wrapper_kfree(obj)
 #define vfree_nocheck(obj) snd_wrapper_vfree(obj)
+#define kstrdup(s, flags)  snd_hidden_kstrdup(s, flags)
 #else
 #define snd_memory_init() /*NOP*/
 #define snd_memory_done() /*NOP*/
@@ -311,7 +313,6 @@ void snd_hidden_vfree(void *obj);
 #define kfree_nocheck(obj) kfree(obj)
 #define vfree_nocheck(obj) vfree(obj)
 #endif
-char *snd_kmalloc_strdup(const char *string, int flags);
 int copy_to_user_fromio(void __user *dst, const volatile void __iomem *src, size_t count);
 int copy_from_user_toio(volatile void __iomem *dst, const void __user *src, size_t count);
 
-- 
cgit v1.2.3


From 35a82d1a53e1a9ad54efafcc940f9335beaed5c3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Thu, 23 Jun 2005 00:09:06 -0700
Subject: [PATCH] optimise loop driver a bit

Looks like locking can be optimised quite a lot.  Increase lock widths
slightly so lo_lock is taken fewer times per request.  Also it was quite
trivial to cover lo_pending with that lock, and remove the atomic
requirement.  This also makes memory ordering explicitly correct, which is
nice (not that I particularly saw any mem ordering bugs).

Test was reading 4 250MB files in parallel on ext2-on-tmpfs filesystem (1K
block size, 4K page size).  System is 2 socket Xeon with HT (4 thread).

intel:/home/npiggin# umount /dev/loop0 ; mount /dev/loop0 /mnt/loop ; /usr/bin/time ./mtloop.sh

Before:
0.24user 5.51system 0:02.84elapsed 202%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.52system 0:02.88elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.57system 0:02.89elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k
0.22user 5.51system 0:02.90elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.44system 0:02.91elapsed 193%CPU (0avgtext+0avgdata 0maxresident)k

After:
0.07user 2.34system 0:01.68elapsed 143%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.37system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.39system 0:01.68elapsed 145%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.36system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.42system 0:01.68elapsed 147%CPU (0avgtext+0avgdata 0maxresident)k

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/loop.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/loop.h b/include/linux/loop.h
index 8220d9c9da00..53fa51595443 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -61,7 +61,7 @@ struct loop_device {
 	struct semaphore	lo_sem;
 	struct semaphore	lo_ctl_mutex;
 	struct semaphore	lo_bh_mutex;
-	atomic_t		lo_pending;
+	int			lo_pending;
 
 	request_queue_t		*lo_queue;
 };
-- 
cgit v1.2.3


From dcd497f99a1ef29a7c5e76142965be77e9dacabd Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Thu, 23 Jun 2005 00:09:07 -0700
Subject: [PATCH] streamline preempt_count type across archs

The preempt_count member of struct thread_info is currently either defined
as int, unsigned int or __s32 depending on arch.  This patch makes the type
of preempt_count an int on all archs.

Having preempt_count be an unsigned type prevents the catching of
preempt_count < 0 bugs, and using int on some archs and __s32 on others is
not exactely "neat" - much nicer when it's just int all over.

A previous version of this patch was already ACK'ed by Robert Love, and the
only change in this version of the patch compared to the one he ACK'ed is
that this one also makes sure the preempt_count member is consistently
commented.

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/thread_info.h       | 2 +-
 include/asm-arm26/thread_info.h     | 2 +-
 include/asm-cris/thread_info.h      | 2 +-
 include/asm-frv/thread_info.h       | 2 +-
 include/asm-h8300/thread_info.h     | 2 +-
 include/asm-i386/thread_info.h      | 2 +-
 include/asm-ia64/thread_info.h      | 2 +-
 include/asm-m32r/thread_info.h      | 2 +-
 include/asm-m68k/thread_info.h      | 2 +-
 include/asm-m68knommu/thread_info.h | 2 +-
 include/asm-mips/thread_info.h      | 2 +-
 include/asm-parisc/thread_info.h    | 2 +-
 include/asm-ppc/thread_info.h       | 3 ++-
 include/asm-ppc64/thread_info.h     | 2 +-
 include/asm-s390/thread_info.h      | 2 +-
 include/asm-sh/thread_info.h        | 2 +-
 include/asm-sh64/thread_info.h      | 2 +-
 include/asm-sparc/thread_info.h     | 4 ++--
 include/asm-sparc64/thread_info.h   | 2 +-
 include/asm-um/thread_info.h        | 2 +-
 include/asm-v850/thread_info.h      | 3 ++-
 include/asm-x86_64/thread_info.h    | 2 +-
 22 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/asm-arm/thread_info.h b/include/asm-arm/thread_info.h
index 66c585c50cf9..8252a4cd860f 100644
--- a/include/asm-arm/thread_info.h
+++ b/include/asm-arm/thread_info.h
@@ -49,7 +49,7 @@ struct cpu_context_save {
  */
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
-	__s32			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
diff --git a/include/asm-arm26/thread_info.h b/include/asm-arm26/thread_info.h
index 50f41b50268a..aff3e5699c64 100644
--- a/include/asm-arm26/thread_info.h
+++ b/include/asm-arm26/thread_info.h
@@ -44,7 +44,7 @@ struct cpu_context_save {
  */
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
-	__s32			preempt_count;	/* 0 => preemptable, <0 => bug */
+	int			preempt_count;	/* 0 => preemptable, <0 => bug */
 	mm_segment_t		addr_limit;	/* address limit */
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain      *exec_domain;   /* execution domain */
diff --git a/include/asm-cris/thread_info.h b/include/asm-cris/thread_info.h
index 53193feb0826..5ba4b7865cc5 100644
--- a/include/asm-cris/thread_info.h
+++ b/include/asm-cris/thread_info.h
@@ -31,7 +31,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
diff --git a/include/asm-frv/thread_info.h b/include/asm-frv/thread_info.h
index b80a97f50af6..c8cba7836f0d 100644
--- a/include/asm-frv/thread_info.h
+++ b/include/asm-frv/thread_info.h
@@ -33,7 +33,7 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		status;		/* thread-synchronous flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count;	/* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
diff --git a/include/asm-h8300/thread_info.h b/include/asm-h8300/thread_info.h
index b07c9344776f..bfcc755c3bb1 100644
--- a/include/asm-h8300/thread_info.h
+++ b/include/asm-h8300/thread_info.h
@@ -23,7 +23,7 @@ struct thread_info {
 	struct exec_domain *exec_domain;	/* execution domain */
 	unsigned long	   flags;		/* low level flags */
 	int		   cpu;			/* cpu we're on */
-	int		   preempt_count;	/* 0 => preemptable, <0 => BUG*/
+	int		   preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block restart_block;
 };
 
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 2cd57271801d..95add81237ea 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -31,7 +31,7 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		status;		/* thread-synchronous flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 
 	mm_segment_t		addr_limit;	/* thread address space:
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 8d5b7e77028c..7dc8951708a3 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -25,7 +25,7 @@ struct thread_info {
 	__u32 flags;			/* thread_info flags (see TIF_*) */
 	__u32 cpu;			/* current CPU */
 	mm_segment_t addr_limit;	/* user-level address space limit */
-	__s32 preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
+	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
 	struct {
 		int signo;
diff --git a/include/asm-m32r/thread_info.h b/include/asm-m32r/thread_info.h
index 9f3a0fcf6e2b..7a6be7727a92 100644
--- a/include/asm-m32r/thread_info.h
+++ b/include/asm-m32r/thread_info.h
@@ -28,7 +28,7 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		status;		/* thread-synchronous flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thread
diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h
index 5f58939c59db..2aed24f6fd2e 100644
--- a/include/asm-m68k/thread_info.h
+++ b/include/asm-m68k/thread_info.h
@@ -8,7 +8,7 @@
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u32 cpu; /* should always be 0 on m68k */
 	struct restart_block    restart_block;
 
diff --git a/include/asm-m68knommu/thread_info.h b/include/asm-m68knommu/thread_info.h
index c8153b7c1f5c..7b9a3fa3af5d 100644
--- a/include/asm-m68knommu/thread_info.h
+++ b/include/asm-m68knommu/thread_info.h
@@ -36,7 +36,7 @@ struct thread_info {
 	struct exec_domain *exec_domain;	/* execution domain */
 	unsigned long	   flags;		/* low level flags */
 	int		   cpu;			/* cpu we're on */
-	int		   preempt_count;	/* 0 => preemptable, <0 => BUG*/
+	int		   preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block restart_block;
 };
 
diff --git a/include/asm-mips/thread_info.h b/include/asm-mips/thread_info.h
index 768900305e2f..42fcd6f2c206 100644
--- a/include/asm-mips/thread_info.h
+++ b/include/asm-mips/thread_info.h
@@ -27,7 +27,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
diff --git a/include/asm-parisc/thread_info.h b/include/asm-parisc/thread_info.h
index fe9b7f8ae4c6..57bbb76cb6c1 100644
--- a/include/asm-parisc/thread_info.h
+++ b/include/asm-parisc/thread_info.h
@@ -12,7 +12,7 @@ struct thread_info {
 	unsigned long flags;		/* thread_info flags (see TIF_*) */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	__u32 cpu;			/* current CPU */
-	__s32 preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
+	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
 };
 
diff --git a/include/asm-ppc/thread_info.h b/include/asm-ppc/thread_info.h
index e3b5284a6f91..27903db42efc 100644
--- a/include/asm-ppc/thread_info.h
+++ b/include/asm-ppc/thread_info.h
@@ -20,7 +20,8 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		local_flags;	/* non-racy flags */
 	int			cpu;		/* cpu we're on */
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	struct restart_block	restart_block;
 };
 
diff --git a/include/asm-ppc64/thread_info.h b/include/asm-ppc64/thread_info.h
index 48b7900e90ec..0494df6fca74 100644
--- a/include/asm-ppc64/thread_info.h
+++ b/include/asm-ppc64/thread_info.h
@@ -24,7 +24,7 @@ struct thread_info {
 	struct task_struct *task;		/* main task structure */
 	struct exec_domain *exec_domain;	/* execution domain */
 	int		cpu;			/* cpu we're on */
-	int		preempt_count;
+	int		preempt_count;		/* 0 => preemptable, <0 => BUG */
 	struct restart_block restart_block;
 	/* set by force_successful_syscall_return */
 	unsigned char	syscall_noerror;
diff --git a/include/asm-s390/thread_info.h b/include/asm-s390/thread_info.h
index aade85c53a63..fe101d41e849 100644
--- a/include/asm-s390/thread_info.h
+++ b/include/asm-s390/thread_info.h
@@ -50,7 +50,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	unsigned int		cpu;		/* current CPU */
-	unsigned int		preempt_count; /* 0 => preemptable */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block	restart_block;
 };
 
diff --git a/include/asm-sh/thread_info.h b/include/asm-sh/thread_info.h
index 4bbbd9f3c37e..46080cefaff8 100644
--- a/include/asm-sh/thread_info.h
+++ b/include/asm-sh/thread_info.h
@@ -20,7 +20,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	__u32			flags;		/* low level flags */
 	__u32			cpu;
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count; /* 0 => preemptable, <0 => BUG */
 	struct restart_block	restart_block;
 	__u8			supervisor_stack[0];
 };
diff --git a/include/asm-sh64/thread_info.h b/include/asm-sh64/thread_info.h
index 8a32d6bd0b79..10f024c6a2e3 100644
--- a/include/asm-sh64/thread_info.h
+++ b/include/asm-sh64/thread_info.h
@@ -22,7 +22,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	/* Put the 4 32-bit fields together to make asm offsetting easier. */
-	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	__u16			cpu;
 
 	mm_segment_t		addr_limit;
diff --git a/include/asm-sparc/thread_info.h b/include/asm-sparc/thread_info.h
index 104f03c55416..ff6ccb3d24c6 100644
--- a/include/asm-sparc/thread_info.h
+++ b/include/asm-sparc/thread_info.h
@@ -30,9 +30,9 @@ struct thread_info {
 	struct task_struct	*task;		/* main task structure */
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
-
 	int			cpu;		/* cpu we're on */
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	int			softirq_count;
 	int			hardirq_count;
 
diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h
index 517caaba1c87..0cd652956929 100644
--- a/include/asm-sparc64/thread_info.h
+++ b/include/asm-sparc64/thread_info.h
@@ -46,7 +46,7 @@ struct thread_info {
 	unsigned long		fault_address;
 	struct pt_regs		*kregs;
 	struct exec_domain	*exec_domain;
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	int			__pad;
 
 	unsigned long		*utraps;
diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h
index 1feaaf148ef1..97267f059ef5 100644
--- a/include/asm-um/thread_info.h
+++ b/include/asm-um/thread_info.h
@@ -17,7 +17,7 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	__u32			cpu;		/* current CPU */
-	__s32			preempt_count;  /* 0 => preemptable, 
+	int			preempt_count;  /* 0 => preemptable,
 						   <0 => BUG */
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user
diff --git a/include/asm-v850/thread_info.h b/include/asm-v850/thread_info.h
index e2ef44593752..e4cfad94a553 100644
--- a/include/asm-v850/thread_info.h
+++ b/include/asm-v850/thread_info.h
@@ -30,7 +30,8 @@ struct thread_info {
 	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	int			cpu;		/* cpu we're on */
-	int			preempt_count;
+	int			preempt_count;	/* 0 => preemptable,
+						   <0 => BUG */
 	struct restart_block	restart_block;
 };
 
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index f4b3b249639c..08eb6e4f3737 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -29,7 +29,7 @@ struct thread_info {
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
-	int 			preempt_count;
+	int 			preempt_count;	/* 0 => preemptable, <0 => BUG */
 
 	mm_segment_t		addr_limit;	
 	struct restart_block    restart_block;
-- 
cgit v1.2.3


From ac20427ef6aa63da663bdc88b71d16f7394f5e23 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Thu, 23 Jun 2005 00:09:11 -0700
Subject: [PATCH] add check to /proc/devices read routines

Patch to add check to get_chrdev_list and get_blkdev_list to prevent reads
of /proc/devices from spilling over the provided page if more than 4096
bytes of string data are generated from all the registered character and
block devices in a system

Signed-off-by: Neil Horman <nhorman@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/genhd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index af26dc718ef6..01796c41c951 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -224,7 +224,7 @@ static inline void free_disk_stats(struct gendisk *disk)
 extern void disk_round_stats(struct gendisk *disk);
 
 /* drivers/block/genhd.c */
-extern int get_blkdev_list(char *);
+extern int get_blkdev_list(char *, int);
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
 extern void unlink_gendisk(struct gendisk *gp);
-- 
cgit v1.2.3


From 84de856ed30c568c2bb7b9ac0679772bd2737d9b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:09:16 -0700
Subject: [PATCH] quota: consolidate code surrounding vfs_quota_on_mount

Move some code duplicated in both callers into vfs_quota_on_mount

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/quotaops.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index e57baa85e744..d211507ab246 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -39,7 +39,8 @@ extern int dquot_commit_info(struct super_block *sb, int type);
 extern int dquot_mark_dquot_dirty(struct dquot *dquot);
 
 extern int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path);
-extern int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry);
+extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+		int format_id, int type);
 extern int vfs_quota_off(struct super_block *sb, int type);
 #define vfs_quota_off_mount(sb, type) vfs_quota_off(sb, type)
 extern int vfs_quota_sync(struct super_block *sb, int type);
-- 
cgit v1.2.3


From b94cce926b2b902b79380ccba370d6f9f2980de0 Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:19 -0700
Subject: [PATCH] kprobes: function-return probes

This patch adds function-return probes to kprobes for the i386
architecture.  This enables you to establish a handler to be run when a
function returns.

1. API

Two new functions are added to kprobes:

	int register_kretprobe(struct kretprobe *rp);
	void unregister_kretprobe(struct kretprobe *rp);

2. Registration and unregistration

2.1 Register

  To register a function-return probe, the user populates the following
  fields in a kretprobe object and calls register_kretprobe() with the
  kretprobe address as an argument:

  kp.addr - the function's address

  handler - this function is run after the ret instruction executes, but
  before control returns to the return address in the caller.

  maxactive - The maximum number of instances of the probed function that
  can be active concurrently.  For example, if the function is non-
  recursive and is called with a spinlock or mutex held, maxactive = 1
  should be enough.  If the function is non-recursive and can never
  relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
  be enough.  maxactive is used to determine how many kretprobe_instance
  objects to allocate for this particular probed function.  If maxactive <=
  0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
  NR_CPUS) else maxactive=NR_CPUS)

  For example:

    struct kretprobe rp;
    rp.kp.addr = /* entrypoint address */
    rp.handler = /*return probe handler */
    rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
    register_kretprobe(&rp);

  The following field may also be of interest:

  nmissed - Initialized to zero when the function-return probe is
  registered, and incremented every time the probed function is entered but
  there is no kretprobe_instance object available for establishing the
  function-return probe (i.e., because maxactive was set too low).

2.2 Unregister

  To unregiter a function-return probe, the user calls
  unregister_kretprobe() with the same kretprobe object as registered
  previously.  If a probed function is running when the return probe is
  unregistered, the function will return as expected, but the handler won't
  be run.

3. Limitations

3.1 This patch supports only the i386 architecture, but patches for
    x86_64 and ppc64 are anticipated soon.

3.2 Return probes operates by replacing the return address in the stack
    (or in a known register, such as the lr register for ppc).  This may
    cause __builtin_return_address(0), when invoked from the return-probed
    function, to return the address of the return-probes trampoline.

3.3 This implementation uses the "Multiprobes at an address" feature in
    2.6.12-rc3-mm3.

3.4 Due to a limitation in multi-probes, you cannot currently establish
    a return probe and a jprobe on the same function.  A patch to remove
    this limitation is being tested.

This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/kprobes.h |  3 ++
 include/linux/kprobes.h    | 90 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 91 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h
index 4092f68d123a..8b6d3a90cd78 100644
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -39,6 +39,9 @@ typedef u8 kprobe_opcode_t;
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
+#define ARCH_SUPPORTS_KRETPROBES
+
+void kretprobe_trampoline(void);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 99ddba5a4e00..fba39f87efec 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -25,21 +25,31 @@
  *		Rusty Russell).
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com> and Jim Keniston
+ *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/config.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/spinlock.h>
+
 #include <asm/kprobes.h>
 
 struct kprobe;
 struct pt_regs;
+struct kretprobe;
+struct kretprobe_instance;
 typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
 typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *);
 typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
 				       unsigned long flags);
 typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
 				       int trapnr);
+typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
+				    struct pt_regs *);
+
 struct kprobe {
 	struct hlist_node hlist;
 
@@ -85,6 +95,62 @@ struct jprobe {
 	kprobe_opcode_t *entry;	/* probe handling code to jump to */
 };
 
+#ifdef ARCH_SUPPORTS_KRETPROBES
+extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs);
+extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+							unsigned long flags);
+extern struct task_struct *arch_get_kprobe_task(void *ptr);
+extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
+extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+#else /* ARCH_SUPPORTS_KRETPROBES */
+static inline void kretprobe_trampoline(void)
+{
+}
+static inline int trampoline_probe_handler(struct kprobe *p,
+						struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void trampoline_post_handler(struct kprobe *p,
+				struct pt_regs *regs, unsigned long flags)
+{
+}
+static inline void arch_prepare_kretprobe(struct kretprobe *rp,
+					struct pt_regs *regs)
+{
+}
+static inline void arch_kprobe_flush_task(struct task_struct *tk)
+{
+}
+#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL)
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+/*
+ * Function-return probe -
+ * Note:
+ * User needs to provide a handler function, and initialize maxactive.
+ * maxactive - The maximum number of instances of the probed function that
+ * can be active concurrently.
+ * nmissed - tracks the number of times the probed function's return was
+ * ignored, due to maxactive being too low.
+ *
+ */
+struct kretprobe {
+	struct kprobe kp;
+	kretprobe_handler_t handler;
+	int maxactive;
+	int nmissed;
+	struct hlist_head free_instances;
+	struct hlist_head used_instances;
+};
+
+struct kretprobe_instance {
+	struct hlist_node uflist; /* either on free list or used list */
+	struct hlist_node hlist;
+	struct kretprobe *rp;
+	void *ret_addr;
+	void *stack_addr;
+};
+
 #ifdef CONFIG_KPROBES
 /* Locks kprobe: irq must be disabled */
 void lock_kprobes(void);
@@ -104,6 +170,7 @@ extern void show_registers(struct pt_regs *regs);
 
 /* Get the kprobe at this addr (if any).  Must have called lock_kprobes */
 struct kprobe *get_kprobe(void *addr);
+struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
@@ -113,7 +180,16 @@ int register_jprobe(struct jprobe *p);
 void unregister_jprobe(struct jprobe *p);
 void jprobe_return(void);
 
-#else
+int register_kretprobe(struct kretprobe *rp);
+void unregister_kretprobe(struct kretprobe *rp);
+
+struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
+struct kretprobe_instance *get_rp_inst(void *sara);
+struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk);
+void add_rp_inst(struct kretprobe_instance *ri);
+void kprobe_flush_task(struct task_struct *tk);
+void recycle_rp_inst(struct kretprobe_instance *ri);
+#else /* CONFIG_KPROBES */
 static inline int kprobe_running(void)
 {
 	return 0;
@@ -135,5 +211,15 @@ static inline void unregister_jprobe(struct jprobe *p)
 static inline void jprobe_return(void)
 {
 }
-#endif
+static inline int register_kretprobe(struct kretprobe *rp)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kretprobe(struct kretprobe *rp)
+{
+}
+static inline void kprobe_flush_task(struct task_struct *tk)
+{
+}
+#endif				/* CONFIG_KPROBES */
 #endif				/* _LINUX_KPROBES_H */
-- 
cgit v1.2.3


From 73649dab0fd524cb8545a8cb83c6eaf77b107105 Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:23 -0700
Subject: [PATCH] x86_64 specific function return probes

The following patch adds the x86_64 architecture specific implementation
for function return probes.

Function return probes is a mechanism built on top of kprobes that allows
a caller to register a handler to be called when a given function exits.
For example, to instrument the return path of sys_mkdir:

static int sys_mkdir_exit(struct kretprobe_instance *i, struct pt_regs *regs)
{
	printk("sys_mkdir exited\n");
	return 0;
}
static struct kretprobe return_probe = {
	.handler = sys_mkdir_exit,
};

<inside setup function>

return_probe.kp.addr = (kprobe_opcode_t *) kallsyms_lookup_name("sys_mkdir");
if (register_kretprobe(&return_probe)) {
	printk(KERN_DEBUG "Unable to register return probe!\n");
	/* do error path */
}

<inside cleanup function>
unregister_kretprobe(&return_probe);

The way this works is that:

* At system initialization time, kernel/kprobes.c installs a kprobe
  on a function called kretprobe_trampoline() that is implemented in
  the arch/x86_64/kernel/kprobes.c  (More on this later)

* When a return probe is registered using register_kretprobe(),
  kernel/kprobes.c will install a kprobe on the first instruction of the
  targeted function with the pre handler set to arch_prepare_kretprobe()
  which is implemented in arch/x86_64/kernel/kprobes.c.

* arch_prepare_kretprobe() will prepare a kretprobe instance that stores:
  - nodes for hanging this instance in an empty or free list
  - a pointer to the return probe
  - the original return address
  - a pointer to the stack address

  With all this stowed away, arch_prepare_kretprobe() then sets the return
  address for the targeted function to a special trampoline function called
  kretprobe_trampoline() implemented in arch/x86_64/kernel/kprobes.c

* The kprobe completes as normal, with control passing back to the target
  function that executes as normal, and eventually returns to our trampoline
  function.

* Since a kprobe was installed on kretprobe_trampoline() during system
  initialization, control passes back to kprobes via the architecture
  specific function trampoline_probe_handler() which will lookup the
  instance in an hlist maintained by kernel/kprobes.c, and then call
  the handler function.

* When trampoline_probe_handler() is done, the kprobes infrastructure
  single steps the original instruction (in this case just a top), and
  then calls trampoline_post_handler().  trampoline_post_handler() then
  looks up the instance again, puts the instance back on the free list,
  and then makes a long jump back to the original return instruction.

So to recap, to instrument the exit path of a function this implementation
will cause four interruptions:

  - A breakpoint at the very beginning of the function allowing us to
    switch out the return address
  - A single step interruption to execute the original instruction that
    we replaced with the break instruction (normal kprobe flow)
  - A breakpoint in the trampoline function where our instrumented function
    returned to
  - A single step interruption to execute the original instruction that
    we replaced with the break instruction (normal kprobe flow)

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/kprobes.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h
index bfea52d516f8..6d6d883fdf6d 100644
--- a/include/asm-x86_64/kprobes.h
+++ b/include/asm-x86_64/kprobes.h
@@ -38,6 +38,9 @@ typedef u8 kprobe_opcode_t;
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
+#define ARCH_SUPPORTS_KRETPROBES
+
+void kretprobe_trampoline(void);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
-- 
cgit v1.2.3


From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:25 -0700
Subject: [PATCH] Move kprobe [dis]arming into arch specific code

The architecture independent code of the current kprobes implementation is
arming and disarming kprobes at registration time.  The problem is that the
code is assuming that arming and disarming is a just done by a simple write
of some magic value to an address.  This is problematic for ia64 where our
instructions look more like structures, and we can not insert break points
by just doing something like:

*p->addr = BREAKPOINT_INSTRUCTION;

The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent
functions:

     * void arch_arm_kprobe(struct kprobe *p)
     * void arch_disarm_kprobe(struct kprobe *p)

and then adds the new functions for each of the architectures that already
implement kprobes (spar64/ppc64/i386/x86_64).

I thought arch_[dis]arm_kprobe was the most descriptive of what was really
happening, but each of the architectures already had a disarm_kprobe()
function that was really a "disarm and do some other clean-up items as
needed when you stumble across a recursive kprobe." So...  I took the
liberty of changing the code that was calling disarm_kprobe() to call
arch_disarm_kprobe(), and then do the cleanup in the block of code dealing
with the recursive kprobe case.

So far this patch as been tested on i386, x86_64, and ppc64, but still
needs to be tested in sparc64.

Signed-off-by: Rusty Lynch <rusty.lynch@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index fba39f87efec..0f90466fb8b0 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -165,6 +165,8 @@ static inline int kprobe_running(void)
 
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_copy_kprobe(struct kprobe *p);
+extern void arch_arm_kprobe(struct kprobe *p);
+extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
 extern void show_registers(struct pt_regs *regs);
 
-- 
cgit v1.2.3


From 0aa55e4d7db822059fe8132fe9f2b7773c48216c Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:26 -0700
Subject: [PATCH] kprobes: moves lock-unlock to non-arch kprobe_flush_task

This patch moves the lock/unlock of the arch specific kprobe_flush_task()
to the non-arch specific kprobe_flusk_task().

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0f90466fb8b0..461391decc46 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -33,7 +33,6 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/spinlock.h>
 
 #include <asm/kprobes.h>
 
@@ -101,7 +100,7 @@ extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
 							unsigned long flags);
 extern struct task_struct *arch_get_kprobe_task(void *ptr);
 extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
-extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+extern void arch_kprobe_flush_task(struct task_struct *tk);
 #else /* ARCH_SUPPORTS_KRETPROBES */
 static inline void kretprobe_trampoline(void)
 {
-- 
cgit v1.2.3


From 7213b2521889eb087eed8abaa48d1a692575da3e Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Thu, 23 Jun 2005 00:09:27 -0700
Subject: [PATCH] Kprobes/IA64: kdebug die notification mechanism

As many of you know that kprobes exist in the main line kernel for various
architecture including i386, x86_64, ppc64 and sparc64.  Attached patches
following this mail are a port of Kprobes and Jprobes for IA64.

I have tesed this patches for kprobes and Jprobes and this seems to work fine.
 I have tested this patch by inserting kprobes on various slots and various
templates including various types of branch instructions.

I have also tested this patch using the tool
http://marc.theaimsgroup.com/?l=linux-kernel&m=111657358022586&w=2 and the
kprobes for IA64 works great.

Here is list of TODO things and pathes for the same will appear soon.

1) Support kprobes on "mov r1=ip" type of instruction
2) Support Kprobes and Jprobes to exist on the same address
3) Support Return probes
3) Architecture independent cleanup of kprobes

This patch adds the kdebug die notification mechanism needed by Kprobes.

For break instruction on Branch type slot, imm21 is ignored and value
zero is placed in IIM register, hence we need to handle kprobes
for switch case zero.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Rusty Lynch <Rusty.lynch@intel.com>

From: Rusty Lynch <rusty.lynch@intel.com>

At the point in traps.c where we recieve a break with a zero value, we can
not say if the break was a result of a kprobe or some other debug facility.

This simple patch changes the informational string to a more correct "break
0" value, and applies to the 2.6.12-rc2-mm2 tree with all the kprobes
patches that were just recently included for the next mm cut.
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/break.h  |  2 ++
 include/asm-ia64/kdebug.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 include/asm-ia64/kdebug.h

(limited to 'include')

diff --git a/include/asm-ia64/break.h b/include/asm-ia64/break.h
index 97c7b2d79600..8167828edc4b 100644
--- a/include/asm-ia64/break.h
+++ b/include/asm-ia64/break.h
@@ -12,6 +12,8 @@
  * OS-specific debug break numbers:
  */
 #define __IA64_BREAK_KDB		0x80100
+#define __IA64_BREAK_KPROBE		0x80200
+#define __IA64_BREAK_JPROBE		0x80300
 
 /*
  * OS-specific break numbers:
diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h
new file mode 100644
index 000000000000..4d376e1663f7
--- /dev/null
+++ b/include/asm-ia64/kdebug.h
@@ -0,0 +1,61 @@
+#ifndef _IA64_KDEBUG_H
+#define _IA64_KDEBUG_H 1
+/*
+ * include/asm-ia64/kdebug.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Intel Corporation, 2005
+ *
+ * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
+ *              <anil.s.keshavamurthy@intel.com> adopted from
+ *              include/asm-x86_64/kdebug.h
+ */
+#include <linux/notifier.h>
+
+struct pt_regs;
+
+struct die_args {
+	struct pt_regs *regs;
+	const char *str;
+	long err;
+	int trapnr;
+	int signr;
+};
+
+int register_die_notifier(struct notifier_block *nb);
+extern struct notifier_block *ia64die_chain;
+
+enum die_val {
+	DIE_BREAK = 1,
+	DIE_SS,
+	DIE_PAGE_FAULT,
+};
+
+static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,
+			     long err, int trap, int sig)
+{
+	struct die_args args = {
+		.regs   = regs,
+		.str    = str,
+		.err    = err,
+		.trapnr = trap,
+		.signr  = sig
+	};
+
+	return notifier_call_chain(&ia64die_chain, val, &args);
+}
+
+#endif
-- 
cgit v1.2.3


From fd7b231ff98578308d8f5fb76a25a369ce1074ae Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Thu, 23 Jun 2005 00:09:28 -0700
Subject: [PATCH] Kprobes/IA64: arch specific handling

This is an IA64 arch specific handling of Kprobes

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Rusty Lynch <Rusty.lynch@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/kprobes.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 include/asm-ia64/kprobes.h

(limited to 'include')

diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
new file mode 100644
index 000000000000..2a6f2a148890
--- /dev/null
+++ b/include/asm-ia64/kprobes.h
@@ -0,0 +1,72 @@
+#ifndef _ASM_KPROBES_H
+#define _ASM_KPROBES_H
+/*
+ *  Kernel Probes (KProbes)
+ *  include/asm-ia64/kprobes.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Intel Corporation, 2005
+ *
+ * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
+ *              <anil.s.keshavamurthy@intel.com> adapted from i386
+ */
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <asm/break.h>
+
+#define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
+
+typedef struct _bundle {
+	struct {
+		unsigned long long template : 5;
+		unsigned long long slot0 : 41;
+		unsigned long long slot1_p0 : 64-46;
+	} quad0;
+	struct {
+		unsigned long long slot1_p1 : 41 - (64-46);
+		unsigned long long slot2 : 41;
+	} quad1;
+} __attribute__((__aligned__(16)))  bundle_t;
+
+#define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
+
+typedef struct kprobe_opcode {
+	bundle_t bundle;
+} kprobe_opcode_t;
+
+struct fnptr {
+	unsigned long ip;
+	unsigned long gp;
+};
+
+/* Architecture specific copy of original instruction*/
+struct arch_specific_insn {
+	/* copy of the original instruction */
+	kprobe_opcode_t insn;
+};
+
+#ifdef CONFIG_KPROBES
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+				    unsigned long val, void *data);
+#else				/* !CONFIG_KPROBES */
+static inline int kprobe_exceptions_notify(struct notifier_block *self,
+					   unsigned long val, void *data)
+{
+	return 0;
+}
+#endif
+#endif				/* _ASM_KPROBES_H */
-- 
cgit v1.2.3


From b2761dc262b428475890fffd979687051beb12ba Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Thu, 23 Jun 2005 00:09:28 -0700
Subject: [PATCH] Kprobes/IA64: architecture specific JProbes support

This patch adds IA64 architecture specific JProbes support on top of Kprobes

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Rusty Lynch <Rusty.lynch@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/kprobes.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index 2a6f2a148890..fec3506e53f8 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -59,6 +59,11 @@ struct arch_specific_insn {
 	kprobe_opcode_t insn;
 };
 
+/* ia64 does not need this */
+static inline void jprobe_return(void)
+{
+}
+
 #ifdef CONFIG_KPROBES
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
-- 
cgit v1.2.3


From cd2675bf65455a45b54228b7acc0c6a26a164cb6 Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Thu, 23 Jun 2005 00:09:29 -0700
Subject: [PATCH] Kprobes/IA64: support kprobe on branch/call instructions

This patch is required to support kprobe on branch/call instructions.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/kprobes.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index fec3506e53f8..d30af77a0b11 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -44,6 +44,17 @@ typedef struct _bundle {
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
 
+#define SLOT0_OPCODE_SHIFT	(37)
+#define SLOT1_p1_OPCODE_SHIFT	(37 - (64-46))
+#define SLOT2_OPCODE_SHIFT 	(37)
+
+#define INDIRECT_CALL_OPCODE		(1)
+#define IP_RELATIVE_CALL_OPCODE		(5)
+#define IP_RELATIVE_BRANCH_OPCODE	(4)
+#define IP_RELATIVE_PREDICT_OPCODE	(7)
+#define LONG_BRANCH_OPCODE		(0xC)
+#define LONG_CALL_OPCODE		(0xD)
+
 typedef struct kprobe_opcode {
 	bundle_t bundle;
 } kprobe_opcode_t;
@@ -55,8 +66,12 @@ struct fnptr {
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
-	/* copy of the original instruction */
+	/* copy of the instruction to be emulated */
 	kprobe_opcode_t insn;
+ #define INST_FLAG_FIX_RELATIVE_IP_ADDR		1
+ #define INST_FLAG_FIX_BRANCH_REG		2
+ 	unsigned long inst_flag;
+ 	unsigned short target_br_reg;
 };
 
 /* ia64 does not need this */
-- 
cgit v1.2.3


From 8bc76772ad653bcaad1b0af72aafb6072ef0fa87 Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:30 -0700
Subject: [PATCH] Kprobes ia64 cleanup

A cleanup of the ia64 kprobes implementation such that all of the bundle
manipulation logic is concentrated in arch_prepare_kprobe().

With the current design for kprobes, the arch specific code only has a
chance to return failure inside the arch_prepare_kprobe() function.

This patch moves all of the work that was happening in arch_copy_kprobe()
and most of the work that was happening in arch_arm_kprobe() into
arch_prepare_kprobe().  By doing this we can add further robustness checks
in arch_arm_kprobe() and refuse to insert kprobes that will cause problems.

Signed-off-by: Rusty Lynch <Rusty.lynch@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/kprobes.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index d30af77a0b11..cec4d9958307 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -30,6 +30,8 @@
 
 #define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
 
+struct kprobe;
+
 typedef struct _bundle {
 	struct {
 		unsigned long long template : 5;
@@ -79,6 +81,11 @@ static inline void jprobe_return(void)
 {
 }
 
+/* ia64 does not need this */
+static inline void arch_copy_kprobe(struct kprobe *p)
+{
+}
+
 #ifdef CONFIG_KPROBES
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
-- 
cgit v1.2.3


From 1674eafcbd3e3c68556cf19fbf4a2c30f7add729 Mon Sep 17 00:00:00 2001
From: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Date: Thu, 23 Jun 2005 00:09:33 -0700
Subject: [PATCH] Kprobes IA64: cmp ctype unc support

The current Kprobes when patching the original instruction with the break
instruction tries to retain the original qualifying predicate(qp), however
for cmp.crel.ctype where ctype == unc, which is a special instruction
always needs to be executed irrespective of qp.  Hence, if the instruction
we are patching is of this type, then we should not copy the original qp to
the break instruction, this is because we always want the break fault to
happen so that we can emulate the instruction.

This patch is based on the feedback given by David Mosberger

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/kprobes.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index cec4d9958307..7b700035e36d 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -30,6 +30,23 @@
 
 #define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
 
+typedef union cmp_inst {
+	struct {
+	unsigned long long qp : 6;
+	unsigned long long p1 : 6;
+	unsigned long long c  : 1;
+	unsigned long long r2 : 7;
+	unsigned long long r3 : 7;
+	unsigned long long p2 : 6;
+	unsigned long long ta : 1;
+	unsigned long long x2 : 2;
+	unsigned long long tb : 1;
+	unsigned long long opcode : 4;
+	unsigned long long reserved : 23;
+	}f;
+	unsigned long long l;
+} cmp_inst_t;
+
 struct kprobe;
 
 typedef struct _bundle {
-- 
cgit v1.2.3


From ea32c65cc2d2294c04e9f81d0578a6f51febfdbf Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Thu, 23 Jun 2005 00:09:36 -0700
Subject: [PATCH] kprobes: Temporary disarming of reentrant probe

In situations where a kprobes handler calls a routine which has a probe on it,
then kprobes_handler() disarms the new probe forever.  This patch removes the
above limitation by temporarily disarming the new probe.  When the another
probe hits while handling the old probe, the kprobes_handler() saves previous
kprobes state and handles the new probe without calling the new kprobes
registered handlers.  kprobe_post_handler() restores back the previous kprobes
state and the normal execution continues.

However on x86_64 architecture, re-rentrancy is provided only through
pre_handler().  If a routine having probe is referenced through
post_handler(), then the probes on that routine are disarmed forever, since
the exception stack is gets changed after the processor single steps the
instruction of the new probe.

This patch includes generic changes to support temporary disarming on
reentrancy of probes.

Signed-of-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 461391decc46..5e1a7b0d7b3f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -36,6 +36,12 @@
 
 #include <asm/kprobes.h>
 
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE	0x00000001
+#define KPROBE_HIT_SS		0x00000002
+#define KPROBE_REENTER		0x00000004
+#define KPROBE_HIT_SSDONE	0x00000008
+
 struct kprobe;
 struct pt_regs;
 struct kretprobe;
@@ -55,6 +61,9 @@ struct kprobe {
 	/* list of kprobes for multi-handler support */
 	struct list_head list;
 
+	/*count the number of times this probe was temporarily disarmed */
+	unsigned long nmissed;
+
 	/* location of the probe point */
 	kprobe_opcode_t *addr;
 
-- 
cgit v1.2.3


From d6e711448137ca3301512cec41a2c2ce852b3d0a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 23 Jun 2005 00:09:43 -0700
Subject: [PATCH] setuid core dump

Add a new `suid_dumpable' sysctl:

This value can be used to query and set the core dump mode for setuid
or otherwise protected/tainted binaries. The modes are

0 - (default) - traditional behaviour.  Any process which has changed
    privilege levels or is execute only will not be dumped

1 - (debug) - all processes dump core when possible.  The core dump is
    owned by the current user and no security is applied.  This is intended
    for system debugging situations only.  Ptrace is unchecked.

2 - (suidsafe) - any binary which normally would not be dumped is dumped
    readable by root only.  This allows the end user to remove such a dump but
    not access it directly.  For security reasons core dumps in this mode will
    not overwrite one another or other files.  This mode is appropriate when
    adminstrators are attempting to debug problems in a normal environment.

(akpm:

> > +EXPORT_SYMBOL(suid_dumpable);
>
> EXPORT_SYMBOL_GPL?

No problem to me.

> >  	if (current->euid == current->uid && current->egid == current->gid)
> >  		current->mm->dumpable = 1;
>
> Should this be SUID_DUMP_USER?

Actually the feedback I had from last time was that the SUID_ defines
should go because its clearer to follow the numbers. They can go
everywhere (and there are lots of places where dumpable is tested/used
as a bool in untouched code)

> Maybe this should be renamed to `dump_policy' or something.  Doing that
> would help us catch any code which isn't using the #defines, too.

Fair comment. The patch was designed to be easy to maintain for Red Hat
rather than for merging. Changing that field would create a gigantic
diff because it is used all over the place.

)

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/binfmts.h | 5 +++++
 include/linux/sched.h   | 2 +-
 include/linux/sysctl.h  | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7e736e201c46..c1e82c514443 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -69,6 +69,11 @@ extern void remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 
+extern int suid_dumpable;
+#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
+#define SUID_DUMP_USER		1	/* Dump as user of process */
+#define SUID_DUMP_ROOT		2	/* Dump as root */
+
 /* Stack area protections */
 #define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
 #define EXSTACK_DISABLE_X 1	/* Disable executable stacks */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b58afd97a180..901742f92389 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,7 +246,7 @@ struct mm_struct {
 
 	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
 
-	unsigned dumpable:1;
+	unsigned dumpable:2;
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a17745c80a91..614e939c78a4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -136,6 +136,7 @@ enum
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
 	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
+	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 };
 
 
-- 
cgit v1.2.3


From ef3daeda7b58f046f94b26637d500354038d39f4 Mon Sep 17 00:00:00 2001
From: Yoav Zach <yoav_zach@yahoo.com>
Date: Thu, 23 Jun 2005 00:09:58 -0700
Subject: [PATCH] Don't force O_LARGEFILE for 32 bit processes on ia64

In ia64 kernel, the O_LARGEFILE flag is forced when opening a file.  This
is problematic for execution of 32 bit processes, which are not largefile
aware, either by SW emulation or by HW execution.

For such processes, the problem is two-fold:

1) When trying to open a file that is larger than 4G
   the operation should fail, but it's not
2) Writing to offset larger than 4G should fail, but
   it's not

The proposed patch takes advantage of the way 32 bit processes are
identified in ia64 systems.  Such processes have PER_LINUX32 for their
personality.  With the patch, the ia64 kernel will not enforce the
O_LARGEFILE flag if the current process has PER_LINUX32 set.  The behavior
for all other architectures remains unchanged.

Signed-off-by: Yoav Zach <yoav.zach@intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/fcntl.h | 2 ++
 include/linux/fcntl.h    | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/asm-ia64/fcntl.h b/include/asm-ia64/fcntl.h
index d193981bb1d8..c9f8d835d0cc 100644
--- a/include/asm-ia64/fcntl.h
+++ b/include/asm-ia64/fcntl.h
@@ -81,4 +81,6 @@ struct flock {
 
 #define F_LINUX_SPECIFIC_BASE	1024
 
+#define force_o_largefile() ( ! (current->personality & PER_LINUX32) )
+
 #endif /* _ASM_IA64_FCNTL_H */
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 704fb76b6334..8a7c82151de9 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -25,6 +25,10 @@
 
 #ifdef __KERNEL__
 
+#ifndef force_o_largefile
+#define force_o_largefile() (BITS_PER_LONG != 32)
+#endif
+
 #if BITS_PER_LONG == 32
 #define IS_GETLK32(cmd)		((cmd) == F_GETLK)
 #define IS_SETLK32(cmd)		((cmd) == F_SETLK)
-- 
cgit v1.2.3


From 11c80c8367db0a9d342529ed74464670cd86a1f6 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 23 Jun 2005 00:09:59 -0700
Subject: [PATCH] adjust per_cpu definition in non-SMP case

Fix (in the architectures I'm actually building for) the UP definition of
per_cpu so that the cpu specified may be any expression, not just an
identifier or a suffix expression.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/percpu.h | 2 +-
 include/asm-ia64/percpu.h    | 2 +-
 include/asm-x86_64/percpu.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 3b709b84934f..9044aeb37828 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -29,7 +29,7 @@ do {								\
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
-#define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
+#define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 
 #endif	/* SMP */
diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h
index 1e87f19dad56..2b14dee29ce7 100644
--- a/include/asm-ia64/percpu.h
+++ b/include/asm-ia64/percpu.h
@@ -50,7 +50,7 @@ extern void *per_cpu_init(void);
 
 #else /* ! SMP */
 
-#define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
+#define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 #define per_cpu_init()				(__phys_per_cpu_start)
 
diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h
index 415d73f3c8ef..9c71855736fb 100644
--- a/include/asm-x86_64/percpu.h
+++ b/include/asm-x86_64/percpu.h
@@ -39,7 +39,7 @@ extern void setup_per_cpu_areas(void);
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
-#define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
+#define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
 
 #endif	/* SMP */
-- 
cgit v1.2.3


From 46c271bedd2c8444b1d05bc44928beec0c07debc Mon Sep 17 00:00:00 2001
From: Peter Osterlund <petero2@telia.com>
Date: Thu, 23 Jun 2005 00:10:02 -0700
Subject: [PATCH] Improve CD/DVD packet driver write performance

This patch improves write performance for the CD/DVD packet writing driver.
 The logic for switching between reading and writing has been changed so
that streaming writes are no longer interrupted by read requests.

Signed-off-by: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pktcdvd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 4e2d2a942ecb..4b32bce9a289 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -159,7 +159,7 @@ struct packet_iosched
 	struct bio		*read_queue_tail;
 	struct bio		*write_queue;
 	struct bio		*write_queue_tail;
-	int			high_prio_read;	/* An important read request has been queued */
+	sector_t		last_write;	/* The sector where the last write ended */
 	int			successive_reads;
 };
 
-- 
cgit v1.2.3


From fa912bcb06d5dc9525d8912a145db2bf4b7668c5 Mon Sep 17 00:00:00 2001
From: Daniel Ritz <daniel.ritz@gmx.ch>
Date: Thu, 23 Jun 2005 00:10:12 -0700
Subject: [PATCH] yenta TI: turn off interrupts during card power-on #2

- make boot-up card recognition more reliable (ie.  redo interrogation
  always if there is no valid 'card inserted' state) (and yes, i saw it
  happening on an o2micro controller that both CB_CBARD and CB_16BITCARD
  bits were set at the same time)

- also redo interrogation before probing the ISA interrupts.  it's safer
  to do the probing with the socket in a clean state.

- make card insert detect more reliable.  yenta_get_status() now returns
  SS_PENDING as long as the card is not completley inserted and one of the
  voltage bits is set.  also !CB_CBARD doesn't mean CB_16BITCARD.  there is
  CB_NOTACARD as well, so make an explicit check for CB_16BITCARD.

- for TI bridges: disable IRQs during power-on.  in all-serial and tied
  interrupt mode the interrupts are always disabled for single-slot
  controllers.  for two-slot contollers the disabling is only done when the
  other slot is empty.  to force disabling there is a new module parameter
  now: pwr_irqs_off=Y (which is a regression for working setups.  that's
  why it's an option, only use when required)

- modparm to disable ISA interrupt probing (isa_probe, defaults to on)

- remove unneeded code/cleanups (ie.  merge yenta_events() into
  yenta_interrupts())

Signed-off-by: Daniel Ritz <daniel.ritz@gmx.ch>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/pcmcia/ss.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index 6d3413a56708..67b867f31fe4 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -77,6 +77,11 @@ extern socket_state_t dead_socket;
 /* Use this just for bridge windows */
 #define MAP_IOSPACE	0x20
 
+/* power hook operations */
+#define HOOK_POWER_PRE	0x01
+#define HOOK_POWER_POST	0x02
+
+
 typedef struct pccard_io_map {
     u_char	map;
     u_char	flags;
@@ -222,6 +227,9 @@ struct pcmcia_socket {
 	/* Zoom video behaviour is so chip specific its not worth adding
 	   this to _ops */
 	void 				(*zoom_video)(struct pcmcia_socket *, int);
+
+	/* so is power hook */
+	int (*power_hook)(struct pcmcia_socket *sock, int operation);
                            
 	/* state thread */
 	struct semaphore		skt_sem;	/* protects socket h/w state */
-- 
cgit v1.2.3


From 0d77e5a2c23da734f5a7925f64afa1c2ed92e0f9 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 23 Jun 2005 00:10:14 -0700
Subject: [PATCH] compat: introduce compat_time_t

This patch is based on work by Carlos O'Donell and Matthew Wilcox.  It
introduces/updates the compat_time_t type and uses it for compat siginfo
structures.  I have built this on ppc64 and x86_64.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/compat.h    | 1 +
 include/asm-mips/compat.h    | 1 +
 include/asm-parisc/compat.h  | 2 +-
 include/asm-ppc64/compat.h   | 1 +
 include/asm-ppc64/ppc32.h    | 2 +-
 include/asm-sparc64/compat.h | 1 +
 include/asm-x86_64/ia32.h    | 2 +-
 7 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-ia64/compat.h b/include/asm-ia64/compat.h
index cc0ff0a4bdd0..0c05e5bad8a0 100644
--- a/include/asm-ia64/compat.h
+++ b/include/asm-ia64/compat.h
@@ -27,6 +27,7 @@ typedef u16		compat_ipc_pid_t;
 typedef s32		compat_daddr_t;
 typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-mips/compat.h b/include/asm-mips/compat.h
index dce92079e7fc..d78002afb1e1 100644
--- a/include/asm-mips/compat.h
+++ b/include/asm-mips/compat.h
@@ -29,6 +29,7 @@ typedef s32		compat_caddr_t;
 typedef struct {
 	s32	val[2];
 } compat_fsid_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-parisc/compat.h b/include/asm-parisc/compat.h
index ca0eac647a05..7630d1ad2391 100644
--- a/include/asm-parisc/compat.h
+++ b/include/asm-parisc/compat.h
@@ -24,7 +24,7 @@ typedef u16	compat_nlink_t;
 typedef u16	compat_ipc_pid_t;
 typedef s32	compat_daddr_t;
 typedef u32	compat_caddr_t;
-typedef u32	compat_timer_t;
+typedef s32	compat_timer_t;
 
 typedef s32	compat_int_t;
 typedef s32	compat_long_t;
diff --git a/include/asm-ppc64/compat.h b/include/asm-ppc64/compat.h
index 09c28d28ce6c..12414f5fc666 100644
--- a/include/asm-ppc64/compat.h
+++ b/include/asm-ppc64/compat.h
@@ -26,6 +26,7 @@ typedef s32		compat_daddr_t;
 typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 typedef s32		compat_key_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h
index 1d0404897550..6b44a8caf395 100644
--- a/include/asm-ppc64/ppc32.h
+++ b/include/asm-ppc64/ppc32.h
@@ -32,7 +32,7 @@ typedef struct compat_siginfo {
 
 		/* POSIX.1b timers */
 		struct {
-			timer_t _tid;			/* timer id */
+			compat_timer_t _tid;			/* timer id */
 			int _overrun;			/* overrun count */
 			compat_sigval_t _sigval;		/* same as below */
 			int _sys_private;		/* not to be passed to user */
diff --git a/include/asm-sparc64/compat.h b/include/asm-sparc64/compat.h
index 22f58055b8ab..b59122dd176d 100644
--- a/include/asm-sparc64/compat.h
+++ b/include/asm-sparc64/compat.h
@@ -25,6 +25,7 @@ typedef s32		compat_daddr_t;
 typedef u32		compat_caddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
 typedef s32		compat_key_t;
+typedef s32		compat_timer_t;
 
 typedef s32		compat_int_t;
 typedef s32		compat_long_t;
diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h
index c0a7717923ed..6efa00fe4e7b 100644
--- a/include/asm-x86_64/ia32.h
+++ b/include/asm-x86_64/ia32.h
@@ -94,7 +94,7 @@ typedef struct compat_siginfo{
 
 		/* POSIX.1b timers */
 		struct {
-			int _tid;		/* timer id */
+			compat_timer_t _tid;	/* timer id */
 			int _overrun;		/* overrun count */
 			compat_sigval_t _sigval;	/* same as below */
 			int _sys_private;	/* not to be passed to user */
-- 
cgit v1.2.3


From bb93e3a52f8db7210258a1a2134cced0b78a46e1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Jun 2005 00:10:15 -0700
Subject: [PATCH] block: add unlocked_ioctl support for block devices

This patch allows block device drivers to convert their ioctl functions to
unlocked_ioctl() like character devices and other subsystems.  All
functions that were called with the BKL held before are still used that
way, but I would not be surprised if it could be removed from the ioctl
functions in drivers/block/ioctl.c themselves.

As a side note, I found that compat_blkdev_ioctl() acquires the BKL as
well, which looks like a bug.  I have checked that every user of
disk->fops->compat_ioctl() in the current git tree gets the BKL itself, so
it could easily be removed from compat_blkdev_ioctl().

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3622e952e98c..9b1278e21279 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -884,6 +884,7 @@ struct block_device_operations {
 	int (*open) (struct inode *, struct file *);
 	int (*release) (struct inode *, struct file *);
 	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+	long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned, unsigned long);
 	int (*media_changed) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
-- 
cgit v1.2.3


From 45778ca819accab1a4a3378b3566cab0f189164f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@graphe.net>
Date: Thu, 23 Jun 2005 00:10:17 -0700
Subject: [PATCH] Remove f_error field from struct file

The following patch removes the f_error field and all checks of f_error.

Trond said:

  f_error was introduced for NFS, and made sense when we were guaranteed
  always to have a file pointer around when write errors occurred.  Since
  then, we have (for various reasons) had to introduce the nfs_open_context in
  order to track the file read/write state, and it made sense to move our
  f_error tracking there too.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9b1278e21279..517bf4966bf5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -581,7 +581,6 @@ struct file {
 	atomic_t		f_count;
 	unsigned int 		f_flags;
 	mode_t			f_mode;
-	int			f_error;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	unsigned int		f_uid, f_gid;
-- 
cgit v1.2.3


From f9fd27a253d5e0b23531d12ce7ad15b6535d4486 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:10:19 -0700
Subject: [PATCH] acl endianess annotations

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix_acl_xattr.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 5efd0a6dad94..fe271c1947b2 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -23,13 +23,13 @@
 #define ACL_UNDEFINED_ID	(-1)
 
 typedef struct {
-	__u16			e_tag;
-	__u16			e_perm;
-	__u32			e_id;
+	__le16			e_tag;
+	__le16			e_perm;
+	__le32			e_id;
 } posix_acl_xattr_entry;
 
 typedef struct {
-	__u32			a_version;
+	__le32			a_version;
 	posix_acl_xattr_entry	a_entries[0];
 } posix_acl_xattr_header;
 
-- 
cgit v1.2.3


From 9a59f452abe11f569e13ec16c51e6d61c54b9838 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:10:19 -0700
Subject: [PATCH] remove <linux/xattr_acl.h>

This file duplicates <linux/posix_acl_xattr.h>, using slightly different
names.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix_acl_xattr.h | 3 +++
 include/linux/reiserfs_acl.h    | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index fe271c1947b2..6e53c34035cd 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -52,4 +52,7 @@ posix_acl_xattr_count(size_t size)
 	return size / sizeof(posix_acl_xattr_entry);
 }
 
+struct posix_acl *posix_acl_from_xattr(const void *value, size_t size);
+int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size);
+
 #endif	/* _POSIX_ACL_XATTR_H */
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 2aef9c3f5ce8..0760507a545b 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -1,6 +1,5 @@
 #include <linux/init.h>
 #include <linux/posix_acl.h>
-#include <linux/xattr_acl.h>
 
 #define REISERFS_ACL_VERSION	0x0001
 
-- 
cgit v1.2.3


From c43dc2fd885b5658cfd7cedb7bcca20910c517a4 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Thu, 23 Jun 2005 00:10:27 -0700
Subject: [PATCH] aio: make wait_queue ->task ->private

In the upcoming aio_down patch, it is useful to store a private data
pointer in the kiocb's wait_queue.  Since we provide our own wake up
function and do not require the task_struct pointer, it makes sense to
convert the task pointer into a generic private pointer.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/wait.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index c9486c3efb4a..d38c9fecdc36 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -33,7 +33,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
-	struct task_struct * task;
+	void *private;
 	wait_queue_func_t func;
 	struct list_head task_list;
 };
@@ -60,7 +60,7 @@ typedef struct __wait_queue_head wait_queue_head_t;
  */
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
-	.task		= tsk,						\
+	.private	= tsk,						\
 	.func		= default_wake_function,			\
 	.task_list	= { NULL, NULL } }
 
@@ -86,7 +86,7 @@ static inline void init_waitqueue_head(wait_queue_head_t *q)
 static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
 {
 	q->flags = 0;
-	q->task = p;
+	q->private = p;
 	q->func = default_wake_function;
 }
 
@@ -94,7 +94,7 @@ static inline void init_waitqueue_func_entry(wait_queue_t *q,
 					wait_queue_func_t func)
 {
 	q->flags = 0;
-	q->task = NULL;
+	q->private = NULL;
 	q->func = func;
 }
 
@@ -110,7 +110,7 @@ static inline int waitqueue_active(wait_queue_head_t *q)
  * aio specifies a wait queue entry with an async notification
  * callback routine, not associated with any task.
  */
-#define is_sync_wait(wait)	(!(wait) || ((wait)->task))
+#define is_sync_wait(wait)	(!(wait) || ((wait)->private))
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
@@ -384,7 +384,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT(name)						\
 	wait_queue_t name = {						\
-		.task		= current,				\
+		.private	= current,				\
 		.func		= autoremove_wake_function,		\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
 	}
@@ -393,7 +393,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
 		.wait	= {						\
-			.task		= current,			\
+			.private	= current,			\
 			.func		= wake_bit_function,		\
 			.task_list	=				\
 				LIST_HEAD_INIT((name).wait.task_list),	\
@@ -402,7 +402,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define init_wait(wait)							\
 	do {								\
-		(wait)->task = current;					\
+		(wait)->private = current;				\
 		(wait)->func = autoremove_wake_function;		\
 		INIT_LIST_HEAD(&(wait)->task_list);			\
 	} while (0)
-- 
cgit v1.2.3


From bfb07599da289881d3bcbb601a110e997fc7444b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 23 Jun 2005 00:10:32 -0700
Subject: [PATCH] Introduce tty_unregister_ldisc()

It's a bit strange to see tty_register_ldisc call in modules' exit
functions.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/tty.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1b76106272d3..59ff42c629ec 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -345,6 +345,7 @@ extern int tty_check_change(struct tty_struct * tty);
 extern void stop_tty(struct tty_struct * tty);
 extern void start_tty(struct tty_struct * tty);
 extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc);
+extern int tty_unregister_ldisc(int disc);
 extern int tty_register_driver(struct tty_driver *driver);
 extern int tty_unregister_driver(struct tty_driver *driver);
 extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev);
-- 
cgit v1.2.3


From 4749f32da939d4e4160541b2cadc22492bb507ec Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Jun 2005 11:36:56 +0200
Subject: [PATCH] better USB_MON dependencies

This makes the USB_MON less confusing.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/usb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3d508bf08402..eb282b581546 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -290,7 +290,7 @@ struct usb_bus {
 	struct class_device *class_dev;	/* class device for this bus */
 	struct kref kref;		/* handles reference counting this bus */
 	void (*release)(struct usb_bus *bus);	/* function to destroy this bus's memory */
-#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)
+#if defined(CONFIG_USB_MON)
 	struct mon_bus *mon_bus;	/* non-null when associated */
 	int monitored;			/* non-zero when monitored */
 #endif
-- 
cgit v1.2.3


From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:19:55 -0700
Subject: [TCP]: Add pluggable congestion control algorithm infrastructure.

Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules.  The legacy "new RENO" algorithm is used as a starting
point and fallback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h |   9 +-
 include/linux/tcp.h    |  49 +++-------
 include/net/tcp.h      | 237 ++++++++++++++++---------------------------------
 3 files changed, 86 insertions(+), 209 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 614e939c78a4..72965bfe6cfb 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -333,21 +333,14 @@ enum
 	NET_TCP_FRTO=92,
 	NET_TCP_LOW_LATENCY=93,
 	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
-	NET_TCP_WESTWOOD=95,
 	NET_IPV4_IGMP_MAX_MSF=96,
 	NET_TCP_NO_METRICS_SAVE=97,
-	NET_TCP_VEGAS=98,
-	NET_TCP_VEGAS_ALPHA=99,
-	NET_TCP_VEGAS_BETA=100,
-	NET_TCP_VEGAS_GAMMA=101,
- 	NET_TCP_BIC=102,
- 	NET_TCP_BIC_FAST_CONVERGENCE=103,
-	NET_TCP_BIC_LOW_WINDOW=104,
 	NET_TCP_DEFAULT_WIN_SCALE=105,
 	NET_TCP_MODERATE_RCVBUF=106,
 	NET_TCP_TSO_WIN_DIVISOR=107,
 	NET_TCP_BIC_BETA=108,
 	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
+	NET_TCP_CONG_CONTROL=110,
 };
 
 enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 97a7c9e03df5..3ea75dd6640a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -203,13 +203,6 @@ struct tcp_sack_block {
 	__u32	end_seq;
 };
 
-enum tcp_congestion_algo {
-	TCP_RENO=0,
-	TCP_VEGAS,
-	TCP_WESTWOOD,
-	TCP_BIC,
-};
-
 struct tcp_options_received {
 /*	PAWS/RTTM data	*/
 	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
@@ -305,7 +298,7 @@ struct tcp_sock {
 	__u8	reordering;	/* Packet reordering metric.		*/
 	__u8	frto_counter;	/* Number of new acks after RTO */
 
-	__u8	adv_cong;	/* Using Vegas, Westwood, or BIC */
+	__u8	unused;
 	__u8	defer_accept;	/* User waits for some data after accept() */
 
 /* RTT measurement */
@@ -401,37 +394,10 @@ struct tcp_sock {
 		__u32	time;
 	} rcvq_space;
 
-/* TCP Westwood structure */
-        struct {
-                __u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
-                __u32    bw_est;           /* bandwidth estimate */
-                __u32    rtt_win_sx;       /* here starts a new evaluation... */
-                __u32    bk;
-                __u32    snd_una;          /* used for evaluating the number of acked bytes */
-                __u32    cumul_ack;
-                __u32    accounted;
-                __u32    rtt;
-                __u32    rtt_min;          /* minimum observed RTT */
-        } westwood;
-
-/* Vegas variables */
-	struct {
-		__u32	beg_snd_nxt;	/* right edge during last RTT */
-		__u32	beg_snd_una;	/* left edge  during last RTT */
-		__u32	beg_snd_cwnd;	/* saves the size of the cwnd */
-		__u8	doing_vegas_now;/* if true, do vegas for this RTT */
-		__u16	cntRTT;		/* # of RTTs measured within last RTT */
-		__u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
-		__u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
-	} vegas;
-
-	/* BI TCP Parameters */
-	struct {
-		__u32	cnt;		/* increase cwnd by 1 after this number of ACKs */
-		__u32 	last_max_cwnd;	/* last maximium snd_cwnd */
-		__u32	last_cwnd;	/* the last snd_cwnd */
-		__u32   last_stamp;     /* time when updated last_cwnd */
-	} bictcp;
+	/* Pluggable TCP congestion control hook */
+	struct tcp_congestion_ops *ca_ops;
+	u32	ca_priv[16];
+#define TCP_CA_PRIV_SIZE	(16*sizeof(u32))
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 	return (struct tcp_sock *)sk;
 }
 
+static inline void *tcp_ca(const struct tcp_sock *tp)
+{
+	return (void *) tp->ca_priv;
+}
+
 #endif
 
 #endif	/* _LINUX_TCP_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f730935b824a..e427cf35915c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
 #else
 # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
 #endif
-
-#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
-					 * max_cwnd = snd_cwnd * beta
-					 */
-#define BICTCP_MAX_INCREMENT 32		/*
-					 * Limit on the amount of
-					 * increment allowed during
-					 * binary search.
-					 */
-#define BICTCP_FUNC_OF_MIN_INCR 11	/*
-					 * log(B/Smin)/log(B/(B-1))+1,
-					 * Smin:min increment
-					 * B:log factor
-					 */
-#define BICTCP_B		4	 /*
-					  * In binary search,
-					  * go to point (max+min)/N
-					  */
-
 /*
  *	TCP option
  */
@@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
-extern int sysctl_tcp_westwood;
-extern int sysctl_tcp_vegas_cong_avoid;
-extern int sysctl_tcp_vegas_alpha;
-extern int sysctl_tcp_vegas_beta;
-extern int sysctl_tcp_vegas_gamma;
 extern int sysctl_tcp_nometrics_save;
-extern int sysctl_tcp_bic;
-extern int sysctl_tcp_bic_fast_convergence;
-extern int sysctl_tcp_bic_low_window;
-extern int sysctl_tcp_bic_beta;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 
@@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp,
 	tp->packets_out -= tcp_skb_pcount(skb);
 }
 
+/* Events passed to congestion control interface */
+enum tcp_ca_event {
+	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
+	CA_EVENT_CWND_RESTART,	/* congestion window restart */
+	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
+	CA_EVENT_FRTO,		/* fast recovery timeout */
+	CA_EVENT_LOSS,		/* loss timeout */
+	CA_EVENT_FAST_ACK,	/* in sequence ack */
+	CA_EVENT_SLOW_ACK,	/* other ack */
+};
+
+/*
+ * Interface for adding new TCP congestion control handlers
+ */
+#define TCP_CA_NAME_MAX	16
+struct tcp_congestion_ops {
+	struct list_head	list;
+
+	/* initialize private data (optional) */
+	void (*init)(struct tcp_sock *tp);
+	/* cleanup private data  (optional) */
+	void (*release)(struct tcp_sock *tp);
+
+	/* return slow start threshold (required) */
+	u32 (*ssthresh)(struct tcp_sock *tp);
+	/* lower bound for congestion window (optional) */
+	u32 (*min_cwnd)(struct tcp_sock *tp);
+	/* do new cwnd calculation (required) */
+	void (*cong_avoid)(struct tcp_sock *tp, u32 ack,
+			   u32 rtt, u32 in_flight, int good_ack);
+	/* round trip time sample per acked packet (optional) */
+	void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt);
+	/* call before changing ca_state (optional) */
+	void (*set_state)(struct tcp_sock *tp, u8 new_state);
+	/* call when cwnd event occurs (optional) */
+	void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev);
+	/* new value of cwnd after loss (optional) */
+	u32  (*undo_cwnd)(struct tcp_sock *tp);
+	/* hook for packet ack accounting (optional) */
+	void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked);
+	/* get info for tcp_diag (optional) */
+	void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb);
+
+	char 		name[TCP_CA_NAME_MAX];
+	struct module 	*owner;
+};
+
+extern int tcp_register_congestion_control(struct tcp_congestion_ops *type);
+extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+
+extern void tcp_init_congestion_control(struct tcp_sock *tp);
+extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
+extern int tcp_set_default_congestion_control(const char *name);
+extern void tcp_get_default_congestion_control(char *name);
+
+extern struct tcp_congestion_ops tcp_reno;
+extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
+extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
+				u32 rtt, u32 in_flight, int flag);
+extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp);
+
+static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
+{
+	if (tp->ca_ops->set_state)
+		tp->ca_ops->set_state(tp, ca_state);
+	tp->ca_state = ca_state;
+}
+
+static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+	if (tp->ca_ops->cwnd_event)
+		tp->ca_ops->cwnd_event(tp, event);
+}
+
 /* This determines how many packets are "in the network" to the best
  * of our knowledge.  In many cases it is conservative, but where
  * detailed information is available from the receiver (via SACK
@@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 	return (tp->packets_out - tp->left_out + tp->retrans_out);
 }
 
-/*
- * Which congestion algorithim is in use on the connection.
- */
-#define tcp_is_vegas(__tp)	((__tp)->adv_cong == TCP_VEGAS)
-#define tcp_is_westwood(__tp)	((__tp)->adv_cong == TCP_WESTWOOD)
-#define tcp_is_bic(__tp)	((__tp)->adv_cong == TCP_BIC)
-
-/* Recalculate snd_ssthresh, we want to set it to:
- *
- * Reno:
- * 	one half the current congestion window, but no
- *	less than two segments
- *
- * BIC:
- *	behave like Reno until low_window is reached,
- *	then increase congestion window slowly
- */
-static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
-{
-	if (tcp_is_bic(tp)) {
-		if (sysctl_tcp_bic_fast_convergence &&
-		    tp->snd_cwnd < tp->bictcp.last_max_cwnd)
-			tp->bictcp.last_max_cwnd = (tp->snd_cwnd * 
-						    (BICTCP_BETA_SCALE
-						     + sysctl_tcp_bic_beta))
-				/ (2 * BICTCP_BETA_SCALE);
-		else
-			tp->bictcp.last_max_cwnd = tp->snd_cwnd;
-
-		if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
-			return max((tp->snd_cwnd * sysctl_tcp_bic_beta)
-				   / BICTCP_BETA_SCALE, 2U);
-	}
-
-	return max(tp->snd_cwnd >> 1U, 2U);
-}
-
-/* Stop taking Vegas samples for now. */
-#define tcp_vegas_disable(__tp)	((__tp)->vegas.doing_vegas_now = 0)
-    
-static inline void tcp_vegas_enable(struct tcp_sock *tp)
-{
-	/* There are several situations when we must "re-start" Vegas:
-	 *
-	 *  o when a connection is established
-	 *  o after an RTO
-	 *  o after fast recovery
-	 *  o when we send a packet and there is no outstanding
-	 *    unacknowledged data (restarting an idle connection)
-	 *
-	 * In these circumstances we cannot do a Vegas calculation at the
-	 * end of the first RTT, because any calculation we do is using
-	 * stale info -- both the saved cwnd and congestion feedback are
-	 * stale.
-	 *
-	 * Instead we must wait until the completion of an RTT during
-	 * which we actually receive ACKs.
-	 */
-    
-	/* Begin taking Vegas samples next time we send something. */
-	tp->vegas.doing_vegas_now = 1;
-     
-	/* Set the beginning of the next send window. */
-	tp->vegas.beg_snd_nxt = tp->snd_nxt;
-
-	tp->vegas.cntRTT = 0;
-	tp->vegas.minRTT = 0x7fffffff;
-}
-
-/* Should we be taking Vegas samples right now? */
-#define tcp_vegas_enabled(__tp)	((__tp)->vegas.doing_vegas_now)
-
-extern void tcp_ca_init(struct tcp_sock *tp);
-
-static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
-{
-	if (tcp_is_vegas(tp)) {
-		if (ca_state == TCP_CA_Open) 
-			tcp_vegas_enable(tp);
-		else
-			tcp_vegas_disable(tp);
-	}
-	tp->ca_state = ca_state;
-}
-
 /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
  * The exception is rate halving phase, when cwnd is decreasing towards
  * ssthresh.
@@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 static inline void __tcp_enter_cwr(struct tcp_sock *tp)
 {
 	tp->undo_marker = 0;
-	tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+	tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
 	tp->snd_cwnd = min(tp->snd_cwnd,
 			   tcp_packets_in_flight(tp) + 1U);
 	tp->snd_cwnd_cnt = 0;
@@ -1876,52 +1837,4 @@ struct tcp_iter_state {
 extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
 extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
 
-/* TCP Westwood functions and constants */
-
-#define TCP_WESTWOOD_INIT_RTT  (20*HZ)           /* maybe too conservative?! */
-#define TCP_WESTWOOD_RTT_MIN   (HZ/20)           /* 50ms */
-
-static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq)
-{
-        if (tcp_is_westwood(tp))
-                tp->westwood.rtt = rtt_seq;
-}
-
-static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
-{
-        return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
-		   (__u32) (tp->mss_cache_std),
-		   2U);
-}
-
-static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
-{
-	return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
-}
-
-static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
-{
-	__u32 ssthresh = 0;
-
-	if (tcp_is_westwood(tp)) {
-		ssthresh = __tcp_westwood_bw_rttmin(tp);
-		if (ssthresh)
-			tp->snd_ssthresh = ssthresh;  
-	}
-
-	return (ssthresh != 0);
-}
-
-static inline int tcp_westwood_cwnd(struct tcp_sock *tp)
-{
-	__u32 cwnd = 0;
-
-	if (tcp_is_westwood(tp)) {
-		cwnd = __tcp_westwood_bw_rttmin(tp);
-		if (cwnd)
-			tp->snd_cwnd = cwnd;
-	}
-
-	return (cwnd != 0);
-}
 #endif	/* _TCP_H */
-- 
cgit v1.2.3


From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:21:28 -0700
Subject: [TCP]: Report congestion control algorithm in tcp_diag.

Enhancement to the tcp_diag interface used by the iproute2 ss command
to report the tcp congestion control being used by a socket.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp_diag.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h
index ceee962e1d15..7a5996743946 100644
--- a/include/linux/tcp_diag.h
+++ b/include/linux/tcp_diag.h
@@ -99,9 +99,10 @@ enum
 	TCPDIAG_MEMINFO,
 	TCPDIAG_INFO,
 	TCPDIAG_VEGASINFO,
+	TCPDIAG_CONG,
 };
 
-#define TCPDIAG_MAX TCPDIAG_VEGASINFO
+#define TCPDIAG_MAX TCPDIAG_CONG
 
 
 /* TCPDIAG_MEM */
@@ -123,5 +124,4 @@ struct tcpvegas_info {
 	__u32	tcpv_minrtt;
 };
 
-
 #endif /* _TCP_DIAG_H_ */
-- 
cgit v1.2.3


From e608a8072b10258aa18c2e33324def225199ba1d Mon Sep 17 00:00:00 2001
From: David Mosberger-Tang <davidm@hpl.hp.com>
Date: Wed, 22 Jun 2005 22:24:00 -0700
Subject: [IA64] Fix pfn_to_nid() so the kernel compiles again for
 !CONFIG_DISCONTIGMEM.

Signed-off-by: David Mosberger-Tang <davidm@hpl.hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/mmzone.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-ia64/mmzone.h b/include/asm-ia64/mmzone.h
index 83ca4043fc11..d32f51e3d6c2 100644
--- a/include/asm-ia64/mmzone.h
+++ b/include/asm-ia64/mmzone.h
@@ -15,6 +15,8 @@
 #include <asm/page.h>
 #include <asm/meminit.h>
 
+#ifdef CONFIG_DISCONTIGMEM
+
 static inline int pfn_to_nid(unsigned long pfn)
 {
 #ifdef CONFIG_NUMA
@@ -29,8 +31,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 #endif
 }
 
-#ifdef CONFIG_DISCONTIGMEM
-
 #ifdef CONFIG_IA64_DIG /* DIG systems are small */
 # define MAX_PHYSNODE_ID	8
 # define NR_NODE_MEMBLKS	(MAX_NUMNODES * 8)
-- 
cgit v1.2.3