From dd7f0b80926befc8c70a873b5b0c0c7b5fd1e7b9 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 22 Jun 2005 12:38:33 -0700 Subject: [NETFILTER]: Fix "iptables -D" rule deletion with ipt_CLUSTERIP target. The patch just changes the order of structure members. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h index baa83e757156..d9bceedfb3dc 100644 --- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h +++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h @@ -18,7 +18,6 @@ struct clusterip_config; struct ipt_clusterip_tgt_info { u_int32_t flags; - struct clusterip_config *config; /* only relevant for new ones */ u_int8_t clustermac[6]; @@ -27,6 +26,8 @@ struct ipt_clusterip_tgt_info { u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; enum clusterip_hashmode hash_mode; u_int32_t hash_initval; + + struct clusterip_config *config; }; #endif /*_IPT_CLUSTERIP_H_target*/ -- cgit v1.2.3 From 10f7e7c15e6ce41799c5dba6925ae4bf8048c870 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:07 +1000 Subject: [PATCH] ppc64: consolidate calibrate_decr implementations pSeries and maple have almost the same code for calibrate_decr, and BPA would need yet another copy. Instead, I'm moving the code to arch/ppc64/kernel/time.c. Some of the related declarations were missing from header files, so I'm moving those as well. It makes sense to merge this with the pmac function of the same name, so we end up having just one implemetation for iSeries and one for Open Firmware based machines. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- include/asm-ppc64/time.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/asm-ppc64/time.h b/include/asm-ppc64/time.h index 8d6e3760ee10..c6c762cad8b0 100644 --- a/include/asm-ppc64/time.h +++ b/include/asm-ppc64/time.h @@ -34,6 +34,15 @@ struct rtc_time; extern void to_tm(int tim, struct rtc_time * tm); extern time_t last_rtc_update; +void generic_calibrate_decr(void); +void setup_default_decr(void); + +/* Some sane defaults: 125 MHz timebase, 1GHz processor */ +extern unsigned long ppc_proc_freq; +#define DEFAULT_PROC_FREQ (DEFAULT_TB_FREQ * 8) +extern unsigned long ppc_tb_freq; +#define DEFAULT_TB_FREQ 125000000UL + /* * By putting all of this stuff into a single struct we * reduce the number of cache lines touched by do_gettimeofday. -- cgit v1.2.3 From 773bf9c469c01f01280c9bd45ec2462dd94d08a0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:18 +1000 Subject: [PATCH] ppc64: rename pSeries rtc functions into rtas_* The rtc rtas functions are not pSeries specific but can also be used by BPA and other SLOF based platforms Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- include/asm-ppc64/rtas.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index a8ab0e9db84a..c46ceb2ffc12 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -188,6 +188,11 @@ extern int rtas_set_power_level(int powerdomain, int level, int *setlevel); extern int rtas_set_indicator(int indicator, int index, int new_value); extern void rtas_initialize(void); +struct rtc_time; +extern void rtas_get_boot_time(struct rtc_time *rtc_time); +extern void rtas_get_rtc_time(struct rtc_time *rtc_time); +extern int rtas_set_rtc_time(struct rtc_time *rtc_time); + /* Given an RTAS status code of 9900..9905 compute the hinted delay */ unsigned int rtas_extended_busy_delay_time(int status); static inline int rtas_is_extended_busy(int status) -- cgit v1.2.3 From 6566c6f1f18d42affe73ccdd403e290b64d10473 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:28 +1000 Subject: [PATCH] ppc64: pSeries_progress -> rtas_progress The pSeries_progress function is called from some places in the rtas code, which may also be used by non-pSeries platforms. Though pSeries is currently the only platform type that implements display-character, the code is actually generic enough to be part of the rtas subsystem. I hit a bug here because the generic rtas code tried calling ppc_md.progress, which points to an __init function on most platforms. We could also clear the ppc_md.progress pointer when freeing the init memory to make it more explicit that ppc_md.progress must not be called after bootup. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- include/asm-ppc64/rtas.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index c46ceb2ffc12..e7d1b5222802 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -186,6 +186,7 @@ extern int rtas_get_sensor(int sensor, int index, int *state); extern int rtas_get_power_level(int powerdomain, int *level); extern int rtas_set_power_level(int powerdomain, int level, int *setlevel); extern int rtas_set_indicator(int indicator, int index, int new_value); +extern void rtas_progress(char *s, unsigned short hex); extern void rtas_initialize(void); struct rtc_time; -- cgit v1.2.3 From 5f5b4e669a59be1cf8fc9d6d04ff1ccad8ab6de0 Mon Sep 17 00:00:00 2001 From: Utz Bacher Date: Thu, 23 Jun 2005 09:43:31 +1000 Subject: [PATCH] ppc64: add a minimal nvram driver The firmware provides the location and size of the nvram in the device tree, so it does not really contain any hardware specific bits and could be used on other machines as well. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- include/asm-ppc64/nvram.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-ppc64/nvram.h b/include/asm-ppc64/nvram.h index 4e6dd370d936..dfaa21566c9a 100644 --- a/include/asm-ppc64/nvram.h +++ b/include/asm-ppc64/nvram.h @@ -70,6 +70,7 @@ extern struct nvram_partition *nvram_find_partition(int sig, const char *name); extern int pSeries_nvram_init(void); extern int pmac_nvram_init(void); +extern int bpa_nvram_init(void); /* PowerMac specific nvram stuffs */ -- cgit v1.2.3 From fef1c772fa154c16e0a54577e9ecb5480f7b937e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 09:43:37 +1000 Subject: [PATCH] ppc64: add BPA platform type This adds the basic support for running on BPA machines. So far, this is only the IBM workstation, and it will not run on others without a little more generalization. It should be possible to configure a kernel for any combination of CONFIG_PPC_BPA with any of the other multiplatform targets. Signed-off-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- include/asm-ppc64/mmu.h | 5 +++-- include/asm-ppc64/processor.h | 15 +++++++++++++-- include/asm-ppc64/smp.h | 8 ++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h index c78282a67d8e..9d03a98a4fa3 100644 --- a/include/asm-ppc64/mmu.h +++ b/include/asm-ppc64/mmu.h @@ -47,9 +47,10 @@ #define SLB_VSID_KS ASM_CONST(0x0000000000000800) #define SLB_VSID_KP ASM_CONST(0x0000000000000400) #define SLB_VSID_N ASM_CONST(0x0000000000000200) /* no-execute */ -#define SLB_VSID_L ASM_CONST(0x0000000000000100) /* largepage 16M */ +#define SLB_VSID_L ASM_CONST(0x0000000000000100) /* largepage */ #define SLB_VSID_C ASM_CONST(0x0000000000000080) /* class */ - +#define SLB_VSID_LS ASM_CONST(0x0000000000000070) /* size of largepage */ + #define SLB_VSID_KERNEL (SLB_VSID_KP|SLB_VSID_C) #define SLB_VSID_USER (SLB_VSID_KP|SLB_VSID_KS) diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h index 3084099086a8..af28aa55d8c1 100644 --- a/include/asm-ppc64/processor.h +++ b/include/asm-ppc64/processor.h @@ -138,8 +138,16 @@ #define SPRN_NIADORM 0x3F3 /* Hardware Implementation Register 2 */ #define SPRN_HID4 0x3F4 /* 970 HID4 */ #define SPRN_HID5 0x3F6 /* 970 HID5 */ -#define SPRN_TSC 0x3FD /* Thread switch control */ -#define SPRN_TST 0x3FC /* Thread switch timeout */ +#define SPRN_HID6 0x3F9 /* BE HID 6 */ +#define HID6_LB (0x0F<<12) /* Concurrent Large Page Modes */ +#define HID6_DLP (1<<20) /* Disable all large page modes (4K only) */ +#define SPRN_TSCR 0x399 /* Thread switch control on BE */ +#define SPRN_TTR 0x39A /* Thread switch timeout on BE */ +#define TSCR_DEC_ENABLE 0x200000 /* Decrementer Interrupt */ +#define TSCR_EE_ENABLE 0x100000 /* External Interrupt */ +#define TSCR_EE_BOOST 0x080000 /* External Interrupt Boost */ +#define SPRN_TSC 0x3FD /* Thread switch control on others */ +#define SPRN_TST 0x3FC /* Thread switch timeout on others */ #define SPRN_L2CR 0x3F9 /* Level 2 Cache Control Regsiter */ #define SPRN_LR 0x008 /* Link Register */ #define SPRN_PIR 0x3FF /* Processor Identification Register */ @@ -259,6 +267,7 @@ #define PV_970FX 0x003C #define PV_630 0x0040 #define PV_630p 0x0041 +#define PV_BE 0x0070 /* Platforms supported by PPC64 */ #define PLATFORM_PSERIES 0x0100 @@ -267,6 +276,7 @@ #define PLATFORM_LPAR 0x0001 #define PLATFORM_POWERMAC 0x0400 #define PLATFORM_MAPLE 0x0500 +#define PLATFORM_BPA 0x1000 /* Compatibility with drivers coming from PPC32 world */ #define _machine (systemcfg->platform) @@ -278,6 +288,7 @@ #define IC_INVALID 0 #define IC_OPEN_PIC 1 #define IC_PPC_XIC 2 +#define IC_BPA_IIC 3 #define XGLUE(a,b) a##b #define GLUE(a,b) XGLUE(a,b) diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h index 8115ecb8feee..d86f742e9a21 100644 --- a/include/asm-ppc64/smp.h +++ b/include/asm-ppc64/smp.h @@ -85,6 +85,14 @@ extern void smp_generic_take_timebase(void); extern struct smp_ops_t *smp_ops; +#ifdef CONFIG_PPC_PSERIES +void vpa_init(int cpu); +#else +static inline void vpa_init(int cpu) +{ +} +#endif /* CONFIG_PPC_PSERIES */ + #endif /* __ASSEMBLY__ */ #endif /* !(_PPC64_SMP_H) */ -- cgit v1.2.3 From 6ca4f65e6b390d09e1de7280cf9fd4f5d8e4b48b Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:04:55 -0700 Subject: [NETPOLL]: Set poll_owner to -1 before unlocking in netpoll_poll_unlock() This trivial patch moves the assignment of poll_owner to -1 inside of the lock. This fixes a potential SMP race in the code. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index c0d8b90c5202..449a4fde6587 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -53,8 +53,8 @@ static inline void netpoll_poll_lock(struct net_device *dev) static inline void netpoll_poll_unlock(struct net_device *dev) { if (dev->np) { - spin_unlock(&dev->np->poll_lock); dev->np->poll_owner = -1; + spin_unlock(&dev->np->poll_lock); } } -- cgit v1.2.3 From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:05:31 -0700 Subject: [NETPOLL]: Introduce a netpoll_info struct This patch introduces a netpoll_info structure, which the struct net_device will now point to instead of pointing to a struct netpoll. The reason for this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock should be maintained per net_device, not per netpoll; and 2) this is a first step in providing support for multiple netpoll clients to register against the same net_device. The struct netpoll is now pointed to by the netpoll_info structure. As such, the previous behaviour of the code is preserved. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- include/linux/netpoll.h | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ba5d1236aa17..d6afd440cf7b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -41,7 +41,7 @@ struct divert_blk; struct vlan_group; struct ethtool_ops; -struct netpoll; +struct netpoll_info; /* source back-compat hooks */ #define SET_ETHTOOL_OPS(netdev,ops) \ ( (netdev)->ethtool_ops = (ops) ) @@ -468,7 +468,7 @@ struct net_device unsigned char *haddr); int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); #ifdef CONFIG_NETPOLL - struct netpoll *np; + struct netpoll_info *npinfo; #endif #ifdef CONFIG_NET_POLL_CONTROLLER void (*poll_controller)(struct net_device *dev); diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 449a4fde6587..388cd91bc7a6 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -16,14 +16,18 @@ struct netpoll; struct netpoll { struct net_device *dev; char dev_name[16], *name; - int rx_flags; void (*rx_hook)(struct netpoll *, int, char *, int); void (*drop)(struct sk_buff *skb); u32 local_ip, remote_ip; u16 local_port, remote_port; unsigned char local_mac[6], remote_mac[6]; +}; + +struct netpoll_info { spinlock_t poll_lock; int poll_owner; + int rx_flags; + struct netpoll *np; }; void netpoll_poll(struct netpoll *np); @@ -39,22 +43,27 @@ void netpoll_queue(struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline int netpoll_rx(struct sk_buff *skb) { - return skb->dev->np && skb->dev->np->rx_flags && __netpoll_rx(skb); + struct netpoll_info *npinfo = skb->dev->npinfo; + + if (!npinfo || !npinfo->rx_flags) + return 0; + + return npinfo->np && __netpoll_rx(skb); } static inline void netpoll_poll_lock(struct net_device *dev) { - if (dev->np) { - spin_lock(&dev->np->poll_lock); - dev->np->poll_owner = smp_processor_id(); + if (dev->npinfo) { + spin_lock(&dev->npinfo->poll_lock); + dev->npinfo->poll_owner = smp_processor_id(); } } static inline void netpoll_poll_unlock(struct net_device *dev) { - if (dev->np) { - dev->np->poll_owner = -1; - spin_unlock(&dev->np->poll_lock); + if (dev->npinfo) { + dev->npinfo->poll_owner = -1; + spin_unlock(&dev->npinfo->poll_lock); } } -- cgit v1.2.3 From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:05:59 -0700 Subject: [NETPOLL]: allow multiple netpoll_clients to register against one interface This patch provides support for registering multiple netpoll clients to the same network device. Only one of these clients may register an rx_hook, however. In practice, this restriction has not been problematic. It is worth mentioning, though, that the current design can be easily extended to allow for the registration of multiple rx_hooks. The basic idea of the patch is that the rx_np pointer in the netpoll_info structure points to the struct netpoll that has rx_hook filled in. Aside from this one case, there is no need for a pointer from the struct net_device to an individual struct netpoll. A lock is introduced to protect the setting and clearing of the np_rx pointer. The pointer will only be cleared upon netpoll client module removal, and the lock should be uncontested. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netpoll.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 388cd91bc7a6..bcd0ac33f592 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -27,7 +27,8 @@ struct netpoll_info { spinlock_t poll_lock; int poll_owner; int rx_flags; - struct netpoll *np; + spinlock_t rx_lock; + struct netpoll *rx_np; /* netpoll that registered an rx_hook */ }; void netpoll_poll(struct netpoll *np); @@ -44,11 +45,19 @@ void netpoll_queue(struct sk_buff *skb); static inline int netpoll_rx(struct sk_buff *skb) { struct netpoll_info *npinfo = skb->dev->npinfo; + unsigned long flags; + int ret = 0; - if (!npinfo || !npinfo->rx_flags) + if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags)) return 0; - return npinfo->np && __netpoll_rx(skb); + spin_lock_irqsave(&npinfo->rx_lock, flags); + /* check rx_flags again with the lock held */ + if (npinfo->rx_flags && __netpoll_rx(skb)) + ret = 1; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + + return ret; } static inline void netpoll_poll_lock(struct net_device *dev) -- cgit v1.2.3 From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Jun 2005 22:15:01 -0700 Subject: [X25]: Selective sub-address matching with call user data. From: Shaun Pereira This is the first (independent of the second) patch of two that I am working on with x25 on linux (tested with xot on a cisco router). Details are as follows. Current state of module: A server using the current implementation (2.6.11.7) of the x25 module will accept a call request/ incoming call packet at the listening x.25 address, from all callers to that address, as long as NO call user data is present in the packet header. If the server needs to choose to accept a particular call request/ incoming call packet arriving at its listening x25 address, then the kernel has to allow a match of call user data present in the call request packet with its own. This is required when multiple servers listen at the same x25 address and device interface. The kernel currently matches ALL call user data, if present. Current Changes: This patch is a follow up to the patch submitted previously by Andrew Hendry, and allows the user to selectively control the number of octets of call user data in the call request packet, that the kernel will match. By default no call user data is matched, even if call user data is present. To allow call user data matching, a cudmatchlength > 0 has to be passed into the kernel after which the passed number of octets will be matched. Otherwise the kernel behavior is exactly as the original implementation. This patch also ensures that as is normally the case, no call user data will be present in the Call accepted / call connected packet sent back to the caller Future Changes on next patch: There are cases however when call user data may be present in the call accepted packet. According to the X.25 recommendation (ITU-T 10/96) section 5.2.3.2 call user data may be present in the call accepted packet provided the fast select facility is used. My next patch will include this fast select utility and the ability to send up to 128 octets call user data in the call accepted packet provided the fast select facility is used. I am currently testing this, again with xot on linux and cisco. Signed-off-by: Shaun Pereira (With a fix from Alexey Dobriyan ) Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 10 ++++++++++ include/net/x25.h | 3 +-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/x25.h b/include/linux/x25.h index 7531cfed5885..6f43b3d20248 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -4,6 +4,8 @@ * History * mar/20/00 Daniela Squassoni Disabling/enabling of facilities * negotiation. + * apr/02/05 Shaun Pereira Selective sub address matching with + * call user data */ #ifndef X25_KERNEL_H @@ -16,6 +18,7 @@ #define SIOCX25GCALLUSERDATA (SIOCPROTOPRIVATE + 4) #define SIOCX25SCALLUSERDATA (SIOCPROTOPRIVATE + 5) #define SIOCX25GCAUSEDIAG (SIOCPROTOPRIVATE + 6) +#define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) /* * Values for {get,set}sockopt. @@ -109,4 +112,11 @@ struct x25_causediag { unsigned char diagnostic; }; +/* + * Further optional call user data match length selection + */ +struct x25_subaddr { + unsigned int cudmatchlength; +}; + #endif diff --git a/include/net/x25.h b/include/net/x25.h index 7a1ba5bbb868..9dd70dd4a9b7 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -134,7 +134,7 @@ struct x25_sock { struct sock sk; struct x25_address source_addr, dest_addr; struct x25_neigh *neighbour; - unsigned int lci; + unsigned int lci, cudmatchlength; unsigned char state, condition, qbitincl, intflag; unsigned short vs, vr, va, vl; unsigned long t2, t21, t22, t23; @@ -242,7 +242,6 @@ extern int x25_validate_nr(struct sock *, unsigned short); extern void x25_write_internal(struct sock *, int); extern int x25_decode(struct sock *, struct sk_buff *, int *, int *, int *, int *, int *); extern void x25_disconnect(struct sock *, int, unsigned char, unsigned char); -extern int x25_check_calluserdata(struct x25_calluserdata *,struct x25_calluserdata *); /* x25_timer.c */ extern void x25_start_heartbeat(struct sock *); -- cgit v1.2.3 From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Jun 2005 22:16:17 -0700 Subject: [X25]: Fast select with no restriction on response This patch is a follow up to patch 1 regarding "Selective Sub Address matching with call user data". It allows use of the Fast-Select-Acceptance optional user facility for X.25. This patch just implements fast select with no restriction on response (NRR). What this means (according to ITU-T Recomendation 10/96 section 6.16) is that if in an incoming call packet, the relevant facility bits are set for fast-select-NRR, then the called DTE can issue a direct response to the incoming packet using a call-accepted packet that contains call-user-data. This patch allows such a response. The called DTE can also respond with a clear-request packet that contains call-user-data. However, this feature is currently not implemented by the patch. How is Fast Select Acceptance used? By default, the system does not allow fast select acceptance (as before). To enable a response to fast select acceptance, After a listen socket in created and bound as follows socket(AF_X25, SOCK_SEQPACKET, 0); bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr)); but before a listen system call is made, the following ioctl should be used. ioctl(call_soc,SIOCX25CALLACCPTAPPRV); Now the listen system call can be made listen(call_soc, 4); After this, an incoming-call packet will be accepted, but no call-accepted packet will be sent back until the following system call is made on the socket that accepts the call ioctl(vc_soc,SIOCX25SENDCALLACCPT); The network (or cisco xot router used for testing here) will allow the application server's call-user-data in the call-accepted packet, provided the call-request was made with Fast-select NRR. Signed-off-by: Shaun Pereira Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 2 ++ include/net/x25.h | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/x25.h b/include/linux/x25.h index 6f43b3d20248..16d44931afa0 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -19,6 +19,8 @@ #define SIOCX25SCALLUSERDATA (SIOCPROTOPRIVATE + 5) #define SIOCX25GCAUSEDIAG (SIOCPROTOPRIVATE + 6) #define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) +#define SIOCX25CALLACCPTAPPRV (SIOCPROTOPRIVATE + 8) +#define SIOCX25SENDCALLACCPT (SIOCPROTOPRIVATE + 9) /* * Values for {get,set}sockopt. diff --git a/include/net/x25.h b/include/net/x25.h index 9dd70dd4a9b7..8b39b98876e8 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -79,6 +79,8 @@ enum { #define X25_DEFAULT_PACKET_SIZE X25_PS128 /* Default Packet Size */ #define X25_DEFAULT_THROUGHPUT 0x0A /* Deafult Throughput */ #define X25_DEFAULT_REVERSE 0x00 /* Default Reverse Charging */ +#define X25_DENY_ACCPT_APPRV 0x01 /* Default value */ +#define X25_ALLOW_ACCPT_APPRV 0x00 /* Control enabled */ #define X25_SMODULUS 8 #define X25_EMODULUS 128 @@ -94,7 +96,7 @@ enum { #define X25_FAC_CLASS_C 0x80 #define X25_FAC_CLASS_D 0xC0 -#define X25_FAC_REVERSE 0x01 +#define X25_FAC_REVERSE 0x01 /* also fast select */ #define X25_FAC_THROUGHPUT 0x02 #define X25_FAC_PACKET_SIZE 0x42 #define X25_FAC_WINDOW_SIZE 0x43 @@ -135,7 +137,7 @@ struct x25_sock { struct x25_address source_addr, dest_addr; struct x25_neigh *neighbour; unsigned int lci, cudmatchlength; - unsigned char state, condition, qbitincl, intflag; + unsigned char state, condition, qbitincl, intflag, accptapprv; unsigned short vs, vr, va, vl; unsigned long t2, t21, t22, t23; unsigned short fraglen; -- cgit v1.2.3 From dad32bbf43b496bcd32a83f73a1e7fd0a02cfd3e Mon Sep 17 00:00:00 2001 From: John Rose Date: Thu, 23 Jun 2005 17:09:54 +1000 Subject: [PATCH] pSeries - read irqs dynamically For I/O DLPAR to work properly, the kernel needs to allow for dynamic assignment of the irq field of the pci_dev structure upon dynamic bus addition. This patch moves the assignment of that field from pSeries_final_fixup() to pcibios_fixup_bus(), which enables dynamic assignment for the children of a newly added bus. Currently, pci_devs receive their irq numbers in one of two ways. The irq line is either read at boot for all pci_devs, or read by the rpaphp module at slot enable time. The latter is no longer sufficient for DLPAR addition of slots that don't qualify as PCI-hotplug capable. This solution handles the cases of boot and dynamic add. Signed-off-by: John Rose Signed-off-by: Paul Mackerras --- include/asm-ppc64/machdep.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h index 5d3cd9d042e2..553b2ea23bed 100644 --- a/include/asm-ppc64/machdep.h +++ b/include/asm-ppc64/machdep.h @@ -76,6 +76,7 @@ struct machdep_calls { void (*tce_flush)(struct iommu_table *tbl); void (*iommu_dev_setup)(struct pci_dev *dev); void (*iommu_bus_setup)(struct pci_bus *bus); + void (*irq_bus_setup)(struct pci_bus *bus); int (*probe)(int platform); void (*setup_arch)(void); -- cgit v1.2.3 From 408fde81c1bff15c875a3618481e93a01dcc79ea Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:37 -0700 Subject: [PATCH] remove non-DISCONTIG use of pgdat->node_mem_map This patch effectively eliminates direct use of pgdat->node_mem_map outside of the DISCONTIG code. On a flat memory system, these fields aren't currently used, neither are they on a sparsemem system. There was also a node_mem_map(nid) macro on many architectures. Its use along with the use of ->node_mem_map itself was not consistent. It has been removed in favor of two new, more explicit, arch-independent macros: pgdat_page_nr(pgdat, pagenr) nid_page_nr(nid, pagenr) I called them "pgdat" and "nid" because we overload the term "node" to mean "NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I believe the newer names are much clearer. These macros can be overridden in the sparsemem case with a theoretically slower operation using node_start_pfn and pfn_to_page(), instead. We could make this the only behavior if people want, but I don't want to change too much at once. One thing at a time. This patch removes more code than it adds. Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386 generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/ Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs. Signed-off-by: Dave Hansen Signed-off-by: Martin J. Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-alpha/mmzone.h | 3 +-- include/asm-i386/mmzone.h | 3 +-- include/asm-m32r/mmzone.h | 3 +-- include/asm-parisc/mmzone.h | 3 +-- include/asm-ppc64/mmzone.h | 3 +-- include/asm-x86_64/mmzone.h | 5 +---- include/linux/mmzone.h | 2 ++ 7 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h index 726c150dcbe4..a011ef4cf3d3 100644 --- a/include/asm-alpha/mmzone.h +++ b/include/asm-alpha/mmzone.h @@ -57,7 +57,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) * Given a kernel address, find the home node of the underlying memory. */ #define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define local_mapnr(kvaddr) \ @@ -108,7 +107,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) #define pfn_to_page(pfn) \ ({ \ unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT); \ - (node_mem_map(kvaddr_to_nid(kaddr)) + local_mapnr(kaddr)); \ + (NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr)); \ }) #define page_to_pfn(page) \ diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 13830ae67cac..9cec191f462c 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -79,7 +79,6 @@ static inline int pfn_to_nid(unsigned long pfn) */ #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -100,7 +99,7 @@ static inline int pfn_to_nid(unsigned long pfn) ({ \ unsigned long __pfn = pfn; \ int __node = pfn_to_nid(__pfn); \ - &node_mem_map(__node)[node_localnr(__pfn,__node)]; \ + &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ }) #define page_to_pfn(pg) \ diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h index ebf0228fec42..d58878ec899e 100644 --- a/include/asm-m32r/mmzone.h +++ b/include/asm-m32r/mmzone.h @@ -14,7 +14,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -32,7 +31,7 @@ extern struct pglist_data *node_data[]; ({ \ unsigned long __pfn = pfn; \ int __node = pfn_to_nid(__pfn); \ - &node_mem_map(__node)[node_localnr(__pfn,__node)]; \ + &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ }) #define page_to_pfn(pg) \ diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h index 928bf50c4693..595d3dce120a 100644 --- a/include/asm-parisc/mmzone.h +++ b/include/asm-parisc/mmzone.h @@ -19,7 +19,6 @@ extern struct node_map_data node_data[]; */ #define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -38,7 +37,7 @@ extern struct node_map_data node_data[]; ({ \ unsigned long __pfn = (pfn); \ int __node = pfn_to_nid(__pfn); \ - &node_mem_map(__node)[node_localnr(__pfn,__node)]; \ + &NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)]; \ }) #define page_to_pfn(pg) \ diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index 0619a41a3c9d..cbfc5ecfe875 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h @@ -65,7 +65,6 @@ static inline int pa_to_nid(unsigned long pa) */ #define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) @@ -76,7 +75,7 @@ static inline int pa_to_nid(unsigned long pa) #define discontigmem_pfn_to_page(pfn) \ ({ \ unsigned long __tmp = pfn; \ - (node_mem_map(pfn_to_nid(__tmp)) + \ + (NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \ node_localnr(__tmp, pfn_to_nid(__tmp))); \ }) diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index d95b7c240831..ca4fc3fe0dee 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -35,9 +35,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) #define NODE_DATA(nid) (node_data[nid]) -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) - -#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) @@ -50,7 +47,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) (2.4 used to). */ #define pfn_to_page(pfn) ({ \ int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); \ - ((pfn) - node_start_pfn(nid)) + node_mem_map(nid); \ + ((pfn) - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map; \ }) #define page_to_pfn(page) \ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4733d35d8223..b79633d3a97b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -284,6 +284,8 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) extern struct pglist_data *pgdat_list; -- cgit v1.2.3 From 6f167ec721108c9282d54424516a12c805e3c306 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:39 -0700 Subject: [PATCH] sparsemem base: simple NUMA remap space allocator Introduce a simple allocator for the NUMA remap space. This space is very scarce, used for structures which are best allocated node local. This mechanism is also used on non-NUMA ia64 systems with a vmem_map to keep the pgdat->node_mem_map initialized in a consistent place for all architectures. Issues: o alloc_remap takes a node_id where we might expect a pgdat which was intended to allow us to allocate the pgdat's using this mechanism; which we do not yet do. Could have alloc_remap_node() and alloc_remap_nid() for this purpose. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0dd8ca1a3d5a..500f451ce0c0 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -67,6 +67,15 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP +extern void *alloc_remap(int nid, unsigned long size); +#else +static inline void *alloc_remap(int nid, unsigned long size) +{ + return NULL; +} +#endif + extern unsigned long __initdata nr_kernel_pages; extern unsigned long __initdata nr_all_pages; -- cgit v1.2.3 From 348f8b6c4837a07304d2f72b11ce8d96588065e0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:40 -0700 Subject: [PATCH] sparsemem base: reorganize page->flags bit operations Generify the value fields in the page_flags. The aim is to allow the location and size of these fields to be varied. Additionally we want to move away from fixed allocations per field whilst still enforcing the overall bit utilisation limits. We rely on the compiler to spot and optimise the accessor functions. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 53 +++++++++++++++++++++++++++++++++++++++++--------- include/linux/mmzone.h | 19 +++++++----------- 2 files changed, 51 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1813b162b0a8..57b2ead51dba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -395,19 +395,41 @@ static inline void put_page(struct page *page) /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. - * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total, - * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits. */ -#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT) + +/* Page flags: | NODE | ZONE | ... | FLAGS | */ +#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) + +/* + * Define the bit shifts to access each section. For non-existant + * sections we define the shift as 0; that plus a 0 mask ensures + * the compiler will optimise away reference to them. + */ +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) + +/* NODE:ZONE is used to lookup the zone from a page. */ +#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#define ZONETABLE_PGSHIFT ZONES_PGSHIFT + +#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#endif + #define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) +#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) +#define NODES_MASK ((1UL << NODES_SHIFT) - 1) +#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) + static inline unsigned long page_zonenum(struct page *page) { - return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT)); + return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } static inline unsigned long page_to_nid(struct page *page) { - return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT)); + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } struct zone; @@ -415,13 +437,26 @@ extern struct zone *zone_table[]; static inline struct zone *page_zone(struct page *page) { - return zone_table[page->flags >> NODEZONE_SHIFT]; + return zone_table[(page->flags >> ZONETABLE_PGSHIFT) & + ZONETABLE_MASK]; +} + +static inline void set_page_zone(struct page *page, unsigned long zone) +{ + page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; +} +static inline void set_page_node(struct page *page, unsigned long node) +{ + page->flags &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } -static inline void set_page_zone(struct page *page, unsigned long nodezone_num) +static inline void set_page_links(struct page *page, unsigned long zone, + unsigned long node) { - page->flags &= ~(~0UL << NODEZONE_SHIFT); - page->flags |= nodezone_num << NODEZONE_SHIFT; + set_page_zone(page, zone); + set_page_node(page, node); } #ifndef CONFIG_DISCONTIGMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b79633d3a97b..39e912708e2a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -414,30 +414,25 @@ extern struct pglist_data contig_page_data; #include +#endif /* !CONFIG_DISCONTIGMEM */ + #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* * with 32 bit page->flags field, we reserve 8 bits for node/zone info. * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. */ -#define MAX_NODES_SHIFT 6 +#define FLAGS_RESERVED 8 + #elif BITS_PER_LONG == 64 /* * with 64 bit flags field, there's plenty of room. */ -#define MAX_NODES_SHIFT 10 -#endif +#define FLAGS_RESERVED 32 -#endif /* !CONFIG_DISCONTIGMEM */ - -#if NODES_SHIFT > MAX_NODES_SHIFT -#error NODES_SHIFT > MAX_NODES_SHIFT -#endif +#else -/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */ -#define MAX_ZONES_SHIFT 2 +#error BITS_PER_LONG not defined -#if ZONES_SHIFT > MAX_ZONES_SHIFT -#error ZONES_SHIFT > MAX_ZONES_SHIFT #endif #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3 From 5b505b90b2d54e526cc8d123bdef3b98c9f0bbc6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:41 -0700 Subject: [PATCH] sparsemem base: teach discontig about sparse ranges discontig.c has some assumptions that mem_map[]s inside of a node are contiguous. Teach it to make sure that each region that it's bringing online is actually made up of valid ranges of ram. Written-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/page.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 41400d342d44..8f3dded01bff 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -120,6 +120,8 @@ static __inline__ int get_order(unsigned long size) extern int sysctl_legacy_va_layout; +extern int page_is_ram(unsigned long pagenr); + #endif /* __ASSEMBLY__ */ #ifdef __ASSEMBLY__ -- cgit v1.2.3 From 93b7504e3e6c1d98586854806e51bea329ea3aa9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:47 -0700 Subject: [PATCH] Introduce new Kconfig option for NUMA or DISCONTIG There is some confusion that arose when working on SPARSEMEM patch between what is needed for DISCONTIG vs. NUMA. Multiple pg_data_t's are needed for DISCONTIGMEM or NUMA, independently. All of the current NUMA implementations require an implementation of DISCONTIG. Because of this, quite a lot of code which is really needed for NUMA is actually under DISCONTIG #ifdefs. For SPARSEMEM, we changed some of these #ifdefs to CONFIG_NUMA, but that broke the DISCONTIG=y and NUMA=n case. Introducing this new NEED_MULTIPLE_NODES config option allows code that is needed for both NUMA or DISCONTIG to be separated out from code that is specific to DISCONTIG. One great advantage of this approach is that it doesn't require every architecture to be converted over. All of the current implementations should "just work", only the ones implementing SPARSEMEM will have to be fixed up. The change to free_area_init() makes it work inside, or out of the new config option. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39e912708e2a..95f4a780ea66 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -402,7 +402,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, /* Returns the number of the current Node. */ #define numa_node_id() (cpu_to_node(raw_smp_processor_id())) -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) @@ -410,11 +410,11 @@ extern struct pglist_data contig_page_data; #define MAX_NODES_SHIFT 1 #define pfn_to_nid(pfn) (0) -#else /* CONFIG_DISCONTIGMEM */ +#else /* CONFIG_NEED_MULTIPLE_NODES */ #include -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* -- cgit v1.2.3 From b159d43fbf7eaaac6ecc647f51cf4257332db47b Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:52 -0700 Subject: [PATCH] generify early_pfn_to_nid Provide a default implementation for early_pfn_to_nid returning node 0. Allow architectures to override this with their own implementation out of asm/mmzone.h. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/mmzone.h | 3 +++ include/linux/mmzone.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 9cec191f462c..48e46d403aa6 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -143,4 +143,7 @@ static inline void get_memcfg_numa(void) } #endif /* CONFIG_DISCONTIGMEM */ + +extern int early_pfn_to_nid(unsigned long pfn); + #endif /* _ASM_MMZONE_H_ */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 95f4a780ea66..6ef07de98d69 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -435,6 +435,10 @@ extern struct pglist_data contig_page_data; #endif +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#define early_pfn_to_nid(nid) (0UL) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ -- cgit v1.2.3 From d41dee369bff3b9dcb6328d4d822926c28cc2594 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:54 -0700 Subject: [PATCH] sparsemem memory model Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of mem_map[] is needed by discontiguous memory machines (like in the old CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually become a complete replacement. A significant advantage over DISCONTIGMEM is that it's completely separated from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA and DISCONTIG are often confused. Another advantage is that sparse doesn't require each NUMA node's ranges to be contiguous. It can handle overlapping ranges between nodes with no problems, where DISCONTIGMEM currently throws away that memory. Sparsemem uses an array to provide different pfn_to_page() translations for each SECTION_SIZE area of physical memory. This is what allows the mem_map[] to be chopped up. In order to do quick pfn_to_page() operations, the section number of the page is encoded in page->flags. Part of the sparsemem infrastructure enables sharing of these bits more dynamically (at compile-time) between the page_zone() and sparsemem operations. However, on 32-bit architectures, the number of bits is quite limited, and may require growing the size of the page->flags type in certain conditions. Several things might force this to occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of memory), an increase in the physical address space, or an increase in the number of used page->flags. One thing to note is that, once sparsemem is present, the NUMA node information no longer needs to be stored in the page->flags. It might provide speed increases on certain platforms and will be stored there if there is room. But, if out of room, an alternate (theoretically slower) mechanism is used. This patch introduces CONFIG_FLATMEM. It is used in almost all cases where there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM often have to compile out the same areas of code. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Adrian Bunk Signed-off-by: Yasunori Goto Signed-off-by: Bob Picco Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 92 ++++++++++++++++++++++++++++++++++++++--------- include/linux/mmzone.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/numa.h | 2 +- 3 files changed, 172 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 57b2ead51dba..6eb7f48317f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -397,40 +397,80 @@ static inline void put_page(struct page *page) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | NODE | ZONE | ... | FLAGS | */ -#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) + +/* + * page->flags layout: + * + * There are three possibilities for how page->flags get + * laid out. The first is for the normal case, without + * sparsemem. The second is for sparsemem when there is + * plenty of space for node and section. The last is when + * we have run out of space and have to fall back to an + * alternate (slower) way of determining the node. + * + * No sparsemem: | NODE | ZONE | ... | FLAGS | + * with space for node: | SECTION | NODE | ZONE | ... | FLAGS | + * no space for node: | SECTION | ZONE | ... | FLAGS | + */ +#ifdef CONFIG_SPARSEMEM +#define SECTIONS_WIDTH SECTIONS_SHIFT +#else +#define SECTIONS_WIDTH 0 +#endif + +#define ZONES_WIDTH ZONES_SHIFT + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED +#define NODES_WIDTH NODES_SHIFT +#else +#define NODES_WIDTH 0 +#endif + +/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ +#define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH) +#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + +/* + * We are going to use the flags for the page to node mapping if its in + * there. This includes the case where there is no node, so it is implicit. + */ +#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0) + +#ifndef PFN_SECTION_SHIFT +#define PFN_SECTION_SHIFT 0 +#endif /* * Define the bit shifts to access each section. For non-existant * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) +#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -/* NODE:ZONE is used to lookup the zone from a page. */ +/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */ +#if FLAGS_HAS_NODE #define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#else +#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#endif #define ZONETABLE_PGSHIFT ZONES_PGSHIFT -#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED -#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED +#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED #endif -#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) - -#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) -#define NODES_MASK ((1UL << NODES_SHIFT) - 1) +#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) +#define NODES_MASK ((1UL << NODES_WIDTH) - 1) +#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) static inline unsigned long page_zonenum(struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } -static inline unsigned long page_to_nid(struct page *page) -{ - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; -} struct zone; extern struct zone *zone_table[]; @@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page) ZONETABLE_MASK]; } +static inline unsigned long page_to_nid(struct page *page) +{ + if (FLAGS_HAS_NODE) + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; + else + return page_zone(page)->zone_pgdat->node_id; +} +static inline unsigned long page_to_section(struct page *page) +{ + return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; +} + static inline void set_page_zone(struct page *page, unsigned long zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); @@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node) page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } +static inline void set_page_section(struct page *page, unsigned long section) +{ + page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; +} static inline void set_page_links(struct page *page, unsigned long zone, - unsigned long node) + unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); + set_page_section(page, pfn_to_section_nr(pfn)); } #ifndef CONFIG_DISCONTIGMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6ef07de98d69..19860d317ec2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -269,7 +269,9 @@ typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[GFP_ZONETYPES]; int nr_zones; +#ifdef CONFIG_FLAT_NODE_MEM_MAP struct page *node_mem_map; +#endif struct bootmem_data *bdata; unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ @@ -284,7 +286,11 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#ifdef CONFIG_FLAT_NODE_MEM_MAP #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#else +#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) +#endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) extern struct pglist_data *pgdat_list; @@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data; #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#ifdef CONFIG_SPARSEMEM +#include +#endif + #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* * with 32 bit page->flags field, we reserve 8 bits for node/zone info. @@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data; #define early_pfn_to_nid(nid) (0UL) #endif +#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) +#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) + +#ifdef CONFIG_SPARSEMEM + +/* + * SECTION_SHIFT #bits space required to store a section # + * + * PA_SECTION_SHIFT physical address to/from section number + * PFN_SECTION_SHIFT pfn to/from section number + */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) +#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) + +#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) + +#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) +#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) + +#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_ORDER exceeds SECTION_SIZE +#endif + +struct page; +struct mem_section { + struct page *section_mem_map; +}; + +extern struct mem_section mem_section[NR_MEM_SECTIONS]; + +/* + * Given a kernel address, find the home node of the underlying memory. + */ +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) + +static inline struct mem_section *__pfn_to_section(unsigned long pfn) +{ + return &mem_section[pfn_to_section_nr(pfn)]; +} + +#define pfn_to_page(pfn) \ +({ \ + unsigned long __pfn = (pfn); \ + __pfn_to_section(__pfn)->section_mem_map + __pfn; \ +}) +#define page_to_pfn(page) \ +({ \ + page - mem_section[page_to_section(page)].section_mem_map; \ +}) + +static inline int pfn_valid(unsigned long pfn) +{ + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) + return 0; + return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0; +} + +/* + * These are _only_ used during initialisation, therefore they + * can use __initdata ... They could have names to indicate + * this restriction. + */ +#ifdef CONFIG_NUMA +#define pfn_to_nid early_pfn_to_nid +#endif + +#define pfn_to_pgdat(pfn) \ +({ \ + NODE_DATA(pfn_to_nid(pfn)); \ +}) + +#define early_pfn_valid(pfn) pfn_valid(pfn) +void sparse_init(void); +#else +#define sparse_init() do {} while (0) +#endif /* CONFIG_SPARSEMEM */ + +#ifndef early_pfn_valid +#define early_pfn_valid(pfn) (1) +#endif + +void memory_present(int nid, unsigned long start, unsigned long end); +unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/include/linux/numa.h b/include/linux/numa.h index bd0c8c4e9a95..f0c539bd3cfc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -3,7 +3,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifndef CONFIG_FLATMEM #include #endif -- cgit v1.2.3 From 05b79bdcb48c18cd9b580c39e3efb9a1ab078151 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:57 -0700 Subject: [PATCH] sparsemem memory model for i386 Provide the architecture specific implementation for SPARSEMEM for i386 SMP and NUMA systems. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/mmzone.h | 89 ++++++++++++++++++++++++-------------------- include/asm-i386/page.h | 4 +- include/asm-i386/pgtable.h | 4 +- include/asm-i386/sparsemem.h | 31 +++++++++++++++ 4 files changed, 83 insertions(+), 45 deletions(-) create mode 100644 include/asm-i386/sparsemem.h (limited to 'include') diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 48e46d403aa6..33ce5d37e894 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -8,7 +8,9 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#if CONFIG_NUMA +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) #ifdef CONFIG_NUMA #ifdef CONFIG_X86_NUMAQ @@ -21,8 +23,28 @@ #define get_zholes_size(n) (0) #endif /* CONFIG_NUMA */ -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) +extern int get_memcfg_numa_flat(void ); +/* + * This allows any one NUMA architecture to be compiled + * for, and still fall back to the flat function if it + * fails. + */ +static inline void get_memcfg_numa(void) +{ +#ifdef CONFIG_X86_NUMAQ + if (get_memcfg_numaq()) + return; +#elif CONFIG_ACPI_SRAT + if (get_memcfg_from_srat()) + return; +#endif + + get_memcfg_numa_flat(); +} + +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_DISCONTIGMEM /* * generic node memory support, the following assumptions apply: @@ -48,26 +70,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -/* - * Following are macros that are specific to this numa platform. - */ -#define reserve_bootmem(addr, size) \ - reserve_bootmem_node(NODE_DATA(0), (addr), (size)) -#define alloc_bootmem(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) -#define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define alloc_bootmem_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages_node(ignore, x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) - #define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) /* @@ -121,28 +123,33 @@ static inline int pfn_valid(int pfn) return (pfn < node_end_pfn(nid)); return 0; } -#endif +#endif /* CONFIG_X86_NUMAQ */ + +#endif /* CONFIG_DISCONTIGMEM */ + +#ifdef CONFIG_NEED_MULTIPLE_NODES -extern int get_memcfg_numa_flat(void ); /* - * This allows any one NUMA architecture to be compiled - * for, and still fall back to the flat function if it - * fails. + * Following are macros that are specific to this numa platform. */ -static inline void get_memcfg_numa(void) -{ -#ifdef CONFIG_X86_NUMAQ - if (get_memcfg_numaq()) - return; -#elif CONFIG_ACPI_SRAT - if (get_memcfg_from_srat()) - return; -#endif - - get_memcfg_numa_flat(); -} +#define reserve_bootmem(addr, size) \ + reserve_bootmem_node(NODE_DATA(0), (addr), (size)) +#define alloc_bootmem(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) +#define alloc_bootmem_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low_pages(x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) +#define alloc_bootmem_node(ignore, x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_pages_node(ignore, x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_low_pages_node(ignore, x) \ + __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ extern int early_pfn_to_nid(unsigned long pfn); diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 8f3dded01bff..dea8f8e6d86e 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -137,11 +137,11 @@ extern int page_is_ram(unsigned long pagenr); #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_FLATMEM */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index e9efe148fdf7..77c6497f416e 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -398,9 +398,9 @@ extern void noexec_setup(const char *str); #endif /* !__ASSEMBLY__ */ -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM #define kern_addr_valid(addr) (1) -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_FLATMEM */ #define io_remap_page_range(vma, vaddr, paddr, size, prot) \ remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot) diff --git a/include/asm-i386/sparsemem.h b/include/asm-i386/sparsemem.h new file mode 100644 index 000000000000..cfeed990585f --- /dev/null +++ b/include/asm-i386/sparsemem.h @@ -0,0 +1,31 @@ +#ifndef _I386_SPARSEMEM_H +#define _I386_SPARSEMEM_H +#ifdef CONFIG_SPARSEMEM + +/* + * generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the + * flags field of the struct page + */ + +/* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ +#ifdef CONFIG_X86_PAE +#define SECTION_SIZE_BITS 30 +#define MAX_PHYSADDR_BITS 36 +#define MAX_PHYSMEM_BITS 36 +#else +#define SECTION_SIZE_BITS 26 +#define MAX_PHYSADDR_BITS 32 +#define MAX_PHYSMEM_BITS 32 +#endif + +/* XXX: FIXME -- wli */ +#define kern_addr_valid(kaddr) (0) + +#endif /* CONFIG_SPARSEMEM */ +#endif /* _I386_SPARSEMEM_H */ -- cgit v1.2.3 From 641c767389b19859a45e6de46d8e18cd935bdb60 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:59 -0700 Subject: [PATCH] sparsemem swiss cheese numa layouts The part of the sparsemem patch which modifies memmap_init_zone() has recently become a problem. It changes behavior so that there is a call to pfn_to_page() for each individual page inside of a node's range: node_start_pfn through node_end_pfn. It used to simply do this once, at the beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside of a node made it necessary to change. Mike Kravetz recently wrote a patch which made the NUMA code accept some new kinds of layouts. The system's memory was laid out like this, with node 0's memory in two pieces: one before and one after node 1's memory: Node 0: +++++ +++++ Node 1: +++++ Previous behavior before Mike's patch was to assign nodes like this: Node 0: 00000 XXXXX Node 1: 11111 Where the 'X' areas were simply thrown away. The new behavior was to make the pg_data_t span node 0 across all of its areas, including areas that are really node 1's: Node 0: 000000000000000 Node 1: 11111 This wastes a little bit of mem_map space, but ends up being OK, and more fully utilizes the system's memory. memmap_init_zone() initializes all of the "struct page"s for node 0, even for the "hole", but those never get used, because there is no pfn_to_page() that resolves to those pages. However, only calling pfn_to_page() once, memmap_init_zone() always uses the pages that were allocated for node0->node_mem_map because: struct page *start = pfn_to_page(start_pfn); // effectively start = &node->node_mem_map[0] for (page = start; page < (start + size); page++) { init_page_here();... page++; } Slow, and wasteful, but generally harmless. But, modify that to call pfn_to_page() for each loop iteration (like sparsemem does): for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) { page = pfn_to_page(pfn); } And you end up trying to initialize node 1's pages too early, along with bogus data from node 0. This patch checks for those weird layouts and declines to touch the pages, making the more frequent pfn_to_page() calls OK to do. Signed-off-by: Dave Hansen Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 19860d317ec2..746b57e3d370 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -528,6 +528,12 @@ void sparse_init(void); #define sparse_init() do {} while (0) #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +#else +#define early_pfn_in_nid(pfn, nid) (1) +#endif + #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) #endif -- cgit v1.2.3 From 29751f6991e845f7d002a6ae520bf996b38c8dcd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:00 -0700 Subject: [PATCH] sparsemem hotplug base Make sparse's initalization be accessible at runtime. This allows sparse mappings to be created after boot in a hotplug situation. This patch is separated from the previous one just to give an indication how much of the sparse infrastructure is *just* for hotplug memory. The section_mem_map doesn't really store a pointer. It stores something that is convenient to do some math against to get a pointer. It isn't valid to just do *section_mem_map, so I don't think it should be stored as a pointer. There are a couple of things I'd like to store about a section. First of all, the fact that it is !NULL does not mean that it is present. There could be such a combination where section_mem_map *is* NULL, but the math gets you properly to a real mem_map. So, I don't think that check is safe. Since we're storing 32-bit-aligned structures, we have a few bits in the bottom of the pointer to play with. Use one bit to encode whether there's really a mem_map there, and the other one to tell whether there's a valid section there. We need to distinguish between the two because sometimes there's a gap between when a section is discovered to be present and when we can get the mem_map for it. Signed-off-by: Dave Hansen Signed-off-by: Andy Whitcroft Signed-off-by: Jack Steiner Signed-off-by: Bob Picco Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 56 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 746b57e3d370..6c90461ed99f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -476,11 +476,56 @@ extern struct pglist_data contig_page_data; struct page; struct mem_section { - struct page *section_mem_map; + /* + * This is, logically, a pointer to an array of struct + * pages. However, it is stored with some other magic. + * (see sparse.c::sparse_init_one_section()) + * + * Making it a UL at least makes someone do a cast + * before using it wrong. + */ + unsigned long section_mem_map; }; extern struct mem_section mem_section[NR_MEM_SECTIONS]; +static inline struct mem_section *__nr_to_section(unsigned long nr) +{ + return &mem_section[nr]; +} + +/* + * We use the lower bits of the mem_map pointer to store + * a little bit of information. There should be at least + * 3 bits here due to 32-bit alignment. + */ +#define SECTION_MARKED_PRESENT (1UL<<0) +#define SECTION_HAS_MEM_MAP (1UL<<1) +#define SECTION_MAP_LAST_BIT (1UL<<2) +#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) + +static inline struct page *__section_mem_map_addr(struct mem_section *section) +{ + unsigned long map = section->section_mem_map; + map &= SECTION_MAP_MASK; + return (struct page *)map; +} + +static inline int valid_section(struct mem_section *section) +{ + return (section->section_mem_map & SECTION_MARKED_PRESENT); +} + +static inline int section_has_mem_map(struct mem_section *section) +{ + return (section->section_mem_map & SECTION_HAS_MEM_MAP); +} + +static inline int valid_section_nr(unsigned long nr) +{ + return valid_section(__nr_to_section(nr)); +} + /* * Given a kernel address, find the home node of the underlying memory. */ @@ -488,24 +533,25 @@ extern struct mem_section mem_section[NR_MEM_SECTIONS]; static inline struct mem_section *__pfn_to_section(unsigned long pfn) { - return &mem_section[pfn_to_section_nr(pfn)]; + return __nr_to_section(pfn_to_section_nr(pfn)); } #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ - __pfn_to_section(__pfn)->section_mem_map + __pfn; \ + __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ }) #define page_to_pfn(page) \ ({ \ - page - mem_section[page_to_section(page)].section_mem_map; \ + page - __section_mem_map_addr(__nr_to_section( \ + page_to_section(page))); \ }) static inline int pfn_valid(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0; + return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); } /* -- cgit v1.2.3 From 510f8fa7ba18320d408dd3093663e58f5664f2f0 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:01 -0700 Subject: [PATCH] ppc64: add early_pfn_to_nid Provide an implementation of early_pfn_to_nid for PPC64. This is used by memory models to determine the node from which to take allocations before the memory allocators are fully initialised. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ppc64/mmzone.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index cbfc5ecfe875..ecc4b07507d6 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h @@ -90,4 +90,9 @@ static inline int pa_to_nid(unsigned long pa) #define discontigmem_pfn_valid(pfn) ((pfn) < num_physpages) #endif /* CONFIG_DISCONTIGMEM */ + +#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) +#endif + #endif /* _ASM_MMZONE_H_ */ -- cgit v1.2.3 From 145e664231648121026d470094c200851a446a73 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:03 -0700 Subject: [PATCH] ppc64: sparsemem memory model Provide the architecture specific implementation for SPARSEMEM for PPC64 systems. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Mike Kravetz (in part) Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ppc64/mmzone.h | 36 +++++++++++++++++++++++------------- include/asm-ppc64/page.h | 3 ++- include/asm-ppc64/sparsemem.h | 16 ++++++++++++++++ 3 files changed, 41 insertions(+), 14 deletions(-) create mode 100644 include/asm-ppc64/sparsemem.h (limited to 'include') diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index ecc4b07507d6..ed473f4b0152 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h @@ -10,9 +10,20 @@ #include #include -#ifdef CONFIG_DISCONTIGMEM +/* generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the + * flags field of the struct page + */ + + +#ifdef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data *node_data[]; +/* + * Return a pointer to the node data for node n. + */ +#define NODE_DATA(nid) (node_data[nid]) /* * Following are specific to this numa platform. @@ -47,30 +58,27 @@ static inline int pa_to_nid(unsigned long pa) return nid; } -#define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) - -/* - * Return a pointer to the node data for node n. - */ -#define NODE_DATA(nid) (node_data[nid]) - #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) /* * Following are macros that each numa implmentation must define. */ -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) - #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) #define local_mapnr(kvaddr) \ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) +#ifdef CONFIG_DISCONTIGMEM + +/* + * Given a kernel address, find the home node of the underlying memory. + */ +#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) + +#define pfn_to_nid(pfn) pa_to_nid((unsigned long)(pfn) << PAGE_SHIFT) + /* Written this way to avoid evaluating arguments twice */ #define discontigmem_pfn_to_page(pfn) \ ({ \ @@ -91,6 +99,8 @@ static inline int pa_to_nid(unsigned long pa) #endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ + #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID #define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) #endif diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h index 257d87eb7c34..a5893a305a09 100644 --- a/include/asm-ppc64/page.h +++ b/include/asm-ppc64/page.h @@ -217,7 +217,8 @@ extern u64 ppc64_pft_size; /* Log 2 of page table size */ #define page_to_pfn(page) discontigmem_page_to_pfn(page) #define pfn_to_page(pfn) discontigmem_pfn_to_page(pfn) #define pfn_valid(pfn) discontigmem_pfn_valid(pfn) -#else +#endif +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) diff --git a/include/asm-ppc64/sparsemem.h b/include/asm-ppc64/sparsemem.h new file mode 100644 index 000000000000..c5bd47e57f17 --- /dev/null +++ b/include/asm-ppc64/sparsemem.h @@ -0,0 +1,16 @@ +#ifndef _ASM_PPC64_SPARSEMEM_H +#define _ASM_PPC64_SPARSEMEM_H 1 + +#ifdef CONFIG_SPARSEMEM +/* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ +#define SECTION_SIZE_BITS 24 +#define MAX_PHYSADDR_BITS 38 +#define MAX_PHYSMEM_BITS 36 + +#endif /* CONFIG_SPARSEMEM */ + +#endif /* _ASM_PPC64_SPARSEMEM_H */ -- cgit v1.2.3 From 2b97690f4cd960779fb351b7cd9974390afabb36 Mon Sep 17 00:00:00 2001 From: Matt Tolentino Date: Thu, 23 Jun 2005 00:08:06 -0700 Subject: [PATCH] reorganize x86-64 NUMA and DISCONTIGMEM config options In order to use the alternative sparsemem implmentation for NUMA kernels, we need to reorganize the config options. This patch effectively abstracts out the CONFIG_DISCONTIGMEM options to CONFIG_NUMA in most cases. Thus, the discontigmem implementation may be employed as always, but the sparsemem implementation may be used alternatively. Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-x86_64/io.h | 5 ----- include/asm-x86_64/mmzone.h | 15 +++++++++------ include/asm-x86_64/page.h | 4 +++- include/asm-x86_64/topology.h | 4 +--- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h index 94202703fae2..37fc3f149a5a 100644 --- a/include/asm-x86_64/io.h +++ b/include/asm-x86_64/io.h @@ -124,12 +124,7 @@ extern inline void * phys_to_virt(unsigned long address) /* * Change "struct page" to physical address. */ -#ifdef CONFIG_DISCONTIGMEM -#include #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#else -#define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) -#endif #include diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index ca4fc3fe0dee..768413751b34 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -6,7 +6,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA #define VIRTUAL_BUG_ON(x) @@ -30,17 +30,16 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) return nid; } -#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) - -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) #define NODE_DATA(nid) (node_data[nid]) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) +#ifdef CONFIG_DISCONTIGMEM + +#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) +#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) /* AK: this currently doesn't deal with invalid addresses. We'll see if the 2.5 kernel doesn't pass them @@ -57,4 +56,8 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) ({ u8 nid__ = pfn_to_nid(pfn); \ nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) <= node_end_pfn(nid__); })) #endif + +#define local_mapnr(kvaddr) \ + ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) +#endif #endif diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 9ce338c3a71e..60130f4ca986 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -119,7 +119,9 @@ extern __inline__ int get_order(unsigned long size) __pa(v); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_DISCONTIGMEM +#define __boot_va(x) __va(x) +#define __boot_pa(x) __pa(x) +#ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) #define pfn_valid(pfn) ((pfn) < max_mapnr) diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index 67f24e0ea819..da21573ec731 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -3,7 +3,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA #include #include @@ -37,7 +37,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus) } #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number) -#ifdef CONFIG_NUMA /* sched_domains SD_NODE_INIT for x86_64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ @@ -59,7 +58,6 @@ static inline cpumask_t __pcibus_to_cpumask(int bus) .balance_interval = 1, \ .nr_balance_failed = 0, \ } -#endif #endif -- cgit v1.2.3 From bbfceef47fb9467424113a004070bf37a806a97c Mon Sep 17 00:00:00 2001 From: Matt Tolentino Date: Thu, 23 Jun 2005 00:08:07 -0700 Subject: [PATCH] add x86-64 specific support for sparsemem This patch adds in the necessary support for sparsemem such that x86-64 kernels may use sparsemem as an alternative to discontigmem for NUMA kernels. Note that this does no preclude one from continuing to build NUMA kernels using discontigmem, but merely allows the option to build NUMA kernels with sparsemem. Interestingly, the use of sparsemem in lieu of discontigmem in NUMA kernels results in reduced text size for otherwise equivalent kernels as shown in the example builds below: text data bss dec hex filename 2371036 765884 1237108 4374028 42be0c vmlinux.discontig 2366549 776484 1302772 4445805 43d66d vmlinux.sparse Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-x86_64/bitops.h | 2 -- include/asm-x86_64/sparsemem.h | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 include/asm-x86_64/sparsemem.h (limited to 'include') diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index 5dd7727c756b..a31bb99be53f 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -411,8 +411,6 @@ static __inline__ int ffs(int x) /* find last set bit */ #define fls(x) generic_fls(x) -#define ARCH_HAS_ATOMIC_UNSIGNED 1 - #endif /* __KERNEL__ */ #endif /* _X86_64_BITOPS_H */ diff --git a/include/asm-x86_64/sparsemem.h b/include/asm-x86_64/sparsemem.h new file mode 100644 index 000000000000..dabb16714a71 --- /dev/null +++ b/include/asm-x86_64/sparsemem.h @@ -0,0 +1,26 @@ +#ifndef _ASM_X86_64_SPARSEMEM_H +#define _ASM_X86_64_SPARSEMEM_H 1 + +#ifdef CONFIG_SPARSEMEM + +/* + * generic non-linear memory support: + * + * 1) we will not split memory into more chunks than will fit into the flags + * field of the struct page + * + * SECTION_SIZE_BITS 2^n: size of each section + * MAX_PHYSADDR_BITS 2^n: max size of physical address space + * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space + * + */ + +#define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ +#define MAX_PHYSADDR_BITS 40 +#define MAX_PHYSMEM_BITS 40 + +extern int early_pfn_to_nid(unsigned long pfn); + +#endif /* CONFIG_SPARSEMEM */ + +#endif /* _ASM_X86_64_SPARSEMEM_H */ -- cgit v1.2.3 From 8a9e1b0f564615bd92ba50162623e25c2904e564 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 23 Jun 2005 00:08:13 -0700 Subject: [PATCH] Platform SMIs and their interferance with tsc based delay calibration Issue: Current tsc based delay_calibration can result in significant errors in loops_per_jiffy count when the platform events like SMIs (System Management Interrupts that are non-maskable) are present. This could lead to potential kernel panic(). This issue is becoming more visible with 2.6 kernel (as default HZ is 1000) and on platforms with higher SMI handling latencies. During the boot time, SMIs are mostly used by BIOS (for things like legacy keyboard emulation). Description: The psuedocode for current delay calibration with tsc based delay looks like (0) Estimate a value for loops_per_jiffy (1) While (loops_per_jiffy estimate is accurate enough) (2) wait for jiffy transition (jiffy1) (3) Note down current tsc (tsc1) (4) loop until tsc becomes tsc1 + loops_per_jiffy (5) check whether jiffy changed since jiffy1 or not and refine loops_per_jiffy estimate Consider the following cases Case 1: If SMIs happen between (2) and (3) above, we can end up with a loops_per_jiffy value that is too low. This results in shorted delays and kernel can panic () during boot (Mostly at IOAPIC timer initialization timer_irq_works() as we don't have enough timer interrupts in a specified interval). Case 2: If SMIs happen between (3) and (4) above, then we can end up with a loops_per_jiffy value that is too high. And with current i386 code, too high lpj value (greater than 17M) can result in a overflow in delay.c:__const_udelay() again resulting in shorter delay and panic(). Solution: The patch below makes the calibration routine aware of asynchronous events like SMIs. We increase the delay calibration time and also identify any significant errors (greater than 12.5%) in the calibration and notify it to user. Patch below changes both i386 and x86-64 architectures to use this new and improved calibrate_delay_direct() routine. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/timer.h | 2 ++ include/asm-i386/timex.h | 3 +++ include/asm-x86_64/timex.h | 3 +++ 3 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h index c34709849839..dcf1e07db08a 100644 --- a/include/asm-i386/timer.h +++ b/include/asm-i386/timer.h @@ -22,6 +22,7 @@ struct timer_opts { unsigned long (*get_offset)(void); unsigned long long (*monotonic_clock)(void); void (*delay)(unsigned long); + unsigned long (*read_timer)(void); }; struct init_timer_opts { @@ -52,6 +53,7 @@ extern struct init_timer_opts timer_cyclone_init; #endif extern unsigned long calibrate_tsc(void); +extern unsigned long read_timer_tsc(void); extern void init_cpu_khz(void); extern int recalibrate_cpu_khz(void); #ifdef CONFIG_HPET_TIMER diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h index b41e484c3445..e370f907bd39 100644 --- a/include/asm-i386/timex.h +++ b/include/asm-i386/timex.h @@ -49,4 +49,7 @@ static inline cycles_t get_cycles (void) extern unsigned long cpu_khz; +extern int read_current_timer(unsigned long *timer_value); +#define ARCH_HAS_READ_CURRENT_TIMER 1 + #endif diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h index 34f31a18f90b..24ecf6a637cb 100644 --- a/include/asm-x86_64/timex.h +++ b/include/asm-x86_64/timex.h @@ -26,6 +26,9 @@ static inline cycles_t get_cycles (void) extern unsigned int cpu_khz; +extern int read_current_timer(unsigned long *timer_value); +#define ARCH_HAS_READ_CURRENT_TIMER 1 + extern struct vxtime_data vxtime; #endif -- cgit v1.2.3 From 4a35293667f8691b45fdb72770f087c086fb9702 Mon Sep 17 00:00:00 2001 From: Hirokazu Takata Date: Thu, 23 Jun 2005 00:08:14 -0700 Subject: [PATCH] m32r: build fix for asm-m32r/topology.h Use asm-generic/topology.h to fix yet another pcibus_to_node() build error. Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-m32r/topology.h | 44 +------------------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) (limited to 'include') diff --git a/include/asm-m32r/topology.h b/include/asm-m32r/topology.h index 299a89d91bde..d607eb32bd7e 100644 --- a/include/asm-m32r/topology.h +++ b/include/asm-m32r/topology.h @@ -1,48 +1,6 @@ -/* - * linux/include/asm-generic/topology.h - * - * Written by: Matthew Dobson, IBM Corporation - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to - */ #ifndef _ASM_M32R_TOPOLOGY_H #define _ASM_M32R_TOPOLOGY_H -/* Other architectures wishing to use this simple topology API should fill - in the below functions as appropriate in their own file. */ - -#define cpu_to_node(cpu) (0) - -#ifndef parent_node -#define parent_node(node) (0) -#endif -#ifndef node_to_cpumask -#define node_to_cpumask(node) (cpu_online_map) -#endif -#ifndef node_to_first_cpu -#define node_to_first_cpu(node) (0) -#endif -#ifndef pcibus_to_cpumask -#define pcibus_to_cpumask(bus) (cpu_online_map) -#endif +#include #endif /* _ASM_M32R_TOPOLOGY_H */ -- cgit v1.2.3 From e164f5573bef0e6caf53519719cf0228c9c15ce3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:17 -0700 Subject: [PATCH] ppc64: pcibus_to_node fix asm-generic/topology.h must also be included if CONFIG_NUMA is set in order to provide the fall back pcibus_to_node function. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ppc64/topology.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-ppc64/topology.h b/include/asm-ppc64/topology.h index d58d9dd79998..fcdcfd26a26b 100644 --- a/include/asm-ppc64/topology.h +++ b/include/asm-ppc64/topology.h @@ -59,10 +59,8 @@ static inline int node_to_first_cpu(int node) .nr_balance_failed = 0, \ } -#else /* !CONFIG_NUMA */ +#endif /* CONFIG_NUMA */ #include -#endif /* CONFIG_NUMA */ - #endif /* _ASM_PPC64_TOPOLOGY_H */ -- cgit v1.2.3 From 8c5a09082f4e61a176382e96a831a0636b918602 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:18 -0700 Subject: [PATCH] x86/x86_64: pcibus_to_node Define pcibus_to_node to be able to figure out which NUMA node contains a given PCI device. This defines pcibus_to_node(bus) in include/linux/topology.h and adjusts the macros for i386 and x86_64 that already provided a way to determine the cpumask of a pci device. x86_64 was changed to not build an array of cpumasks anymore. Instead an array of nodes is build which can be used to generate the cpumask via node_to_cpumask. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/topology.h | 9 ++++++++- include/asm-i386/topology.h | 8 ++------ include/asm-x86_64/topology.h | 14 +++----------- 3 files changed, 13 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index ec96e8b0f190..5d9d70cd17fc 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -41,8 +41,15 @@ #ifndef node_to_first_cpu #define node_to_first_cpu(node) (0) #endif +#ifndef pcibus_to_node +#define pcibus_to_node(node) (-1) +#endif + #ifndef pcibus_to_cpumask -#define pcibus_to_cpumask(bus) (cpu_online_map) +#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \ + CPU_MASK_ALL : \ + node_to_cpumask(pcibus_to_node(bus)) \ + ) #endif #endif /* _ASM_GENERIC_TOPOLOGY_H */ diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h index 98f9e6850cba..6d0f67507b21 100644 --- a/include/asm-i386/topology.h +++ b/include/asm-i386/topology.h @@ -60,12 +60,8 @@ static inline int node_to_first_cpu(int node) return first_cpu(mask); } -/* Returns the number of the node containing PCI bus number 'busnr' */ -static inline cpumask_t __pcibus_to_cpumask(int busnr) -{ - return node_to_cpumask(mp_bus_id_to_node[busnr]); -} -#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number) +#define pcibus_to_node(bus) mp_bus_id_to_node[(bus)->number] +#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus)) /* sched_domains SD_NODE_INIT for NUMAQ machines */ #define SD_NODE_INIT (struct sched_domain) { \ diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index da21573ec731..8f77e9f6bc23 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -13,8 +13,8 @@ extern cpumask_t cpu_online_map; extern unsigned char cpu_to_node[]; +extern unsigned char pci_bus_to_node[]; extern cpumask_t node_to_cpumask[]; -extern cpumask_t pci_bus_to_cpumask[]; #ifdef CONFIG_ACPI_NUMA extern int __node_distance(int, int); @@ -26,16 +26,8 @@ extern int __node_distance(int, int); #define parent_node(node) (node) #define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) #define node_to_cpumask(node) (node_to_cpumask[node]) - -static inline cpumask_t __pcibus_to_cpumask(int bus) -{ - cpumask_t busmask = pci_bus_to_cpumask[bus]; - cpumask_t online = cpu_online_map; - cpumask_t res; - cpus_and(res, busmask, online); - return res; -} -#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus->number) +#define pcibus_to_node(bus) pci_bus_to_node[(bus)->number] +#define pcibus_to_cpumask(bus) node_to_cpumask(pcibus_to_node(bus)); /* sched_domains SD_NODE_INIT for x86_64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ -- cgit v1.2.3 From 1946089a109251655c5438d92c539bd2930e71ea Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:19 -0700 Subject: [PATCH] NUMA aware block device control structure allocation Patch to allocate the control structures for for ide devices on the node of the device itself (for NUMA systems). The patch depends on the Slab API change patch by Manfred and me (in mm) and the pcidev_to_node patch that I posted today. Does some realignment too. Signed-off-by: Justin M. Forbes Signed-off-by: Christoph Lameter Signed-off-by: Pravin Shelar Signed-off-by: Shobhit Dayal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 6 +++++- include/linux/genhd.h | 1 + include/linux/ide.h | 2 +- include/linux/mempool.h | 11 ++++++++--- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4a99b76c5a33..235c3414d268 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -396,6 +396,7 @@ struct request_queue */ unsigned int sg_timeout; unsigned int sg_reserved_size; + int node; struct list_head drain_list; @@ -615,6 +616,8 @@ static inline void blkdev_dequeue_request(struct request *req) /* * Access functions for manipulating queue properties */ +extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn, + spinlock_t *lock, int node_id); extern request_queue_t *blk_init_queue(request_fn_proc *, spinlock_t *); extern void blk_cleanup_queue(request_queue_t *); extern void blk_queue_make_request(request_queue_t *, make_request_fn *); @@ -646,7 +649,8 @@ extern void blk_wait_queue_drained(request_queue_t *, int); extern void blk_finish_queue_drain(request_queue_t *); int blk_get_queue(request_queue_t *); -request_queue_t *blk_alloc_queue(int); +request_queue_t *blk_alloc_queue(int gfp_mask); +request_queue_t *blk_alloc_queue_node(int,int); #define blk_put_queue(q) blk_cleanup_queue((q)) /* diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 47dedaf971d6..af26dc718ef6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -403,6 +403,7 @@ extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); extern void add_partition(struct gendisk *, int, sector_t, sector_t); extern void delete_partition(struct gendisk *, int); +extern struct gendisk *alloc_disk_node(int minors, int node_id); extern struct gendisk *alloc_disk(int minors); extern struct kobject *get_disk(struct gendisk *disk); extern void put_disk(struct gendisk *disk); diff --git a/include/linux/ide.h b/include/linux/ide.h index 336d6e509f59..92129078d4f3 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -917,7 +917,7 @@ typedef struct hwif_s { unsigned dma; void (*led_act)(void *data, int rw); -} ide_hwif_t; +} ____cacheline_maxaligned_in_smp ide_hwif_t; /* * internal ide interrupt handler type diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 4a36edf1c974..796220ce47cc 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -20,9 +20,14 @@ typedef struct mempool_s { mempool_free_t *free; wait_queue_head_t wait; } mempool_t; -extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); -extern int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask); + +extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, int nid); + +extern int mempool_resize(mempool_t *pool, int new_min_nr, + unsigned int __nocast gfp_mask); extern void mempool_destroy(mempool_t *pool); extern void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask); extern void mempool_free(void *element, mempool_t *pool); -- cgit v1.2.3 From ca05fea6db5259c6d62e517c41d448a4249175f4 Mon Sep 17 00:00:00 2001 From: Natalie Protasevich Date: Thu, 23 Jun 2005 00:08:22 -0700 Subject: [PATCH] Do not enforce unique IO_APIC_ID check for xAPIC systems (i386) This patch is per Andi's request to remove NO_IOAPIC_CHECK from genapic and use heuristics to prevent unique I/O APIC ID check for systems that don't need it. The patch disables unique I/O APIC ID check for Xeon-based and other platforms that don't use serial APIC bus for interrupt delivery. Andi stated that AMD systems don't need unique IO_APIC_IDs either. Signed-off-by: Natalie Protasevich Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/genapic.h | 1 - include/asm-i386/mach-bigsmp/mach_apic.h | 2 -- include/asm-i386/mach-default/mach_apic.h | 2 -- include/asm-i386/mach-es7000/mach_apic.h | 2 -- include/asm-i386/mach-generic/mach_apic.h | 1 - include/asm-i386/mach-numaq/mach_apic.h | 2 -- include/asm-i386/mach-summit/mach_apic.h | 2 -- include/asm-i386/mach-visws/mach_apic.h | 2 -- 8 files changed, 14 deletions(-) (limited to 'include') diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index fc813b2e8274..b3783a32abee 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -78,7 +78,6 @@ struct genapic { .int_delivery_mode = INT_DELIVERY_MODE, \ .int_dest_mode = INT_DEST_MODE, \ .no_balance_irq = NO_BALANCE_IRQ, \ - .no_ioapic_check = NO_IOAPIC_CHECK, \ .ESR_DISABLE = esr_disable, \ .apic_destination_logical = APIC_DEST_LOGICAL, \ APICFUNC(apic_id_registered), \ diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h index 2339868270ef..ba936d4daedb 100644 --- a/include/asm-i386/mach-bigsmp/mach_apic.h +++ b/include/asm-i386/mach-bigsmp/mach_apic.h @@ -14,8 +14,6 @@ #define NO_BALANCE_IRQ (1) #define esr_disable (1) -#define NO_IOAPIC_CHECK (0) - static inline int apic_id_registered(void) { return (1); diff --git a/include/asm-i386/mach-default/mach_apic.h b/include/asm-i386/mach-default/mach_apic.h index 627f1cd084ba..3ef6292db780 100644 --- a/include/asm-i386/mach-default/mach_apic.h +++ b/include/asm-i386/mach-default/mach_apic.h @@ -19,8 +19,6 @@ static inline cpumask_t target_cpus(void) #define NO_BALANCE_IRQ (0) #define esr_disable (0) -#define NO_IOAPIC_CHECK (0) - #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h index ceab2c464b13..b5f3f0d0b2bc 100644 --- a/include/asm-i386/mach-es7000/mach_apic.h +++ b/include/asm-i386/mach-es7000/mach_apic.h @@ -38,8 +38,6 @@ static inline cpumask_t target_cpus(void) #define WAKE_SECONDARY_VIA_INIT #endif -#define NO_IOAPIC_CHECK (1) - static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { return 0; diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h index ab36d02ebede..b13767a4e934 100644 --- a/include/asm-i386/mach-generic/mach_apic.h +++ b/include/asm-i386/mach-generic/mach_apic.h @@ -5,7 +5,6 @@ #define esr_disable (genapic->ESR_DISABLE) #define NO_BALANCE_IRQ (genapic->no_balance_irq) -#define NO_IOAPIC_CHECK (genapic->no_ioapic_check) #define INT_DELIVERY_MODE (genapic->int_delivery_mode) #define INT_DEST_MODE (genapic->int_dest_mode) #undef APIC_DEST_LOGICAL diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h index e1a04494764a..9d158095da82 100644 --- a/include/asm-i386/mach-numaq/mach_apic.h +++ b/include/asm-i386/mach-numaq/mach_apic.h @@ -17,8 +17,6 @@ static inline cpumask_t target_cpus(void) #define NO_BALANCE_IRQ (1) #define esr_disable (1) -#define NO_IOAPIC_CHECK (0) - #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */ diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h index 74e9cbc8c01b..3d6d12937e1f 100644 --- a/include/asm-i386/mach-summit/mach_apic.h +++ b/include/asm-i386/mach-summit/mach_apic.h @@ -7,8 +7,6 @@ #define esr_disable (1) #define NO_BALANCE_IRQ (0) -#define NO_IOAPIC_CHECK (1) /* Don't check I/O APIC ID for xAPIC */ - /* In clustered mode, the high nibble of APIC ID is a cluster number. * The low nibble is a 4-bit bitmap. */ #define XAPIC_DEST_CPUS_SHIFT 4 diff --git a/include/asm-i386/mach-visws/mach_apic.h b/include/asm-i386/mach-visws/mach_apic.h index 4e6cdfb8b091..de438c7147a8 100644 --- a/include/asm-i386/mach-visws/mach_apic.h +++ b/include/asm-i386/mach-visws/mach_apic.h @@ -9,8 +9,6 @@ #define no_balance_irq (0) #define esr_disable (0) -#define NO_IOAPIC_CHECK (0) - #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ -- cgit v1.2.3 From 59121003721a8fad11ee72e646fd9d3076b5679c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:25 -0700 Subject: [PATCH] i386: Selectable Frequency of the Timer Interrupt Make the timer frequency selectable. The timer interrupt may cause bus and memory contention in large NUMA systems since the interrupt occurs on each processor HZ times per second. Signed-off-by: Christoph Lameter Signed-off-by: Shai Fultheim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/param.h | 4 +++- include/asm-x86_64/param.h | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-i386/param.h b/include/asm-i386/param.h index b6440526e42a..fa02e67ea86b 100644 --- a/include/asm-i386/param.h +++ b/include/asm-i386/param.h @@ -1,8 +1,10 @@ +#include + #ifndef _ASMi386_PARAM_H #define _ASMi386_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ +# define HZ CONFIG_HZ /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ #endif diff --git a/include/asm-x86_64/param.h b/include/asm-x86_64/param.h index b707f0568c9e..40b11937180d 100644 --- a/include/asm-x86_64/param.h +++ b/include/asm-x86_64/param.h @@ -1,9 +1,11 @@ +#include + #ifndef _ASMx86_64_PARAM_H #define _ASMx86_64_PARAM_H #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks */ +# define HZ CONFIG_HZ /* Internal kernel timer frequency */ +# define USER_HZ 100 /* .. some user interfaces are in "ticks */ #define CLOCKS_PER_SEC (USER_HZ) /* like times() */ #endif -- cgit v1.2.3 From b5d23e5b8c7ecd97d32f6ad7680d9909977580a7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:27 -0700 Subject: [PATCH] ia64: Selectable Timer Interrupt Frequency It allows a selectable timer interrupt frequency of 100, 250 and 1000 HZ. Reducing the timer frequency may have important performance benefits on large systems. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-ia64/param.h b/include/asm-ia64/param.h index 6c6b679b7a9e..5e1e0d2d7baf 100644 --- a/include/asm-ia64/param.h +++ b/include/asm-ia64/param.h @@ -27,7 +27,7 @@ */ # define HZ 32 # else -# define HZ 1024 +# define HZ CONFIG_HZ # endif # define USER_HZ HZ # define CLOCKS_PER_SEC HZ /* frequency at which times() counts */ -- cgit v1.2.3 From a9ed8817966dd723754a990f1003264a21b5747e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:08:32 -0700 Subject: [PATCH] x86: #include asm/uaccess.h in asm/checksum.h csum_and_copy_to_user is static inline and uses VERIFY_WRITE. Patch allows to remove asm/uaccess.h from i386_ksyms.c without dependency surprises. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/checksum.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-i386/checksum.h b/include/asm-i386/checksum.h index 641342002bcd..f949e44c2a35 100644 --- a/include/asm-i386/checksum.h +++ b/include/asm-i386/checksum.h @@ -3,6 +3,8 @@ #include +#include + /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) -- cgit v1.2.3 From a3a255e744dfa672e741dc24306491139d0de2d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Jun 2005 00:08:34 -0700 Subject: [PATCH] x86: cpu_khz type fix x86_64's cpu_khz is unsigned int and there is no reason why x86 needs to use unsigned long. So make cpu_khz unsigned int on x86 as well. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/timex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-i386/timex.h b/include/asm-i386/timex.h index e370f907bd39..292b5a68f627 100644 --- a/include/asm-i386/timex.h +++ b/include/asm-i386/timex.h @@ -47,7 +47,7 @@ static inline cycles_t get_cycles (void) return ret; } -extern unsigned long cpu_khz; +extern unsigned int cpu_khz; extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 -- cgit v1.2.3 From 32ecd42b6f94d3ee320a22827b46bd19ccf924e5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:08:38 -0700 Subject: [PATCH] eliminate duplicate rdpmc definition Eliminate duplicate definition of rdpmc in x86-64's mtrr.h. Signed-off-by: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-x86_64/msr.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h index 513e52c71821..bc700232728d 100644 --- a/include/asm-x86_64/msr.h +++ b/include/asm-x86_64/msr.h @@ -57,11 +57,6 @@ (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ } while(0) -#define rdpmc(counter,low,high) \ - __asm__ __volatile__("rdpmc" \ - : "=a" (low), "=d" (high) \ - : "c" (counter)) - #define write_tsc(val1,val2) wrmsr(0x10, val1, val2) #define rdpmc(counter,low,high) \ -- cgit v1.2.3 From f5012310e35bd62fd39fce338ee44422c975ff3c Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:42 -0700 Subject: [PATCH] xen: x86: add macro for debugreg Add 2 macros to set and get debugreg on x86. This is useful for Xen because it will need only to redefine each macro to a hypervisor call. Signed-off-by: Vincent Hanquez Cc: Ian Pratt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/processor.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 359bb0151742..c76c50e96225 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -501,12 +501,16 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa } while (0) /* - * This special macro can be used to load a debugging register + * These special macros can be used to get or set a debugging register */ -#define loaddebug(thread,register) \ - __asm__("movl %0,%%db" #register \ - : /* no output */ \ - :"r" ((thread)->debugreg[register])) +#define get_debugreg(var, register) \ + __asm__("movl %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movl %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + /* Forward declaration, a strange C thing */ struct task_struct; -- cgit v1.2.3 From fa1e1bdf78d405f9905b8290ee9211e7a7bbc99b Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:44 -0700 Subject: [PATCH] xen: x86: Rename usermode macro Rename user_mode to user_mode_vm and add a user_mode macro similar to the x86-64 one. This is useful for Xen because the linux xen kernel does not runs on the same priviledge that a vanilla linux kernel, and with this we just need to redefine user_mode(). Signed-off-by: Vincent Hanquez Cc: Ian Pratt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/ptrace.h | 3 ++- include/asm-x86_64/ptrace.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index 8618914b3521..eef9f93870d4 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -57,7 +57,8 @@ struct pt_regs { #ifdef __KERNEL__ struct task_struct; extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); -#define user_mode(regs) ((VM_MASK & (regs)->eflags) || (3 & (regs)->xcs)) +#define user_mode(regs) (3 & (regs)->xcs) +#define user_mode_vm(regs) ((VM_MASK & (regs)->eflags) || user_mode(regs)) #define instruction_pointer(regs) ((regs)->eip) #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/include/asm-x86_64/ptrace.h b/include/asm-x86_64/ptrace.h index 5bbc8d3141c8..ca6f15ff61d4 100644 --- a/include/asm-x86_64/ptrace.h +++ b/include/asm-x86_64/ptrace.h @@ -82,6 +82,7 @@ struct pt_regs { #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #define user_mode(regs) (!!((regs)->cs & 3)) +#define user_mode_vm(regs) user_mode(regs) #define instruction_pointer(regs) ((regs)->rip) extern unsigned long profile_pc(struct pt_regs *regs); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); -- cgit v1.2.3 From e9129e56e9ec50c0689eb4cf7a3ca132f1e776db Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Thu, 23 Jun 2005 00:08:46 -0700 Subject: [PATCH] xen: x86_64: Add macro for debugreg Add 2 macros to set and get debugreg on x86_64. This is useful for Xen because it will need only to redefine each macro to a hypervisor call. Signed-off-by: Vincent Hanquez Cc: Ian Pratt Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-x86_64/processor.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 8b55f139968f..106f666517bb 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -280,6 +280,14 @@ struct thread_struct { set_fs(USER_DS); \ } while(0) +#define get_debugreg(var, register) \ + __asm__("movq %%db" #register ", %0" \ + :"=r" (var)) +#define set_debugreg(value, register) \ + __asm__("movq %0,%%db" #register \ + : /* no output */ \ + :"r" (value)) + struct task_struct; struct mm_struct; -- cgit v1.2.3 From fa72b903f75e4f0f0b2c2feed093005167da4023 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:49 -0700 Subject: [PATCH] blk: remove blk_queue_tag->real_max_depth optimization blk_queue_tag->real_max_depth was used to optimize out unnecessary allocations/frees on tag resize. However, the whole thing was very broken - tag_map was never allocated to real_max_depth resulting in access beyond the end of the map, bits in [max_depth..real_max_depth] were set when initializing a map and copied when resizing resulting in pre-occupied tags. As the gain of the optimization is very small, well, almost nill, remove the whole thing. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 235c3414d268..8d7e2f4151d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -294,7 +294,6 @@ struct blk_queue_tag { struct list_head busy_list; /* fifo list of busy tags */ int busy; /* current depth */ int max_depth; /* what we will send to device */ - int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ }; -- cgit v1.2.3 From f7d37d028dfba90b1b747f8ac685bf0959aeda8b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:50 -0700 Subject: [PATCH] blk: remove BLK_TAGS_{PER_LONG|MASK} Replace BLK_TAGS_PER_LONG with BITS_PER_LONG and remove unused BLK_TAGS_MASK. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8d7e2f4151d0..60272141ff19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -285,9 +285,6 @@ enum blk_queue_state { Queue_up, }; -#define BLK_TAGS_PER_LONG (sizeof(unsigned long) * 8) -#define BLK_TAGS_MASK (BLK_TAGS_PER_LONG - 1) - struct blk_queue_tag { struct request **tag_index; /* map of busy tags */ unsigned long *tag_map; /* bit map of free/busy tags */ -- cgit v1.2.3 From 55c888d6d09a0df236adfaf8ccf06ff5d0646775 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:08:56 -0700 Subject: [PATCH] timers fixes/improvements This patch tries to solve following problems: 1. del_timer_sync() is racy. The timer can be fired again after del_timer_sync have checked all cpus and before it will recheck timer_pending(). 2. It has scalability problems. All cpus are scanned to determine if the timer is running on that cpu. With this patch del_timer_sync is O(1) and no slower than plain del_timer(pending_timer), unless it has to actually wait for completion of the currently running timer. The only restriction is that the recurring timer should not use add_timer_on(). 3. The timers are not serialized wrt to itself. If CPU_0 does mod_timer(jiffies+1) while the timer is currently running on CPU 1, it is quite possible that local interrupt on CPU_0 will start that timer before it finished on CPU_1. 4. The timers locking is suboptimal. __mod_timer() takes 3 locks at once and still requires wmb() in del_timer/run_timers. The new implementation takes 2 locks sequentially and does not need memory barriers. Currently ->base != NULL means that the timer is pending. In that case ->base.lock is used to lock the timer. __mod_timer also takes timer->lock because ->base can be == NULL. This patch uses timer->entry.next != NULL as indication that the timer is pending. So it does __list_del(), entry->next = NULL instead of list_del() when the timer is deleted. The ->base field is used for hashed locking only, it is initialized in init_timer() which sets ->base = per_cpu(tvec_bases). When the tvec_bases.lock is locked, it means that all timers which are tied to this base via timer->base are locked, and the base itself is locked too. So __run_timers/migrate_timers can safely modify all timers which could be found on ->tvX lists (pending timers). When the timer's base is locked, and the timer removed from ->entry list (which means that _run_timers/migrate_timers can't see this timer), it is possible to set timer->base = NULL and drop the lock: the timer remains locked. This patch adds lock_timer_base() helper, which waits for ->base != NULL, locks the ->base, and checks it is still the same. __mod_timer() schedules the timer on the local CPU and changes it's base. However, it does not lock both old and new bases at once. It locks the timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base, and adds this timer. This simplifies the code, because AB-BA deadlock is not possible. __mod_timer() also ensures that the timer's base is not changed while the timer's handler is running on the old base. __run_timers(), del_timer() do not change ->base anymore, they only clear pending flag. So del_timer_sync() can test timer->base->running_timer == timer to detect whether it is running or not. We don't need timer_list->lock anymore, this patch kills it. We also don't need barriers. del_timer() and __run_timers() used smp_wmb() before clearing timer's pending flag. It was needed because __mod_timer() did not lock old_base if the timer is not pending, so __mod_timer()->list_add() could race with del_timer()->list_del(). With this patch these functions are serialized through base->lock. One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch adds global struct timer_base_s { spinlock_t lock; struct timer_list *running_timer; } __init_timer_base; which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s struct are replaced by struct timer_base_s t_base. It is indeed ugly. But this can't have scalability problems. The global __init_timer_base.lock is used only when __mod_timer() is called for the first time AND the timer was compile time initialized. After that the timer migrates to the local CPU. Signed-off-by: Oleg Nesterov Acked-by: Ingo Molnar Signed-off-by: Renaud Lienhart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timer.h | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/timer.h b/include/linux/timer.h index 90db1cc62ddd..2e78fedfc069 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -6,45 +6,33 @@ #include #include -struct tvec_t_base_s; +struct timer_base_s; struct timer_list { struct list_head entry; unsigned long expires; - spinlock_t lock; unsigned long magic; void (*function)(unsigned long); unsigned long data; - struct tvec_t_base_s *base; + struct timer_base_s *base; }; #define TIMER_MAGIC 0x4b87ad6e +extern struct timer_base_s __init_timer_base; + #define TIMER_INITIALIZER(_function, _expires, _data) { \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ - .base = NULL, \ + .base = &__init_timer_base, \ .magic = TIMER_MAGIC, \ - .lock = SPIN_LOCK_UNLOCKED, \ } -/*** - * init_timer - initialize a timer. - * @timer: the timer to be initialized - * - * init_timer() must be done to a timer prior calling *any* of the - * other timer functions. - */ -static inline void init_timer(struct timer_list * timer) -{ - timer->base = NULL; - timer->magic = TIMER_MAGIC; - spin_lock_init(&timer->lock); -} +void fastcall init_timer(struct timer_list * timer); /*** * timer_pending - is a timer pending? @@ -58,7 +46,7 @@ static inline void init_timer(struct timer_list * timer) */ static inline int timer_pending(const struct timer_list * timer) { - return timer->base != NULL; + return timer->entry.next != NULL; } extern void add_timer_on(struct timer_list *timer, int cpu); @@ -89,12 +77,12 @@ static inline void add_timer(struct timer_list * timer) #ifdef CONFIG_SMP extern int del_timer_sync(struct timer_list *timer); - extern int del_singleshot_timer_sync(struct timer_list *timer); #else # define del_timer_sync(t) del_timer(t) -# define del_singleshot_timer_sync(t) del_timer(t) #endif +#define del_singleshot_timer_sync(t) del_timer_sync(t) + extern void init_timers(void); extern void run_local_timers(void); extern void it_real_fn(unsigned long); -- cgit v1.2.3 From fd450b7318b75343fd76b3d95416853e34e72c95 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:08:59 -0700 Subject: [PATCH] timers: introduce try_to_del_timer_sync() This patch splits del_timer_sync() into 2 functions. The new one, try_to_del_timer_sync(), returns -1 when it hits executing timer. It can be used in interrupt context, or when the caller hold locks which can prevent completion of the timer's handler. NOTE. Currently it can't be used in interrupt context in UP case, because ->running_timer is used only with CONFIG_SMP. Should the need arise, it is possible to kill #ifdef CONFIG_SMP in set_running_timer(), it is cheap. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timer.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/timer.h b/include/linux/timer.h index 2e78fedfc069..221f81ac2002 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -76,9 +76,11 @@ static inline void add_timer(struct timer_list * timer) } #ifdef CONFIG_SMP + extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else -# define del_timer_sync(t) del_timer(t) +# define try_to_del_timer_sync(t) del_timer(t) +# define del_timer_sync(t) del_timer(t) #endif #define del_singleshot_timer_sync(t) del_timer_sync(t) -- cgit v1.2.3 From 991114c6fa6a21d1fa4d544abe78592352860c82 Mon Sep 17 00:00:00 2001 From: Alexander Viro Date: Thu, 23 Jun 2005 00:09:01 -0700 Subject: [PATCH] fix for prune_icache()/forced final iput() races Based on analysis and a patch from Russ Weight There is a race condition that can occur if an inode is allocated and then released (using iput) during the ->fill_super functions. The race condition is between kswapd and mount. For most filesystems this can only happen in an error path when kswapd is running concurrently. For isofs, however, the error can occur in a more common code path (which is how the bug was found). The logic here is "we want final iput() to free inode *now* instead of letting it sit in cache if fs is going down or had not quite come up". The problem is with kswapd seeing such inodes in the middle of being killed and happily taking over. The clean solution would be to tell kswapd to leave those inodes alone and let our final iput deal with them. I.e. add a new flag (I_FORCED_FREEING), set it before write_inode_now() there and make prune_icache() leave those alone. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index e5a8db00df29..3622e952e98c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1025,6 +1025,7 @@ struct super_operations { #define I_FREEING 16 #define I_CLEAR 32 #define I_NEW 64 +#define I_WILL_FREE 128 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) -- cgit v1.2.3 From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001 From: Paulo Marques Date: Thu, 23 Jun 2005 00:09:02 -0700 Subject: [PATCH] create a kstrdup library function This patch creates a new kstrdup library function and changes the "local" implementations in several places to use this function. Most of the changes come from the sound and net subsystems. The sound part had already been acknowledged by Takashi Iwai and the net part by David S. Miller. I left UML alone for now because I would need more time to read the code carefully before making changes there. Signed-off-by: Paulo Marques Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/netdevice.h | 4 ---- include/linux/string.h | 2 ++ include/sound/core.h | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d6afd440cf7b..d89816ad642f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -925,10 +925,6 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward); extern void net_enable_timestamp(void); extern void net_disable_timestamp(void); -#ifdef CONFIG_SYSCTL -extern char *net_sysctl_strdup(const char *s); -#endif - #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/string.h b/include/linux/string.h index b9fc59469956..93994c613095 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -88,6 +88,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); #endif +extern char *kstrdup(const char *s, int gfp); + #ifdef __cplusplus } #endif diff --git a/include/sound/core.h b/include/sound/core.h index 9117c23e3a01..f8c4ef0aa352 100644 --- a/include/sound/core.h +++ b/include/sound/core.h @@ -292,6 +292,7 @@ void *snd_hidden_kcalloc(size_t n, size_t size, int flags); void snd_hidden_kfree(const void *obj); void *snd_hidden_vmalloc(unsigned long size); void snd_hidden_vfree(void *obj); +char *snd_hidden_kstrdup(const char *s, int flags); #define kmalloc(size, flags) snd_hidden_kmalloc(size, flags) #define kcalloc(n, size, flags) snd_hidden_kcalloc(n, size, flags) #define kfree(obj) snd_hidden_kfree(obj) @@ -301,6 +302,7 @@ void snd_hidden_vfree(void *obj); #define vmalloc_nocheck(size) snd_wrapper_vmalloc(size) #define kfree_nocheck(obj) snd_wrapper_kfree(obj) #define vfree_nocheck(obj) snd_wrapper_vfree(obj) +#define kstrdup(s, flags) snd_hidden_kstrdup(s, flags) #else #define snd_memory_init() /*NOP*/ #define snd_memory_done() /*NOP*/ @@ -311,7 +313,6 @@ void snd_hidden_vfree(void *obj); #define kfree_nocheck(obj) kfree(obj) #define vfree_nocheck(obj) vfree(obj) #endif -char *snd_kmalloc_strdup(const char *string, int flags); int copy_to_user_fromio(void __user *dst, const volatile void __iomem *src, size_t count); int copy_from_user_toio(volatile void __iomem *dst, const void __user *src, size_t count); -- cgit v1.2.3 From 35a82d1a53e1a9ad54efafcc940f9335beaed5c3 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 23 Jun 2005 00:09:06 -0700 Subject: [PATCH] optimise loop driver a bit Looks like locking can be optimised quite a lot. Increase lock widths slightly so lo_lock is taken fewer times per request. Also it was quite trivial to cover lo_pending with that lock, and remove the atomic requirement. This also makes memory ordering explicitly correct, which is nice (not that I particularly saw any mem ordering bugs). Test was reading 4 250MB files in parallel on ext2-on-tmpfs filesystem (1K block size, 4K page size). System is 2 socket Xeon with HT (4 thread). intel:/home/npiggin# umount /dev/loop0 ; mount /dev/loop0 /mnt/loop ; /usr/bin/time ./mtloop.sh Before: 0.24user 5.51system 0:02.84elapsed 202%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.52system 0:02.88elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.57system 0:02.89elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k 0.22user 5.51system 0:02.90elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.44system 0:02.91elapsed 193%CPU (0avgtext+0avgdata 0maxresident)k After: 0.07user 2.34system 0:01.68elapsed 143%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.37system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.39system 0:01.68elapsed 145%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.36system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.42system 0:01.68elapsed 147%CPU (0avgtext+0avgdata 0maxresident)k Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/loop.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/loop.h b/include/linux/loop.h index 8220d9c9da00..53fa51595443 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -61,7 +61,7 @@ struct loop_device { struct semaphore lo_sem; struct semaphore lo_ctl_mutex; struct semaphore lo_bh_mutex; - atomic_t lo_pending; + int lo_pending; request_queue_t *lo_queue; }; -- cgit v1.2.3 From dcd497f99a1ef29a7c5e76142965be77e9dacabd Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 23 Jun 2005 00:09:07 -0700 Subject: [PATCH] streamline preempt_count type across archs The preempt_count member of struct thread_info is currently either defined as int, unsigned int or __s32 depending on arch. This patch makes the type of preempt_count an int on all archs. Having preempt_count be an unsigned type prevents the catching of preempt_count < 0 bugs, and using int on some archs and __s32 on others is not exactely "neat" - much nicer when it's just int all over. A previous version of this patch was already ACK'ed by Robert Love, and the only change in this version of the patch compared to the one he ACK'ed is that this one also makes sure the preempt_count member is consistently commented. Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-arm/thread_info.h | 2 +- include/asm-arm26/thread_info.h | 2 +- include/asm-cris/thread_info.h | 2 +- include/asm-frv/thread_info.h | 2 +- include/asm-h8300/thread_info.h | 2 +- include/asm-i386/thread_info.h | 2 +- include/asm-ia64/thread_info.h | 2 +- include/asm-m32r/thread_info.h | 2 +- include/asm-m68k/thread_info.h | 2 +- include/asm-m68knommu/thread_info.h | 2 +- include/asm-mips/thread_info.h | 2 +- include/asm-parisc/thread_info.h | 2 +- include/asm-ppc/thread_info.h | 3 ++- include/asm-ppc64/thread_info.h | 2 +- include/asm-s390/thread_info.h | 2 +- include/asm-sh/thread_info.h | 2 +- include/asm-sh64/thread_info.h | 2 +- include/asm-sparc/thread_info.h | 4 ++-- include/asm-sparc64/thread_info.h | 2 +- include/asm-um/thread_info.h | 2 +- include/asm-v850/thread_info.h | 3 ++- include/asm-x86_64/thread_info.h | 2 +- 22 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/asm-arm/thread_info.h b/include/asm-arm/thread_info.h index 66c585c50cf9..8252a4cd860f 100644 --- a/include/asm-arm/thread_info.h +++ b/include/asm-arm/thread_info.h @@ -49,7 +49,7 @@ struct cpu_context_save { */ struct thread_info { unsigned long flags; /* low level flags */ - __s32 preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ diff --git a/include/asm-arm26/thread_info.h b/include/asm-arm26/thread_info.h index 50f41b50268a..aff3e5699c64 100644 --- a/include/asm-arm26/thread_info.h +++ b/include/asm-arm26/thread_info.h @@ -44,7 +44,7 @@ struct cpu_context_save { */ struct thread_info { unsigned long flags; /* low level flags */ - __s32 preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ diff --git a/include/asm-cris/thread_info.h b/include/asm-cris/thread_info.h index 53193feb0826..5ba4b7865cc5 100644 --- a/include/asm-cris/thread_info.h +++ b/include/asm-cris/thread_info.h @@ -31,7 +31,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead diff --git a/include/asm-frv/thread_info.h b/include/asm-frv/thread_info.h index b80a97f50af6..c8cba7836f0d 100644 --- a/include/asm-frv/thread_info.h +++ b/include/asm-frv/thread_info.h @@ -33,7 +33,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead diff --git a/include/asm-h8300/thread_info.h b/include/asm-h8300/thread_info.h index b07c9344776f..bfcc755c3bb1 100644 --- a/include/asm-h8300/thread_info.h +++ b/include/asm-h8300/thread_info.h @@ -23,7 +23,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ int cpu; /* cpu we're on */ - int preempt_count; /* 0 => preemptable, <0 => BUG*/ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h index 2cd57271801d..95add81237ea 100644 --- a/include/asm-i386/thread_info.h +++ b/include/asm-i386/thread_info.h @@ -31,7 +31,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h index 8d5b7e77028c..7dc8951708a3 100644 --- a/include/asm-ia64/thread_info.h +++ b/include/asm-ia64/thread_info.h @@ -25,7 +25,7 @@ struct thread_info { __u32 flags; /* thread_info flags (see TIF_*) */ __u32 cpu; /* current CPU */ mm_segment_t addr_limit; /* user-level address space limit */ - __s32 preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ + int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; struct { int signo; diff --git a/include/asm-m32r/thread_info.h b/include/asm-m32r/thread_info.h index 9f3a0fcf6e2b..7a6be7727a92 100644 --- a/include/asm-m32r/thread_info.h +++ b/include/asm-m32r/thread_info.h @@ -28,7 +28,7 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long status; /* thread-synchronous flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thread diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h index 5f58939c59db..2aed24f6fd2e 100644 --- a/include/asm-m68k/thread_info.h +++ b/include/asm-m68k/thread_info.h @@ -8,7 +8,7 @@ struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ __u32 cpu; /* should always be 0 on m68k */ struct restart_block restart_block; diff --git a/include/asm-m68knommu/thread_info.h b/include/asm-m68knommu/thread_info.h index c8153b7c1f5c..7b9a3fa3af5d 100644 --- a/include/asm-m68knommu/thread_info.h +++ b/include/asm-m68knommu/thread_info.h @@ -36,7 +36,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ int cpu; /* cpu we're on */ - int preempt_count; /* 0 => preemptable, <0 => BUG*/ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-mips/thread_info.h b/include/asm-mips/thread_info.h index 768900305e2f..42fcd6f2c206 100644 --- a/include/asm-mips/thread_info.h +++ b/include/asm-mips/thread_info.h @@ -27,7 +27,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead diff --git a/include/asm-parisc/thread_info.h b/include/asm-parisc/thread_info.h index fe9b7f8ae4c6..57bbb76cb6c1 100644 --- a/include/asm-parisc/thread_info.h +++ b/include/asm-parisc/thread_info.h @@ -12,7 +12,7 @@ struct thread_info { unsigned long flags; /* thread_info flags (see TIF_*) */ mm_segment_t addr_limit; /* user-level address space limit */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ + int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; }; diff --git a/include/asm-ppc/thread_info.h b/include/asm-ppc/thread_info.h index e3b5284a6f91..27903db42efc 100644 --- a/include/asm-ppc/thread_info.h +++ b/include/asm-ppc/thread_info.h @@ -20,7 +20,8 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long local_flags; /* non-racy flags */ int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, + <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-ppc64/thread_info.h b/include/asm-ppc64/thread_info.h index 48b7900e90ec..0494df6fca74 100644 --- a/include/asm-ppc64/thread_info.h +++ b/include/asm-ppc64/thread_info.h @@ -24,7 +24,7 @@ struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; /* set by force_successful_syscall_return */ unsigned char syscall_noerror; diff --git a/include/asm-s390/thread_info.h b/include/asm-s390/thread_info.h index aade85c53a63..fe101d41e849 100644 --- a/include/asm-s390/thread_info.h +++ b/include/asm-s390/thread_info.h @@ -50,7 +50,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ unsigned int cpu; /* current CPU */ - unsigned int preempt_count; /* 0 => preemptable */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-sh/thread_info.h b/include/asm-sh/thread_info.h index 4bbbd9f3c37e..46080cefaff8 100644 --- a/include/asm-sh/thread_info.h +++ b/include/asm-sh/thread_info.h @@ -20,7 +20,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ __u32 flags; /* low level flags */ __u32 cpu; - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; __u8 supervisor_stack[0]; }; diff --git a/include/asm-sh64/thread_info.h b/include/asm-sh64/thread_info.h index 8a32d6bd0b79..10f024c6a2e3 100644 --- a/include/asm-sh64/thread_info.h +++ b/include/asm-sh64/thread_info.h @@ -22,7 +22,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ /* Put the 4 32-bit fields together to make asm offsetting easier. */ - __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ __u16 cpu; mm_segment_t addr_limit; diff --git a/include/asm-sparc/thread_info.h b/include/asm-sparc/thread_info.h index 104f03c55416..ff6ccb3d24c6 100644 --- a/include/asm-sparc/thread_info.h +++ b/include/asm-sparc/thread_info.h @@ -30,9 +30,9 @@ struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ - int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, + <0 => BUG */ int softirq_count; int hardirq_count; diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h index 517caaba1c87..0cd652956929 100644 --- a/include/asm-sparc64/thread_info.h +++ b/include/asm-sparc64/thread_info.h @@ -46,7 +46,7 @@ struct thread_info { unsigned long fault_address; struct pt_regs *kregs; struct exec_domain *exec_domain; - int preempt_count; + int preempt_count; /* 0 => preemptable, <0 => BUG */ int __pad; unsigned long *utraps; diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h index 1feaaf148ef1..97267f059ef5 100644 --- a/include/asm-um/thread_info.h +++ b/include/asm-um/thread_info.h @@ -17,7 +17,7 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ - __s32 preempt_count; /* 0 => preemptable, + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user diff --git a/include/asm-v850/thread_info.h b/include/asm-v850/thread_info.h index e2ef44593752..e4cfad94a553 100644 --- a/include/asm-v850/thread_info.h +++ b/include/asm-v850/thread_info.h @@ -30,7 +30,8 @@ struct thread_info { struct exec_domain *exec_domain; /* execution domain */ unsigned long flags; /* low level flags */ int cpu; /* cpu we're on */ - int preempt_count; + int preempt_count; /* 0 => preemptable, + <0 => BUG */ struct restart_block restart_block; }; diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h index f4b3b249639c..08eb6e4f3737 100644 --- a/include/asm-x86_64/thread_info.h +++ b/include/asm-x86_64/thread_info.h @@ -29,7 +29,7 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int preempt_count; + int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; struct restart_block restart_block; -- cgit v1.2.3 From ac20427ef6aa63da663bdc88b71d16f7394f5e23 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 23 Jun 2005 00:09:11 -0700 Subject: [PATCH] add check to /proc/devices read routines Patch to add check to get_chrdev_list and get_blkdev_list to prevent reads of /proc/devices from spilling over the provided page if more than 4096 bytes of string data are generated from all the registered character and block devices in a system Signed-off-by: Neil Horman Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index af26dc718ef6..01796c41c951 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -224,7 +224,7 @@ static inline void free_disk_stats(struct gendisk *disk) extern void disk_round_stats(struct gendisk *disk); /* drivers/block/genhd.c */ -extern int get_blkdev_list(char *); +extern int get_blkdev_list(char *, int); extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern void unlink_gendisk(struct gendisk *gp); -- cgit v1.2.3 From 84de856ed30c568c2bb7b9ac0679772bd2737d9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:09:16 -0700 Subject: [PATCH] quota: consolidate code surrounding vfs_quota_on_mount Move some code duplicated in both callers into vfs_quota_on_mount Signed-off-by: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/quotaops.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index e57baa85e744..d211507ab246 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -39,7 +39,8 @@ extern int dquot_commit_info(struct super_block *sb, int type); extern int dquot_mark_dquot_dirty(struct dquot *dquot); extern int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path); -extern int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry); +extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name, + int format_id, int type); extern int vfs_quota_off(struct super_block *sb, int type); #define vfs_quota_off_mount(sb, type) vfs_quota_off(sb, type) extern int vfs_quota_sync(struct super_block *sb, int type); -- cgit v1.2.3 From b94cce926b2b902b79380ccba370d6f9f2980de0 Mon Sep 17 00:00:00 2001 From: Hien Nguyen Date: Thu, 23 Jun 2005 00:09:19 -0700 Subject: [PATCH] kprobes: function-return probes This patch adds function-return probes to kprobes for the i386 architecture. This enables you to establish a handler to be run when a function returns. 1. API Two new functions are added to kprobes: int register_kretprobe(struct kretprobe *rp); void unregister_kretprobe(struct kretprobe *rp); 2. Registration and unregistration 2.1 Register To register a function-return probe, the user populates the following fields in a kretprobe object and calls register_kretprobe() with the kretprobe address as an argument: kp.addr - the function's address handler - this function is run after the ret instruction executes, but before control returns to the return address in the caller. maxactive - The maximum number of instances of the probed function that can be active concurrently. For example, if the function is non- recursive and is called with a spinlock or mutex held, maxactive = 1 should be enough. If the function is non-recursive and can never relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should be enough. maxactive is used to determine how many kretprobe_instance objects to allocate for this particular probed function. If maxactive <= 0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 * NR_CPUS) else maxactive=NR_CPUS) For example: struct kretprobe rp; rp.kp.addr = /* entrypoint address */ rp.handler = /*return probe handler */ rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */ register_kretprobe(&rp); The following field may also be of interest: nmissed - Initialized to zero when the function-return probe is registered, and incremented every time the probed function is entered but there is no kretprobe_instance object available for establishing the function-return probe (i.e., because maxactive was set too low). 2.2 Unregister To unregiter a function-return probe, the user calls unregister_kretprobe() with the same kretprobe object as registered previously. If a probed function is running when the return probe is unregistered, the function will return as expected, but the handler won't be run. 3. Limitations 3.1 This patch supports only the i386 architecture, but patches for x86_64 and ppc64 are anticipated soon. 3.2 Return probes operates by replacing the return address in the stack (or in a known register, such as the lr register for ppc). This may cause __builtin_return_address(0), when invoked from the return-probed function, to return the address of the return-probes trampoline. 3.3 This implementation uses the "Multiprobes at an address" feature in 2.6.12-rc3-mm3. 3.4 Due to a limitation in multi-probes, you cannot currently establish a return probe and a jprobe on the same function. A patch to remove this limitation is being tested. This feature is required by SystemTap (http://sourceware.org/systemtap), and reflects ideas contributed by several SystemTap developers, including Will Cohen and Ananth Mavinakayanahalli. Signed-off-by: Hien Nguyen Signed-off-by: Prasanna S Panchamukhi Signed-off-by: Frederik Deweerdt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/kprobes.h | 3 ++ include/linux/kprobes.h | 90 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 91 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h index 4092f68d123a..8b6d3a90cd78 100644 --- a/include/asm-i386/kprobes.h +++ b/include/asm-i386/kprobes.h @@ -39,6 +39,9 @@ typedef u8 kprobe_opcode_t; : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry +#define ARCH_SUPPORTS_KRETPROBES + +void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 99ddba5a4e00..fba39f87efec 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -25,21 +25,31 @@ * Rusty Russell). * 2004-July Suparna Bhattacharya added jumper probes * interface to access function arguments. + * 2005-May Hien Nguyen and Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. */ #include #include #include #include +#include + #include struct kprobe; struct pt_regs; +struct kretprobe; +struct kretprobe_instance; typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *); typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *); typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *, unsigned long flags); typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *, int trapnr); +typedef int (*kretprobe_handler_t) (struct kretprobe_instance *, + struct pt_regs *); + struct kprobe { struct hlist_node hlist; @@ -85,6 +95,62 @@ struct jprobe { kprobe_opcode_t *entry; /* probe handling code to jump to */ }; +#ifdef ARCH_SUPPORTS_KRETPROBES +extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs); +extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags); +extern struct task_struct *arch_get_kprobe_task(void *ptr); +extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs); +extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock); +#else /* ARCH_SUPPORTS_KRETPROBES */ +static inline void kretprobe_trampoline(void) +{ +} +static inline int trampoline_probe_handler(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; +} +static inline void trampoline_post_handler(struct kprobe *p, + struct pt_regs *regs, unsigned long flags) +{ +} +static inline void arch_prepare_kretprobe(struct kretprobe *rp, + struct pt_regs *regs) +{ +} +static inline void arch_kprobe_flush_task(struct task_struct *tk) +{ +} +#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL) +#endif /* ARCH_SUPPORTS_KRETPROBES */ +/* + * Function-return probe - + * Note: + * User needs to provide a handler function, and initialize maxactive. + * maxactive - The maximum number of instances of the probed function that + * can be active concurrently. + * nmissed - tracks the number of times the probed function's return was + * ignored, due to maxactive being too low. + * + */ +struct kretprobe { + struct kprobe kp; + kretprobe_handler_t handler; + int maxactive; + int nmissed; + struct hlist_head free_instances; + struct hlist_head used_instances; +}; + +struct kretprobe_instance { + struct hlist_node uflist; /* either on free list or used list */ + struct hlist_node hlist; + struct kretprobe *rp; + void *ret_addr; + void *stack_addr; +}; + #ifdef CONFIG_KPROBES /* Locks kprobe: irq must be disabled */ void lock_kprobes(void); @@ -104,6 +170,7 @@ extern void show_registers(struct pt_regs *regs); /* Get the kprobe at this addr (if any). Must have called lock_kprobes */ struct kprobe *get_kprobe(void *addr); +struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk); int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); @@ -113,7 +180,16 @@ int register_jprobe(struct jprobe *p); void unregister_jprobe(struct jprobe *p); void jprobe_return(void); -#else +int register_kretprobe(struct kretprobe *rp); +void unregister_kretprobe(struct kretprobe *rp); + +struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp); +struct kretprobe_instance *get_rp_inst(void *sara); +struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk); +void add_rp_inst(struct kretprobe_instance *ri); +void kprobe_flush_task(struct task_struct *tk); +void recycle_rp_inst(struct kretprobe_instance *ri); +#else /* CONFIG_KPROBES */ static inline int kprobe_running(void) { return 0; @@ -135,5 +211,15 @@ static inline void unregister_jprobe(struct jprobe *p) static inline void jprobe_return(void) { } -#endif +static inline int register_kretprobe(struct kretprobe *rp) +{ + return -ENOSYS; +} +static inline void unregister_kretprobe(struct kretprobe *rp) +{ +} +static inline void kprobe_flush_task(struct task_struct *tk) +{ +} +#endif /* CONFIG_KPROBES */ #endif /* _LINUX_KPROBES_H */ -- cgit v1.2.3 From 73649dab0fd524cb8545a8cb83c6eaf77b107105 Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:23 -0700 Subject: [PATCH] x86_64 specific function return probes The following patch adds the x86_64 architecture specific implementation for function return probes. Function return probes is a mechanism built on top of kprobes that allows a caller to register a handler to be called when a given function exits. For example, to instrument the return path of sys_mkdir: static int sys_mkdir_exit(struct kretprobe_instance *i, struct pt_regs *regs) { printk("sys_mkdir exited\n"); return 0; } static struct kretprobe return_probe = { .handler = sys_mkdir_exit, }; return_probe.kp.addr = (kprobe_opcode_t *) kallsyms_lookup_name("sys_mkdir"); if (register_kretprobe(&return_probe)) { printk(KERN_DEBUG "Unable to register return probe!\n"); /* do error path */ } unregister_kretprobe(&return_probe); The way this works is that: * At system initialization time, kernel/kprobes.c installs a kprobe on a function called kretprobe_trampoline() that is implemented in the arch/x86_64/kernel/kprobes.c (More on this later) * When a return probe is registered using register_kretprobe(), kernel/kprobes.c will install a kprobe on the first instruction of the targeted function with the pre handler set to arch_prepare_kretprobe() which is implemented in arch/x86_64/kernel/kprobes.c. * arch_prepare_kretprobe() will prepare a kretprobe instance that stores: - nodes for hanging this instance in an empty or free list - a pointer to the return probe - the original return address - a pointer to the stack address With all this stowed away, arch_prepare_kretprobe() then sets the return address for the targeted function to a special trampoline function called kretprobe_trampoline() implemented in arch/x86_64/kernel/kprobes.c * The kprobe completes as normal, with control passing back to the target function that executes as normal, and eventually returns to our trampoline function. * Since a kprobe was installed on kretprobe_trampoline() during system initialization, control passes back to kprobes via the architecture specific function trampoline_probe_handler() which will lookup the instance in an hlist maintained by kernel/kprobes.c, and then call the handler function. * When trampoline_probe_handler() is done, the kprobes infrastructure single steps the original instruction (in this case just a top), and then calls trampoline_post_handler(). trampoline_post_handler() then looks up the instance again, puts the instance back on the free list, and then makes a long jump back to the original return instruction. So to recap, to instrument the exit path of a function this implementation will cause four interruptions: - A breakpoint at the very beginning of the function allowing us to switch out the return address - A single step interruption to execute the original instruction that we replaced with the break instruction (normal kprobe flow) - A breakpoint in the trampoline function where our instrumented function returned to - A single step interruption to execute the original instruction that we replaced with the break instruction (normal kprobe flow) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-x86_64/kprobes.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/asm-x86_64/kprobes.h b/include/asm-x86_64/kprobes.h index bfea52d516f8..6d6d883fdf6d 100644 --- a/include/asm-x86_64/kprobes.h +++ b/include/asm-x86_64/kprobes.h @@ -38,6 +38,9 @@ typedef u8 kprobe_opcode_t; : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry +#define ARCH_SUPPORTS_KRETPROBES + +void kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { -- cgit v1.2.3 From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:25 -0700 Subject: [PATCH] Move kprobe [dis]arming into arch specific code The architecture independent code of the current kprobes implementation is arming and disarming kprobes at registration time. The problem is that the code is assuming that arming and disarming is a just done by a simple write of some magic value to an address. This is problematic for ia64 where our instructions look more like structures, and we can not insert break points by just doing something like: *p->addr = BREAKPOINT_INSTRUCTION; The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent functions: * void arch_arm_kprobe(struct kprobe *p) * void arch_disarm_kprobe(struct kprobe *p) and then adds the new functions for each of the architectures that already implement kprobes (spar64/ppc64/i386/x86_64). I thought arch_[dis]arm_kprobe was the most descriptive of what was really happening, but each of the architectures already had a disarm_kprobe() function that was really a "disarm and do some other clean-up items as needed when you stumble across a recursive kprobe." So... I took the liberty of changing the code that was calling disarm_kprobe() to call arch_disarm_kprobe(), and then do the cleanup in the block of code dealing with the recursive kprobe case. So far this patch as been tested on i386, x86_64, and ppc64, but still needs to be tested in sparc64. Signed-off-by: Rusty Lynch Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index fba39f87efec..0f90466fb8b0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -165,6 +165,8 @@ static inline int kprobe_running(void) extern int arch_prepare_kprobe(struct kprobe *p); extern void arch_copy_kprobe(struct kprobe *p); +extern void arch_arm_kprobe(struct kprobe *p); +extern void arch_disarm_kprobe(struct kprobe *p); extern void arch_remove_kprobe(struct kprobe *p); extern void show_registers(struct pt_regs *regs); -- cgit v1.2.3 From 0aa55e4d7db822059fe8132fe9f2b7773c48216c Mon Sep 17 00:00:00 2001 From: Hien Nguyen Date: Thu, 23 Jun 2005 00:09:26 -0700 Subject: [PATCH] kprobes: moves lock-unlock to non-arch kprobe_flush_task This patch moves the lock/unlock of the arch specific kprobe_flush_task() to the non-arch specific kprobe_flusk_task(). Signed-off-by: Hien Nguyen Acked-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0f90466fb8b0..461391decc46 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -33,7 +33,6 @@ #include #include #include -#include #include @@ -101,7 +100,7 @@ extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags); extern struct task_struct *arch_get_kprobe_task(void *ptr); extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs); -extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock); +extern void arch_kprobe_flush_task(struct task_struct *tk); #else /* ARCH_SUPPORTS_KRETPROBES */ static inline void kretprobe_trampoline(void) { -- cgit v1.2.3 From 7213b2521889eb087eed8abaa48d1a692575da3e Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:27 -0700 Subject: [PATCH] Kprobes/IA64: kdebug die notification mechanism As many of you know that kprobes exist in the main line kernel for various architecture including i386, x86_64, ppc64 and sparc64. Attached patches following this mail are a port of Kprobes and Jprobes for IA64. I have tesed this patches for kprobes and Jprobes and this seems to work fine. I have tested this patch by inserting kprobes on various slots and various templates including various types of branch instructions. I have also tested this patch using the tool http://marc.theaimsgroup.com/?l=linux-kernel&m=111657358022586&w=2 and the kprobes for IA64 works great. Here is list of TODO things and pathes for the same will appear soon. 1) Support kprobes on "mov r1=ip" type of instruction 2) Support Kprobes and Jprobes to exist on the same address 3) Support Return probes 3) Architecture independent cleanup of kprobes This patch adds the kdebug die notification mechanism needed by Kprobes. For break instruction on Branch type slot, imm21 is ignored and value zero is placed in IIM register, hence we need to handle kprobes for switch case zero. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Rusty Lynch From: Rusty Lynch At the point in traps.c where we recieve a break with a zero value, we can not say if the break was a result of a kprobe or some other debug facility. This simple patch changes the informational string to a more correct "break 0" value, and applies to the 2.6.12-rc2-mm2 tree with all the kprobes patches that were just recently included for the next mm cut. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/break.h | 2 ++ include/asm-ia64/kdebug.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 include/asm-ia64/kdebug.h (limited to 'include') diff --git a/include/asm-ia64/break.h b/include/asm-ia64/break.h index 97c7b2d79600..8167828edc4b 100644 --- a/include/asm-ia64/break.h +++ b/include/asm-ia64/break.h @@ -12,6 +12,8 @@ * OS-specific debug break numbers: */ #define __IA64_BREAK_KDB 0x80100 +#define __IA64_BREAK_KPROBE 0x80200 +#define __IA64_BREAK_JPROBE 0x80300 /* * OS-specific break numbers: diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h new file mode 100644 index 000000000000..4d376e1663f7 --- /dev/null +++ b/include/asm-ia64/kdebug.h @@ -0,0 +1,61 @@ +#ifndef _IA64_KDEBUG_H +#define _IA64_KDEBUG_H 1 +/* + * include/asm-ia64/kdebug.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Intel Corporation, 2005 + * + * 2005-Apr Rusty Lynch and Anil S Keshavamurthy + * adopted from + * include/asm-x86_64/kdebug.h + */ +#include + +struct pt_regs; + +struct die_args { + struct pt_regs *regs; + const char *str; + long err; + int trapnr; + int signr; +}; + +int register_die_notifier(struct notifier_block *nb); +extern struct notifier_block *ia64die_chain; + +enum die_val { + DIE_BREAK = 1, + DIE_SS, + DIE_PAGE_FAULT, +}; + +static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs, + long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + + return notifier_call_chain(&ia64die_chain, val, &args); +} + +#endif -- cgit v1.2.3 From fd7b231ff98578308d8f5fb76a25a369ce1074ae Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:28 -0700 Subject: [PATCH] Kprobes/IA64: arch specific handling This is an IA64 arch specific handling of Kprobes Signed-off-by: Anil S Keshavamurthy Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/kprobes.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 include/asm-ia64/kprobes.h (limited to 'include') diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h new file mode 100644 index 000000000000..2a6f2a148890 --- /dev/null +++ b/include/asm-ia64/kprobes.h @@ -0,0 +1,72 @@ +#ifndef _ASM_KPROBES_H +#define _ASM_KPROBES_H +/* + * Kernel Probes (KProbes) + * include/asm-ia64/kprobes.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Intel Corporation, 2005 + * + * 2005-Apr Rusty Lynch and Anil S Keshavamurthy + * adapted from i386 + */ +#include +#include +#include + +#define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) + +typedef struct _bundle { + struct { + unsigned long long template : 5; + unsigned long long slot0 : 41; + unsigned long long slot1_p0 : 64-46; + } quad0; + struct { + unsigned long long slot1_p1 : 41 - (64-46); + unsigned long long slot2 : 41; + } quad1; +} __attribute__((__aligned__(16))) bundle_t; + +#define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry + +typedef struct kprobe_opcode { + bundle_t bundle; +} kprobe_opcode_t; + +struct fnptr { + unsigned long ip; + unsigned long gp; +}; + +/* Architecture specific copy of original instruction*/ +struct arch_specific_insn { + /* copy of the original instruction */ + kprobe_opcode_t insn; +}; + +#ifdef CONFIG_KPROBES +extern int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data); +#else /* !CONFIG_KPROBES */ +static inline int kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif +#endif /* _ASM_KPROBES_H */ -- cgit v1.2.3 From b2761dc262b428475890fffd979687051beb12ba Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:28 -0700 Subject: [PATCH] Kprobes/IA64: architecture specific JProbes support This patch adds IA64 architecture specific JProbes support on top of Kprobes Signed-off-by: Anil S Keshavamurthy Signed-off-by: Rusty Lynch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/kprobes.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index 2a6f2a148890..fec3506e53f8 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -59,6 +59,11 @@ struct arch_specific_insn { kprobe_opcode_t insn; }; +/* ia64 does not need this */ +static inline void jprobe_return(void) +{ +} + #ifdef CONFIG_KPROBES extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -- cgit v1.2.3 From cd2675bf65455a45b54228b7acc0c6a26a164cb6 Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:29 -0700 Subject: [PATCH] Kprobes/IA64: support kprobe on branch/call instructions This patch is required to support kprobe on branch/call instructions. Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/kprobes.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index fec3506e53f8..d30af77a0b11 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -44,6 +44,17 @@ typedef struct _bundle { #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)pentry +#define SLOT0_OPCODE_SHIFT (37) +#define SLOT1_p1_OPCODE_SHIFT (37 - (64-46)) +#define SLOT2_OPCODE_SHIFT (37) + +#define INDIRECT_CALL_OPCODE (1) +#define IP_RELATIVE_CALL_OPCODE (5) +#define IP_RELATIVE_BRANCH_OPCODE (4) +#define IP_RELATIVE_PREDICT_OPCODE (7) +#define LONG_BRANCH_OPCODE (0xC) +#define LONG_CALL_OPCODE (0xD) + typedef struct kprobe_opcode { bundle_t bundle; } kprobe_opcode_t; @@ -55,8 +66,12 @@ struct fnptr { /* Architecture specific copy of original instruction*/ struct arch_specific_insn { - /* copy of the original instruction */ + /* copy of the instruction to be emulated */ kprobe_opcode_t insn; + #define INST_FLAG_FIX_RELATIVE_IP_ADDR 1 + #define INST_FLAG_FIX_BRANCH_REG 2 + unsigned long inst_flag; + unsigned short target_br_reg; }; /* ia64 does not need this */ -- cgit v1.2.3 From 8bc76772ad653bcaad1b0af72aafb6072ef0fa87 Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:30 -0700 Subject: [PATCH] Kprobes ia64 cleanup A cleanup of the ia64 kprobes implementation such that all of the bundle manipulation logic is concentrated in arch_prepare_kprobe(). With the current design for kprobes, the arch specific code only has a chance to return failure inside the arch_prepare_kprobe() function. This patch moves all of the work that was happening in arch_copy_kprobe() and most of the work that was happening in arch_arm_kprobe() into arch_prepare_kprobe(). By doing this we can add further robustness checks in arch_arm_kprobe() and refuse to insert kprobes that will cause problems. Signed-off-by: Rusty Lynch Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/kprobes.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index d30af77a0b11..cec4d9958307 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -30,6 +30,8 @@ #define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) +struct kprobe; + typedef struct _bundle { struct { unsigned long long template : 5; @@ -79,6 +81,11 @@ static inline void jprobe_return(void) { } +/* ia64 does not need this */ +static inline void arch_copy_kprobe(struct kprobe *p) +{ +} + #ifdef CONFIG_KPROBES extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -- cgit v1.2.3 From 1674eafcbd3e3c68556cf19fbf4a2c30f7add729 Mon Sep 17 00:00:00 2001 From: Anil S Keshavamurthy Date: Thu, 23 Jun 2005 00:09:33 -0700 Subject: [PATCH] Kprobes IA64: cmp ctype unc support The current Kprobes when patching the original instruction with the break instruction tries to retain the original qualifying predicate(qp), however for cmp.crel.ctype where ctype == unc, which is a special instruction always needs to be executed irrespective of qp. Hence, if the instruction we are patching is of this type, then we should not copy the original qp to the break instruction, this is because we always want the break fault to happen so that we can emulate the instruction. This patch is based on the feedback given by David Mosberger Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/kprobes.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index cec4d9958307..7b700035e36d 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -30,6 +30,23 @@ #define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) +typedef union cmp_inst { + struct { + unsigned long long qp : 6; + unsigned long long p1 : 6; + unsigned long long c : 1; + unsigned long long r2 : 7; + unsigned long long r3 : 7; + unsigned long long p2 : 6; + unsigned long long ta : 1; + unsigned long long x2 : 2; + unsigned long long tb : 1; + unsigned long long opcode : 4; + unsigned long long reserved : 23; + }f; + unsigned long long l; +} cmp_inst_t; + struct kprobe; typedef struct _bundle { -- cgit v1.2.3 From ea32c65cc2d2294c04e9f81d0578a6f51febfdbf Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:36 -0700 Subject: [PATCH] kprobes: Temporary disarming of reentrant probe In situations where a kprobes handler calls a routine which has a probe on it, then kprobes_handler() disarms the new probe forever. This patch removes the above limitation by temporarily disarming the new probe. When the another probe hits while handling the old probe, the kprobes_handler() saves previous kprobes state and handles the new probe without calling the new kprobes registered handlers. kprobe_post_handler() restores back the previous kprobes state and the normal execution continues. However on x86_64 architecture, re-rentrancy is provided only through pre_handler(). If a routine having probe is referenced through post_handler(), then the probes on that routine are disarmed forever, since the exception stack is gets changed after the processor single steps the instruction of the new probe. This patch includes generic changes to support temporary disarming on reentrancy of probes. Signed-of-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 461391decc46..5e1a7b0d7b3f 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -36,6 +36,12 @@ #include +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 +#define KPROBE_REENTER 0x00000004 +#define KPROBE_HIT_SSDONE 0x00000008 + struct kprobe; struct pt_regs; struct kretprobe; @@ -55,6 +61,9 @@ struct kprobe { /* list of kprobes for multi-handler support */ struct list_head list; + /*count the number of times this probe was temporarily disarmed */ + unsigned long nmissed; + /* location of the probe point */ kprobe_opcode_t *addr; -- cgit v1.2.3 From d6e711448137ca3301512cec41a2c2ce852b3d0a Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 23 Jun 2005 00:09:43 -0700 Subject: [PATCH] setuid core dump Add a new `suid_dumpable' sysctl: This value can be used to query and set the core dump mode for setuid or otherwise protected/tainted binaries. The modes are 0 - (default) - traditional behaviour. Any process which has changed privilege levels or is execute only will not be dumped 1 - (debug) - all processes dump core when possible. The core dump is owned by the current user and no security is applied. This is intended for system debugging situations only. Ptrace is unchecked. 2 - (suidsafe) - any binary which normally would not be dumped is dumped readable by root only. This allows the end user to remove such a dump but not access it directly. For security reasons core dumps in this mode will not overwrite one another or other files. This mode is appropriate when adminstrators are attempting to debug problems in a normal environment. (akpm: > > +EXPORT_SYMBOL(suid_dumpable); > > EXPORT_SYMBOL_GPL? No problem to me. > > if (current->euid == current->uid && current->egid == current->gid) > > current->mm->dumpable = 1; > > Should this be SUID_DUMP_USER? Actually the feedback I had from last time was that the SUID_ defines should go because its clearer to follow the numbers. They can go everywhere (and there are lots of places where dumpable is tested/used as a bool in untouched code) > Maybe this should be renamed to `dump_policy' or something. Doing that > would help us catch any code which isn't using the #defines, too. Fair comment. The patch was designed to be easy to maintain for Red Hat rather than for merging. Changing that field would create a gigantic diff because it is used all over the place. ) Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/binfmts.h | 5 +++++ include/linux/sched.h | 2 +- include/linux/sysctl.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7e736e201c46..c1e82c514443 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -69,6 +69,11 @@ extern void remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *,struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); +extern int suid_dumpable; +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ + /* Stack area protections */ #define EXSTACK_DEFAULT 0 /* Whatever the arch defaults to */ #define EXSTACK_DISABLE_X 1 /* Disable executable stacks */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b58afd97a180..901742f92389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -246,7 +246,7 @@ struct mm_struct { unsigned long saved_auxv[42]; /* for /proc/PID/auxv */ - unsigned dumpable:1; + unsigned dumpable:2; cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a17745c80a91..614e939c78a4 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -136,6 +136,7 @@ enum KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */ KERN_RANDOMIZE=68, /* int: randomize virtual address space */ + KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ }; -- cgit v1.2.3 From ef3daeda7b58f046f94b26637d500354038d39f4 Mon Sep 17 00:00:00 2001 From: Yoav Zach Date: Thu, 23 Jun 2005 00:09:58 -0700 Subject: [PATCH] Don't force O_LARGEFILE for 32 bit processes on ia64 In ia64 kernel, the O_LARGEFILE flag is forced when opening a file. This is problematic for execution of 32 bit processes, which are not largefile aware, either by SW emulation or by HW execution. For such processes, the problem is two-fold: 1) When trying to open a file that is larger than 4G the operation should fail, but it's not 2) Writing to offset larger than 4G should fail, but it's not The proposed patch takes advantage of the way 32 bit processes are identified in ia64 systems. Such processes have PER_LINUX32 for their personality. With the patch, the ia64 kernel will not enforce the O_LARGEFILE flag if the current process has PER_LINUX32 set. The behavior for all other architectures remains unchanged. Signed-off-by: Yoav Zach Acked-by: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/fcntl.h | 2 ++ include/linux/fcntl.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/asm-ia64/fcntl.h b/include/asm-ia64/fcntl.h index d193981bb1d8..c9f8d835d0cc 100644 --- a/include/asm-ia64/fcntl.h +++ b/include/asm-ia64/fcntl.h @@ -81,4 +81,6 @@ struct flock { #define F_LINUX_SPECIFIC_BASE 1024 +#define force_o_largefile() ( ! (current->personality & PER_LINUX32) ) + #endif /* _ASM_IA64_FCNTL_H */ diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 704fb76b6334..8a7c82151de9 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -25,6 +25,10 @@ #ifdef __KERNEL__ +#ifndef force_o_largefile +#define force_o_largefile() (BITS_PER_LONG != 32) +#endif + #if BITS_PER_LONG == 32 #define IS_GETLK32(cmd) ((cmd) == F_GETLK) #define IS_SETLK32(cmd) ((cmd) == F_SETLK) -- cgit v1.2.3 From 11c80c8367db0a9d342529ed74464670cd86a1f6 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 23 Jun 2005 00:09:59 -0700 Subject: [PATCH] adjust per_cpu definition in non-SMP case Fix (in the architectures I'm actually building for) the UP definition of per_cpu so that the cpu specified may be any expression, not just an identifier or a suffix expression. Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/percpu.h | 2 +- include/asm-ia64/percpu.h | 2 +- include/asm-x86_64/percpu.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 3b709b84934f..9044aeb37828 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -29,7 +29,7 @@ do { \ #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name -#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h index 1e87f19dad56..2b14dee29ce7 100644 --- a/include/asm-ia64/percpu.h +++ b/include/asm-ia64/percpu.h @@ -50,7 +50,7 @@ extern void *per_cpu_init(void); #else /* ! SMP */ -#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var #define per_cpu_init() (__phys_per_cpu_start) diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index 415d73f3c8ef..9c71855736fb 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -39,7 +39,7 @@ extern void setup_per_cpu_areas(void); #define DEFINE_PER_CPU(type, name) \ __typeof__(type) per_cpu__##name -#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var #endif /* SMP */ -- cgit v1.2.3 From 46c271bedd2c8444b1d05bc44928beec0c07debc Mon Sep 17 00:00:00 2001 From: Peter Osterlund Date: Thu, 23 Jun 2005 00:10:02 -0700 Subject: [PATCH] Improve CD/DVD packet driver write performance This patch improves write performance for the CD/DVD packet writing driver. The logic for switching between reading and writing has been changed so that streaming writes are no longer interrupted by read requests. Signed-off-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pktcdvd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 4e2d2a942ecb..4b32bce9a289 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -159,7 +159,7 @@ struct packet_iosched struct bio *read_queue_tail; struct bio *write_queue; struct bio *write_queue_tail; - int high_prio_read; /* An important read request has been queued */ + sector_t last_write; /* The sector where the last write ended */ int successive_reads; }; -- cgit v1.2.3 From fa912bcb06d5dc9525d8912a145db2bf4b7668c5 Mon Sep 17 00:00:00 2001 From: Daniel Ritz Date: Thu, 23 Jun 2005 00:10:12 -0700 Subject: [PATCH] yenta TI: turn off interrupts during card power-on #2 - make boot-up card recognition more reliable (ie. redo interrogation always if there is no valid 'card inserted' state) (and yes, i saw it happening on an o2micro controller that both CB_CBARD and CB_16BITCARD bits were set at the same time) - also redo interrogation before probing the ISA interrupts. it's safer to do the probing with the socket in a clean state. - make card insert detect more reliable. yenta_get_status() now returns SS_PENDING as long as the card is not completley inserted and one of the voltage bits is set. also !CB_CBARD doesn't mean CB_16BITCARD. there is CB_NOTACARD as well, so make an explicit check for CB_16BITCARD. - for TI bridges: disable IRQs during power-on. in all-serial and tied interrupt mode the interrupts are always disabled for single-slot controllers. for two-slot contollers the disabling is only done when the other slot is empty. to force disabling there is a new module parameter now: pwr_irqs_off=Y (which is a regression for working setups. that's why it's an option, only use when required) - modparm to disable ISA interrupt probing (isa_probe, defaults to on) - remove unneeded code/cleanups (ie. merge yenta_events() into yenta_interrupts()) Signed-off-by: Daniel Ritz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/pcmcia/ss.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h index 6d3413a56708..67b867f31fe4 100644 --- a/include/pcmcia/ss.h +++ b/include/pcmcia/ss.h @@ -77,6 +77,11 @@ extern socket_state_t dead_socket; /* Use this just for bridge windows */ #define MAP_IOSPACE 0x20 +/* power hook operations */ +#define HOOK_POWER_PRE 0x01 +#define HOOK_POWER_POST 0x02 + + typedef struct pccard_io_map { u_char map; u_char flags; @@ -222,6 +227,9 @@ struct pcmcia_socket { /* Zoom video behaviour is so chip specific its not worth adding this to _ops */ void (*zoom_video)(struct pcmcia_socket *, int); + + /* so is power hook */ + int (*power_hook)(struct pcmcia_socket *sock, int operation); /* state thread */ struct semaphore skt_sem; /* protects socket h/w state */ -- cgit v1.2.3 From 0d77e5a2c23da734f5a7925f64afa1c2ed92e0f9 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 23 Jun 2005 00:10:14 -0700 Subject: [PATCH] compat: introduce compat_time_t This patch is based on work by Carlos O'Donell and Matthew Wilcox. It introduces/updates the compat_time_t type and uses it for compat siginfo structures. I have built this on ppc64 and x86_64. Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ia64/compat.h | 1 + include/asm-mips/compat.h | 1 + include/asm-parisc/compat.h | 2 +- include/asm-ppc64/compat.h | 1 + include/asm-ppc64/ppc32.h | 2 +- include/asm-sparc64/compat.h | 1 + include/asm-x86_64/ia32.h | 2 +- 7 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-ia64/compat.h b/include/asm-ia64/compat.h index cc0ff0a4bdd0..0c05e5bad8a0 100644 --- a/include/asm-ia64/compat.h +++ b/include/asm-ia64/compat.h @@ -27,6 +27,7 @@ typedef u16 compat_ipc_pid_t; typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-mips/compat.h b/include/asm-mips/compat.h index dce92079e7fc..d78002afb1e1 100644 --- a/include/asm-mips/compat.h +++ b/include/asm-mips/compat.h @@ -29,6 +29,7 @@ typedef s32 compat_caddr_t; typedef struct { s32 val[2]; } compat_fsid_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-parisc/compat.h b/include/asm-parisc/compat.h index ca0eac647a05..7630d1ad2391 100644 --- a/include/asm-parisc/compat.h +++ b/include/asm-parisc/compat.h @@ -24,7 +24,7 @@ typedef u16 compat_nlink_t; typedef u16 compat_ipc_pid_t; typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; -typedef u32 compat_timer_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-ppc64/compat.h b/include/asm-ppc64/compat.h index 09c28d28ce6c..12414f5fc666 100644 --- a/include/asm-ppc64/compat.h +++ b/include/asm-ppc64/compat.h @@ -26,6 +26,7 @@ typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; typedef s32 compat_key_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h index 1d0404897550..6b44a8caf395 100644 --- a/include/asm-ppc64/ppc32.h +++ b/include/asm-ppc64/ppc32.h @@ -32,7 +32,7 @@ typedef struct compat_siginfo { /* POSIX.1b timers */ struct { - timer_t _tid; /* timer id */ + compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ diff --git a/include/asm-sparc64/compat.h b/include/asm-sparc64/compat.h index 22f58055b8ab..b59122dd176d 100644 --- a/include/asm-sparc64/compat.h +++ b/include/asm-sparc64/compat.h @@ -25,6 +25,7 @@ typedef s32 compat_daddr_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; typedef s32 compat_key_t; +typedef s32 compat_timer_t; typedef s32 compat_int_t; typedef s32 compat_long_t; diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h index c0a7717923ed..6efa00fe4e7b 100644 --- a/include/asm-x86_64/ia32.h +++ b/include/asm-x86_64/ia32.h @@ -94,7 +94,7 @@ typedef struct compat_siginfo{ /* POSIX.1b timers */ struct { - int _tid; /* timer id */ + compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ -- cgit v1.2.3 From bb93e3a52f8db7210258a1a2134cced0b78a46e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 00:10:15 -0700 Subject: [PATCH] block: add unlocked_ioctl support for block devices This patch allows block device drivers to convert their ioctl functions to unlocked_ioctl() like character devices and other subsystems. All functions that were called with the BKL held before are still used that way, but I would not be surprised if it could be removed from the ioctl functions in drivers/block/ioctl.c themselves. As a side note, I found that compat_blkdev_ioctl() acquires the BKL as well, which looks like a bug. I have checked that every user of disk->fops->compat_ioctl() in the current git tree gets the BKL itself, so it could easily be removed from compat_blkdev_ioctl(). Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3622e952e98c..9b1278e21279 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -884,6 +884,7 @@ struct block_device_operations { int (*open) (struct inode *, struct file *); int (*release) (struct inode *, struct file *); int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + long (*unlocked_ioctl) (struct file *, unsigned, unsigned long); long (*compat_ioctl) (struct file *, unsigned, unsigned long); int (*media_changed) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); -- cgit v1.2.3 From 45778ca819accab1a4a3378b3566cab0f189164f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:10:17 -0700 Subject: [PATCH] Remove f_error field from struct file The following patch removes the f_error field and all checks of f_error. Trond said: f_error was introduced for NFS, and made sense when we were guaranteed always to have a file pointer around when write errors occurred. Since then, we have (for various reasons) had to introduce the nfs_open_context in order to track the file read/write state, and it made sense to move our f_error tracking there too. Signed-off-by: Christoph Lameter Acked-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b1278e21279..517bf4966bf5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -581,7 +581,6 @@ struct file { atomic_t f_count; unsigned int f_flags; mode_t f_mode; - int f_error; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; -- cgit v1.2.3 From f9fd27a253d5e0b23531d12ce7ad15b6535d4486 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:10:19 -0700 Subject: [PATCH] acl endianess annotations Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/posix_acl_xattr.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 5efd0a6dad94..fe271c1947b2 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -23,13 +23,13 @@ #define ACL_UNDEFINED_ID (-1) typedef struct { - __u16 e_tag; - __u16 e_perm; - __u32 e_id; + __le16 e_tag; + __le16 e_perm; + __le32 e_id; } posix_acl_xattr_entry; typedef struct { - __u32 a_version; + __le32 a_version; posix_acl_xattr_entry a_entries[0]; } posix_acl_xattr_header; -- cgit v1.2.3 From 9a59f452abe11f569e13ec16c51e6d61c54b9838 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:10:19 -0700 Subject: [PATCH] remove This file duplicates , using slightly different names. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/posix_acl_xattr.h | 3 +++ include/linux/reiserfs_acl.h | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index fe271c1947b2..6e53c34035cd 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -52,4 +52,7 @@ posix_acl_xattr_count(size_t size) return size / sizeof(posix_acl_xattr_entry); } +struct posix_acl *posix_acl_from_xattr(const void *value, size_t size); +int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size); + #endif /* _POSIX_ACL_XATTR_H */ diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h index 2aef9c3f5ce8..0760507a545b 100644 --- a/include/linux/reiserfs_acl.h +++ b/include/linux/reiserfs_acl.h @@ -1,6 +1,5 @@ #include #include -#include #define REISERFS_ACL_VERSION 0x0001 -- cgit v1.2.3 From c43dc2fd885b5658cfd7cedb7bcca20910c517a4 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Thu, 23 Jun 2005 00:10:27 -0700 Subject: [PATCH] aio: make wait_queue ->task ->private In the upcoming aio_down patch, it is useful to store a private data pointer in the kiocb's wait_queue. Since we provide our own wake up function and do not require the task_struct pointer, it makes sense to convert the task pointer into a generic private pointer. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index c9486c3efb4a..d38c9fecdc36 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -33,7 +33,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 - struct task_struct * task; + void *private; wait_queue_func_t func; struct list_head task_list; }; @@ -60,7 +60,7 @@ typedef struct __wait_queue_head wait_queue_head_t; */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ - .task = tsk, \ + .private = tsk, \ .func = default_wake_function, \ .task_list = { NULL, NULL } } @@ -86,7 +86,7 @@ static inline void init_waitqueue_head(wait_queue_head_t *q) static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { q->flags = 0; - q->task = p; + q->private = p; q->func = default_wake_function; } @@ -94,7 +94,7 @@ static inline void init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) { q->flags = 0; - q->task = NULL; + q->private = NULL; q->func = func; } @@ -110,7 +110,7 @@ static inline int waitqueue_active(wait_queue_head_t *q) * aio specifies a wait queue entry with an async notification * callback routine, not associated with any task. */ -#define is_sync_wait(wait) (!(wait) || ((wait)->task)) +#define is_sync_wait(wait) (!(wait) || ((wait)->private)) extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); @@ -384,7 +384,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); #define DEFINE_WAIT(name) \ wait_queue_t name = { \ - .task = current, \ + .private = current, \ .func = autoremove_wake_function, \ .task_list = LIST_HEAD_INIT((name).task_list), \ } @@ -393,7 +393,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); struct wait_bit_queue name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wait = { \ - .task = current, \ + .private = current, \ .func = wake_bit_function, \ .task_list = \ LIST_HEAD_INIT((name).wait.task_list), \ @@ -402,7 +402,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); #define init_wait(wait) \ do { \ - (wait)->task = current; \ + (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->task_list); \ } while (0) -- cgit v1.2.3 From bfb07599da289881d3bcbb601a110e997fc7444b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:10:32 -0700 Subject: [PATCH] Introduce tty_unregister_ldisc() It's a bit strange to see tty_register_ldisc call in modules' exit functions. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tty.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 1b76106272d3..59ff42c629ec 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -345,6 +345,7 @@ extern int tty_check_change(struct tty_struct * tty); extern void stop_tty(struct tty_struct * tty); extern void start_tty(struct tty_struct * tty); extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc); +extern int tty_unregister_ldisc(int disc); extern int tty_register_driver(struct tty_driver *driver); extern int tty_unregister_driver(struct tty_driver *driver); extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); -- cgit v1.2.3 From 4749f32da939d4e4160541b2cadc22492bb507ec Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 23 Jun 2005 11:36:56 +0200 Subject: [PATCH] better USB_MON dependencies This makes the USB_MON less confusing. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- include/linux/usb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/usb.h b/include/linux/usb.h index 3d508bf08402..eb282b581546 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -290,7 +290,7 @@ struct usb_bus { struct class_device *class_dev; /* class device for this bus */ struct kref kref; /* handles reference counting this bus */ void (*release)(struct usb_bus *bus); /* function to destroy this bus's memory */ -#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) +#if defined(CONFIG_USB_MON) struct mon_bus *mon_bus; /* non-null when associated */ int monitored; /* non-zero when monitored */ #endif -- cgit v1.2.3 From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:19:55 -0700 Subject: [TCP]: Add pluggable congestion control algorithm infrastructure. Allow TCP to have multiple pluggable congestion control algorithms. Algorithms are defined by a set of operations and can be built in or modules. The legacy "new RENO" algorithm is used as a starting point and fallback. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/sysctl.h | 9 +- include/linux/tcp.h | 49 +++------- include/net/tcp.h | 237 ++++++++++++++++--------------------------------- 3 files changed, 86 insertions(+), 209 deletions(-) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a4..72965bfe6cfb 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -333,21 +333,14 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, - NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, NET_TCP_NO_METRICS_SAVE=97, - NET_TCP_VEGAS=98, - NET_TCP_VEGAS_ALPHA=99, - NET_TCP_VEGAS_BETA=100, - NET_TCP_VEGAS_GAMMA=101, - NET_TCP_BIC=102, - NET_TCP_BIC_FAST_CONVERGENCE=103, - NET_TCP_BIC_LOW_WINDOW=104, NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, + NET_TCP_CONG_CONTROL=110, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df5..3ea75dd6640a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -203,13 +203,6 @@ struct tcp_sack_block { __u32 end_seq; }; -enum tcp_congestion_algo { - TCP_RENO=0, - TCP_VEGAS, - TCP_WESTWOOD, - TCP_BIC, -}; - struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -305,7 +298,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ + __u8 unused; __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -401,37 +394,10 @@ struct tcp_sock { __u32 time; } rcvq_space; -/* TCP Westwood structure */ - struct { - __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ - __u32 bw_est; /* bandwidth estimate */ - __u32 rtt_win_sx; /* here starts a new evaluation... */ - __u32 bk; - __u32 snd_una; /* used for evaluating the number of acked bytes */ - __u32 cumul_ack; - __u32 accounted; - __u32 rtt; - __u32 rtt_min; /* minimum observed RTT */ - } westwood; - -/* Vegas variables */ - struct { - __u32 beg_snd_nxt; /* right edge during last RTT */ - __u32 beg_snd_una; /* left edge during last RTT */ - __u32 beg_snd_cwnd; /* saves the size of the cwnd */ - __u8 doing_vegas_now;/* if true, do vegas for this RTT */ - __u16 cntRTT; /* # of RTTs measured within last RTT */ - __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ - } vegas; - - /* BI TCP Parameters */ - struct { - __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ - __u32 last_max_cwnd; /* last maximium snd_cwnd */ - __u32 last_cwnd; /* the last snd_cwnd */ - __u32 last_stamp; /* time when updated last_cwnd */ - } bictcp; + /* Pluggable TCP congestion control hook */ + struct tcp_congestion_ops *ca_ops; + u32 ca_priv[16]; +#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } +static inline void *tcp_ca(const struct tcp_sock *tp) +{ + return (void *) tp->ca_priv; +} + #endif #endif /* _LINUX_TCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index f730935b824a..e427cf35915c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #else # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) #endif - -#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation - * max_cwnd = snd_cwnd * beta - */ -#define BICTCP_MAX_INCREMENT 32 /* - * Limit on the amount of - * increment allowed during - * binary search. - */ -#define BICTCP_FUNC_OF_MIN_INCR 11 /* - * log(B/Smin)/log(B/(B-1))+1, - * Smin:min increment - * B:log factor - */ -#define BICTCP_B 4 /* - * In binary search, - * go to point (max+min)/N - */ - /* * TCP option */ @@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_westwood; -extern int sysctl_tcp_vegas_cong_avoid; -extern int sysctl_tcp_vegas_alpha; -extern int sysctl_tcp_vegas_beta; -extern int sysctl_tcp_vegas_gamma; extern int sysctl_tcp_nometrics_save; -extern int sysctl_tcp_bic; -extern int sysctl_tcp_bic_fast_convergence; -extern int sysctl_tcp_bic_low_window; -extern int sysctl_tcp_bic_beta; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, tp->packets_out -= tcp_skb_pcount(skb); } +/* Events passed to congestion control interface */ +enum tcp_ca_event { + CA_EVENT_TX_START, /* first transmit when no packets in flight */ + CA_EVENT_CWND_RESTART, /* congestion window restart */ + CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ + CA_EVENT_FRTO, /* fast recovery timeout */ + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_FAST_ACK, /* in sequence ack */ + CA_EVENT_SLOW_ACK, /* other ack */ +}; + +/* + * Interface for adding new TCP congestion control handlers + */ +#define TCP_CA_NAME_MAX 16 +struct tcp_congestion_ops { + struct list_head list; + + /* initialize private data (optional) */ + void (*init)(struct tcp_sock *tp); + /* cleanup private data (optional) */ + void (*release)(struct tcp_sock *tp); + + /* return slow start threshold (required) */ + u32 (*ssthresh)(struct tcp_sock *tp); + /* lower bound for congestion window (optional) */ + u32 (*min_cwnd)(struct tcp_sock *tp); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int good_ack); + /* round trip time sample per acked packet (optional) */ + void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); + /* call before changing ca_state (optional) */ + void (*set_state)(struct tcp_sock *tp, u8 new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); + /* new value of cwnd after loss (optional) */ + u32 (*undo_cwnd)(struct tcp_sock *tp); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); + /* get info for tcp_diag (optional) */ + void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); + + char name[TCP_CA_NAME_MAX]; + struct module *owner; +}; + +extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); +extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); + +extern void tcp_init_congestion_control(struct tcp_sock *tp); +extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); +extern int tcp_set_default_congestion_control(const char *name); +extern void tcp_get_default_congestion_control(char *name); + +extern struct tcp_congestion_ops tcp_reno; +extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); +extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int flag); +extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); + +static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +{ + if (tp->ca_ops->set_state) + tp->ca_ops->set_state(tp, ca_state); + tp->ca_state = ca_state; +} + +static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (tp->ca_ops->cwnd_event) + tp->ca_ops->cwnd_event(tp, event); +} + /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK @@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) return (tp->packets_out - tp->left_out + tp->retrans_out); } -/* - * Which congestion algorithim is in use on the connection. - */ -#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) -#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) -#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) - -/* Recalculate snd_ssthresh, we want to set it to: - * - * Reno: - * one half the current congestion window, but no - * less than two segments - * - * BIC: - * behave like Reno until low_window is reached, - * then increase congestion window slowly - */ -static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) -{ - if (tcp_is_bic(tp)) { - if (sysctl_tcp_bic_fast_convergence && - tp->snd_cwnd < tp->bictcp.last_max_cwnd) - tp->bictcp.last_max_cwnd = (tp->snd_cwnd * - (BICTCP_BETA_SCALE - + sysctl_tcp_bic_beta)) - / (2 * BICTCP_BETA_SCALE); - else - tp->bictcp.last_max_cwnd = tp->snd_cwnd; - - if (tp->snd_cwnd > sysctl_tcp_bic_low_window) - return max((tp->snd_cwnd * sysctl_tcp_bic_beta) - / BICTCP_BETA_SCALE, 2U); - } - - return max(tp->snd_cwnd >> 1U, 2U); -} - -/* Stop taking Vegas samples for now. */ -#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) - -static inline void tcp_vegas_enable(struct tcp_sock *tp) -{ - /* There are several situations when we must "re-start" Vegas: - * - * o when a connection is established - * o after an RTO - * o after fast recovery - * o when we send a packet and there is no outstanding - * unacknowledged data (restarting an idle connection) - * - * In these circumstances we cannot do a Vegas calculation at the - * end of the first RTT, because any calculation we do is using - * stale info -- both the saved cwnd and congestion feedback are - * stale. - * - * Instead we must wait until the completion of an RTT during - * which we actually receive ACKs. - */ - - /* Begin taking Vegas samples next time we send something. */ - tp->vegas.doing_vegas_now = 1; - - /* Set the beginning of the next send window. */ - tp->vegas.beg_snd_nxt = tp->snd_nxt; - - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; -} - -/* Should we be taking Vegas samples right now? */ -#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) - -extern void tcp_ca_init(struct tcp_sock *tp); - -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) -{ - if (tcp_is_vegas(tp)) { - if (ca_state == TCP_CA_Open) - tcp_vegas_enable(tp); - else - tcp_vegas_disable(tp); - } - tp->ca_state = ca_state; -} - /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. @@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static inline void __tcp_enter_cwr(struct tcp_sock *tp) { tp->undo_marker = 0; - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -1876,52 +1837,4 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); -/* TCP Westwood functions and constants */ - -#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ -#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ - -static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) -{ - if (tcp_is_westwood(tp)) - tp->westwood.rtt = rtt_seq; -} - -static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / - (__u32) (tp->mss_cache_std), - 2U); -} - -static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; -} - -static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) -{ - __u32 ssthresh = 0; - - if (tcp_is_westwood(tp)) { - ssthresh = __tcp_westwood_bw_rttmin(tp); - if (ssthresh) - tp->snd_ssthresh = ssthresh; - } - - return (ssthresh != 0); -} - -static inline int tcp_westwood_cwnd(struct tcp_sock *tp) -{ - __u32 cwnd = 0; - - if (tcp_is_westwood(tp)) { - cwnd = __tcp_westwood_bw_rttmin(tp); - if (cwnd) - tp->snd_cwnd = cwnd; - } - - return (cwnd != 0); -} #endif /* _TCP_H */ -- cgit v1.2.3 From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:21:28 -0700 Subject: [TCP]: Report congestion control algorithm in tcp_diag. Enhancement to the tcp_diag interface used by the iproute2 ss command to report the tcp congestion control being used by a socket. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/tcp_diag.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index ceee962e1d15..7a5996743946 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -99,9 +99,10 @@ enum TCPDIAG_MEMINFO, TCPDIAG_INFO, TCPDIAG_VEGASINFO, + TCPDIAG_CONG, }; -#define TCPDIAG_MAX TCPDIAG_VEGASINFO +#define TCPDIAG_MAX TCPDIAG_CONG /* TCPDIAG_MEM */ @@ -123,5 +124,4 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; - #endif /* _TCP_DIAG_H_ */ -- cgit v1.2.3 From e608a8072b10258aa18c2e33324def225199ba1d Mon Sep 17 00:00:00 2001 From: David Mosberger-Tang Date: Wed, 22 Jun 2005 22:24:00 -0700 Subject: [IA64] Fix pfn_to_nid() so the kernel compiles again for !CONFIG_DISCONTIGMEM. Signed-off-by: David Mosberger-Tang Signed-off-by: Tony Luck --- include/asm-ia64/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-ia64/mmzone.h b/include/asm-ia64/mmzone.h index 83ca4043fc11..d32f51e3d6c2 100644 --- a/include/asm-ia64/mmzone.h +++ b/include/asm-ia64/mmzone.h @@ -15,6 +15,8 @@ #include #include +#ifdef CONFIG_DISCONTIGMEM + static inline int pfn_to_nid(unsigned long pfn) { #ifdef CONFIG_NUMA @@ -29,8 +31,6 @@ static inline int pfn_to_nid(unsigned long pfn) #endif } -#ifdef CONFIG_DISCONTIGMEM - #ifdef CONFIG_IA64_DIG /* DIG systems are small */ # define MAX_PHYSNODE_ID 8 # define NR_NODE_MEMBLKS (MAX_NUMNODES * 8) -- cgit v1.2.3