From dd7f0b80926befc8c70a873b5b0c0c7b5fd1e7b9 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 22 Jun 2005 12:38:33 -0700 Subject: [NETFILTER]: Fix "iptables -D" rule deletion with ipt_CLUSTERIP target. The patch just changes the order of structure members. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h index baa83e757156..d9bceedfb3dc 100644 --- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h +++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h @@ -18,7 +18,6 @@ struct clusterip_config; struct ipt_clusterip_tgt_info { u_int32_t flags; - struct clusterip_config *config; /* only relevant for new ones */ u_int8_t clustermac[6]; @@ -27,6 +26,8 @@ struct ipt_clusterip_tgt_info { u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; enum clusterip_hashmode hash_mode; u_int32_t hash_initval; + + struct clusterip_config *config; }; #endif /*_IPT_CLUSTERIP_H_target*/ -- cgit v1.2.3 From 6ca4f65e6b390d09e1de7280cf9fd4f5d8e4b48b Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:04:55 -0700 Subject: [NETPOLL]: Set poll_owner to -1 before unlocking in netpoll_poll_unlock() This trivial patch moves the assignment of poll_owner to -1 inside of the lock. This fixes a potential SMP race in the code. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index c0d8b90c5202..449a4fde6587 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -53,8 +53,8 @@ static inline void netpoll_poll_lock(struct net_device *dev) static inline void netpoll_poll_unlock(struct net_device *dev) { if (dev->np) { - spin_unlock(&dev->np->poll_lock); dev->np->poll_owner = -1; + spin_unlock(&dev->np->poll_lock); } } -- cgit v1.2.3 From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:05:31 -0700 Subject: [NETPOLL]: Introduce a netpoll_info struct This patch introduces a netpoll_info structure, which the struct net_device will now point to instead of pointing to a struct netpoll. The reason for this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock should be maintained per net_device, not per netpoll; and 2) this is a first step in providing support for multiple netpoll clients to register against the same net_device. The struct netpoll is now pointed to by the netpoll_info structure. As such, the previous behaviour of the code is preserved. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- include/linux/netpoll.h | 25 +++++++++++++++++-------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ba5d1236aa17..d6afd440cf7b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -41,7 +41,7 @@ struct divert_blk; struct vlan_group; struct ethtool_ops; -struct netpoll; +struct netpoll_info; /* source back-compat hooks */ #define SET_ETHTOOL_OPS(netdev,ops) \ ( (netdev)->ethtool_ops = (ops) ) @@ -468,7 +468,7 @@ struct net_device unsigned char *haddr); int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); #ifdef CONFIG_NETPOLL - struct netpoll *np; + struct netpoll_info *npinfo; #endif #ifdef CONFIG_NET_POLL_CONTROLLER void (*poll_controller)(struct net_device *dev); diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 449a4fde6587..388cd91bc7a6 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -16,14 +16,18 @@ struct netpoll; struct netpoll { struct net_device *dev; char dev_name[16], *name; - int rx_flags; void (*rx_hook)(struct netpoll *, int, char *, int); void (*drop)(struct sk_buff *skb); u32 local_ip, remote_ip; u16 local_port, remote_port; unsigned char local_mac[6], remote_mac[6]; +}; + +struct netpoll_info { spinlock_t poll_lock; int poll_owner; + int rx_flags; + struct netpoll *np; }; void netpoll_poll(struct netpoll *np); @@ -39,22 +43,27 @@ void netpoll_queue(struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline int netpoll_rx(struct sk_buff *skb) { - return skb->dev->np && skb->dev->np->rx_flags && __netpoll_rx(skb); + struct netpoll_info *npinfo = skb->dev->npinfo; + + if (!npinfo || !npinfo->rx_flags) + return 0; + + return npinfo->np && __netpoll_rx(skb); } static inline void netpoll_poll_lock(struct net_device *dev) { - if (dev->np) { - spin_lock(&dev->np->poll_lock); - dev->np->poll_owner = smp_processor_id(); + if (dev->npinfo) { + spin_lock(&dev->npinfo->poll_lock); + dev->npinfo->poll_owner = smp_processor_id(); } } static inline void netpoll_poll_unlock(struct net_device *dev) { - if (dev->np) { - dev->np->poll_owner = -1; - spin_unlock(&dev->np->poll_lock); + if (dev->npinfo) { + dev->npinfo->poll_owner = -1; + spin_unlock(&dev->npinfo->poll_lock); } } -- cgit v1.2.3 From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Jun 2005 22:05:59 -0700 Subject: [NETPOLL]: allow multiple netpoll_clients to register against one interface This patch provides support for registering multiple netpoll clients to the same network device. Only one of these clients may register an rx_hook, however. In practice, this restriction has not been problematic. It is worth mentioning, though, that the current design can be easily extended to allow for the registration of multiple rx_hooks. The basic idea of the patch is that the rx_np pointer in the netpoll_info structure points to the struct netpoll that has rx_hook filled in. Aside from this one case, there is no need for a pointer from the struct net_device to an individual struct netpoll. A lock is introduced to protect the setting and clearing of the np_rx pointer. The pointer will only be cleared upon netpoll client module removal, and the lock should be uncontested. Signed-off-by: Jeff Moyer Signed-off-by: David S. Miller --- include/linux/netpoll.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 388cd91bc7a6..bcd0ac33f592 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -27,7 +27,8 @@ struct netpoll_info { spinlock_t poll_lock; int poll_owner; int rx_flags; - struct netpoll *np; + spinlock_t rx_lock; + struct netpoll *rx_np; /* netpoll that registered an rx_hook */ }; void netpoll_poll(struct netpoll *np); @@ -44,11 +45,19 @@ void netpoll_queue(struct sk_buff *skb); static inline int netpoll_rx(struct sk_buff *skb) { struct netpoll_info *npinfo = skb->dev->npinfo; + unsigned long flags; + int ret = 0; - if (!npinfo || !npinfo->rx_flags) + if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags)) return 0; - return npinfo->np && __netpoll_rx(skb); + spin_lock_irqsave(&npinfo->rx_lock, flags); + /* check rx_flags again with the lock held */ + if (npinfo->rx_flags && __netpoll_rx(skb)) + ret = 1; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + + return ret; } static inline void netpoll_poll_lock(struct net_device *dev) -- cgit v1.2.3 From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Jun 2005 22:15:01 -0700 Subject: [X25]: Selective sub-address matching with call user data. From: Shaun Pereira This is the first (independent of the second) patch of two that I am working on with x25 on linux (tested with xot on a cisco router). Details are as follows. Current state of module: A server using the current implementation (2.6.11.7) of the x25 module will accept a call request/ incoming call packet at the listening x.25 address, from all callers to that address, as long as NO call user data is present in the packet header. If the server needs to choose to accept a particular call request/ incoming call packet arriving at its listening x25 address, then the kernel has to allow a match of call user data present in the call request packet with its own. This is required when multiple servers listen at the same x25 address and device interface. The kernel currently matches ALL call user data, if present. Current Changes: This patch is a follow up to the patch submitted previously by Andrew Hendry, and allows the user to selectively control the number of octets of call user data in the call request packet, that the kernel will match. By default no call user data is matched, even if call user data is present. To allow call user data matching, a cudmatchlength > 0 has to be passed into the kernel after which the passed number of octets will be matched. Otherwise the kernel behavior is exactly as the original implementation. This patch also ensures that as is normally the case, no call user data will be present in the Call accepted / call connected packet sent back to the caller Future Changes on next patch: There are cases however when call user data may be present in the call accepted packet. According to the X.25 recommendation (ITU-T 10/96) section 5.2.3.2 call user data may be present in the call accepted packet provided the fast select facility is used. My next patch will include this fast select utility and the ability to send up to 128 octets call user data in the call accepted packet provided the fast select facility is used. I am currently testing this, again with xot on linux and cisco. Signed-off-by: Shaun Pereira (With a fix from Alexey Dobriyan ) Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/x25.h b/include/linux/x25.h index 7531cfed5885..6f43b3d20248 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -4,6 +4,8 @@ * History * mar/20/00 Daniela Squassoni Disabling/enabling of facilities * negotiation. + * apr/02/05 Shaun Pereira Selective sub address matching with + * call user data */ #ifndef X25_KERNEL_H @@ -16,6 +18,7 @@ #define SIOCX25GCALLUSERDATA (SIOCPROTOPRIVATE + 4) #define SIOCX25SCALLUSERDATA (SIOCPROTOPRIVATE + 5) #define SIOCX25GCAUSEDIAG (SIOCPROTOPRIVATE + 6) +#define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) /* * Values for {get,set}sockopt. @@ -109,4 +112,11 @@ struct x25_causediag { unsigned char diagnostic; }; +/* + * Further optional call user data match length selection + */ +struct x25_subaddr { + unsigned int cudmatchlength; +}; + #endif -- cgit v1.2.3 From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001 From: Shaun Pereira Date: Wed, 22 Jun 2005 22:16:17 -0700 Subject: [X25]: Fast select with no restriction on response This patch is a follow up to patch 1 regarding "Selective Sub Address matching with call user data". It allows use of the Fast-Select-Acceptance optional user facility for X.25. This patch just implements fast select with no restriction on response (NRR). What this means (according to ITU-T Recomendation 10/96 section 6.16) is that if in an incoming call packet, the relevant facility bits are set for fast-select-NRR, then the called DTE can issue a direct response to the incoming packet using a call-accepted packet that contains call-user-data. This patch allows such a response. The called DTE can also respond with a clear-request packet that contains call-user-data. However, this feature is currently not implemented by the patch. How is Fast Select Acceptance used? By default, the system does not allow fast select acceptance (as before). To enable a response to fast select acceptance, After a listen socket in created and bound as follows socket(AF_X25, SOCK_SEQPACKET, 0); bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr)); but before a listen system call is made, the following ioctl should be used. ioctl(call_soc,SIOCX25CALLACCPTAPPRV); Now the listen system call can be made listen(call_soc, 4); After this, an incoming-call packet will be accepted, but no call-accepted packet will be sent back until the following system call is made on the socket that accepts the call ioctl(vc_soc,SIOCX25SENDCALLACCPT); The network (or cisco xot router used for testing here) will allow the application server's call-user-data in the call-accepted packet, provided the call-request was made with Fast-select NRR. Signed-off-by: Shaun Pereira Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/x25.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/x25.h b/include/linux/x25.h index 6f43b3d20248..16d44931afa0 100644 --- a/include/linux/x25.h +++ b/include/linux/x25.h @@ -19,6 +19,8 @@ #define SIOCX25SCALLUSERDATA (SIOCPROTOPRIVATE + 5) #define SIOCX25GCAUSEDIAG (SIOCPROTOPRIVATE + 6) #define SIOCX25SCUDMATCHLEN (SIOCPROTOPRIVATE + 7) +#define SIOCX25CALLACCPTAPPRV (SIOCPROTOPRIVATE + 8) +#define SIOCX25SENDCALLACCPT (SIOCPROTOPRIVATE + 9) /* * Values for {get,set}sockopt. -- cgit v1.2.3 From 408fde81c1bff15c875a3618481e93a01dcc79ea Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:37 -0700 Subject: [PATCH] remove non-DISCONTIG use of pgdat->node_mem_map This patch effectively eliminates direct use of pgdat->node_mem_map outside of the DISCONTIG code. On a flat memory system, these fields aren't currently used, neither are they on a sparsemem system. There was also a node_mem_map(nid) macro on many architectures. Its use along with the use of ->node_mem_map itself was not consistent. It has been removed in favor of two new, more explicit, arch-independent macros: pgdat_page_nr(pgdat, pagenr) nid_page_nr(nid, pagenr) I called them "pgdat" and "nid" because we overload the term "node" to mean "NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I believe the newer names are much clearer. These macros can be overridden in the sparsemem case with a theoretically slower operation using node_start_pfn and pfn_to_page(), instead. We could make this the only behavior if people want, but I don't want to change too much at once. One thing at a time. This patch removes more code than it adds. Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386 generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/ Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs. Signed-off-by: Dave Hansen Signed-off-by: Martin J. Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4733d35d8223..b79633d3a97b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -284,6 +284,8 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) extern struct pglist_data *pgdat_list; -- cgit v1.2.3 From 6f167ec721108c9282d54424516a12c805e3c306 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:39 -0700 Subject: [PATCH] sparsemem base: simple NUMA remap space allocator Introduce a simple allocator for the NUMA remap space. This space is very scarce, used for structures which are best allocated node local. This mechanism is also used on non-NUMA ia64 systems with a vmem_map to keep the pgdat->node_mem_map initialized in a consistent place for all architectures. Issues: o alloc_remap takes a node_id where we might expect a pgdat which was intended to allow us to allocate the pgdat's using this mechanism; which we do not yet do. Could have alloc_remap_node() and alloc_remap_nid() for this purpose. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0dd8ca1a3d5a..500f451ce0c0 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -67,6 +67,15 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP +extern void *alloc_remap(int nid, unsigned long size); +#else +static inline void *alloc_remap(int nid, unsigned long size) +{ + return NULL; +} +#endif + extern unsigned long __initdata nr_kernel_pages; extern unsigned long __initdata nr_all_pages; -- cgit v1.2.3 From 348f8b6c4837a07304d2f72b11ce8d96588065e0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:40 -0700 Subject: [PATCH] sparsemem base: reorganize page->flags bit operations Generify the value fields in the page_flags. The aim is to allow the location and size of these fields to be varied. Additionally we want to move away from fixed allocations per field whilst still enforcing the overall bit utilisation limits. We rely on the compiler to spot and optimise the accessor functions. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 53 +++++++++++++++++++++++++++++++++++++++++--------- include/linux/mmzone.h | 19 +++++++----------- 2 files changed, 51 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1813b162b0a8..57b2ead51dba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -395,19 +395,41 @@ static inline void put_page(struct page *page) /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. - * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total, - * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits. */ -#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT) + +/* Page flags: | NODE | ZONE | ... | FLAGS | */ +#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) + +/* + * Define the bit shifts to access each section. For non-existant + * sections we define the shift as 0; that plus a 0 mask ensures + * the compiler will optimise away reference to them. + */ +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) + +/* NODE:ZONE is used to lookup the zone from a page. */ +#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#define ZONETABLE_PGSHIFT ZONES_PGSHIFT + +#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#endif + #define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) +#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) +#define NODES_MASK ((1UL << NODES_SHIFT) - 1) +#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) + static inline unsigned long page_zonenum(struct page *page) { - return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT)); + return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } static inline unsigned long page_to_nid(struct page *page) { - return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT)); + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; } struct zone; @@ -415,13 +437,26 @@ extern struct zone *zone_table[]; static inline struct zone *page_zone(struct page *page) { - return zone_table[page->flags >> NODEZONE_SHIFT]; + return zone_table[(page->flags >> ZONETABLE_PGSHIFT) & + ZONETABLE_MASK]; +} + +static inline void set_page_zone(struct page *page, unsigned long zone) +{ + page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; +} +static inline void set_page_node(struct page *page, unsigned long node) +{ + page->flags &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } -static inline void set_page_zone(struct page *page, unsigned long nodezone_num) +static inline void set_page_links(struct page *page, unsigned long zone, + unsigned long node) { - page->flags &= ~(~0UL << NODEZONE_SHIFT); - page->flags |= nodezone_num << NODEZONE_SHIFT; + set_page_zone(page, zone); + set_page_node(page, node); } #ifndef CONFIG_DISCONTIGMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b79633d3a97b..39e912708e2a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -414,30 +414,25 @@ extern struct pglist_data contig_page_data; #include +#endif /* !CONFIG_DISCONTIGMEM */ + #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* * with 32 bit page->flags field, we reserve 8 bits for node/zone info. * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. */ -#define MAX_NODES_SHIFT 6 +#define FLAGS_RESERVED 8 + #elif BITS_PER_LONG == 64 /* * with 64 bit flags field, there's plenty of room. */ -#define MAX_NODES_SHIFT 10 -#endif +#define FLAGS_RESERVED 32 -#endif /* !CONFIG_DISCONTIGMEM */ - -#if NODES_SHIFT > MAX_NODES_SHIFT -#error NODES_SHIFT > MAX_NODES_SHIFT -#endif +#else -/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */ -#define MAX_ZONES_SHIFT 2 +#error BITS_PER_LONG not defined -#if ZONES_SHIFT > MAX_ZONES_SHIFT -#error ZONES_SHIFT > MAX_ZONES_SHIFT #endif #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3 From 93b7504e3e6c1d98586854806e51bea329ea3aa9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 23 Jun 2005 00:07:47 -0700 Subject: [PATCH] Introduce new Kconfig option for NUMA or DISCONTIG There is some confusion that arose when working on SPARSEMEM patch between what is needed for DISCONTIG vs. NUMA. Multiple pg_data_t's are needed for DISCONTIGMEM or NUMA, independently. All of the current NUMA implementations require an implementation of DISCONTIG. Because of this, quite a lot of code which is really needed for NUMA is actually under DISCONTIG #ifdefs. For SPARSEMEM, we changed some of these #ifdefs to CONFIG_NUMA, but that broke the DISCONTIG=y and NUMA=n case. Introducing this new NEED_MULTIPLE_NODES config option allows code that is needed for both NUMA or DISCONTIG to be separated out from code that is specific to DISCONTIG. One great advantage of this approach is that it doesn't require every architecture to be converted over. All of the current implementations should "just work", only the ones implementing SPARSEMEM will have to be fixed up. The change to free_area_init() makes it work inside, or out of the new config option. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39e912708e2a..95f4a780ea66 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -402,7 +402,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, /* Returns the number of the current Node. */ #define numa_node_id() (cpu_to_node(raw_smp_processor_id())) -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) @@ -410,11 +410,11 @@ extern struct pglist_data contig_page_data; #define MAX_NODES_SHIFT 1 #define pfn_to_nid(pfn) (0) -#else /* CONFIG_DISCONTIGMEM */ +#else /* CONFIG_NEED_MULTIPLE_NODES */ #include -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* -- cgit v1.2.3 From b159d43fbf7eaaac6ecc647f51cf4257332db47b Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:52 -0700 Subject: [PATCH] generify early_pfn_to_nid Provide a default implementation for early_pfn_to_nid returning node 0. Allow architectures to override this with their own implementation out of asm/mmzone.h. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 95f4a780ea66..6ef07de98d69 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -435,6 +435,10 @@ extern struct pglist_data contig_page_data; #endif +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#define early_pfn_to_nid(nid) (0UL) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ -- cgit v1.2.3 From d41dee369bff3b9dcb6328d4d822926c28cc2594 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:54 -0700 Subject: [PATCH] sparsemem memory model Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of mem_map[] is needed by discontiguous memory machines (like in the old CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually become a complete replacement. A significant advantage over DISCONTIGMEM is that it's completely separated from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA and DISCONTIG are often confused. Another advantage is that sparse doesn't require each NUMA node's ranges to be contiguous. It can handle overlapping ranges between nodes with no problems, where DISCONTIGMEM currently throws away that memory. Sparsemem uses an array to provide different pfn_to_page() translations for each SECTION_SIZE area of physical memory. This is what allows the mem_map[] to be chopped up. In order to do quick pfn_to_page() operations, the section number of the page is encoded in page->flags. Part of the sparsemem infrastructure enables sharing of these bits more dynamically (at compile-time) between the page_zone() and sparsemem operations. However, on 32-bit architectures, the number of bits is quite limited, and may require growing the size of the page->flags type in certain conditions. Several things might force this to occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of memory), an increase in the physical address space, or an increase in the number of used page->flags. One thing to note is that, once sparsemem is present, the NUMA node information no longer needs to be stored in the page->flags. It might provide speed increases on certain platforms and will be stored there if there is room. But, if out of room, an alternate (theoretically slower) mechanism is used. This patch introduces CONFIG_FLATMEM. It is used in almost all cases where there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM often have to compile out the same areas of code. Signed-off-by: Andy Whitcroft Signed-off-by: Dave Hansen Signed-off-by: Martin Bligh Signed-off-by: Adrian Bunk Signed-off-by: Yasunori Goto Signed-off-by: Bob Picco Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 92 ++++++++++++++++++++++++++++++++++++++--------- include/linux/mmzone.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/numa.h | 2 +- 3 files changed, 172 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 57b2ead51dba..6eb7f48317f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -397,40 +397,80 @@ static inline void put_page(struct page *page) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | NODE | ZONE | ... | FLAGS | */ -#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) + +/* + * page->flags layout: + * + * There are three possibilities for how page->flags get + * laid out. The first is for the normal case, without + * sparsemem. The second is for sparsemem when there is + * plenty of space for node and section. The last is when + * we have run out of space and have to fall back to an + * alternate (slower) way of determining the node. + * + * No sparsemem: | NODE | ZONE | ... | FLAGS | + * with space for node: | SECTION | NODE | ZONE | ... | FLAGS | + * no space for node: | SECTION | ZONE | ... | FLAGS | + */ +#ifdef CONFIG_SPARSEMEM +#define SECTIONS_WIDTH SECTIONS_SHIFT +#else +#define SECTIONS_WIDTH 0 +#endif + +#define ZONES_WIDTH ZONES_SHIFT + +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED +#define NODES_WIDTH NODES_SHIFT +#else +#define NODES_WIDTH 0 +#endif + +/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ +#define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH) +#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + +/* + * We are going to use the flags for the page to node mapping if its in + * there. This includes the case where there is no node, so it is implicit. + */ +#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0) + +#ifndef PFN_SECTION_SHIFT +#define PFN_SECTION_SHIFT 0 +#endif /* * Define the bit shifts to access each section. For non-existant * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) +#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -/* NODE:ZONE is used to lookup the zone from a page. */ +/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */ +#if FLAGS_HAS_NODE #define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#else +#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#endif #define ZONETABLE_PGSHIFT ZONES_PGSHIFT -#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED -#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED +#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED +#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED #endif -#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) - -#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) -#define NODES_MASK ((1UL << NODES_SHIFT) - 1) +#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) +#define NODES_MASK ((1UL << NODES_WIDTH) - 1) +#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) static inline unsigned long page_zonenum(struct page *page) { return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } -static inline unsigned long page_to_nid(struct page *page) -{ - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; -} struct zone; extern struct zone *zone_table[]; @@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page) ZONETABLE_MASK]; } +static inline unsigned long page_to_nid(struct page *page) +{ + if (FLAGS_HAS_NODE) + return (page->flags >> NODES_PGSHIFT) & NODES_MASK; + else + return page_zone(page)->zone_pgdat->node_id; +} +static inline unsigned long page_to_section(struct page *page) +{ + return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; +} + static inline void set_page_zone(struct page *page, unsigned long zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); @@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node) page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } +static inline void set_page_section(struct page *page, unsigned long section) +{ + page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; +} static inline void set_page_links(struct page *page, unsigned long zone, - unsigned long node) + unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); + set_page_section(page, pfn_to_section_nr(pfn)); } #ifndef CONFIG_DISCONTIGMEM diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6ef07de98d69..19860d317ec2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -269,7 +269,9 @@ typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[GFP_ZONETYPES]; int nr_zones; +#ifdef CONFIG_FLAT_NODE_MEM_MAP struct page *node_mem_map; +#endif struct bootmem_data *bdata; unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ @@ -284,7 +286,11 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#ifdef CONFIG_FLAT_NODE_MEM_MAP #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#else +#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) +#endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) extern struct pglist_data *pgdat_list; @@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data; #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#ifdef CONFIG_SPARSEMEM +#include +#endif + #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) /* * with 32 bit page->flags field, we reserve 8 bits for node/zone info. @@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data; #define early_pfn_to_nid(nid) (0UL) #endif +#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) +#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) + +#ifdef CONFIG_SPARSEMEM + +/* + * SECTION_SHIFT #bits space required to store a section # + * + * PA_SECTION_SHIFT physical address to/from section number + * PFN_SECTION_SHIFT pfn to/from section number + */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) +#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) + +#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) + +#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) +#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) + +#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_ORDER exceeds SECTION_SIZE +#endif + +struct page; +struct mem_section { + struct page *section_mem_map; +}; + +extern struct mem_section mem_section[NR_MEM_SECTIONS]; + +/* + * Given a kernel address, find the home node of the underlying memory. + */ +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) + +static inline struct mem_section *__pfn_to_section(unsigned long pfn) +{ + return &mem_section[pfn_to_section_nr(pfn)]; +} + +#define pfn_to_page(pfn) \ +({ \ + unsigned long __pfn = (pfn); \ + __pfn_to_section(__pfn)->section_mem_map + __pfn; \ +}) +#define page_to_pfn(page) \ +({ \ + page - mem_section[page_to_section(page)].section_mem_map; \ +}) + +static inline int pfn_valid(unsigned long pfn) +{ + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) + return 0; + return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0; +} + +/* + * These are _only_ used during initialisation, therefore they + * can use __initdata ... They could have names to indicate + * this restriction. + */ +#ifdef CONFIG_NUMA +#define pfn_to_nid early_pfn_to_nid +#endif + +#define pfn_to_pgdat(pfn) \ +({ \ + NODE_DATA(pfn_to_nid(pfn)); \ +}) + +#define early_pfn_valid(pfn) pfn_valid(pfn) +void sparse_init(void); +#else +#define sparse_init() do {} while (0) +#endif /* CONFIG_SPARSEMEM */ + +#ifndef early_pfn_valid +#define early_pfn_valid(pfn) (1) +#endif + +void memory_present(int nid, unsigned long start, unsigned long end); +unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/include/linux/numa.h b/include/linux/numa.h index bd0c8c4e9a95..f0c539bd3cfc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -3,7 +3,7 @@ #include -#ifdef CONFIG_DISCONTIGMEM +#ifndef CONFIG_FLATMEM #include #endif -- cgit v1.2.3 From 641c767389b19859a45e6de46d8e18cd935bdb60 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:07:59 -0700 Subject: [PATCH] sparsemem swiss cheese numa layouts The part of the sparsemem patch which modifies memmap_init_zone() has recently become a problem. It changes behavior so that there is a call to pfn_to_page() for each individual page inside of a node's range: node_start_pfn through node_end_pfn. It used to simply do this once, at the beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside of a node made it necessary to change. Mike Kravetz recently wrote a patch which made the NUMA code accept some new kinds of layouts. The system's memory was laid out like this, with node 0's memory in two pieces: one before and one after node 1's memory: Node 0: +++++ +++++ Node 1: +++++ Previous behavior before Mike's patch was to assign nodes like this: Node 0: 00000 XXXXX Node 1: 11111 Where the 'X' areas were simply thrown away. The new behavior was to make the pg_data_t span node 0 across all of its areas, including areas that are really node 1's: Node 0: 000000000000000 Node 1: 11111 This wastes a little bit of mem_map space, but ends up being OK, and more fully utilizes the system's memory. memmap_init_zone() initializes all of the "struct page"s for node 0, even for the "hole", but those never get used, because there is no pfn_to_page() that resolves to those pages. However, only calling pfn_to_page() once, memmap_init_zone() always uses the pages that were allocated for node0->node_mem_map because: struct page *start = pfn_to_page(start_pfn); // effectively start = &node->node_mem_map[0] for (page = start; page < (start + size); page++) { init_page_here();... page++; } Slow, and wasteful, but generally harmless. But, modify that to call pfn_to_page() for each loop iteration (like sparsemem does): for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) { page = pfn_to_page(pfn); } And you end up trying to initialize node 1's pages too early, along with bogus data from node 0. This patch checks for those weird layouts and declines to touch the pages, making the more frequent pfn_to_page() calls OK to do. Signed-off-by: Dave Hansen Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 19860d317ec2..746b57e3d370 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -528,6 +528,12 @@ void sparse_init(void); #define sparse_init() do {} while (0) #endif /* CONFIG_SPARSEMEM */ +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid)) +#else +#define early_pfn_in_nid(pfn, nid) (1) +#endif + #ifndef early_pfn_valid #define early_pfn_valid(pfn) (1) #endif -- cgit v1.2.3 From 29751f6991e845f7d002a6ae520bf996b38c8dcd Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Thu, 23 Jun 2005 00:08:00 -0700 Subject: [PATCH] sparsemem hotplug base Make sparse's initalization be accessible at runtime. This allows sparse mappings to be created after boot in a hotplug situation. This patch is separated from the previous one just to give an indication how much of the sparse infrastructure is *just* for hotplug memory. The section_mem_map doesn't really store a pointer. It stores something that is convenient to do some math against to get a pointer. It isn't valid to just do *section_mem_map, so I don't think it should be stored as a pointer. There are a couple of things I'd like to store about a section. First of all, the fact that it is !NULL does not mean that it is present. There could be such a combination where section_mem_map *is* NULL, but the math gets you properly to a real mem_map. So, I don't think that check is safe. Since we're storing 32-bit-aligned structures, we have a few bits in the bottom of the pointer to play with. Use one bit to encode whether there's really a mem_map there, and the other one to tell whether there's a valid section there. We need to distinguish between the two because sometimes there's a gap between when a section is discovered to be present and when we can get the mem_map for it. Signed-off-by: Dave Hansen Signed-off-by: Andy Whitcroft Signed-off-by: Jack Steiner Signed-off-by: Bob Picco Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 56 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 746b57e3d370..6c90461ed99f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -476,11 +476,56 @@ extern struct pglist_data contig_page_data; struct page; struct mem_section { - struct page *section_mem_map; + /* + * This is, logically, a pointer to an array of struct + * pages. However, it is stored with some other magic. + * (see sparse.c::sparse_init_one_section()) + * + * Making it a UL at least makes someone do a cast + * before using it wrong. + */ + unsigned long section_mem_map; }; extern struct mem_section mem_section[NR_MEM_SECTIONS]; +static inline struct mem_section *__nr_to_section(unsigned long nr) +{ + return &mem_section[nr]; +} + +/* + * We use the lower bits of the mem_map pointer to store + * a little bit of information. There should be at least + * 3 bits here due to 32-bit alignment. + */ +#define SECTION_MARKED_PRESENT (1UL<<0) +#define SECTION_HAS_MEM_MAP (1UL<<1) +#define SECTION_MAP_LAST_BIT (1UL<<2) +#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) + +static inline struct page *__section_mem_map_addr(struct mem_section *section) +{ + unsigned long map = section->section_mem_map; + map &= SECTION_MAP_MASK; + return (struct page *)map; +} + +static inline int valid_section(struct mem_section *section) +{ + return (section->section_mem_map & SECTION_MARKED_PRESENT); +} + +static inline int section_has_mem_map(struct mem_section *section) +{ + return (section->section_mem_map & SECTION_HAS_MEM_MAP); +} + +static inline int valid_section_nr(unsigned long nr) +{ + return valid_section(__nr_to_section(nr)); +} + /* * Given a kernel address, find the home node of the underlying memory. */ @@ -488,24 +533,25 @@ extern struct mem_section mem_section[NR_MEM_SECTIONS]; static inline struct mem_section *__pfn_to_section(unsigned long pfn) { - return &mem_section[pfn_to_section_nr(pfn)]; + return __nr_to_section(pfn_to_section_nr(pfn)); } #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ - __pfn_to_section(__pfn)->section_mem_map + __pfn; \ + __section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn; \ }) #define page_to_pfn(page) \ ({ \ - page - mem_section[page_to_section(page)].section_mem_map; \ + page - __section_mem_map_addr(__nr_to_section( \ + page_to_section(page))); \ }) static inline int pfn_valid(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0; + return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); } /* -- cgit v1.2.3 From 1946089a109251655c5438d92c539bd2930e71ea Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:08:19 -0700 Subject: [PATCH] NUMA aware block device control structure allocation Patch to allocate the control structures for for ide devices on the node of the device itself (for NUMA systems). The patch depends on the Slab API change patch by Manfred and me (in mm) and the pcidev_to_node patch that I posted today. Does some realignment too. Signed-off-by: Justin M. Forbes Signed-off-by: Christoph Lameter Signed-off-by: Pravin Shelar Signed-off-by: Shobhit Dayal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 6 +++++- include/linux/genhd.h | 1 + include/linux/ide.h | 2 +- include/linux/mempool.h | 11 ++++++++--- 4 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4a99b76c5a33..235c3414d268 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -396,6 +396,7 @@ struct request_queue */ unsigned int sg_timeout; unsigned int sg_reserved_size; + int node; struct list_head drain_list; @@ -615,6 +616,8 @@ static inline void blkdev_dequeue_request(struct request *req) /* * Access functions for manipulating queue properties */ +extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn, + spinlock_t *lock, int node_id); extern request_queue_t *blk_init_queue(request_fn_proc *, spinlock_t *); extern void blk_cleanup_queue(request_queue_t *); extern void blk_queue_make_request(request_queue_t *, make_request_fn *); @@ -646,7 +649,8 @@ extern void blk_wait_queue_drained(request_queue_t *, int); extern void blk_finish_queue_drain(request_queue_t *); int blk_get_queue(request_queue_t *); -request_queue_t *blk_alloc_queue(int); +request_queue_t *blk_alloc_queue(int gfp_mask); +request_queue_t *blk_alloc_queue_node(int,int); #define blk_put_queue(q) blk_cleanup_queue((q)) /* diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 47dedaf971d6..af26dc718ef6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -403,6 +403,7 @@ extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); extern void add_partition(struct gendisk *, int, sector_t, sector_t); extern void delete_partition(struct gendisk *, int); +extern struct gendisk *alloc_disk_node(int minors, int node_id); extern struct gendisk *alloc_disk(int minors); extern struct kobject *get_disk(struct gendisk *disk); extern void put_disk(struct gendisk *disk); diff --git a/include/linux/ide.h b/include/linux/ide.h index 336d6e509f59..92129078d4f3 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -917,7 +917,7 @@ typedef struct hwif_s { unsigned dma; void (*led_act)(void *data, int rw); -} ide_hwif_t; +} ____cacheline_maxaligned_in_smp ide_hwif_t; /* * internal ide interrupt handler type diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 4a36edf1c974..796220ce47cc 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -20,9 +20,14 @@ typedef struct mempool_s { mempool_free_t *free; wait_queue_head_t wait; } mempool_t; -extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); -extern int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask); + +extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, int nid); + +extern int mempool_resize(mempool_t *pool, int new_min_nr, + unsigned int __nocast gfp_mask); extern void mempool_destroy(mempool_t *pool); extern void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask); extern void mempool_free(void *element, mempool_t *pool); -- cgit v1.2.3 From fa72b903f75e4f0f0b2c2feed093005167da4023 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:49 -0700 Subject: [PATCH] blk: remove blk_queue_tag->real_max_depth optimization blk_queue_tag->real_max_depth was used to optimize out unnecessary allocations/frees on tag resize. However, the whole thing was very broken - tag_map was never allocated to real_max_depth resulting in access beyond the end of the map, bits in [max_depth..real_max_depth] were set when initializing a map and copied when resizing resulting in pre-occupied tags. As the gain of the optimization is very small, well, almost nill, remove the whole thing. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 235c3414d268..8d7e2f4151d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -294,7 +294,6 @@ struct blk_queue_tag { struct list_head busy_list; /* fifo list of busy tags */ int busy; /* current depth */ int max_depth; /* what we will send to device */ - int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ }; -- cgit v1.2.3 From f7d37d028dfba90b1b747f8ac685bf0959aeda8b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Jun 2005 00:08:50 -0700 Subject: [PATCH] blk: remove BLK_TAGS_{PER_LONG|MASK} Replace BLK_TAGS_PER_LONG with BITS_PER_LONG and remove unused BLK_TAGS_MASK. Signed-off-by: Tejun Heo Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8d7e2f4151d0..60272141ff19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -285,9 +285,6 @@ enum blk_queue_state { Queue_up, }; -#define BLK_TAGS_PER_LONG (sizeof(unsigned long) * 8) -#define BLK_TAGS_MASK (BLK_TAGS_PER_LONG - 1) - struct blk_queue_tag { struct request **tag_index; /* map of busy tags */ unsigned long *tag_map; /* bit map of free/busy tags */ -- cgit v1.2.3 From 55c888d6d09a0df236adfaf8ccf06ff5d0646775 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:08:56 -0700 Subject: [PATCH] timers fixes/improvements This patch tries to solve following problems: 1. del_timer_sync() is racy. The timer can be fired again after del_timer_sync have checked all cpus and before it will recheck timer_pending(). 2. It has scalability problems. All cpus are scanned to determine if the timer is running on that cpu. With this patch del_timer_sync is O(1) and no slower than plain del_timer(pending_timer), unless it has to actually wait for completion of the currently running timer. The only restriction is that the recurring timer should not use add_timer_on(). 3. The timers are not serialized wrt to itself. If CPU_0 does mod_timer(jiffies+1) while the timer is currently running on CPU 1, it is quite possible that local interrupt on CPU_0 will start that timer before it finished on CPU_1. 4. The timers locking is suboptimal. __mod_timer() takes 3 locks at once and still requires wmb() in del_timer/run_timers. The new implementation takes 2 locks sequentially and does not need memory barriers. Currently ->base != NULL means that the timer is pending. In that case ->base.lock is used to lock the timer. __mod_timer also takes timer->lock because ->base can be == NULL. This patch uses timer->entry.next != NULL as indication that the timer is pending. So it does __list_del(), entry->next = NULL instead of list_del() when the timer is deleted. The ->base field is used for hashed locking only, it is initialized in init_timer() which sets ->base = per_cpu(tvec_bases). When the tvec_bases.lock is locked, it means that all timers which are tied to this base via timer->base are locked, and the base itself is locked too. So __run_timers/migrate_timers can safely modify all timers which could be found on ->tvX lists (pending timers). When the timer's base is locked, and the timer removed from ->entry list (which means that _run_timers/migrate_timers can't see this timer), it is possible to set timer->base = NULL and drop the lock: the timer remains locked. This patch adds lock_timer_base() helper, which waits for ->base != NULL, locks the ->base, and checks it is still the same. __mod_timer() schedules the timer on the local CPU and changes it's base. However, it does not lock both old and new bases at once. It locks the timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base, and adds this timer. This simplifies the code, because AB-BA deadlock is not possible. __mod_timer() also ensures that the timer's base is not changed while the timer's handler is running on the old base. __run_timers(), del_timer() do not change ->base anymore, they only clear pending flag. So del_timer_sync() can test timer->base->running_timer == timer to detect whether it is running or not. We don't need timer_list->lock anymore, this patch kills it. We also don't need barriers. del_timer() and __run_timers() used smp_wmb() before clearing timer's pending flag. It was needed because __mod_timer() did not lock old_base if the timer is not pending, so __mod_timer()->list_add() could race with del_timer()->list_del(). With this patch these functions are serialized through base->lock. One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch adds global struct timer_base_s { spinlock_t lock; struct timer_list *running_timer; } __init_timer_base; which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s struct are replaced by struct timer_base_s t_base. It is indeed ugly. But this can't have scalability problems. The global __init_timer_base.lock is used only when __mod_timer() is called for the first time AND the timer was compile time initialized. After that the timer migrates to the local CPU. Signed-off-by: Oleg Nesterov Acked-by: Ingo Molnar Signed-off-by: Renaud Lienhart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timer.h | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 90db1cc62ddd..2e78fedfc069 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -6,45 +6,33 @@ #include #include -struct tvec_t_base_s; +struct timer_base_s; struct timer_list { struct list_head entry; unsigned long expires; - spinlock_t lock; unsigned long magic; void (*function)(unsigned long); unsigned long data; - struct tvec_t_base_s *base; + struct timer_base_s *base; }; #define TIMER_MAGIC 0x4b87ad6e +extern struct timer_base_s __init_timer_base; + #define TIMER_INITIALIZER(_function, _expires, _data) { \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ - .base = NULL, \ + .base = &__init_timer_base, \ .magic = TIMER_MAGIC, \ - .lock = SPIN_LOCK_UNLOCKED, \ } -/*** - * init_timer - initialize a timer. - * @timer: the timer to be initialized - * - * init_timer() must be done to a timer prior calling *any* of the - * other timer functions. - */ -static inline void init_timer(struct timer_list * timer) -{ - timer->base = NULL; - timer->magic = TIMER_MAGIC; - spin_lock_init(&timer->lock); -} +void fastcall init_timer(struct timer_list * timer); /*** * timer_pending - is a timer pending? @@ -58,7 +46,7 @@ static inline void init_timer(struct timer_list * timer) */ static inline int timer_pending(const struct timer_list * timer) { - return timer->base != NULL; + return timer->entry.next != NULL; } extern void add_timer_on(struct timer_list *timer, int cpu); @@ -89,12 +77,12 @@ static inline void add_timer(struct timer_list * timer) #ifdef CONFIG_SMP extern int del_timer_sync(struct timer_list *timer); - extern int del_singleshot_timer_sync(struct timer_list *timer); #else # define del_timer_sync(t) del_timer(t) -# define del_singleshot_timer_sync(t) del_timer(t) #endif +#define del_singleshot_timer_sync(t) del_timer_sync(t) + extern void init_timers(void); extern void run_local_timers(void); extern void it_real_fn(unsigned long); -- cgit v1.2.3 From fd450b7318b75343fd76b3d95416853e34e72c95 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 23 Jun 2005 00:08:59 -0700 Subject: [PATCH] timers: introduce try_to_del_timer_sync() This patch splits del_timer_sync() into 2 functions. The new one, try_to_del_timer_sync(), returns -1 when it hits executing timer. It can be used in interrupt context, or when the caller hold locks which can prevent completion of the timer's handler. NOTE. Currently it can't be used in interrupt context in UP case, because ->running_timer is used only with CONFIG_SMP. Should the need arise, it is possible to kill #ifdef CONFIG_SMP in set_running_timer(), it is cheap. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timer.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 2e78fedfc069..221f81ac2002 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -76,9 +76,11 @@ static inline void add_timer(struct timer_list * timer) } #ifdef CONFIG_SMP + extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else -# define del_timer_sync(t) del_timer(t) +# define try_to_del_timer_sync(t) del_timer(t) +# define del_timer_sync(t) del_timer(t) #endif #define del_singleshot_timer_sync(t) del_timer_sync(t) -- cgit v1.2.3 From 991114c6fa6a21d1fa4d544abe78592352860c82 Mon Sep 17 00:00:00 2001 From: Alexander Viro Date: Thu, 23 Jun 2005 00:09:01 -0700 Subject: [PATCH] fix for prune_icache()/forced final iput() races Based on analysis and a patch from Russ Weight There is a race condition that can occur if an inode is allocated and then released (using iput) during the ->fill_super functions. The race condition is between kswapd and mount. For most filesystems this can only happen in an error path when kswapd is running concurrently. For isofs, however, the error can occur in a more common code path (which is how the bug was found). The logic here is "we want final iput() to free inode *now* instead of letting it sit in cache if fs is going down or had not quite come up". The problem is with kswapd seeing such inodes in the middle of being killed and happily taking over. The clean solution would be to tell kswapd to leave those inodes alone and let our final iput deal with them. I.e. add a new flag (I_FORCED_FREEING), set it before write_inode_now() there and make prune_icache() leave those alone. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e5a8db00df29..3622e952e98c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1025,6 +1025,7 @@ struct super_operations { #define I_FREEING 16 #define I_CLEAR 32 #define I_NEW 64 +#define I_WILL_FREE 128 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) -- cgit v1.2.3 From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001 From: Paulo Marques Date: Thu, 23 Jun 2005 00:09:02 -0700 Subject: [PATCH] create a kstrdup library function This patch creates a new kstrdup library function and changes the "local" implementations in several places to use this function. Most of the changes come from the sound and net subsystems. The sound part had already been acknowledged by Takashi Iwai and the net part by David S. Miller. I left UML alone for now because I would need more time to read the code carefully before making changes there. Signed-off-by: Paulo Marques Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/netdevice.h | 4 ---- include/linux/string.h | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d6afd440cf7b..d89816ad642f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -925,10 +925,6 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward); extern void net_enable_timestamp(void); extern void net_disable_timestamp(void); -#ifdef CONFIG_SYSCTL -extern char *net_sysctl_strdup(const char *s); -#endif - #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/string.h b/include/linux/string.h index b9fc59469956..93994c613095 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -88,6 +88,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); #endif +extern char *kstrdup(const char *s, int gfp); + #ifdef __cplusplus } #endif -- cgit v1.2.3 From 35a82d1a53e1a9ad54efafcc940f9335beaed5c3 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 23 Jun 2005 00:09:06 -0700 Subject: [PATCH] optimise loop driver a bit Looks like locking can be optimised quite a lot. Increase lock widths slightly so lo_lock is taken fewer times per request. Also it was quite trivial to cover lo_pending with that lock, and remove the atomic requirement. This also makes memory ordering explicitly correct, which is nice (not that I particularly saw any mem ordering bugs). Test was reading 4 250MB files in parallel on ext2-on-tmpfs filesystem (1K block size, 4K page size). System is 2 socket Xeon with HT (4 thread). intel:/home/npiggin# umount /dev/loop0 ; mount /dev/loop0 /mnt/loop ; /usr/bin/time ./mtloop.sh Before: 0.24user 5.51system 0:02.84elapsed 202%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.52system 0:02.88elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.57system 0:02.89elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k 0.22user 5.51system 0:02.90elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k 0.19user 5.44system 0:02.91elapsed 193%CPU (0avgtext+0avgdata 0maxresident)k After: 0.07user 2.34system 0:01.68elapsed 143%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.37system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.39system 0:01.68elapsed 145%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.36system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k 0.06user 2.42system 0:01.68elapsed 147%CPU (0avgtext+0avgdata 0maxresident)k Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/loop.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/loop.h b/include/linux/loop.h index 8220d9c9da00..53fa51595443 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -61,7 +61,7 @@ struct loop_device { struct semaphore lo_sem; struct semaphore lo_ctl_mutex; struct semaphore lo_bh_mutex; - atomic_t lo_pending; + int lo_pending; request_queue_t *lo_queue; }; -- cgit v1.2.3 From ac20427ef6aa63da663bdc88b71d16f7394f5e23 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 23 Jun 2005 00:09:11 -0700 Subject: [PATCH] add check to /proc/devices read routines Patch to add check to get_chrdev_list and get_blkdev_list to prevent reads of /proc/devices from spilling over the provided page if more than 4096 bytes of string data are generated from all the registered character and block devices in a system Signed-off-by: Neil Horman Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index af26dc718ef6..01796c41c951 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -224,7 +224,7 @@ static inline void free_disk_stats(struct gendisk *disk) extern void disk_round_stats(struct gendisk *disk); /* drivers/block/genhd.c */ -extern int get_blkdev_list(char *); +extern int get_blkdev_list(char *, int); extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern void unlink_gendisk(struct gendisk *gp); -- cgit v1.2.3 From 84de856ed30c568c2bb7b9ac0679772bd2737d9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:09:16 -0700 Subject: [PATCH] quota: consolidate code surrounding vfs_quota_on_mount Move some code duplicated in both callers into vfs_quota_on_mount Signed-off-by: Christoph Hellwig Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/quotaops.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index e57baa85e744..d211507ab246 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -39,7 +39,8 @@ extern int dquot_commit_info(struct super_block *sb, int type); extern int dquot_mark_dquot_dirty(struct dquot *dquot); extern int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path); -extern int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry); +extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name, + int format_id, int type); extern int vfs_quota_off(struct super_block *sb, int type); #define vfs_quota_off_mount(sb, type) vfs_quota_off(sb, type) extern int vfs_quota_sync(struct super_block *sb, int type); -- cgit v1.2.3 From b94cce926b2b902b79380ccba370d6f9f2980de0 Mon Sep 17 00:00:00 2001 From: Hien Nguyen Date: Thu, 23 Jun 2005 00:09:19 -0700 Subject: [PATCH] kprobes: function-return probes This patch adds function-return probes to kprobes for the i386 architecture. This enables you to establish a handler to be run when a function returns. 1. API Two new functions are added to kprobes: int register_kretprobe(struct kretprobe *rp); void unregister_kretprobe(struct kretprobe *rp); 2. Registration and unregistration 2.1 Register To register a function-return probe, the user populates the following fields in a kretprobe object and calls register_kretprobe() with the kretprobe address as an argument: kp.addr - the function's address handler - this function is run after the ret instruction executes, but before control returns to the return address in the caller. maxactive - The maximum number of instances of the probed function that can be active concurrently. For example, if the function is non- recursive and is called with a spinlock or mutex held, maxactive = 1 should be enough. If the function is non-recursive and can never relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should be enough. maxactive is used to determine how many kretprobe_instance objects to allocate for this particular probed function. If maxactive <= 0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 * NR_CPUS) else maxactive=NR_CPUS) For example: struct kretprobe rp; rp.kp.addr = /* entrypoint address */ rp.handler = /*return probe handler */ rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */ register_kretprobe(&rp); The following field may also be of interest: nmissed - Initialized to zero when the function-return probe is registered, and incremented every time the probed function is entered but there is no kretprobe_instance object available for establishing the function-return probe (i.e., because maxactive was set too low). 2.2 Unregister To unregiter a function-return probe, the user calls unregister_kretprobe() with the same kretprobe object as registered previously. If a probed function is running when the return probe is unregistered, the function will return as expected, but the handler won't be run. 3. Limitations 3.1 This patch supports only the i386 architecture, but patches for x86_64 and ppc64 are anticipated soon. 3.2 Return probes operates by replacing the return address in the stack (or in a known register, such as the lr register for ppc). This may cause __builtin_return_address(0), when invoked from the return-probed function, to return the address of the return-probes trampoline. 3.3 This implementation uses the "Multiprobes at an address" feature in 2.6.12-rc3-mm3. 3.4 Due to a limitation in multi-probes, you cannot currently establish a return probe and a jprobe on the same function. A patch to remove this limitation is being tested. This feature is required by SystemTap (http://sourceware.org/systemtap), and reflects ideas contributed by several SystemTap developers, including Will Cohen and Ananth Mavinakayanahalli. Signed-off-by: Hien Nguyen Signed-off-by: Prasanna S Panchamukhi Signed-off-by: Frederik Deweerdt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 99ddba5a4e00..fba39f87efec 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -25,21 +25,31 @@ * Rusty Russell). * 2004-July Suparna Bhattacharya added jumper probes * interface to access function arguments. + * 2005-May Hien Nguyen and Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. */ #include #include #include #include +#include + #include struct kprobe; struct pt_regs; +struct kretprobe; +struct kretprobe_instance; typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *); typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *); typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *, unsigned long flags); typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *, int trapnr); +typedef int (*kretprobe_handler_t) (struct kretprobe_instance *, + struct pt_regs *); + struct kprobe { struct hlist_node hlist; @@ -85,6 +95,62 @@ struct jprobe { kprobe_opcode_t *entry; /* probe handling code to jump to */ }; +#ifdef ARCH_SUPPORTS_KRETPROBES +extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs); +extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags); +extern struct task_struct *arch_get_kprobe_task(void *ptr); +extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs); +extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock); +#else /* ARCH_SUPPORTS_KRETPROBES */ +static inline void kretprobe_trampoline(void) +{ +} +static inline int trampoline_probe_handler(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; +} +static inline void trampoline_post_handler(struct kprobe *p, + struct pt_regs *regs, unsigned long flags) +{ +} +static inline void arch_prepare_kretprobe(struct kretprobe *rp, + struct pt_regs *regs) +{ +} +static inline void arch_kprobe_flush_task(struct task_struct *tk) +{ +} +#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL) +#endif /* ARCH_SUPPORTS_KRETPROBES */ +/* + * Function-return probe - + * Note: + * User needs to provide a handler function, and initialize maxactive. + * maxactive - The maximum number of instances of the probed function that + * can be active concurrently. + * nmissed - tracks the number of times the probed function's return was + * ignored, due to maxactive being too low. + * + */ +struct kretprobe { + struct kprobe kp; + kretprobe_handler_t handler; + int maxactive; + int nmissed; + struct hlist_head free_instances; + struct hlist_head used_instances; +}; + +struct kretprobe_instance { + struct hlist_node uflist; /* either on free list or used list */ + struct hlist_node hlist; + struct kretprobe *rp; + void *ret_addr; + void *stack_addr; +}; + #ifdef CONFIG_KPROBES /* Locks kprobe: irq must be disabled */ void lock_kprobes(void); @@ -104,6 +170,7 @@ extern void show_registers(struct pt_regs *regs); /* Get the kprobe at this addr (if any). Must have called lock_kprobes */ struct kprobe *get_kprobe(void *addr); +struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk); int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); @@ -113,7 +180,16 @@ int register_jprobe(struct jprobe *p); void unregister_jprobe(struct jprobe *p); void jprobe_return(void); -#else +int register_kretprobe(struct kretprobe *rp); +void unregister_kretprobe(struct kretprobe *rp); + +struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp); +struct kretprobe_instance *get_rp_inst(void *sara); +struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk); +void add_rp_inst(struct kretprobe_instance *ri); +void kprobe_flush_task(struct task_struct *tk); +void recycle_rp_inst(struct kretprobe_instance *ri); +#else /* CONFIG_KPROBES */ static inline int kprobe_running(void) { return 0; @@ -135,5 +211,15 @@ static inline void unregister_jprobe(struct jprobe *p) static inline void jprobe_return(void) { } -#endif +static inline int register_kretprobe(struct kretprobe *rp) +{ + return -ENOSYS; +} +static inline void unregister_kretprobe(struct kretprobe *rp) +{ +} +static inline void kprobe_flush_task(struct task_struct *tk) +{ +} +#endif /* CONFIG_KPROBES */ #endif /* _LINUX_KPROBES_H */ -- cgit v1.2.3 From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001 From: Rusty Lynch Date: Thu, 23 Jun 2005 00:09:25 -0700 Subject: [PATCH] Move kprobe [dis]arming into arch specific code The architecture independent code of the current kprobes implementation is arming and disarming kprobes at registration time. The problem is that the code is assuming that arming and disarming is a just done by a simple write of some magic value to an address. This is problematic for ia64 where our instructions look more like structures, and we can not insert break points by just doing something like: *p->addr = BREAKPOINT_INSTRUCTION; The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent functions: * void arch_arm_kprobe(struct kprobe *p) * void arch_disarm_kprobe(struct kprobe *p) and then adds the new functions for each of the architectures that already implement kprobes (spar64/ppc64/i386/x86_64). I thought arch_[dis]arm_kprobe was the most descriptive of what was really happening, but each of the architectures already had a disarm_kprobe() function that was really a "disarm and do some other clean-up items as needed when you stumble across a recursive kprobe." So... I took the liberty of changing the code that was calling disarm_kprobe() to call arch_disarm_kprobe(), and then do the cleanup in the block of code dealing with the recursive kprobe case. So far this patch as been tested on i386, x86_64, and ppc64, but still needs to be tested in sparc64. Signed-off-by: Rusty Lynch Signed-off-by: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index fba39f87efec..0f90466fb8b0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -165,6 +165,8 @@ static inline int kprobe_running(void) extern int arch_prepare_kprobe(struct kprobe *p); extern void arch_copy_kprobe(struct kprobe *p); +extern void arch_arm_kprobe(struct kprobe *p); +extern void arch_disarm_kprobe(struct kprobe *p); extern void arch_remove_kprobe(struct kprobe *p); extern void show_registers(struct pt_regs *regs); -- cgit v1.2.3 From 0aa55e4d7db822059fe8132fe9f2b7773c48216c Mon Sep 17 00:00:00 2001 From: Hien Nguyen Date: Thu, 23 Jun 2005 00:09:26 -0700 Subject: [PATCH] kprobes: moves lock-unlock to non-arch kprobe_flush_task This patch moves the lock/unlock of the arch specific kprobe_flush_task() to the non-arch specific kprobe_flusk_task(). Signed-off-by: Hien Nguyen Acked-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0f90466fb8b0..461391decc46 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -33,7 +33,6 @@ #include #include #include -#include #include @@ -101,7 +100,7 @@ extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags); extern struct task_struct *arch_get_kprobe_task(void *ptr); extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs); -extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock); +extern void arch_kprobe_flush_task(struct task_struct *tk); #else /* ARCH_SUPPORTS_KRETPROBES */ static inline void kretprobe_trampoline(void) { -- cgit v1.2.3 From ea32c65cc2d2294c04e9f81d0578a6f51febfdbf Mon Sep 17 00:00:00 2001 From: Prasanna S Panchamukhi Date: Thu, 23 Jun 2005 00:09:36 -0700 Subject: [PATCH] kprobes: Temporary disarming of reentrant probe In situations where a kprobes handler calls a routine which has a probe on it, then kprobes_handler() disarms the new probe forever. This patch removes the above limitation by temporarily disarming the new probe. When the another probe hits while handling the old probe, the kprobes_handler() saves previous kprobes state and handles the new probe without calling the new kprobes registered handlers. kprobe_post_handler() restores back the previous kprobes state and the normal execution continues. However on x86_64 architecture, re-rentrancy is provided only through pre_handler(). If a routine having probe is referenced through post_handler(), then the probes on that routine are disarmed forever, since the exception stack is gets changed after the processor single steps the instruction of the new probe. This patch includes generic changes to support temporary disarming on reentrancy of probes. Signed-of-by: Prasanna S Panchamukhi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 461391decc46..5e1a7b0d7b3f 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -36,6 +36,12 @@ #include +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 +#define KPROBE_REENTER 0x00000004 +#define KPROBE_HIT_SSDONE 0x00000008 + struct kprobe; struct pt_regs; struct kretprobe; @@ -55,6 +61,9 @@ struct kprobe { /* list of kprobes for multi-handler support */ struct list_head list; + /*count the number of times this probe was temporarily disarmed */ + unsigned long nmissed; + /* location of the probe point */ kprobe_opcode_t *addr; -- cgit v1.2.3 From d6e711448137ca3301512cec41a2c2ce852b3d0a Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 23 Jun 2005 00:09:43 -0700 Subject: [PATCH] setuid core dump Add a new `suid_dumpable' sysctl: This value can be used to query and set the core dump mode for setuid or otherwise protected/tainted binaries. The modes are 0 - (default) - traditional behaviour. Any process which has changed privilege levels or is execute only will not be dumped 1 - (debug) - all processes dump core when possible. The core dump is owned by the current user and no security is applied. This is intended for system debugging situations only. Ptrace is unchecked. 2 - (suidsafe) - any binary which normally would not be dumped is dumped readable by root only. This allows the end user to remove such a dump but not access it directly. For security reasons core dumps in this mode will not overwrite one another or other files. This mode is appropriate when adminstrators are attempting to debug problems in a normal environment. (akpm: > > +EXPORT_SYMBOL(suid_dumpable); > > EXPORT_SYMBOL_GPL? No problem to me. > > if (current->euid == current->uid && current->egid == current->gid) > > current->mm->dumpable = 1; > > Should this be SUID_DUMP_USER? Actually the feedback I had from last time was that the SUID_ defines should go because its clearer to follow the numbers. They can go everywhere (and there are lots of places where dumpable is tested/used as a bool in untouched code) > Maybe this should be renamed to `dump_policy' or something. Doing that > would help us catch any code which isn't using the #defines, too. Fair comment. The patch was designed to be easy to maintain for Red Hat rather than for merging. Changing that field would create a gigantic diff because it is used all over the place. ) Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/binfmts.h | 5 +++++ include/linux/sched.h | 2 +- include/linux/sysctl.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 7e736e201c46..c1e82c514443 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -69,6 +69,11 @@ extern void remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *,struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); +extern int suid_dumpable; +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ + /* Stack area protections */ #define EXSTACK_DEFAULT 0 /* Whatever the arch defaults to */ #define EXSTACK_DISABLE_X 1 /* Disable executable stacks */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b58afd97a180..901742f92389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -246,7 +246,7 @@ struct mm_struct { unsigned long saved_auxv[42]; /* for /proc/PID/auxv */ - unsigned dumpable:1; + unsigned dumpable:2; cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a17745c80a91..614e939c78a4 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -136,6 +136,7 @@ enum KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */ KERN_RANDOMIZE=68, /* int: randomize virtual address space */ + KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ }; -- cgit v1.2.3 From ef3daeda7b58f046f94b26637d500354038d39f4 Mon Sep 17 00:00:00 2001 From: Yoav Zach Date: Thu, 23 Jun 2005 00:09:58 -0700 Subject: [PATCH] Don't force O_LARGEFILE for 32 bit processes on ia64 In ia64 kernel, the O_LARGEFILE flag is forced when opening a file. This is problematic for execution of 32 bit processes, which are not largefile aware, either by SW emulation or by HW execution. For such processes, the problem is two-fold: 1) When trying to open a file that is larger than 4G the operation should fail, but it's not 2) Writing to offset larger than 4G should fail, but it's not The proposed patch takes advantage of the way 32 bit processes are identified in ia64 systems. Such processes have PER_LINUX32 for their personality. With the patch, the ia64 kernel will not enforce the O_LARGEFILE flag if the current process has PER_LINUX32 set. The behavior for all other architectures remains unchanged. Signed-off-by: Yoav Zach Acked-by: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fcntl.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 704fb76b6334..8a7c82151de9 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -25,6 +25,10 @@ #ifdef __KERNEL__ +#ifndef force_o_largefile +#define force_o_largefile() (BITS_PER_LONG != 32) +#endif + #if BITS_PER_LONG == 32 #define IS_GETLK32(cmd) ((cmd) == F_GETLK) #define IS_SETLK32(cmd) ((cmd) == F_SETLK) -- cgit v1.2.3 From 46c271bedd2c8444b1d05bc44928beec0c07debc Mon Sep 17 00:00:00 2001 From: Peter Osterlund Date: Thu, 23 Jun 2005 00:10:02 -0700 Subject: [PATCH] Improve CD/DVD packet driver write performance This patch improves write performance for the CD/DVD packet writing driver. The logic for switching between reading and writing has been changed so that streaming writes are no longer interrupted by read requests. Signed-off-by: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pktcdvd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index 4e2d2a942ecb..4b32bce9a289 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -159,7 +159,7 @@ struct packet_iosched struct bio *read_queue_tail; struct bio *write_queue; struct bio *write_queue_tail; - int high_prio_read; /* An important read request has been queued */ + sector_t last_write; /* The sector where the last write ended */ int successive_reads; }; -- cgit v1.2.3 From bb93e3a52f8db7210258a1a2134cced0b78a46e1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jun 2005 00:10:15 -0700 Subject: [PATCH] block: add unlocked_ioctl support for block devices This patch allows block device drivers to convert their ioctl functions to unlocked_ioctl() like character devices and other subsystems. All functions that were called with the BKL held before are still used that way, but I would not be surprised if it could be removed from the ioctl functions in drivers/block/ioctl.c themselves. As a side note, I found that compat_blkdev_ioctl() acquires the BKL as well, which looks like a bug. I have checked that every user of disk->fops->compat_ioctl() in the current git tree gets the BKL itself, so it could easily be removed from compat_blkdev_ioctl(). Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3622e952e98c..9b1278e21279 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -884,6 +884,7 @@ struct block_device_operations { int (*open) (struct inode *, struct file *); int (*release) (struct inode *, struct file *); int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + long (*unlocked_ioctl) (struct file *, unsigned, unsigned long); long (*compat_ioctl) (struct file *, unsigned, unsigned long); int (*media_changed) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); -- cgit v1.2.3 From 45778ca819accab1a4a3378b3566cab0f189164f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 23 Jun 2005 00:10:17 -0700 Subject: [PATCH] Remove f_error field from struct file The following patch removes the f_error field and all checks of f_error. Trond said: f_error was introduced for NFS, and made sense when we were guaranteed always to have a file pointer around when write errors occurred. Since then, we have (for various reasons) had to introduce the nfs_open_context in order to track the file read/write state, and it made sense to move our f_error tracking there too. Signed-off-by: Christoph Lameter Acked-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b1278e21279..517bf4966bf5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -581,7 +581,6 @@ struct file { atomic_t f_count; unsigned int f_flags; mode_t f_mode; - int f_error; loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; -- cgit v1.2.3 From f9fd27a253d5e0b23531d12ce7ad15b6535d4486 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:10:19 -0700 Subject: [PATCH] acl endianess annotations Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/posix_acl_xattr.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 5efd0a6dad94..fe271c1947b2 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -23,13 +23,13 @@ #define ACL_UNDEFINED_ID (-1) typedef struct { - __u16 e_tag; - __u16 e_perm; - __u32 e_id; + __le16 e_tag; + __le16 e_perm; + __le32 e_id; } posix_acl_xattr_entry; typedef struct { - __u32 a_version; + __le32 a_version; posix_acl_xattr_entry a_entries[0]; } posix_acl_xattr_header; -- cgit v1.2.3 From 9a59f452abe11f569e13ec16c51e6d61c54b9838 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 23 Jun 2005 00:10:19 -0700 Subject: [PATCH] remove This file duplicates , using slightly different names. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/posix_acl_xattr.h | 3 +++ include/linux/reiserfs_acl.h | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index fe271c1947b2..6e53c34035cd 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -52,4 +52,7 @@ posix_acl_xattr_count(size_t size) return size / sizeof(posix_acl_xattr_entry); } +struct posix_acl *posix_acl_from_xattr(const void *value, size_t size); +int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size); + #endif /* _POSIX_ACL_XATTR_H */ diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h index 2aef9c3f5ce8..0760507a545b 100644 --- a/include/linux/reiserfs_acl.h +++ b/include/linux/reiserfs_acl.h @@ -1,6 +1,5 @@ #include #include -#include #define REISERFS_ACL_VERSION 0x0001 -- cgit v1.2.3 From c43dc2fd885b5658cfd7cedb7bcca20910c517a4 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Thu, 23 Jun 2005 00:10:27 -0700 Subject: [PATCH] aio: make wait_queue ->task ->private In the upcoming aio_down patch, it is useful to store a private data pointer in the kiocb's wait_queue. Since we provide our own wake up function and do not require the task_struct pointer, it makes sense to convert the task pointer into a generic private pointer. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index c9486c3efb4a..d38c9fecdc36 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -33,7 +33,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key struct __wait_queue { unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 - struct task_struct * task; + void *private; wait_queue_func_t func; struct list_head task_list; }; @@ -60,7 +60,7 @@ typedef struct __wait_queue_head wait_queue_head_t; */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ - .task = tsk, \ + .private = tsk, \ .func = default_wake_function, \ .task_list = { NULL, NULL } } @@ -86,7 +86,7 @@ static inline void init_waitqueue_head(wait_queue_head_t *q) static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { q->flags = 0; - q->task = p; + q->private = p; q->func = default_wake_function; } @@ -94,7 +94,7 @@ static inline void init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) { q->flags = 0; - q->task = NULL; + q->private = NULL; q->func = func; } @@ -110,7 +110,7 @@ static inline int waitqueue_active(wait_queue_head_t *q) * aio specifies a wait queue entry with an async notification * callback routine, not associated with any task. */ -#define is_sync_wait(wait) (!(wait) || ((wait)->task)) +#define is_sync_wait(wait) (!(wait) || ((wait)->private)) extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); @@ -384,7 +384,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); #define DEFINE_WAIT(name) \ wait_queue_t name = { \ - .task = current, \ + .private = current, \ .func = autoremove_wake_function, \ .task_list = LIST_HEAD_INIT((name).task_list), \ } @@ -393,7 +393,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); struct wait_bit_queue name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wait = { \ - .task = current, \ + .private = current, \ .func = wake_bit_function, \ .task_list = \ LIST_HEAD_INIT((name).wait.task_list), \ @@ -402,7 +402,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); #define init_wait(wait) \ do { \ - (wait)->task = current; \ + (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->task_list); \ } while (0) -- cgit v1.2.3 From bfb07599da289881d3bcbb601a110e997fc7444b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 23 Jun 2005 00:10:32 -0700 Subject: [PATCH] Introduce tty_unregister_ldisc() It's a bit strange to see tty_register_ldisc call in modules' exit functions. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tty.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 1b76106272d3..59ff42c629ec 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -345,6 +345,7 @@ extern int tty_check_change(struct tty_struct * tty); extern void stop_tty(struct tty_struct * tty); extern void start_tty(struct tty_struct * tty); extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc); +extern int tty_unregister_ldisc(int disc); extern int tty_register_driver(struct tty_driver *driver); extern int tty_unregister_driver(struct tty_driver *driver); extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); -- cgit v1.2.3 From 4749f32da939d4e4160541b2cadc22492bb507ec Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 23 Jun 2005 11:36:56 +0200 Subject: [PATCH] better USB_MON dependencies This makes the USB_MON less confusing. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- include/linux/usb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 3d508bf08402..eb282b581546 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -290,7 +290,7 @@ struct usb_bus { struct class_device *class_dev; /* class device for this bus */ struct kref kref; /* handles reference counting this bus */ void (*release)(struct usb_bus *bus); /* function to destroy this bus's memory */ -#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE) +#if defined(CONFIG_USB_MON) struct mon_bus *mon_bus; /* non-null when associated */ int monitored; /* non-zero when monitored */ #endif -- cgit v1.2.3 From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:19:55 -0700 Subject: [TCP]: Add pluggable congestion control algorithm infrastructure. Allow TCP to have multiple pluggable congestion control algorithms. Algorithms are defined by a set of operations and can be built in or modules. The legacy "new RENO" algorithm is used as a starting point and fallback. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/sysctl.h | 9 +-------- include/linux/tcp.h | 49 ++++++++++--------------------------------------- 2 files changed, 11 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a4..72965bfe6cfb 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -333,21 +333,14 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, - NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, NET_TCP_NO_METRICS_SAVE=97, - NET_TCP_VEGAS=98, - NET_TCP_VEGAS_ALPHA=99, - NET_TCP_VEGAS_BETA=100, - NET_TCP_VEGAS_GAMMA=101, - NET_TCP_BIC=102, - NET_TCP_BIC_FAST_CONVERGENCE=103, - NET_TCP_BIC_LOW_WINDOW=104, NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, + NET_TCP_CONG_CONTROL=110, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df5..3ea75dd6640a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -203,13 +203,6 @@ struct tcp_sack_block { __u32 end_seq; }; -enum tcp_congestion_algo { - TCP_RENO=0, - TCP_VEGAS, - TCP_WESTWOOD, - TCP_BIC, -}; - struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -305,7 +298,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ + __u8 unused; __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -401,37 +394,10 @@ struct tcp_sock { __u32 time; } rcvq_space; -/* TCP Westwood structure */ - struct { - __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ - __u32 bw_est; /* bandwidth estimate */ - __u32 rtt_win_sx; /* here starts a new evaluation... */ - __u32 bk; - __u32 snd_una; /* used for evaluating the number of acked bytes */ - __u32 cumul_ack; - __u32 accounted; - __u32 rtt; - __u32 rtt_min; /* minimum observed RTT */ - } westwood; - -/* Vegas variables */ - struct { - __u32 beg_snd_nxt; /* right edge during last RTT */ - __u32 beg_snd_una; /* left edge during last RTT */ - __u32 beg_snd_cwnd; /* saves the size of the cwnd */ - __u8 doing_vegas_now;/* if true, do vegas for this RTT */ - __u16 cntRTT; /* # of RTTs measured within last RTT */ - __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ - } vegas; - - /* BI TCP Parameters */ - struct { - __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ - __u32 last_max_cwnd; /* last maximium snd_cwnd */ - __u32 last_cwnd; /* the last snd_cwnd */ - __u32 last_stamp; /* time when updated last_cwnd */ - } bictcp; + /* Pluggable TCP congestion control hook */ + struct tcp_congestion_ops *ca_ops; + u32 ca_priv[16]; +#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } +static inline void *tcp_ca(const struct tcp_sock *tp) +{ + return (void *) tp->ca_priv; +} + #endif #endif /* _LINUX_TCP_H */ -- cgit v1.2.3 From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:21:28 -0700 Subject: [TCP]: Report congestion control algorithm in tcp_diag. Enhancement to the tcp_diag interface used by the iproute2 ss command to report the tcp congestion control being used by a socket. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/tcp_diag.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index ceee962e1d15..7a5996743946 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -99,9 +99,10 @@ enum TCPDIAG_MEMINFO, TCPDIAG_INFO, TCPDIAG_VEGASINFO, + TCPDIAG_CONG, }; -#define TCPDIAG_MAX TCPDIAG_VEGASINFO +#define TCPDIAG_MAX TCPDIAG_CONG /* TCPDIAG_MEM */ @@ -123,5 +124,4 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; - #endif /* _TCP_DIAG_H_ */ -- cgit v1.2.3