From 18b504e25fd617bee8830d2cdcaff7fb7b5931bb Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Tue, 21 Jun 2005 12:38:48 -0700 Subject: [NETLINK]: netlink_callback structure needs 5 args not 4 net/ipv4/tcp_diag.c uses up to ->args[4] Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 561d4dc75836..3029cad63a01 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -147,7 +147,7 @@ struct netlink_callback int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); int family; - long args[4]; + long args[5]; }; struct netlink_notify -- cgit v1.2.3 From e45b1be8bcb3643808975a426fa3e201a2588e87 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:01:30 -0700 Subject: [NETFILTER]: Kill lockhelp.h Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack_core.h | 3 +- include/linux/netfilter_ipv4/ip_nat.h | 3 +- include/linux/netfilter_ipv4/listhelp.h | 1 - include/linux/netfilter_ipv4/lockhelp.h | 129 ----------------------- 4 files changed, 2 insertions(+), 134 deletions(-) delete mode 100644 include/linux/netfilter_ipv4/lockhelp.h (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h index d84be02cb4fc..694aec9b4784 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_core.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h @@ -1,7 +1,6 @@ #ifndef _IP_CONNTRACK_CORE_H #define _IP_CONNTRACK_CORE_H #include -#include /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use @@ -47,6 +46,6 @@ static inline int ip_conntrack_confirm(struct sk_buff **pskb) extern struct list_head *ip_conntrack_hash; extern struct list_head ip_conntrack_expect_list; -DECLARE_RWLOCK_EXTERN(ip_conntrack_lock); +extern rwlock_t ip_conntrack_lock; #endif /* _IP_CONNTRACK_CORE_H */ diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h index 2b72b86176f0..e201ec6e9905 100644 --- a/include/linux/netfilter_ipv4/ip_nat.h +++ b/include/linux/netfilter_ipv4/ip_nat.h @@ -50,10 +50,9 @@ struct ip_nat_multi_range_compat #ifdef __KERNEL__ #include -#include /* Protects NAT hash tables, and NAT-private part of conntracks. */ -DECLARE_RWLOCK_EXTERN(ip_nat_lock); +extern rwlock_t ip_nat_lock; /* The structure embedded in the conntrack structure. */ struct ip_nat_info diff --git a/include/linux/netfilter_ipv4/listhelp.h b/include/linux/netfilter_ipv4/listhelp.h index f2ae7c5e57bb..360429f48737 100644 --- a/include/linux/netfilter_ipv4/listhelp.h +++ b/include/linux/netfilter_ipv4/listhelp.h @@ -2,7 +2,6 @@ #define _LISTHELP_H #include #include -#include /* Header to do more comprehensive job than linux/list.h; assume list is first entry in structure. */ diff --git a/include/linux/netfilter_ipv4/lockhelp.h b/include/linux/netfilter_ipv4/lockhelp.h deleted file mode 100644 index a3288633ab46..000000000000 --- a/include/linux/netfilter_ipv4/lockhelp.h +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef _LOCKHELP_H -#define _LOCKHELP_H -#include - -#include -#include -#include -#include - -/* Header to do help in lock debugging. */ - -#ifdef CONFIG_NETFILTER_DEBUG -struct spinlock_debug -{ - spinlock_t l; - atomic_t locked_by; -}; - -struct rwlock_debug -{ - rwlock_t l; - long read_locked_map; - long write_locked_map; -}; - -#define DECLARE_LOCK(l) \ -struct spinlock_debug l = { SPIN_LOCK_UNLOCKED, ATOMIC_INIT(-1) } -#define DECLARE_LOCK_EXTERN(l) \ -extern struct spinlock_debug l -#define DECLARE_RWLOCK(l) \ -struct rwlock_debug l = { RW_LOCK_UNLOCKED, 0, 0 } -#define DECLARE_RWLOCK_EXTERN(l) \ -extern struct rwlock_debug l - -#define MUST_BE_LOCKED(l) \ -do { if (atomic_read(&(l)->locked_by) != smp_processor_id()) \ - printk("ASSERT %s:%u %s unlocked\n", __FILE__, __LINE__, #l); \ -} while(0) - -#define MUST_BE_UNLOCKED(l) \ -do { if (atomic_read(&(l)->locked_by) == smp_processor_id()) \ - printk("ASSERT %s:%u %s locked\n", __FILE__, __LINE__, #l); \ -} while(0) - -/* Write locked OK as well. */ -#define MUST_BE_READ_LOCKED(l) \ -do { if (!((l)->read_locked_map & (1UL << smp_processor_id())) \ - && !((l)->write_locked_map & (1UL << smp_processor_id()))) \ - printk("ASSERT %s:%u %s not readlocked\n", __FILE__, __LINE__, #l); \ -} while(0) - -#define MUST_BE_WRITE_LOCKED(l) \ -do { if (!((l)->write_locked_map & (1UL << smp_processor_id()))) \ - printk("ASSERT %s:%u %s not writelocked\n", __FILE__, __LINE__, #l); \ -} while(0) - -#define MUST_BE_READ_WRITE_UNLOCKED(l) \ -do { if ((l)->read_locked_map & (1UL << smp_processor_id())) \ - printk("ASSERT %s:%u %s readlocked\n", __FILE__, __LINE__, #l); \ - else if ((l)->write_locked_map & (1UL << smp_processor_id())) \ - printk("ASSERT %s:%u %s writelocked\n", __FILE__, __LINE__, #l); \ -} while(0) - -#define LOCK_BH(lk) \ -do { \ - MUST_BE_UNLOCKED(lk); \ - spin_lock_bh(&(lk)->l); \ - atomic_set(&(lk)->locked_by, smp_processor_id()); \ -} while(0) - -#define UNLOCK_BH(lk) \ -do { \ - MUST_BE_LOCKED(lk); \ - atomic_set(&(lk)->locked_by, -1); \ - spin_unlock_bh(&(lk)->l); \ -} while(0) - -#define READ_LOCK(lk) \ -do { \ - MUST_BE_READ_WRITE_UNLOCKED(lk); \ - read_lock_bh(&(lk)->l); \ - set_bit(smp_processor_id(), &(lk)->read_locked_map); \ -} while(0) - -#define WRITE_LOCK(lk) \ -do { \ - MUST_BE_READ_WRITE_UNLOCKED(lk); \ - write_lock_bh(&(lk)->l); \ - set_bit(smp_processor_id(), &(lk)->write_locked_map); \ -} while(0) - -#define READ_UNLOCK(lk) \ -do { \ - if (!((lk)->read_locked_map & (1UL << smp_processor_id()))) \ - printk("ASSERT: %s:%u %s not readlocked\n", \ - __FILE__, __LINE__, #lk); \ - clear_bit(smp_processor_id(), &(lk)->read_locked_map); \ - read_unlock_bh(&(lk)->l); \ -} while(0) - -#define WRITE_UNLOCK(lk) \ -do { \ - MUST_BE_WRITE_LOCKED(lk); \ - clear_bit(smp_processor_id(), &(lk)->write_locked_map); \ - write_unlock_bh(&(lk)->l); \ -} while(0) - -#else -#define DECLARE_LOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED -#define DECLARE_LOCK_EXTERN(l) extern spinlock_t l -#define DECLARE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED -#define DECLARE_RWLOCK_EXTERN(l) extern rwlock_t l - -#define MUST_BE_LOCKED(l) -#define MUST_BE_UNLOCKED(l) -#define MUST_BE_READ_LOCKED(l) -#define MUST_BE_WRITE_LOCKED(l) -#define MUST_BE_READ_WRITE_UNLOCKED(l) - -#define LOCK_BH(l) spin_lock_bh(l) -#define UNLOCK_BH(l) spin_unlock_bh(l) - -#define READ_LOCK(l) read_lock_bh(l) -#define WRITE_LOCK(l) write_lock_bh(l) -#define READ_UNLOCK(l) read_unlock_bh(l) -#define WRITE_UNLOCK(l) write_unlock_bh(l) -#endif /*CONFIG_NETFILTER_DEBUG*/ - -#endif /* _LOCKHELP_H */ -- cgit v1.2.3 From 18b8afc771102b1b6af97962808291a7d27f52af Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 21 Jun 2005 14:01:57 -0700 Subject: [NETFILTER]: Kill nf_debug Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4.h | 6 ------ include/linux/skbuff.h | 13 ------------- 2 files changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index 9e5750079e09..3ebc36afae1a 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -75,12 +75,6 @@ enum nf_ip_hook_priorities { #define SO_ORIGINAL_DST 80 #ifdef __KERNEL__ -#ifdef CONFIG_NETFILTER_DEBUG -void nf_debug_ip_local_deliver(struct sk_buff *skb); -void nf_debug_ip_loopback_xmit(struct sk_buff *newskb); -void nf_debug_ip_finish_output2(struct sk_buff *skb); -#endif /*CONFIG_NETFILTER_DEBUG*/ - extern int ip_route_me_harder(struct sk_buff **pskb); /* Call this before modifying an existing IP packet: ensures it is diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cc04f5cd2286..d7c839a21842 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -193,7 +193,6 @@ struct skb_shared_info { * @nfcache: Cache info * @nfct: Associated connection, if any * @nfctinfo: Relationship of this skb to the connection - * @nf_debug: Netfilter debugging * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @private: Data which is private to the HIPPI implementation * @tc_index: Traffic control index @@ -264,9 +263,6 @@ struct sk_buff { __u32 nfcache; __u32 nfctinfo; struct nf_conntrack *nfct; -#ifdef CONFIG_NETFILTER_DEBUG - unsigned int nf_debug; -#endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif @@ -1219,15 +1215,6 @@ static inline void nf_reset(struct sk_buff *skb) { nf_conntrack_put(skb->nfct); skb->nfct = NULL; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif -} -static inline void nf_reset_debug(struct sk_buff *skb) -{ -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif } #ifdef CONFIG_BRIDGE_NETFILTER -- cgit v1.2.3 From 39c715b71740c4a78ba4769fb54826929bac03cb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 21 Jun 2005 17:14:34 -0700 Subject: [PATCH] smp_processor_id() cleanup This patch implements a number of smp_processor_id() cleanup ideas that Arjan van de Ven and I came up with. The previous __smp_processor_id/_smp_processor_id/smp_processor_id API spaghetti was hard to follow both on the implementational and on the usage side. Some of the complexity arose from picking wrong names, some of the complexity comes from the fact that not all architectures defined __smp_processor_id. In the new code, there are two externally visible symbols: - smp_processor_id(): debug variant. - raw_smp_processor_id(): nondebug variant. Replaces all existing uses of _smp_processor_id() and __smp_processor_id(). Defined by every SMP architecture in include/asm-*/smp.h. There is one new internal symbol, dependent on DEBUG_PREEMPT: - debug_smp_processor_id(): internal debug variant, mapped to smp_processor_id(). Also, i moved debug_smp_processor_id() from lib/kernel_lock.c into a new lib/smp_processor_id.c file. All related comments got updated and/or clarified. I have build/boot tested the following 8 .config combinations on x86: {SMP,UP} x {PREEMPT,!PREEMPT} x {DEBUG_PREEMPT,!DEBUG_PREEMPT} I have also build/boot tested x64 on UP/PREEMPT/DEBUG_PREEMPT. (Other architectures are untested, but should work just fine.) Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- include/linux/smp.h | 40 ++++++++++++++++------------------------ 2 files changed, 17 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e530c6c092f1..beacd931b606 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -381,7 +381,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, #include /* Returns the number of the current Node. */ -#define numa_node_id() (cpu_to_node(_smp_processor_id())) +#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) #ifndef CONFIG_DISCONTIGMEM diff --git a/include/linux/smp.h b/include/linux/smp.h index dcf1db3b35d3..9dfa3ee769ae 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -92,10 +92,7 @@ void smp_prepare_boot_cpu(void); /* * These macros fold the SMP functionality into a single CPU system */ - -#if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT) -# define smp_processor_id() 0 -#endif +#define raw_smp_processor_id() 0 #define hard_smp_processor_id() 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) #define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) @@ -106,30 +103,25 @@ static inline void smp_send_reschedule(int cpu) { } #endif /* !SMP */ /* - * DEBUG_PREEMPT support: check whether smp_processor_id() is being - * used in a preemption-safe way. + * smp_processor_id(): get the current CPU ID. * - * An architecture has to enable this debugging code explicitly. - * It can do so by renaming the smp_processor_id() macro to - * __smp_processor_id(). This should only be done after some minimal - * testing, because usually there are a number of false positives - * that an architecture will trigger. + * if DEBUG_PREEMPT is enabled the we check whether it is + * used in a preemption-safe way. (smp_processor_id() is safe + * if it's used in a preemption-off critical section, or in + * a thread that is bound to the current CPU.) * - * To fix a false positive (i.e. smp_processor_id() use that the - * debugging code reports but which use for some reason is legal), - * change the smp_processor_id() reference to _smp_processor_id(), - * which is the nondebug variant. NOTE: don't use this to hack around - * real bugs. + * NOTE: raw_smp_processor_id() is for internal use only + * (smp_processor_id() is the preferred variant), but in rare + * instances it might also be used to turn off false positives + * (i.e. smp_processor_id() use that the debugging code reports but + * which use for some reason is legal). Don't use this to hack around + * the warning message, as your code might not work under PREEMPT. */ -#ifdef __smp_processor_id -# if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) - extern unsigned int smp_processor_id(void); -# else -# define smp_processor_id() __smp_processor_id() -# endif -# define _smp_processor_id() __smp_processor_id() +#ifdef CONFIG_DEBUG_PREEMPT + extern unsigned int debug_smp_processor_id(void); +# define smp_processor_id() debug_smp_processor_id() #else -# define _smp_processor_id() smp_processor_id() +# define smp_processor_id() raw_smp_processor_id() #endif #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) -- cgit v1.2.3 From 753ee728964e5afb80c17659cc6c3a6fd0a42fe0 Mon Sep 17 00:00:00 2001 From: Martin Hicks Date: Tue, 21 Jun 2005 17:14:41 -0700 Subject: [PATCH] VM: early zone reclaim This is the core of the (much simplified) early reclaim. The goal of this patch is to reclaim some easily-freed pages from a zone before falling back onto another zone. One of the major uses of this is NUMA machines. With the default allocator behavior the allocator would look for memory in another zone, which might be off-node, before trying to reclaim from the current zone. This adds a zone tuneable to enable early zone reclaim. It is selected on a per-zone basis and is turned on/off via syscall. Adding some extra throttling on the reclaim was also required (patch 4/4). Without the machine would grind to a crawl when doing a "make -j" kernel build. Even with this patch the System Time is higher on average, but it seems tolerable. Here are some numbers for kernbench runs on a 2-node, 4cpu, 8Gig RAM Altix in the "make -j" run: wall user sys %cpu ctx sw. sleeps ---- ---- --- ---- ------ ------ No patch 1009 1384 847 258 298170 504402 w/patch, no reclaim 880 1376 667 288 254064 396745 w/patch & reclaim 1079 1385 926 252 291625 548873 These numbers are the average of 2 runs of 3 "make -j" runs done right after system boot. Run-to-run variability for "make -j" is huge, so these numbers aren't terribly useful except to seee that with reclaim the benchmark still finishes in a reasonable amount of time. I also looked at the NUMA hit/miss stats for the "make -j" runs and the reclaim doesn't make any difference when the machine is thrashing away. Doing a "make -j8" on a single node that is filled with page cache pages takes 700 seconds with reclaim turned on and 735 seconds without reclaim (due to remote memory accesses). The simple zone_reclaim syscall program is at http://www.bork.org/~mort/sgi/zone_reclaim.c Signed-off-by: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ include/linux/swap.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index beacd931b606..dfc2452ccb10 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -144,6 +144,12 @@ struct zone { unsigned long pages_scanned; /* since last reclaim */ int all_unreclaimable; /* All pages pinned */ + /* + * Does the allocator try to reclaim pages from the zone as soon + * as it fails a watermark_ok() in __alloc_pages? + */ + int reclaim_pages; + /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim diff --git a/include/linux/swap.h b/include/linux/swap.h index 3bbc41be9bd0..0d21e682d99d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -173,6 +173,7 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); +extern int zone_reclaim(struct zone *, unsigned int, unsigned int); extern int shrink_all_memory(int); extern int vm_swappiness; -- cgit v1.2.3 From 0c35bbadc59f5ed105c34471143eceb4c0dd9c95 Mon Sep 17 00:00:00 2001 From: Martin Hicks Date: Tue, 21 Jun 2005 17:14:42 -0700 Subject: [PATCH] VM: add __GFP_NORECLAIM When using the early zone reclaim, it was noticed that allocating new pages that should be spread across the whole system caused eviction of local pages. This adds a new GFP flag to prevent early reclaim from happening during certain allocation attempts. The example that is implemented here is for page cache pages. We want page cache pages to be spread across the whole system, and we don't want page cache pages to evict other pages to get local memory. Signed-off-by: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 3 ++- include/linux/pagemap.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index af7407e8cfc5..208535fd4832 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,6 +39,7 @@ struct vm_area_struct; #define __GFP_COMP 0x4000u /* Add compound page metadata */ #define __GFP_ZERO 0x8000u /* Return zeroed page on success */ #define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */ +#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) @@ -47,7 +48,7 @@ struct vm_area_struct; #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ - __GFP_NOMEMALLOC) + __GFP_NOMEMALLOC|__GFP_NORECLAIM) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0422031161ba..d9a25647a295 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -52,12 +52,12 @@ void release_pages(struct page **pages, int nr, int cold); static inline struct page *page_cache_alloc(struct address_space *x) { - return alloc_pages(mapping_gfp_mask(x), 0); + return alloc_pages(mapping_gfp_mask(x)|__GFP_NORECLAIM, 0); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { - return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); + return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD|__GFP_NORECLAIM, 0); } typedef int filler_t(void *, struct page *); -- cgit v1.2.3 From 1e7e5a9048b30c57ba1ddaa6cdf59b21b65cde99 Mon Sep 17 00:00:00 2001 From: Martin Hicks Date: Tue, 21 Jun 2005 17:14:43 -0700 Subject: [PATCH] VM: rate limit early reclaim When early zone reclaim is turned on the LRU is scanned more frequently when a zone is low on memory. This limits when the zone reclaim can be called by skipping the scan if another thread (either via kswapd or sync reclaim) is already reclaiming from the zone. Signed-off-by: Martin Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dfc2452ccb10..18fed8b67943 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -149,6 +149,8 @@ struct zone { * as it fails a watermark_ok() in __alloc_pages? */ int reclaim_pages; + /* A count of how many reclaimers are scanning this zone */ + atomic_t reclaim_in_progress; /* * prev_priority holds the scanning priority for this zone. It is -- cgit v1.2.3 From 63551ae0feaaa23807ebea60de1901564bbef32e Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 21 Jun 2005 17:14:44 -0700 Subject: [PATCH] Hugepage consolidation A lot of the code in arch/*/mm/hugetlbpage.c is quite similar. This patch attempts to consolidate a lot of the code across the arch's, putting the combined version in mm/hugetlb.c. There are a couple of uglyish hacks in order to covert all the hugepage archs, but the result is a very large reduction in the total amount of code. It also means things like hugepage lazy allocation could be implemented in one place, instead of six. Tested, at least a little, on ppc64, i386 and x86_64. Notes: - this patch changes the meaning of set_huge_pte() to be more analagous to set_pte() - does SH4 need s special huge_ptep_get_and_clear()?? Acked-by: William Lee Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6af1ae4a8211..f529d1442815 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -4,6 +4,7 @@ #ifdef CONFIG_HUGETLB_PAGE #include +#include struct ctl_table; @@ -22,12 +23,6 @@ int hugetlb_report_meminfo(char *); int hugetlb_report_node_meminfo(int, char *); int is_hugepage_mem_enough(size_t); unsigned long hugetlb_total_pages(void); -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write); -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write); -int is_aligned_hugepage_range(unsigned long addr, unsigned long len); -int pmd_huge(pmd_t pmd); struct page *alloc_huge_page(void); void free_huge_page(struct page *); @@ -35,6 +30,17 @@ extern unsigned long max_huge_pages; extern const unsigned long hugetlb_zero, hugetlb_infinity; extern int sysctl_hugetlb_shm_group; +/* arch callbacks */ + +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr); +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); +struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, + int write); +struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write); +int is_aligned_hugepage_range(unsigned long addr, unsigned long len); +int pmd_huge(pmd_t pmd); + #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ @@ -48,6 +54,28 @@ extern int sysctl_hugetlb_shm_group; int prepare_hugepage_range(unsigned long addr, unsigned long len); #endif +#ifndef ARCH_HAS_SETCLEAR_HUGE_PTE +#define set_huge_pte_at(mm, addr, ptep, pte) set_pte_at(mm, addr, ptep, pte) +#define huge_ptep_get_and_clear(mm, addr, ptep) ptep_get_and_clear(mm, addr, ptep) +#else +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); +#endif + +#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK +#define hugetlb_prefault_arch_hook(mm) do { } while (0) +#else +void hugetlb_prefault_arch_hook(struct mm_struct *mm); +#endif + +#ifndef ARCH_HAS_HUGETLB_CLEAN_STALE_PGTABLE +#define hugetlb_clean_stale_pgtable(pte) BUG() +#else +void hugetlb_clean_stale_pgtable(pte_t *pte); +#endif + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) -- cgit v1.2.3 From e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 21 Jun 2005 17:14:47 -0700 Subject: [PATCH] node local per-cpu-pages This patch modifies the way pagesets in struct zone are managed. Each zone has a per-cpu array of pagesets. So any particular CPU has some memory in each zone structure which belongs to itself. Even if that CPU is not local to that zone. So the patch relocates the pagesets for each cpu to the node that is nearest to the cpu instead of allocating the pagesets in the (possibly remote) target zone. This means that the operations to manage pages on remote zone can be done with information available locally. We play a macro trick so that non-NUMA pmachines avoid the additional pointer chase on the page allocator fastpath. AIM7 benchmark on a 32 CPU SGI Altix w/o patches: Tasks jobs/min jti jobs/min/task real cpu 1 484.68 100 484.6769 12.01 1.97 Fri Mar 25 11:01:42 2005 100 27140.46 89 271.4046 21.44 148.71 Fri Mar 25 11:02:04 2005 200 30792.02 82 153.9601 37.80 296.72 Fri Mar 25 11:02:42 2005 300 32209.27 81 107.3642 54.21 451.34 Fri Mar 25 11:03:37 2005 400 34962.83 78 87.4071 66.59 588.97 Fri Mar 25 11:04:44 2005 500 31676.92 75 63.3538 91.87 742.71 Fri Mar 25 11:06:16 2005 600 36032.69 73 60.0545 96.91 885.44 Fri Mar 25 11:07:54 2005 700 35540.43 77 50.7720 114.63 1024.28 Fri Mar 25 11:09:49 2005 800 33906.70 74 42.3834 137.32 1181.65 Fri Mar 25 11:12:06 2005 900 34120.67 73 37.9119 153.51 1325.26 Fri Mar 25 11:14:41 2005 1000 34802.37 74 34.8024 167.23 1465.26 Fri Mar 25 11:17:28 2005 with slab API changes and pageset patch: Tasks jobs/min jti jobs/min/task real cpu 1 485.00 100 485.0000 12.00 1.96 Fri Mar 25 11:46:18 2005 100 28000.96 89 280.0096 20.79 150.45 Fri Mar 25 11:46:39 2005 200 32285.80 79 161.4290 36.05 293.37 Fri Mar 25 11:47:16 2005 300 40424.15 84 134.7472 43.19 438.42 Fri Mar 25 11:47:59 2005 400 39155.01 79 97.8875 59.46 590.05 Fri Mar 25 11:48:59 2005 500 37881.25 82 75.7625 76.82 730.19 Fri Mar 25 11:50:16 2005 600 39083.14 78 65.1386 89.35 872.79 Fri Mar 25 11:51:46 2005 700 38627.83 77 55.1826 105.47 1022.46 Fri Mar 25 11:53:32 2005 800 39631.94 78 49.5399 117.48 1169.94 Fri Mar 25 11:55:30 2005 900 36903.70 79 41.0041 141.94 1310.78 Fri Mar 25 11:57:53 2005 1000 36201.23 77 36.2012 160.77 1458.31 Fri Mar 25 12:00:34 2005 Signed-off-by: Christoph Lameter Signed-off-by: Shobhit Dayal Signed-off-by: Shai Fultheim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ include/linux/mmzone.h | 11 ++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 17518fe0b311..1813b162b0a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,12 @@ extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); +#ifdef CONFIG_NUMA +extern void setup_per_cpu_pageset(void); +#else +static inline void setup_per_cpu_pageset(void) {} +#endif + /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 18fed8b67943..4733d35d8223 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -63,6 +63,12 @@ struct per_cpu_pageset { #endif } ____cacheline_aligned_in_smp; +#ifdef CONFIG_NUMA +#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) +#else +#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) +#endif + #define ZONE_DMA 0 #define ZONE_NORMAL 1 #define ZONE_HIGHMEM 2 @@ -122,8 +128,11 @@ struct zone { */ unsigned long lowmem_reserve[MAX_NR_ZONES]; +#ifdef CONFIG_NUMA + struct per_cpu_pageset *pageset[NR_CPUS]; +#else struct per_cpu_pageset pageset[NR_CPUS]; - +#endif /* * free areas of different sizes */ -- cgit v1.2.3 From 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Mon Sep 17 00:00:00 2001 From: Wolfgang Wander Date: Tue, 21 Jun 2005 17:14:49 -0700 Subject: [PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander Credit-to: "Richard Purdie" Signed-off-by: Ken Chen Acked-by: Ingo Molnar (partly) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4dbb109022f3..b58afd97a180 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -201,8 +201,8 @@ extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern void arch_unmap_area(struct vm_area_struct *area); -extern void arch_unmap_area_topdown(struct vm_area_struct *area); +extern void arch_unmap_area(struct mm_struct *, unsigned long); +extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define set_mm_counter(mm, member, value) (mm)->_##member = (value) #define get_mm_counter(mm, member) ((mm)->_##member) @@ -218,9 +218,10 @@ struct mm_struct { unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); - void (*unmap_area) (struct vm_area_struct *area); - unsigned long mmap_base; /* base of mmap area */ - unsigned long free_area_cache; /* first hole */ + void (*unmap_area) (struct mm_struct *mm, unsigned long addr); + unsigned long mmap_base; /* base of mmap area */ + unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ + unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ -- cgit v1.2.3 From cbe37d093707762fc0abb280781e6a82a9d8d568 Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Tue, 21 Jun 2005 17:14:52 -0700 Subject: [PATCH] mm: remove PG_highmem Remove PG_highmem, to save a page flag. Use is_highmem() instead. It'll generate a little more code, but we don't use PageHigheMem() in many places. Signed-off-by: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 39ab8c6b5652..df313891db10 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -61,21 +61,20 @@ #define PG_active 6 #define PG_slab 7 /* slab debug (Suparna wants this) */ -#define PG_highmem 8 -#define PG_checked 9 /* kill me in 2.5.. */ -#define PG_arch_1 10 -#define PG_reserved 11 - -#define PG_private 12 /* Has something at ->private */ -#define PG_writeback 13 /* Page is under writeback */ -#define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_compound 15 /* Part of a compound page */ - -#define PG_swapcache 16 /* Swap page: swp_entry_t in private */ -#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ -#define PG_reclaim 18 /* To be reclaimed asap */ -#define PG_nosave_free 19 /* Free, should not be written */ -#define PG_uncached 20 /* Page has been mapped as uncached */ +#define PG_checked 8 /* kill me in 2.5.. */ +#define PG_arch_1 9 +#define PG_reserved 10 +#define PG_private 11 /* Has something at ->private */ + +#define PG_writeback 12 /* Page is under writeback */ +#define PG_nosave 13 /* Used for system suspend/resume */ +#define PG_compound 14 /* Part of a compound page */ +#define PG_swapcache 15 /* Swap page: swp_entry_t in private */ + +#define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ +#define PG_reclaim 17 /* To be reclaimed asap */ +#define PG_nosave_free 18 /* Free, should not be written */ +#define PG_uncached 19 /* Page has been mapped as uncached */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -215,7 +214,7 @@ extern void __mod_page_state(unsigned offset, unsigned long delta); #define TestSetPageSlab(page) test_and_set_bit(PG_slab, &(page)->flags) #ifdef CONFIG_HIGHMEM -#define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) +#define PageHighMem(page) is_highmem(page_zone(page)) #else #define PageHighMem(page) 0 /* needed to optimize away at compile time */ #endif -- cgit v1.2.3 From 1ad539b2bd89bf2e129123eb24d5bcc4484a35de Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Tue, 21 Jun 2005 17:14:53 -0700 Subject: [PATCH] vm: try_to_free_pages unused argument try_to_free_pages accepts a third argument, order, but hasn't used it since before 2.6.0. The following patch removes the argument and updates all the calls to try_to_free_pages. Signed-off-by: Darren Hart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0d21e682d99d..2343f999e6e1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -172,7 +172,7 @@ extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); +extern int try_to_free_pages(struct zone **, unsigned int); extern int zone_reclaim(struct zone *, unsigned int, unsigned int); extern int shrink_all_memory(int); extern int vm_swappiness; -- cgit v1.2.3 From 83e5d8f7253cb7b14472385a6d57df1e9f848e8e Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 21 Jun 2005 17:14:54 -0700 Subject: [PATCH] __mod_page_state(): pass unsigned long instead of unsigned By making the offset argument of __mod_page_state an unsigned long instead of unsigned, we can avoid forcing the compiler to sign extend a usually constant argument. This saves 1 instruction on x86-64. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index df313891db10..f2ee9b2332e3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -136,7 +136,7 @@ struct page_state { extern void get_page_state(struct page_state *ret); extern void get_full_page_state(struct page_state *ret); extern unsigned long __read_page_state(unsigned offset); -extern void __mod_page_state(unsigned offset, unsigned long delta); +extern void __mod_page_state(unsigned long offset, unsigned long delta); #define read_page_state(member) \ __read_page_state(offsetof(struct page_state, member)) -- cgit v1.2.3 From c2f29ea111e3344ed48257c2a142c3db514e1529 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 21 Jun 2005 17:14:55 -0700 Subject: [PATCH] __read_page_state(): pass unsigned long instead of unsigned By making the offset argument of __read_page_state an unsigned long instead of unsigned, we can avoid forcing the compiler to sign extend a usually constant argument. This saves 1 instruction on x86-64. Signed-off-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f2ee9b2332e3..f5a6695d4d21 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -135,7 +135,7 @@ struct page_state { extern void get_page_state(struct page_state *ret); extern void get_full_page_state(struct page_state *ret); -extern unsigned long __read_page_state(unsigned offset); +extern unsigned long __read_page_state(unsigned long offset); extern void __mod_page_state(unsigned long offset, unsigned long delta); #define read_page_state(member) \ -- cgit v1.2.3 From 4ae7c03943fca73f23bc0cdb938070f41b98101f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 21 Jun 2005 17:14:57 -0700 Subject: [PATCH] Periodically drain non local pagesets The pageset array can potentially acquire a huge amount of memory on large NUMA systems. F.e. on a system with 512 processors and 256 nodes there will be 256*512 pagesets. If each pageset only holds 5 pages then we are talking about 655360 pages.With a 16K page size on IA64 this results in potentially 10 Gigabytes of memory being trapped in pagesets. The typical cases are much less for smaller systems but there is still the potential of memory being trapped in off node pagesets. Off node memory may be rarely used if local memory is available and so we may potentially have memory in seldom used pagesets without this patch. The slab allocator flushes its per cpu caches every 2 seconds. The following patch flushes the off node pageset caches in the same way by tying into the slab flush. The patch also changes /proc/zoneinfo to include the number of pages currently in each pageset. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 208535fd4832..8d6bf608b199 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -133,5 +133,10 @@ extern void FASTCALL(free_cold_page(struct page *page)); #define free_page(addr) free_pages((addr),0) void page_alloc_init(void); +#ifdef CONFIG_NUMA +void drain_remote_pages(void); +#else +static inline void drain_remote_pages(void) { }; +#endif #endif /* __LINUX_GFP_H */ -- cgit v1.2.3 From f14f75b81187cdbe10cc53a521bf9fdf97b59f8c Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Tue, 21 Jun 2005 17:15:02 -0700 Subject: [PATCH] ia64 uncached alloc This patch contains the ia64 uncached page allocator and the generic allocator (genalloc). The uncached allocator was formerly part of the SN2 mspec driver but there are several other users of it so it has been split off from the driver. The generic allocator can be used by device driver to manage special memory etc. The generic allocator is based on the allocator from the sym53c8xx_2 driver. Various users on ia64 needs uncached memory. The SGI SN architecture requires it for inter-partition communication between partitions within a large NUMA cluster. The specific user for this is the XPC code. Another application is large MPI style applications which use it for synchronization, on SN this can be done using special 'fetchop' operations but it also benefits non SN hardware which may use regular uncached memory for this purpose. Performance of doing this through uncached vs cached memory is pretty substantial. This is handled by the mspec driver which I will push out in a seperate patch. Rather than creating a specific allocator for just uncached memory I came up with genalloc which is a generic purpose allocator that can be used by device drivers and other subsystems as they please. For instance to handle onboard device memory. It was derived from the sym53c7xx_2 driver's allocator which is also an example of a potential user (I am refraining from modifying sym2 right now as it seems to have been under fairly heavy development recently). On ia64 memory has various properties within a granule, ie. it isn't safe to access memory as uncached within the same granule as currently has memory accessed in cached mode. The regular system therefore doesn't utilize memory in the lower granules which is mixed in with device PAL code etc. The uncached driver walks the EFI memmap and pulls out the spill uncached pages and sticks them into the uncached pool. Only after these chunks have been utilized, will it start converting regular cached memory into uncached memory. Hence the reason for the EFI related code additions. Signed-off-by: Jes Sorensen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 include/linux/genalloc.h (limited to 'include/linux') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h new file mode 100644 index 000000000000..7fd0576a4454 --- /dev/null +++ b/include/linux/genalloc.h @@ -0,0 +1,40 @@ +/* + * Basic general purpose allocator for managing special purpose memory + * not managed by the regular kmalloc/kfree interface. + * Uses for this includes on-device special memory, uncached memory + * etc. + * + * This code is based on the buddy allocator found in the sym53c8xx_2 + * driver, adapted for general purpose use. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include + +#define ALLOC_MIN_SHIFT 5 /* 32 bytes minimum */ +/* + * Link between free memory chunks of a given size. + */ +struct gen_pool_link { + struct gen_pool_link *next; +}; + +/* + * Memory pool descriptor. + */ +struct gen_pool { + spinlock_t lock; + unsigned long (*get_new_chunk)(struct gen_pool *); + struct gen_pool *next; + struct gen_pool_link *h; + unsigned long private; + int max_chunk_shift; +}; + +unsigned long gen_pool_alloc(struct gen_pool *poolp, int size); +void gen_pool_free(struct gen_pool *mp, unsigned long ptr, int size); +struct gen_pool *gen_pool_create(int nr_chunks, int max_chunk_shift, + unsigned long (*fp)(struct gen_pool *), + unsigned long data); -- cgit v1.2.3 From 5b37b700f7c491a9320f4e29472bbaf23dded8fd Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 21 Jun 2005 17:15:18 -0700 Subject: [PATCH] ppc32: Added support for new MPC8548 family of PowerQUICC III processors Added descriptions of the new MPC8548 family processors, e500 core and peripherals. Signed-off-by: Kumar Gala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsl_devices.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h index faaff4c64559..70f54af87b9f 100644 --- a/include/linux/fsl_devices.h +++ b/include/linux/fsl_devices.h @@ -51,6 +51,7 @@ struct gianfar_platform_data { /* board specific information */ u32 board_flags; + u32 phy_flags; u32 phyid; u32 interruptPHY; u8 mac_addr[6]; @@ -61,9 +62,14 @@ struct gianfar_platform_data { #define FSL_GIANFAR_DEV_HAS_COALESCE 0x00000002 #define FSL_GIANFAR_DEV_HAS_RMON 0x00000004 #define FSL_GIANFAR_DEV_HAS_MULTI_INTR 0x00000008 +#define FSL_GIANFAR_DEV_HAS_CSUM 0x00000010 +#define FSL_GIANFAR_DEV_HAS_VLAN 0x00000020 +#define FSL_GIANFAR_DEV_HAS_EXTENDED_HASH 0x00000040 +#define FSL_GIANFAR_DEV_HAS_PADDING 0x00000080 /* Flags in gianfar_platform_data */ -#define FSL_GIANFAR_BRD_HAS_PHY_INTR 0x00000001 /* if not set use a timer */ +#define FSL_GIANFAR_BRD_HAS_PHY_INTR 0x00000001 /* set or use a timer */ +#define FSL_GIANFAR_BRD_IS_REDUCED 0x00000002 /* Set if RGMII, RMII */ struct fsl_i2c_platform_data { /* device specific information */ -- cgit v1.2.3 From 22329b511a97557b293583194037d1f4c71e1504 Mon Sep 17 00:00:00 2001 From: Brent Casavant Date: Tue, 21 Jun 2005 17:15:59 -0700 Subject: [PATCH] ioc4: Core driver rewrite This series of patches reworks the configuration and internal structure of the SGI IOC4 I/O controller device drivers. These changes are motivated by several factors: - The IOC4 chip PCI resources are of mixed use between functions (i.e. multiple functions are handled in the same address range, sometimes within the same register), muddling resource ownership and initialization issues. Centralizing this ownership in a core driver is desirable. - The IOC4 chip implements multiple functions (serial, IDE, others not yet implemented in the mainline kernel) but is not a multifunction PCI device. In order to properly handle device addition and removal as well as module insertion and deletion, an intermediary IOC4-specific driver layer is needed to handle these operations cleanly. - All IOC4 drivers are currently enabled by a single CONFIG value. As not all systems need all IOC4 functions, it is desireable to enable these drivers independently. - The current IOC4 core driver will trigger loading of all function-level drivers, as it makes direct calls to them. This situation should be reversed (i.e. function-level drivers cause loading of core driver) in order to maintain a clear and least-surprise driver loading model. - IOC4 hardware design necessitates some driver-level dependency on the PCI bus clock speed. Current code assumes a 66MHz bus, but the speed should be autodetected and appropriate compensation taken. This patch series effects the above changes by a newly and better designed IOC4 core driver with which the function-level drivers can register and deregister themselves upon module insertion/removal. By tracking these modules, device addition/removal is also handled properly. PCI resource management and ownership issues are centralized in this core driver, and IOC4-wide configuration actions such as bus speed detection are also handled in this core driver. This patch: The SGI IOC4 I/O controller chip implements multiple functions, though it is not a multi-function PCI device. Additionally, various PCI resources of the IOC4 are shared by multiple hardware functions, and thus resource ownership by driver is not clearly delineated. Due to the current driver design, all core and subordinate drivers must be loaded, or none, which is undesirable if not all IOC4 hardware features are being used. This patch reorganizes the IOC4 drivers so that the core driver provides a subdriver registration service. Through appropriate callbacks the subdrivers can now handle device addition and removal, as well as module insertion and deletion (though the IOC4 IDE driver requires further work before module deletion will work). The core driver now takes care of allocating PCI resources and data which must be shared between subdrivers, to clearly delineate module ownership of these items. Signed-off-by: Brent Casavant Acked-by: Pat Gefre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioc4.h | 156 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/ioc4_common.h | 21 ------ 2 files changed, 156 insertions(+), 21 deletions(-) create mode 100644 include/linux/ioc4.h delete mode 100644 include/linux/ioc4_common.h (limited to 'include/linux') diff --git a/include/linux/ioc4.h b/include/linux/ioc4.h new file mode 100644 index 000000000000..729bfa4c4ac5 --- /dev/null +++ b/include/linux/ioc4.h @@ -0,0 +1,156 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2005 Silicon Graphics, Inc. All Rights Reserved. + */ + +#ifndef _LINUX_IOC4_H +#define _LINUX_IOC4_H + +#include + +/*********************************** + * Structures needed by subdrivers * + ***********************************/ + +/* This structure fully describes the IOC4 miscellaneous registers which + * appear at bar[0]+0x00000 through bar[0]+0x0005c. The corresponding + * PCI resource is managed by the main IOC4 driver because it contains + * registers of interest to many different IOC4 subdrivers. + */ +struct ioc4_misc_regs { + /* Miscellaneous IOC4 registers */ + union ioc4_pci_err_addr_l { + uint32_t raw; + struct { + uint32_t valid:1; /* Address captured */ + uint32_t master_id:4; /* Unit causing error + * 0/1: Serial port 0 TX/RX + * 2/3: Serial port 1 TX/RX + * 4/5: Serial port 2 TX/RX + * 6/7: Serial port 3 TX/RX + * 8: ATA/ATAPI + * 9-15: Undefined + */ + uint32_t mul_err:1; /* Multiple errors occurred */ + uint32_t addr:26; /* Bits 31-6 of error addr */ + } fields; + } pci_err_addr_l; + uint32_t pci_err_addr_h; /* Bits 63-32 of error addr */ + union ioc4_sio_int { + uint32_t raw; + struct { + uint8_t tx_mt:1; /* TX ring buffer empty */ + uint8_t rx_full:1; /* RX ring buffer full */ + uint8_t rx_high:1; /* RX high-water exceeded */ + uint8_t rx_timer:1; /* RX timer has triggered */ + uint8_t delta_dcd:1; /* DELTA_DCD seen */ + uint8_t delta_cts:1; /* DELTA_CTS seen */ + uint8_t intr_pass:1; /* Interrupt pass-through */ + uint8_t tx_explicit:1; /* TX, MCW, or delay complete */ + } fields[4]; + } sio_ir; /* Serial interrupt state */ + union ioc4_other_int { + uint32_t raw; + struct { + uint32_t ata_int:1; /* ATA port passthru */ + uint32_t ata_memerr:1; /* ATA halted by mem error */ + uint32_t memerr:4; /* Serial halted by mem err */ + uint32_t kbd_int:1; /* kbd/mouse intr asserted */ + uint32_t reserved:16; /* zero */ + uint32_t rt_int:1; /* INT_OUT section latch */ + uint32_t gen_int:8; /* Intr. from generic pins */ + } fields; + } other_ir; /* Other interrupt state */ + union ioc4_sio_int sio_ies; /* Serial interrupt enable set */ + union ioc4_other_int other_ies; /* Other interrupt enable set */ + union ioc4_sio_int sio_iec; /* Serial interrupt enable clear */ + union ioc4_other_int other_iec; /* Other interrupt enable clear */ + union ioc4_sio_cr { + uint32_t raw; + struct { + uint32_t cmd_pulse:4; /* Bytebus strobe width */ + uint32_t arb_diag:3; /* PCI bus requester */ + uint32_t sio_diag_idle:1; /* Active ser req? */ + uint32_t ata_diag_idle:1; /* Active ATA req? */ + uint32_t ata_diag_active:1; /* ATA req is winner */ + uint32_t reserved:22; /* zero */ + } fields; + } sio_cr; + uint32_t unused1; + union ioc4_int_out { + uint32_t raw; + struct { + uint32_t count:16; /* Period control */ + uint32_t mode:3; /* Output signal shape */ + uint32_t reserved:11; /* zero */ + uint32_t diag:1; /* Timebase control */ + uint32_t int_out:1; /* Current value */ + } fields; + } int_out; /* External interrupt output control */ + uint32_t unused2; + union ioc4_gpcr { + uint32_t raw; + struct { + uint32_t dir:8; /* Pin direction */ + uint32_t edge:8; /* Edge/level mode */ + uint32_t reserved1:4; /* zero */ + uint32_t int_out_en:1; /* INT_OUT enable */ + uint32_t reserved2:11; /* zero */ + } fields; + } gpcr_s; /* Generic PIO control set */ + union ioc4_gpcr gpcr_c; /* Generic PIO control clear */ + union ioc4_gpdr { + uint32_t raw; + struct { + uint32_t gen_pin:8; /* State of pins */ + uint32_t reserved:24; + } fields; + } gpdr; /* Generic PIO data */ + uint32_t unused3; + union ioc4_gppr { + uint32_t raw; + struct { + uint32_t gen_pin:1; /* Single pin state */ + uint32_t reserved:31; + } fields; + } gppr[8]; /* Generic PIO pins */ +}; + +/* One of these per IOC4 + * + * The idd_serial_data field is present here, even though it's used + * solely by the serial subdriver, because the main IOC4 module + * properly owns pci_{get,set}_drvdata functionality. This field + * allows that subdriver to stash its own drvdata somewhere. + */ +struct ioc4_driver_data { + struct list_head idd_list; + unsigned long idd_bar0; + struct pci_dev *idd_pdev; + const struct pci_device_id *idd_pci_id; + struct __iomem ioc4_misc_regs *idd_misc_regs; + void *idd_serial_data; +}; + +/* One per submodule */ +struct ioc4_submodule { + struct list_head is_list; + char *is_name; + struct module *is_owner; + int (*is_probe) (struct ioc4_driver_data *); + int (*is_remove) (struct ioc4_driver_data *); +}; + +#define IOC4_NUM_CARDS 8 /* max cards per partition */ + +/********************************** + * Functions needed by submodules * + **********************************/ + +extern int ioc4_register_submodule(struct ioc4_submodule *); +extern void ioc4_unregister_submodule(struct ioc4_submodule *); + +#endif /* _LINUX_IOC4_H */ diff --git a/include/linux/ioc4_common.h b/include/linux/ioc4_common.h deleted file mode 100644 index b03bcc46df55..000000000000 --- a/include/linux/ioc4_common.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (c) 2005 Silicon Graphics, Inc. All Rights Reserved. - */ - -#ifndef _LINUX_IOC4_COMMON_H -#define _LINUX_IOC4_COMMON_H - -/* prototypes */ - -int ioc4_serial_init(void); - -int ioc4_serial_attach_one(struct pci_dev *pdev, const struct - pci_device_id *pci_id); -int ioc4_ide_attach_one(struct pci_dev *pdev, const struct - pci_device_id *pci_id); - -#endif /* _LINUX_IOC4_COMMON_H */ -- cgit v1.2.3 From d4c477ca5448f19afaaf6c0cfd655009ea9e614d Mon Sep 17 00:00:00 2001 From: Brent Casavant Date: Tue, 21 Jun 2005 17:16:01 -0700 Subject: [PATCH] ioc4: PCI bus speed detection Several hardware features of SGI's IOC4 I/O controller chip require timing-related driver calculations dependent upon the PCI bus speed. This patch enables the core IOC4 driver code to detect the actual bus speed and store a value that can later be used by the IOC4 subdrivers as needed. Signed-off-by: Brent Casavant Acked-by: Pat Gefre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioc4.h | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioc4.h b/include/linux/ioc4.h index 729bfa4c4ac5..3dd18b785ebd 100644 --- a/include/linux/ioc4.h +++ b/include/linux/ioc4.h @@ -11,6 +11,14 @@ #include +/*************** + * Definitions * + ***************/ + +/* Miscellaneous values inherent to hardware */ + +#define IOC4_EXTINT_COUNT_DIVISOR 520 /* PCI clocks per COUNT tick */ + /*********************************** * Structures needed by subdrivers * ***********************************/ @@ -119,19 +127,34 @@ struct ioc4_misc_regs { } gppr[8]; /* Generic PIO pins */ }; -/* One of these per IOC4 - * - * The idd_serial_data field is present here, even though it's used - * solely by the serial subdriver, because the main IOC4 module - * properly owns pci_{get,set}_drvdata functionality. This field - * allows that subdriver to stash its own drvdata somewhere. - */ +/* Masks for GPCR DIR pins */ +#define IOC4_GPCR_DIR_0 0x01 /* External interrupt output */ +#define IOC4_GPCR_DIR_1 0x02 /* External interrupt input */ +#define IOC4_GPCR_DIR_2 0x04 +#define IOC4_GPCR_DIR_3 0x08 /* Keyboard/mouse presence */ +#define IOC4_GPCR_DIR_4 0x10 /* Ser. port 0 xcvr select (0=232, 1=422) */ +#define IOC4_GPCR_DIR_5 0x20 /* Ser. port 1 xcvr select (0=232, 1=422) */ +#define IOC4_GPCR_DIR_6 0x40 /* Ser. port 2 xcvr select (0=232, 1=422) */ +#define IOC4_GPCR_DIR_7 0x80 /* Ser. port 3 xcvr select (0=232, 1=422) */ + +/* Masks for GPCR EDGE pins */ +#define IOC4_GPCR_EDGE_0 0x01 +#define IOC4_GPCR_EDGE_1 0x02 /* External interrupt input */ +#define IOC4_GPCR_EDGE_2 0x04 +#define IOC4_GPCR_EDGE_3 0x08 +#define IOC4_GPCR_EDGE_4 0x10 +#define IOC4_GPCR_EDGE_5 0x20 +#define IOC4_GPCR_EDGE_6 0x40 +#define IOC4_GPCR_EDGE_7 0x80 + +/* One of these per IOC4 */ struct ioc4_driver_data { struct list_head idd_list; unsigned long idd_bar0; struct pci_dev *idd_pdev; const struct pci_device_id *idd_pci_id; struct __iomem ioc4_misc_regs *idd_misc_regs; + unsigned long count_period; void *idd_serial_data; }; -- cgit v1.2.3 From dbce706e2550253c5ab6043f4f5dfde0cd02470f Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Tue, 21 Jun 2005 17:16:19 -0700 Subject: [PATCH] uml: add and use generic hw_controller_type->release With Chris Wedgwood Currently UML must explicitly call the UML-specific free_irq_by_irq_and_dev() for each free_irq call it's done. This is needed because ->shutdown and/or ->disable are only called when the last "action" for that irq is removed. Instead, for UML shared IRQs (UML IRQs are very often, if not always, shared), for each dev_id some setup is done, which must be cleared on the release of that fd. For instance, for each open console a new instance (i.e. new dev_id) of the same IRQ is requested(). Exactly, a fd is stored in an array (pollfds), which is after read by a host thread and passed to poll(). Each event registered by poll() triggers an interrupt. So, for each free_irq() we must remove the corresponding host fd from the table, which we do via this -release() method. In this patch we add an appropriate hook for this, and remove all uses of it by pointing the hook to the said procedure; this is safe to do since the said procedure. Also some cosmetic improvements are included. This is heavily based on some work by Chris Wedgwood, which however didn't get the patch merged for something I'd call a "misunderstanding" (the need for this patch wasn't cleanly explained, thus adding the generic hook was felt as undesirable). Signed-off-by: Paolo 'Blaisorblade' Giarrusso CC: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index c3ff4d101667..b68ad80e2464 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -47,6 +47,7 @@ struct hw_interrupt_type { void (*ack)(unsigned int irq); void (*end)(unsigned int irq); void (*set_affinity)(unsigned int irq, cpumask_t dest); + void (*release)(unsigned int irq, void *dev_id); }; typedef struct hw_interrupt_type hw_irq_controller; -- cgit v1.2.3 From b77d6adc922b8bbf8b16b67f567958c42962cf88 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso Date: Tue, 21 Jun 2005 17:16:24 -0700 Subject: [PATCH] uml: make hw_controller_type->release exist only for archs needing it With Chris Wedgwood As suggested by Chris, we can make the "just added" method ->release conditional to UML only (better: to archs requesting it, i.e. only UML currently), so that other archs don't get this unneeded crud, and if UML won't need it any more we can kill this. Signed-off-by: Paolo 'Blaisorblade' Giarrusso CC: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index b68ad80e2464..7fc1022be9ee 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -47,7 +47,10 @@ struct hw_interrupt_type { void (*ack)(unsigned int irq); void (*end)(unsigned int irq); void (*set_affinity)(unsigned int irq, cpumask_t dest); + /* Currently used only by UML, might disappear one day.*/ +#ifdef CONFIG_IRQ_RELEASE_METHOD void (*release)(unsigned int irq, void *dev_id); +#endif }; typedef struct hw_interrupt_type hw_irq_controller; -- cgit v1.2.3 From 8a96619145840c6eb50d85759f72d9337db411f7 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Jun 2005 17:16:41 -0700 Subject: [PATCH] autofs4: subversion bump to identify these changes Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/auto_fs4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h index a1657fb99516..9343c89d843c 100644 --- a/include/linux/auto_fs4.h +++ b/include/linux/auto_fs4.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 4 -#define AUTOFS_PROTO_SUBVERSION 6 +#define AUTOFS_PROTO_SUBVERSION 7 /* Mask for expire behaviour */ #define AUTOFS_EXP_IMMEDIATE 1 -- cgit v1.2.3 From f5a9951c94e7a285a3d00648e3d790a7f016bd11 Mon Sep 17 00:00:00 2001 From: James Simmons Date: Tue, 21 Jun 2005 17:16:58 -0700 Subject: [PATCH] fbdev: iomove removal Since no one is using the inbuf, outbuf of struct fb_pixmap I removed their use in the framebuffer console. The idea is instead move the pixmap functionality below the accelerated functions intead of on top as the way it is now. If there is no objection please apply. This is against Linus latestr GIT tree. Thank you. Signed-off-by: James Simmons Cc: "Antonino A. Daplas" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fb.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index b468bf496547..e14942805dbd 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -524,11 +524,11 @@ struct fb_pixmap { u32 offset; /* current offset to buffer */ u32 buf_align; /* byte alignment of each bitmap */ u32 scan_align; /* alignment per scanline */ - u32 access_align; /* alignment per read/write */ + u32 access_align; /* alignment per read/write (bits) */ u32 flags; /* see FB_PIXMAP_* */ /* access methods */ - void (*outbuf)(struct fb_info *info, u8 *addr, u8 *src, unsigned int size); - u8 (*inbuf) (struct fb_info *info, u8 *addr); + void (*writeio)(struct fb_info *info, void __iomem *dst, void *src, unsigned int size); + void (*readio) (struct fb_info *info, void *dst, void __iomem *src, unsigned int size); }; @@ -816,12 +816,6 @@ extern int unregister_framebuffer(struct fb_info *fb_info); extern int fb_prepare_logo(struct fb_info *fb_info); extern int fb_show_logo(struct fb_info *fb_info); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); -extern void fb_iomove_buf_unaligned(struct fb_info *info, struct fb_pixmap *buf, - u8 *dst, u32 d_pitch, u8 *src, u32 idx, - u32 height, u32 shift_high, u32 shift_low, u32 mod); -extern void fb_iomove_buf_aligned(struct fb_info *info, struct fb_pixmap *buf, - u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, - u32 height); extern void fb_sysmove_buf_unaligned(struct fb_info *info, struct fb_pixmap *buf, u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, u32 shift_high, u32 shift_low, u32 mod); -- cgit v1.2.3 From 1154ea7dcd8eed758fb5ec47393a79d5a1f0bc43 Mon Sep 17 00:00:00 2001 From: Jaya Kumar Date: Tue, 21 Jun 2005 17:17:04 -0700 Subject: [PATCH] Framebuffer driver for Arc LCD board Add support for the Arc monochrome LCD board. The board uses KS108 controllers to drive individual 64x64 LCD matrices. The board can be paneled in a variety of setups such as 2x1=128x64, 4x4=256x256 and so on. The board/host interface is through GPIO. Signed-off-by: Jaya Kumar Cc: "Antonino A. Daplas" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/arcfb.h | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 include/linux/arcfb.h (limited to 'include/linux') diff --git a/include/linux/arcfb.h b/include/linux/arcfb.h new file mode 100644 index 000000000000..721e7654daeb --- /dev/null +++ b/include/linux/arcfb.h @@ -0,0 +1,8 @@ +#ifndef __LINUX_ARCFB_H__ +#define __LINUX_ARCFB_H__ + +#define FBIO_WAITEVENT _IO('F', 0x88) +#define FBIO_GETCONTROL2 _IOR('F', 0x89, size_t) + +#endif + -- cgit v1.2.3 From d5881eb4883ef7dd28a4dcea237714817c4b2f8e Mon Sep 17 00:00:00 2001 From: James Simmons Date: Tue, 21 Jun 2005 17:17:05 -0700 Subject: [PATCH] fbdev: new pci id for chipsfb Patch adds pci ID for CT 69000 chipset. Signed-off-by: James Simmons Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index b8b4ebf9abf1..63e89e47b8e9 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -575,6 +575,7 @@ #define PCI_DEVICE_ID_CT_65550 0x00e0 #define PCI_DEVICE_ID_CT_65554 0x00e4 #define PCI_DEVICE_ID_CT_65555 0x00e5 +#define PCI_DEVICE_ID_CT_69000 0x00c0 #define PCI_VENDOR_ID_MIRO 0x1031 #define PCI_DEVICE_ID_MIRO_36050 0x5601 -- cgit v1.2.3 From 303b86d9913eca0cbfc3c5cb41e7006f6e13b755 Mon Sep 17 00:00:00 2001 From: Jurriaan Date: Tue, 21 Jun 2005 17:17:06 -0700 Subject: [PATCH] New framebuffer fonts + updated 12x22 font available Improve the fonts for use with the framebuffer. I've added all the characters marked 'FIXME' in the sun12x22 font and created a 10x18 font (based on the sun12x22 font) and a 7x14 font (based on the vga8x16 font). This patch is non-intrusive, no options are enabled by default so most users won't notice a thing. I am placing my changes under the GPL, however, I've not seen any copyright notices on the sun12x22 font and the vga8x16 font which I derived my new fonts from so I don't know what the copyright status is. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/font.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index fc2d690c9d5f..8fc80a7d78ac 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -25,19 +25,23 @@ struct font_desc { #define VGA8x16_IDX 1 #define PEARL8x8_IDX 2 #define VGA6x11_IDX 3 -#define SUN8x16_IDX 4 -#define SUN12x22_IDX 5 -#define ACORN8x8_IDX 6 -#define MINI4x6_IDX 7 +#define FONT7x14_IDX 4 +#define FONT10x18_IDX 5 +#define SUN8x16_IDX 6 +#define SUN12x22_IDX 7 +#define ACORN8x8_IDX 8 +#define MINI4x6_IDX 9 extern struct font_desc font_vga_8x8, - font_vga_8x16, - font_pearl_8x8, - font_vga_6x11, - font_sun_8x16, - font_sun_12x22, - font_acorn_8x8, - font_mini_4x6; + font_vga_8x16, + font_pearl_8x8, + font_vga_6x11, + font_7x14, + font_10x18, + font_sun_8x16, + font_sun_12x22, + font_acorn_8x8, + font_mini_4x6; /* Find a font with a specific name */ -- cgit v1.2.3 From f1ab5dac251bb4514607918b0019a3b3f5f5fb48 Mon Sep 17 00:00:00 2001 From: James Simmons Date: Tue, 21 Jun 2005 17:17:07 -0700 Subject: [PATCH] fbdev: stack reduction Shrink the stack when calling the drawing alignment functions. Signed-off-by: James Simmons Cc: "Antonino A. Daplas" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fb.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index e14942805dbd..bc24beeed971 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -816,12 +816,9 @@ extern int unregister_framebuffer(struct fb_info *fb_info); extern int fb_prepare_logo(struct fb_info *fb_info); extern int fb_show_logo(struct fb_info *fb_info); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); -extern void fb_sysmove_buf_unaligned(struct fb_info *info, struct fb_pixmap *buf, - u8 *dst, u32 d_pitch, u8 *src, u32 idx, +extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, u32 shift_high, u32 shift_low, u32 mod); -extern void fb_sysmove_buf_aligned(struct fb_info *info, struct fb_pixmap *buf, - u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, - u32 height); +extern void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height); extern void fb_set_suspend(struct fb_info *info, int state); extern int fb_get_color_depth(struct fb_var_screeninfo *var); extern int fb_get_options(char *name, char **option); -- cgit v1.2.3 From 06d91a5fe0b50c9060e70bdf7786f8a3c66249db Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:12 -0700 Subject: [PATCH] md: improve locking on 'safemode' and move superblock writes When md marks the superblock dirty before a write, it calls generic_make_request (to write the superblock) from within generic_make_request (to write the first dirty block), which could cause problems later. With this patch, the superblock write is always done by the helper thread, and write request are delayed until that write completes. Also, the locking around marking the array dirty and writing the superblock is improved to avoid possible races. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md.h | 2 +- include/linux/raid/md_k.h | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index a6a67d102bfa..cfde8f497d6d 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -69,7 +69,7 @@ extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev), extern void md_unregister_thread (mdk_thread_t *thread); extern void md_wakeup_thread(mdk_thread_t *thread); extern void md_check_recovery(mddev_t *mddev); -extern void md_write_start(mddev_t *mddev); +extern int md_write_start(mddev_t *mddev, struct bio *bi); extern void md_write_end(mddev_t *mddev); extern void md_handle_safemode(mddev_t *mddev); extern void md_done_sync(mddev_t *mddev, int blocks, int ok); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index c9a0d4013be7..d92db54255a3 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -15,6 +15,9 @@ #ifndef _MD_K_H #define _MD_K_H +/* and dm-bio-list.h is not under include/linux because.... ??? */ +#include "../../../drivers/md/dm-bio-list.h" + #define MD_RESERVED 0UL #define LINEAR 1UL #define RAID0 2UL @@ -252,6 +255,10 @@ struct mddev_s atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; sector_t recovery_cp; + + spinlock_t write_lock; + struct bio_list write_list; + unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. */ -- cgit v1.2.3 From 57afd89f98a990747445f01c458ecae64263b2f8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:13 -0700 Subject: [PATCH] md: improve the interface to sync_request 1/ change the return value (which is number-of-sectors synced) from 'int' to 'sector_t'. The number of sectors is usually easily small enough to fit in an int, but if resync needs to abort, it may want to return the total number of remaining sectors, which could be large. Also errors cannot be returned as negative numbers now, so use 0 instead 2/ Add a 'skipped' return parameter to allow the array to report that it skipped the sectors. This allows md to take this into account in the speed calculations. Currently there is no important skipping, but the bitmap-based-resync that is coming will use this. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index d92db54255a3..bce0032decff 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -298,7 +298,7 @@ struct mdk_personality_s int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); int (*hot_remove_disk) (mddev_t *mddev, int number); int (*spare_active) (mddev_t *mddev); - int (*sync_request)(mddev_t *mddev, sector_t sector_nr, int go_faster); + sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); int (*resize) (mddev_t *mddev, sector_t sectors); int (*reshape) (mddev_t *mddev, int raid_disks); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); -- cgit v1.2.3 From 32a7627cf3a35396a8e834faf34e38ae9f3b1309 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:14 -0700 Subject: [PATCH] md: optimised resync using Bitmap based intent logging With this patch, the intent to write to some block in the array can be logged to a bitmap file. Each bit represents some number of sectors and is set before any update happens, and only cleared when all writes relating to all sectors are complete. After an unclean shutdown, information in this bitmap can be used to optimise resync - only sectors which could be out-of-sync need to be updated. Also if a drive is removed and then added back into an array, the recovery can make use of the bitmap to optimise reconstruction. This is not implemented in this patch. Currently the bitmap is stored in a file which must (obviously) be stored on a separate device. The patch only provided infrastructure. It does not update any personalities to bitmap intent logging. Md arrays can still be used with no bitmap file. This patch has minimal impact on such arrays. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 280 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/raid/md_k.h | 4 + include/linux/raid/md_u.h | 7 ++ 3 files changed, 291 insertions(+) create mode 100644 include/linux/raid/bitmap.h (limited to 'include/linux') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h new file mode 100644 index 000000000000..f785cf26cbad --- /dev/null +++ b/include/linux/raid/bitmap.h @@ -0,0 +1,280 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR 3 +#define BITMAP_MINOR 38 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ + BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */ +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __u32 magic; /* 0 BITMAP_MAGIC */ + __u32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __u64 events; /* 24 event counter for the bitmap (1)*/ + __u64 events_cleared;/*32 event counter when last bit cleared (2) */ + __u64 sync_size; /* 40 the size of the md device's sync range(3) */ + __u32 state; /* 48 bitmap state information */ + __u32 chunksize; /* 52 the bitmap chunk size in bytes */ + __u32 daemon_sleep; /* 56 seconds between disk flushes */ + + __u8 pad[256 - 60]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * count of dirty bits on the page + */ + unsigned int count:31; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + + /* bitmap spinlock */ + spinlock_t lock; + + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + + unsigned long flags; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long daemon_sleep; /* how many seconds between updates? */ + + /* + * bitmap write daemon - this daemon performs writes to the bitmap file + * this thread is only needed because of a limitation in ext3 (jbd) + * that does not allow a task to have two journal transactions ongoing + * simultaneously (even if the transactions are for two different + * filesystems) -- in the case of bitmap, that would be the filesystem + * that the bitmap file resides on and the filesystem that is mounted + * on the md device -- see current->journal_info in jbd/transaction.c + */ + mdk_thread_t *writeback_daemon; + spinlock_t write_lock; + struct semaphore write_ready; + struct semaphore write_done; + unsigned long writes_pending; + wait_queue_head_t write_wait; + struct list_head write_pages; + struct list_head complete_pages; + mempool_t *write_pool; +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); +int bitmap_active(struct bitmap *bitmap); + +char *file_path(struct file *file, char *buf, int count); +void bitmap_print_sb(struct bitmap *bitmap); +int bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); + +int bitmap_unplug(struct bitmap *bitmap); +int bitmap_daemon_work(struct bitmap *bitmap); +#endif + +#endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index bce0032decff..16e94a9f0f8c 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -267,6 +267,9 @@ struct mddev_s atomic_t writes_pending; request_queue_t *queue; /* for plugging ... */ + struct bitmap *bitmap; /* the bitmap for the device */ + struct file *bitmap_file; /* the bitmap file */ + struct list_head all_mddevs; }; @@ -341,6 +344,7 @@ typedef struct mdk_thread_s { unsigned long flags; struct completion *event; struct task_struct *tsk; + unsigned long timeout; const char *name; } mdk_thread_t; diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h index a2df5c2a42af..81da20ccec4d 100644 --- a/include/linux/raid/md_u.h +++ b/include/linux/raid/md_u.h @@ -23,6 +23,7 @@ #define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) #define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) #define RAID_AUTORUN _IO (MD_MAJOR, 0x14) +#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t) /* configuration */ #define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) @@ -36,6 +37,7 @@ #define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) #define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) #define HOT_GENERATE_ERROR _IO (MD_MAJOR, 0x2a) +#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int) /* usage */ #define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) @@ -106,6 +108,11 @@ typedef struct mdu_start_info_s { } mdu_start_info_t; +typedef struct mdu_bitmap_file_s +{ + char pathname[4096]; +} mdu_bitmap_file_t; + typedef struct mdu_param_s { int personality; /* 1,2,3,4 */ -- cgit v1.2.3 From 77ad4bc706fe6c52ab953f31c287a6af712d080c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:21 -0700 Subject: [PATCH] md: enable the bitmap write-back daemon and wait for it. Currently we don't wait for updates to the bitmap to be flushed to disk properly. The infrastructure all there, but it isn't being used.... A separate kernel thread (bitmap_writeback_daemon) is needed to wait for each page as we cannot get callbacks when a page write completes. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index f785cf26cbad..cfe60cfc8f3d 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -233,21 +233,12 @@ struct bitmap { unsigned long daemon_sleep; /* how many seconds between updates? */ /* - * bitmap write daemon - this daemon performs writes to the bitmap file - * this thread is only needed because of a limitation in ext3 (jbd) - * that does not allow a task to have two journal transactions ongoing - * simultaneously (even if the transactions are for two different - * filesystems) -- in the case of bitmap, that would be the filesystem - * that the bitmap file resides on and the filesystem that is mounted - * on the md device -- see current->journal_info in jbd/transaction.c + * bitmap_writeback_daemon waits for file-pages that have been written, + * as there is no way to get a call-back when a page write completes. */ mdk_thread_t *writeback_daemon; spinlock_t write_lock; - struct semaphore write_ready; - struct semaphore write_done; - unsigned long writes_pending; wait_queue_head_t write_wait; - struct list_head write_pages; struct list_head complete_pages; mempool_t *write_pool; }; -- cgit v1.2.3 From 191ea9b2c7cc3ebbe0678834ab710d7d95ad3f9a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:23 -0700 Subject: [PATCH] md: raid1 support for bitmap intent logging Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid1.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h index abbfdd9afe1e..9d93cf12e890 100644 --- a/include/linux/raid/raid1.h +++ b/include/linux/raid/raid1.h @@ -36,12 +36,21 @@ struct r1_private_data_s { spinlock_t device_lock; struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + /* queue of writes that have been unplugged */ + struct bio_list flushing_bio_list; + /* for use when syncing mirrors: */ spinlock_t resync_lock; - int nr_pending; - int barrier; + int nr_pending; + int barrier; sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ wait_queue_head_t wait_idle; wait_queue_head_t wait_resume; @@ -85,14 +94,17 @@ struct r1bio_s { int read_disk; struct list_head retry_list; + struct bitmap_update *bitmap_update; /* * if the IO is in WRITE direction, then multiple bios are used. * We choose the number when they are allocated. */ struct bio *bios[0]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ }; /* bits for r1bio.state */ #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 +#define R1BIO_Degraded 2 #endif -- cgit v1.2.3 From 41158c7eb22312cfaa256744e1553bb4042ff085 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:25 -0700 Subject: [PATCH] md: optimise reconstruction when re-adding a recently failed drive. When an array is degraded, bit in the intent-bitmap are never cleared. So if a recently failed drive is re-added, we only need to reconstruct the block that are still reflected in the bitmap. This patch adds support for this re-adding. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 16e94a9f0f8c..6cdcb4434c6c 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -183,6 +183,10 @@ struct mdk_rdev_s int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ + int saved_raid_disk; /* role that device used to have in the + * array and could again if we did a partial + * resync from the bitmap + */ atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that -- cgit v1.2.3 From 3d310eb7b3df1252e8595d059d982b0a9825a137 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:26 -0700 Subject: [PATCH] md: fix deadlock due to md thread processing delayed requests. Before completing a 'write' the md superblock might need to be updated. This is best done by the md_thread. The current code schedules this up and queues the write request for later handling by the md_thread. However some personalities (Raid5/raid6) will deadlock if the md_thread tries to submit requests to its own array. So this patch changes things so the processes submitting the request waits for the superblock to be written and then submits the request itself. This fixes a recently-created deadlock in raid5/raid6 Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md.h | 2 +- include/linux/raid/md_k.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index cfde8f497d6d..75f41d8faed2 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -69,7 +69,7 @@ extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev), extern void md_unregister_thread (mdk_thread_t *thread); extern void md_wakeup_thread(mdk_thread_t *thread); extern void md_check_recovery(mddev_t *mddev); -extern int md_write_start(mddev_t *mddev, struct bio *bi); +extern void md_write_start(mddev_t *mddev, struct bio *bi); extern void md_write_end(mddev_t *mddev); extern void md_handle_safemode(mddev_t *mddev); extern void md_done_sync(mddev_t *mddev, int blocks, int ok); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 6cdcb4434c6c..3e977025cf43 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -261,7 +261,7 @@ struct mddev_s sector_t recovery_cp; spinlock_t write_lock; - struct bio_list write_list; + wait_queue_head_t sb_wait; /* for waiting on superblock updates */ unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. -- cgit v1.2.3 From a654b9d8f851f4ca02649d5825cbe6c608adb10c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:27 -0700 Subject: [PATCH] md: allow md intent bitmap to be stored near the superblock. This provides an alternate to storing the bitmap in a separate file. The bitmap can be stored at a given offset from the superblock. Obviously the creator of the array must make sure this doesn't intersect with data.... After is good for version-0.90 superblocks. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/bitmap.h | 2 ++ include/linux/raid/md.h | 15 ++++++++++++++- include/linux/raid/md_k.h | 4 ++++ include/linux/raid/md_p.h | 7 ++++++- 4 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index cfe60cfc8f3d..e24b74b11150 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -217,6 +217,7 @@ struct bitmap { /* bitmap spinlock */ spinlock_t lock; + long offset; /* offset from superblock if file is NULL */ struct file *file; /* backing disk file */ struct page *sb_page; /* cached copy of the bitmap file superblock */ struct page **filemap; /* list of cache pages for the file */ @@ -255,6 +256,7 @@ void bitmap_print_sb(struct bitmap *bitmap); int bitmap_update_sb(struct bitmap *bitmap); int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); /* these are exported */ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 75f41d8faed2..ffa316ce4dc8 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -60,7 +60,14 @@ */ #define MD_MAJOR_VERSION 0 #define MD_MINOR_VERSION 90 -#define MD_PATCHLEVEL_VERSION 1 +/* + * MD_PATCHLEVEL_VERSION indicates kernel functionality. + * >=1 means different superblock formats are selectable using SET_ARRAY_INFO + * and major_version/minor_version accordingly + * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT + * in the super status byte + */ +#define MD_PATCHLEVEL_VERSION 2 extern int register_md_personality (int p_num, mdk_personality_t *p); extern int unregister_md_personality (int p_num); @@ -78,6 +85,12 @@ extern void md_unplug_mddev(mddev_t *mddev); extern void md_print_devices (void); +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); + + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } #endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 3e977025cf43..a3725b57fb7d 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -273,6 +273,10 @@ struct mddev_s struct bitmap *bitmap; /* the bitmap for the device */ struct file *bitmap_file; /* the bitmap file */ + long bitmap_offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + */ struct list_head all_mddevs; }; diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 8ba95d67329f..8e592a25a8b5 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -96,6 +96,7 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_CLEAN 0 #define MD_SB_ERRORS 1 +#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ typedef struct mdp_superblock_s { /* * Constant generic information @@ -184,7 +185,7 @@ struct mdp_superblock_1 { /* constant array information - 128 bytes */ __u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ __u32 major_version; /* 1 */ - __u32 feature_map; /* 0 for now */ + __u32 feature_map; /* bit 0 set if 'bitmap_offset' is meaningful */ __u32 pad0; /* always set to 0 when writing */ __u8 set_uuid[16]; /* user-space generated. */ @@ -197,6 +198,10 @@ struct mdp_superblock_1 { __u32 chunksize; /* in 512byte sectors */ __u32 raid_disks; + __u32 bitmap_offset; /* sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ __u8 pad1[128-96]; /* set to 0 when written */ /* constant this-device information - 64 bytes */ -- cgit v1.2.3 From 7bfa19f2748000d646dbdf8f48258cfe1d257b52 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:28 -0700 Subject: [PATCH] md: allow md to update multiple superblocks in parallel. currently, md updates all superblocks (one on each device) in series. It waits for one write to complete before starting the next. This isn't a big problem as superblock updates don't happen that often. However it is neater to do it in parallel, and if the drives in the array have gone to "sleep" after a period of idleness, then waking them is parallel is faster (and someone else should be worrying about power drain). Futher, we will need parallel superblock updates for a future patch which keeps the intent-logging bitmap near the superblock. Also remove the silly code that retired superblock updates 100 times. This simply never made sense. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index a3725b57fb7d..8c14ba565a45 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -262,6 +262,7 @@ struct mddev_s spinlock_t write_lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ + atomic_t pending_writes; /* number of active superblock writes */ unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. -- cgit v1.2.3 From 39730960d94306d7be414e8d54f4e5c071af1278 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Jun 2005 17:17:28 -0700 Subject: [PATCH] Two small fixes for md verion-1 superblocks. 1/ Must typecast int to (sector_t) before inverting or we might not invert enough bits. 2/ When "bitmap_offset" was added to mdp_superblock_1, we didn't increase the count of words-used (96 to 100). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_p.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 8e592a25a8b5..dc65cd435494 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -202,7 +202,7 @@ struct mdp_superblock_1 { * NOTE: signed, so bitmap can be before superblock * only meaningful of feature_map[0] is set. */ - __u8 pad1[128-96]; /* set to 0 when written */ + __u8 pad1[128-100]; /* set to 0 when written */ /* constant this-device information - 64 bytes */ __u64 data_offset; /* sector start of data, often 0 */ -- cgit v1.2.3