From 56079431b6ba163df8ba26b3eccc82379f0c0ce4 Mon Sep 17 00:00:00 2001 From: Denis Vlasenko Date: Wed, 29 Mar 2006 15:57:29 -0800 Subject: [NET]: Deinline some larger functions from netdevice.h On a allyesconfig'ured kernel: Size Uses Wasted Name and definition ===== ==== ====== ================================================ 95 162 12075 netif_wake_queue include/linux/netdevice.h 129 86 9265 dev_kfree_skb_any include/linux/netdevice.h 127 56 5885 netif_device_attach include/linux/netdevice.h 73 86 4505 dev_kfree_skb_irq include/linux/netdevice.h 46 60 1534 netif_device_detach include/linux/netdevice.h 119 16 1485 __netif_rx_schedule include/linux/netdevice.h 143 5 492 netif_rx_schedule include/linux/netdevice.h 81 7 366 netif_schedule include/linux/netdevice.h netif_wake_queue is big because __netif_schedule is a big inline: static inline void __netif_schedule(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) { unsigned long flags; struct softnet_data *sd; local_irq_save(flags); sd = &__get_cpu_var(softnet_data); dev->next_sched = sd->output_queue; sd->output_queue = dev; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } } static inline void netif_wake_queue(struct net_device *dev) { #ifdef CONFIG_NETPOLL_TRAP if (netpoll_trap()) return; #endif if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) __netif_schedule(dev); } By de-inlining __netif_schedule we are saving a lot of text at each callsite of netif_wake_queue and netif_schedule. __netif_rx_schedule is also big, and it makes more sense to keep both of them out of line. Patch also deinlines dev_kfree_skb_any. We can deinline dev_kfree_skb_irq instead... oh well. netif_device_attach/detach are not hot paths, we can deinline them too. Signed-off-by: Denis Vlasenko Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/netdevice.h | 55 +++++------------------------------------------ 1 file changed, 5 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 950dc55e5192..40ccf8cc4239 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -598,20 +598,7 @@ DECLARE_PER_CPU(struct softnet_data,softnet_data); #define HAVE_NETIF_QUEUE -static inline void __netif_schedule(struct net_device *dev) -{ - if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) { - unsigned long flags; - struct softnet_data *sd; - - local_irq_save(flags); - sd = &__get_cpu_var(softnet_data); - dev->next_sched = sd->output_queue; - sd->output_queue = dev; - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_restore(flags); - } -} +extern void __netif_schedule(struct net_device *dev); static inline void netif_schedule(struct net_device *dev) { @@ -675,13 +662,7 @@ static inline void dev_kfree_skb_irq(struct sk_buff *skb) /* Use this variant in places where it could be invoked * either from interrupt or non-interrupt context. */ -static inline void dev_kfree_skb_any(struct sk_buff *skb) -{ - if (in_irq() || irqs_disabled()) - dev_kfree_skb_irq(skb); - else - dev_kfree_skb(skb); -} +extern void dev_kfree_skb_any(struct sk_buff *skb); #define HAVE_NETIF_RX 1 extern int netif_rx(struct sk_buff *skb); @@ -768,22 +749,9 @@ static inline int netif_device_present(struct net_device *dev) return test_bit(__LINK_STATE_PRESENT, &dev->state); } -static inline void netif_device_detach(struct net_device *dev) -{ - if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && - netif_running(dev)) { - netif_stop_queue(dev); - } -} +extern void netif_device_detach(struct net_device *dev); -static inline void netif_device_attach(struct net_device *dev) -{ - if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && - netif_running(dev)) { - netif_wake_queue(dev); - __netdev_watchdog_up(dev); - } -} +extern void netif_device_attach(struct net_device *dev); /* * Network interface message level settings @@ -851,20 +819,7 @@ static inline int netif_rx_schedule_prep(struct net_device *dev) * already been called and returned 1. */ -static inline void __netif_rx_schedule(struct net_device *dev) -{ - unsigned long flags; - - local_irq_save(flags); - dev_hold(dev); - list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list); - if (dev->quota < 0) - dev->quota += dev->weight; - else - dev->quota = dev->weight; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - local_irq_restore(flags); -} +extern void __netif_rx_schedule(struct net_device *dev); /* Try to reschedule poll. Called by irq handler. */ -- cgit v1.2.3 From 5274f052e7b3dbd81935772eb551dfd0325dfa9d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Mar 2006 15:15:30 +0200 Subject: [PATCH] Introduce sys_splice() system call This adds support for the sys_splice system call. Using a pipe as a transport, it can connect to files or sockets (latter as output only). From the splice.c comments: "splice": joining two ropes together by interweaving their strands. This is the "extended pipe" functionality, where a pipe is used as an arbitrary in-memory buffer. Think of a pipe as a small kernel buffer that you can use to transfer data from one end to the other. The traditional unix read/write is extended with a "splice()" operation that transfers data buffers to or from a pipe buffer. Named by Larry McVoy, original implementation from Linus, extended by Jens to support splicing to files and fixing the initial implementation bugs. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++++ include/linux/syscalls.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 408fe89498f4..20fa5f6d7269 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1032,6 +1032,8 @@ struct file_operations { int (*check_flags)(int); int (*dir_notify)(struct file *filp, unsigned long arg); int (*flock) (struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int); }; struct inode_operations { @@ -1609,6 +1611,8 @@ extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor extern void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *, struct file *, loff_t *, read_descriptor_t *, read_actor_t); +extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int); +extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e487e3b60f60..e78ffc7d5b56 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -569,5 +569,7 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user * filename, asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode); asmlinkage long sys_unshare(unsigned long unshare_flags); +asmlinkage long sys_splice(int fdin, int fdout, size_t len, + unsigned int flags); #endif -- cgit v1.2.3 From 5abc97aa25b2c41413b3a520faee83f2282d9f18 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Mar 2006 15:16:46 +0200 Subject: [PATCH] splice: add support for SPLICE_F_MOVE flag This enables the caller to migrate pages from one address space page cache to another. In buzz word marketing, you can do zero-copy file copies! Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index b12e59c75752..75c7f55023ab 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -9,6 +9,7 @@ struct pipe_buffer { struct page *page; unsigned int offset, len; struct pipe_buf_operations *ops; + unsigned int stolen; }; struct pipe_buf_operations { @@ -16,6 +17,7 @@ struct pipe_buf_operations { void * (*map)(struct file *, struct pipe_inode_info *, struct pipe_buffer *); void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *); void (*release)(struct pipe_inode_info *, struct pipe_buffer *); + int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); }; struct pipe_inode_info { @@ -53,4 +55,10 @@ void pipe_wait(struct inode * inode); struct inode* pipe_new(struct inode* inode); void free_pipe_info(struct inode* inode); +/* + * splice is tied to pipes as a transport (at least for now), so we'll just + * add the splice flags here. + */ +#define SPLICE_F_MOVE (0x01) /* move pages instead of copying */ + #endif -- cgit v1.2.3 From 025be81e83043f20538dcced1e12c5f8d152fbdb Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 31 Mar 2006 02:27:06 -0800 Subject: [NET]: Allow skb headroom to be overridden Previously we added NET_IP_ALIGN so an architecture can override the padding done to align headers. The next step is to allow the skb headroom to be overridden. We currently always reserve 16 bytes to grow into, meaning all DMAs start 16 bytes into a cacheline. On ppc64 we really want DMA writes to start on a cacheline boundary, so we increase that headroom to one cacheline. Signed-off-by: Anton Blanchard Signed-off-by: David S. Miller --- include/linux/skbuff.h | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 613b9513f8b9..c4619a428d9b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -941,6 +941,25 @@ static inline void skb_reserve(struct sk_buff *skb, int len) #define NET_IP_ALIGN 2 #endif +/* + * The networking layer reserves some headroom in skb data (via + * dev_alloc_skb). This is used to avoid having to reallocate skb data when + * the header has to grow. In the default case, if the header has to grow + * 16 bytes or less we avoid the reallocation. + * + * Unfortunately this headroom changes the DMA alignment of the resulting + * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive + * on some architectures. An architecture can override this value, + * perhaps setting it to a cacheline in size (since that will maintain + * cacheline alignment of the DMA). It must be a power of 2. + * + * Various parts of the networking layer expect at least 16 bytes of + * headroom, you should not reduce this. + */ +#ifndef NET_SKB_PAD +#define NET_SKB_PAD 16 +#endif + extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc); static inline void __skb_trim(struct sk_buff *skb, unsigned int len) @@ -1030,9 +1049,9 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) { - struct sk_buff *skb = alloc_skb(length + 16, gfp_mask); + struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask); if (likely(skb)) - skb_reserve(skb, 16); + skb_reserve(skb, NET_SKB_PAD); return skb; } #else @@ -1070,13 +1089,15 @@ static inline struct sk_buff *dev_alloc_skb(unsigned int length) */ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) { - int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb); + int delta = (headroom > NET_SKB_PAD ? headroom : NET_SKB_PAD) - + skb_headroom(skb); if (delta < 0) delta = 0; if (delta || skb_cloned(skb)) - return pskb_expand_head(skb, (delta + 15) & ~15, 0, GFP_ATOMIC); + return pskb_expand_head(skb, (delta + (NET_SKB_PAD-1)) & + ~(NET_SKB_PAD-1), 0, GFP_ATOMIC); return 0; } -- cgit v1.2.3 From 48b192686dd20cb1576ae1d8ccd17a07971ef24a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 31 Mar 2006 02:29:41 -0800 Subject: [PATCH] sem2mutex: drivers/mtd/ Semaphore to mutex conversion. The conversion was generated via scripts, and the result was validated automatically via a script as well. Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Acked-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mtd/blktrans.h | 4 ++-- include/linux/mtd/doc2000.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h index f46afec6fbf8..72fc68c5ee96 100644 --- a/include/linux/mtd/blktrans.h +++ b/include/linux/mtd/blktrans.h @@ -10,7 +10,7 @@ #ifndef __MTD_TRANS_H__ #define __MTD_TRANS_H__ -#include +#include struct hd_geometry; struct mtd_info; @@ -22,7 +22,7 @@ struct mtd_blktrans_dev { struct mtd_blktrans_ops *tr; struct list_head list; struct mtd_info *mtd; - struct semaphore sem; + struct mutex lock; int devnum; int blksize; unsigned long size; diff --git a/include/linux/mtd/doc2000.h b/include/linux/mtd/doc2000.h index 386a52cf8b1b..9addd073bf15 100644 --- a/include/linux/mtd/doc2000.h +++ b/include/linux/mtd/doc2000.h @@ -15,7 +15,7 @@ #define __MTD_DOC2000_H__ #include -#include +#include #define DoC_Sig1 0 #define DoC_Sig2 1 @@ -187,7 +187,7 @@ struct DiskOnChip { int numchips; struct Nand *chips; struct mtd_info *nextdoc; - struct semaphore lock; + struct mutex lock; }; int doc_decode_ecc(unsigned char sector[512], unsigned char ecc1[6]); -- cgit v1.2.3 From 0500abf52109d09bf60d740dec2e41d6cf265688 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 31 Mar 2006 02:29:42 -0800 Subject: [PATCH] drivers/mtd/: small cleanups - chips/sharp.c: make two needlessly global functions static - move some declarations to a header file where they belong to Signed-off-by: Adrian Bunk Acked-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mtd/inftl.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/inftl.h b/include/linux/mtd/inftl.h index 0268125a6271..d7eaa40e5ab0 100644 --- a/include/linux/mtd/inftl.h +++ b/include/linux/mtd/inftl.h @@ -52,6 +52,11 @@ struct INFTLrecord { int INFTL_mount(struct INFTLrecord *s); int INFTL_formatblock(struct INFTLrecord *s, int block); +extern char inftlmountrev[]; + +void INFTL_dumptables(struct INFTLrecord *s); +void INFTL_dumpVUchains(struct INFTLrecord *s); + #endif /* __KERNEL__ */ #endif /* __MTD_INFTL_H__ */ -- cgit v1.2.3 From 9bf9e89c3d147ca8cf9622d2d053684fba77a464 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 31 Mar 2006 02:29:56 -0800 Subject: [PATCH] migrate_pages_to() must be defined for the no swap case Fix migrate_pages_to() definition. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7d09962c3c0b..ff0a64073ebc 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -12,7 +12,7 @@ extern void migrate_page_copy(struct page *, struct page *); extern int migrate_page_remove_references(struct page *, struct page *, int); extern int migrate_pages(struct list_head *l, struct list_head *t, struct list_head *moved, struct list_head *failed); -int migrate_pages_to(struct list_head *pagelist, +extern int migrate_pages_to(struct list_head *pagelist, struct vm_area_struct *vma, int dest); extern int fail_migrate_page(struct page *, struct page *); @@ -26,6 +26,9 @@ static inline int putback_lru_pages(struct list_head *l) { return 0; } static inline int migrate_pages(struct list_head *l, struct list_head *t, struct list_head *moved, struct list_head *failed) { return -ENOSYS; } +static inline int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest) { return 0; } + static inline int migrate_prep(void) { return -ENOSYS; } /* Possible settings for the migrate_page() method in address_operations */ -- cgit v1.2.3 From 93fac7041f082297b93655a0e49f659cd7520e40 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 31 Mar 2006 02:29:56 -0800 Subject: [PATCH] mm: schedule find_trylock_page() removal find_trylock_page() is an odd interface in that it doesn't take a reference like the others. Now that XFS no longer uses it, and its last remaining caller actually wants an elevated refcount, opencode that callsite and schedule find_trylock_page() for removal. Signed-off-by: Nick Piggin Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 839f0b3c23aa..9539efd4f7e6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -72,8 +72,8 @@ extern struct page * find_get_page(struct address_space *mapping, unsigned long index); extern struct page * find_lock_page(struct address_space *mapping, unsigned long index); -extern struct page * find_trylock_page(struct address_space *mapping, - unsigned long index); +extern __deprecated_for_modules struct page * find_trylock_page( + struct address_space *mapping, unsigned long index); extern struct page * find_or_create_page(struct address_space *mapping, unsigned long index, gfp_t gfp_mask); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, -- cgit v1.2.3 From 3691c5199e8a4be1c7a91b5ab925db5feb866e19 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 31 Mar 2006 02:30:30 -0800 Subject: [PATCH] kill __init_timer_base in favor of boot_tvec_bases Commit a4a6198b80cf82eb8160603c98da218d1bd5e104: [PATCH] tvec_bases too large for per-cpu data introduced "struct tvec_t_base_s boot_tvec_bases" which is visible at compile time. This means we can kill __init_timer_base and move timer_base_s's content into tvec_t_base_s. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index b5caabca553c..0a485beba9f5 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -6,7 +6,7 @@ #include #include -struct timer_base_s; +struct tvec_t_base_s; struct timer_list { struct list_head entry; @@ -15,16 +15,16 @@ struct timer_list { void (*function)(unsigned long); unsigned long data; - struct timer_base_s *base; + struct tvec_t_base_s *base; }; -extern struct timer_base_s __init_timer_base; +extern struct tvec_t_base_s boot_tvec_bases; #define TIMER_INITIALIZER(_function, _expires, _data) { \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ - .base = &__init_timer_base, \ + .base = &boot_tvec_bases, \ } #define DEFINE_TIMER(_name, _function, _expires, _data) \ -- cgit v1.2.3 From 68eef3b4791572ecb70249c7fb145bb3742dd899 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Fri, 31 Mar 2006 02:30:32 -0800 Subject: [PATCH] Simplify proc/devices and fix early termination regression Make baby-simple the code for /proc/devices. Based on the proven design for /proc/interrupts. This also fixes the early-termination regression 2.6.16 introduced, as demonstrated by: # dd if=/proc/devices bs=1 Character devices: 1 mem 27+0 records in 27+0 records out This should also work (but is untested) when /proc/devices >4096 bytes, which I believe is what the original 2.6.16 rewrite fixed. [akpm@osdl.org: cleanups, simplifications] Signed-off-by: Joe Korty Cc: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 20fa5f6d7269..20a7afd4590c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1413,6 +1413,7 @@ extern void bd_release_from_disk(struct block_device *, struct gendisk *); #endif /* fs/char_dev.c */ +#define CHRDEV_MAJOR_HASH_SIZE 255 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); extern int register_chrdev_region(dev_t, unsigned, const char *); extern int register_chrdev(unsigned int, const char *, @@ -1420,25 +1421,17 @@ extern int register_chrdev(unsigned int, const char *, extern int unregister_chrdev(unsigned int, const char *); extern void unregister_chrdev_region(dev_t, unsigned); extern int chrdev_open(struct inode *, struct file *); -extern int get_chrdev_list(char *); -extern void *acquire_chrdev_list(void); -extern int count_chrdev_list(void); -extern void *get_next_chrdev(void *); -extern int get_chrdev_info(void *, int *, char **); -extern void release_chrdev_list(void *); +extern void chrdev_show(struct seq_file *,off_t); /* fs/block_dev.c */ +#define BLKDEV_MAJOR_HASH_SIZE 255 #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); extern struct block_device *open_bdev_excl(const char *, int, void *); extern void close_bdev_excl(struct block_device *); -extern void *acquire_blkdev_list(void); -extern int count_blkdev_list(void); -extern void *get_next_blkdev(void *); -extern int get_blkdev_info(void *, int *, char **); -extern void release_blkdev_list(void *); +extern void blkdev_show(struct seq_file *,off_t); extern void init_special_inode(struct inode *, umode_t, dev_t); -- cgit v1.2.3 From 453823ba08ba762b3d58934b6dce75edce37169e Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 31 Mar 2006 02:30:39 -0800 Subject: [PATCH] IPMI: fix startup race condition Matt Domsch noticed a startup race with the IPMI kernel thread, it was possible (though extraordinarly unlikely) that a message could come in before the upper layer was ready to handle it. This patch splits the startup processing of an IPMI interface into two parts, one to get ready and one to actually start the processes to receive messages from the interface. [akpm@osdl.org: cleanups] Signed-off-by: Corey Minyard Cc: Matt Domsch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipmi_smi.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 53571288a9fc..6d9c7e4da472 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -82,6 +82,13 @@ struct ipmi_smi_handlers { struct module *owner; + /* The low-level interface cannot start sending messages to + the upper layer until this function is called. This may + not be NULL, the lower layer must take the interface from + this call. */ + int (*start_processing)(void *send_info, + ipmi_smi_t new_intf); + /* Called to enqueue an SMI message to be sent. This operation is not allowed to fail. If an error occurs, it should report back the error in a received message. It may @@ -157,13 +164,16 @@ static inline void ipmi_demangle_device_id(unsigned char *data, } /* Add a low-level interface to the IPMI driver. Note that if the - interface doesn't know its slave address, it should pass in zero. */ + interface doesn't know its slave address, it should pass in zero. + The low-level interface should not deliver any messages to the + upper layer until the start_processing() function in the handlers + is called, and the lower layer must get the interface from that + call. */ int ipmi_register_smi(struct ipmi_smi_handlers *handlers, void *send_info, struct ipmi_device_id *device_id, struct device *dev, - unsigned char slave_addr, - ipmi_smi_t *intf); + unsigned char slave_addr); /* * Remove a low-level interface from the IPMI driver. This will -- cgit v1.2.3 From f79e2abb9bd452d97295f34376dedbec9686b986 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 31 Mar 2006 02:30:42 -0800 Subject: [PATCH] sys_sync_file_range() Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT fadvise() additions, do it in a new sys_sync_file_range() syscall instead. Reasons: - It's more flexible. Things which would require two or three syscalls with fadvise() can be done in a single syscall. - Using fadvise() in this manner is something not covered by POSIX. The patch wires up the syscall for x86. The sycall is implemented in the new fs/sync.c. The intention is that we can move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later. Documentation for the syscall is in fs/sync.c. A test app (sync_file_range.c) is in http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz. The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for NFS_DATA_SYNC which is hopefully the more common." Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if the queue is congested. This is trivial to fix: add a new flag bit, set wbc->nonblocking. But I'm not sure that we want to expose implementation details down to that level. Note: it's notable that we can sync an fd which wasn't opened for writing. Same with fsync() and fdatasync()). Note: the code takes some care to handle attempts to sync file contents outside the 16TB offset on 32-bit machines. It makes such attempts appear to succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such requests fail... Cc: Nick Piggin Cc: Michael Kerrisk Cc: Ulrich Drepper Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fadvise.h | 6 ------ include/linux/fs.h | 7 +++++++ include/linux/syscalls.h | 2 ++ 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h index b2913bba35d8..e8e747139b9a 100644 --- a/include/linux/fadvise.h +++ b/include/linux/fadvise.h @@ -18,10 +18,4 @@ #define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */ #endif -/* - * Linux-specific fadvise() extensions: - */ -#define LINUX_FADV_ASYNC_WRITE 32 /* Start writeout on range */ -#define LINUX_FADV_WRITE_WAIT 33 /* Wait upon writeout to range */ - #endif /* FADVISE_H_INCLUDED */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 20a7afd4590c..4ed7e602d703 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -757,6 +757,13 @@ extern void send_sigio(struct fown_struct *fown, int fd, int band); extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); extern int fcntl_getlease(struct file *filp); +/* fs/sync.c */ +#define SYNC_FILE_RANGE_WAIT_BEFORE 1 +#define SYNC_FILE_RANGE_WRITE 2 +#define SYNC_FILE_RANGE_WAIT_AFTER 4 +extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, + int flags); + /* fs/locks.c */ extern void locks_init_lock(struct file_lock *); extern void locks_copy_lock(struct file_lock *, struct file_lock *); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e78ffc7d5b56..5717147596b6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -571,5 +571,7 @@ asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, asmlinkage long sys_unshare(unsigned long unshare_flags); asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags); +asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, + int flags); #endif -- cgit v1.2.3 From 0ca07731e495584bd84dca15a0f065470d594ec4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 31 Mar 2006 02:30:58 -0800 Subject: [PATCH] vt: add TIOCL_GETKMSGREDIRECT Add TIOCL_GETKMSGREDIRECT needed by the userland suspend tool to get the current value of kmsg_redirect from the kernel so that it can save it and restore it after resume. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tiocl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tiocl.h b/include/linux/tiocl.h index 2c9e847f6ed1..4756862c4ed4 100644 --- a/include/linux/tiocl.h +++ b/include/linux/tiocl.h @@ -34,5 +34,6 @@ struct tiocl_selection { #define TIOCL_SCROLLCONSOLE 13 /* scroll console */ #define TIOCL_BLANKSCREEN 14 /* keep screen blank even if a key is pressed */ #define TIOCL_BLANKEDSCREEN 15 /* return which vt was blanked */ +#define TIOCL_GETKMSGREDIRECT 17 /* get the vt the kernel messages are restricted to */ #endif /* _LINUX_TIOCL_H */ -- cgit v1.2.3 From c72a1d608dd0eb3d553a08bfdf1c0041bebaa8a0 Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Fri, 31 Mar 2006 02:31:04 -0800 Subject: [PATCH] LED: add LED class Add the foundations of a new LEDs subsystem. This patch adds a class which presents LED devices within sysfs and allows their brightness to be controlled. Signed-off-by: Richard Purdie Cc: Russell King Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/leds.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 include/linux/leds.h (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h new file mode 100644 index 000000000000..6812640b39cc --- /dev/null +++ b/include/linux/leds.h @@ -0,0 +1,51 @@ +/* + * Driver model for leds and led triggers + * + * Copyright (C) 2005 John Lenz + * Copyright (C) 2005 Richard Purdie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#ifndef __LINUX_LEDS_H_INCLUDED +#define __LINUX_LEDS_H_INCLUDED + +struct device; +struct class_device; +/* + * LED Core + */ + +enum led_brightness { + LED_OFF = 0, + LED_HALF = 127, + LED_FULL = 255, +}; + +struct led_classdev { + const char *name; + int brightness; + int flags; +#define LED_SUSPENDED (1 << 0) + + /* A function to set the brightness of the led */ + void (*brightness_set)(struct led_classdev *led_cdev, + enum led_brightness brightness); + + struct class_device *class_dev; + /* LED Device linked list */ + struct list_head node; + + /* Trigger data */ + char *default_trigger; +}; + +extern int led_classdev_register(struct device *parent, + struct led_classdev *led_cdev); +extern void led_classdev_unregister(struct led_classdev *led_cdev); +extern void led_classdev_suspend(struct led_classdev *led_cdev); +extern void led_classdev_resume(struct led_classdev *led_cdev); + +#endif /* __LINUX_LEDS_H_INCLUDED */ -- cgit v1.2.3 From c3bc9956ec52fb2c70f29aa894d8eec766116584 Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Fri, 31 Mar 2006 02:31:05 -0800 Subject: [PATCH] LED: add LED trigger tupport Add support for LED triggers to the LED subsystem. "Triggers" are events which change the state of an LED. Two kinds of trigger are available, simple ones which can be added to exising code with minimum disruption and complex ones for implementing new or more complex functionality. Signed-off-by: Richard Purdie Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/leds.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 6812640b39cc..404575c3dd5a 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -40,6 +40,14 @@ struct led_classdev { /* Trigger data */ char *default_trigger; +#ifdef CONFIG_LEDS_TRIGGERS + rwlock_t trigger_lock; + /* Protects the trigger data below */ + + struct led_trigger *trigger; + struct list_head trig_list; + void *trigger_data; +#endif }; extern int led_classdev_register(struct device *parent, @@ -48,4 +56,48 @@ extern void led_classdev_unregister(struct led_classdev *led_cdev); extern void led_classdev_suspend(struct led_classdev *led_cdev); extern void led_classdev_resume(struct led_classdev *led_cdev); +/* + * LED Triggers + */ +#ifdef CONFIG_LEDS_TRIGGERS + +#define TRIG_NAME_MAX 50 + +struct led_trigger { + /* Trigger Properties */ + const char *name; + void (*activate)(struct led_classdev *led_cdev); + void (*deactivate)(struct led_classdev *led_cdev); + + /* LEDs under control by this trigger (for simple triggers) */ + rwlock_t leddev_list_lock; + struct list_head led_cdevs; + + /* Link to next registered trigger */ + struct list_head next_trig; +}; + +/* Registration functions for complex triggers */ +extern int led_trigger_register(struct led_trigger *trigger); +extern void led_trigger_unregister(struct led_trigger *trigger); + +/* Registration functions for simple triggers */ +#define DEFINE_LED_TRIGGER(x) static struct led_trigger *x; +#define DEFINE_LED_TRIGGER_GLOBAL(x) struct led_trigger *x; +extern void led_trigger_register_simple(const char *name, + struct led_trigger **trigger); +extern void led_trigger_unregister_simple(struct led_trigger *trigger); +extern void led_trigger_event(struct led_trigger *trigger, + enum led_brightness event); + +#else + +/* Triggers aren't active - null macros */ +#define DEFINE_LED_TRIGGER(x) +#define DEFINE_LED_TRIGGER_GLOBAL(x) +#define led_trigger_register_simple(x, y) do {} while(0) +#define led_trigger_unregister_simple(x) do {} while(0) +#define led_trigger_event(x, y) do {} while(0) + +#endif #endif /* __LINUX_LEDS_H_INCLUDED */ -- cgit v1.2.3 From 2bfb646cdf348cb77c572f06d5b9d17ea205c7e2 Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Fri, 31 Mar 2006 02:31:16 -0800 Subject: [PATCH] LED: Add IDE disk activity LED trigger Add an LED trigger for IDE disk activity to the ide-disk driver. Signed-off-by: Richard Purdie Acked-by: Bartlomiej Zolnierkiewicz Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/leds.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 404575c3dd5a..4617e75903b0 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -100,4 +100,12 @@ extern void led_trigger_event(struct led_trigger *trigger, #define led_trigger_event(x, y) do {} while(0) #endif + +/* Trigger specific functions */ +#ifdef CONFIG_LEDS_TRIGGER_IDE_DISK +extern void ledtrig_ide_activity(void); +#else +#define ledtrig_ide_activity() do {} while(0) +#endif + #endif /* __LINUX_LEDS_H_INCLUDED */ -- cgit v1.2.3 From 00362e33f65f1cb5d15e62ea5509520ce2770360 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 31 Mar 2006 02:31:17 -0800 Subject: [PATCH] hrtimer: create generic sleeper The removal of the data field in the hrtimer structure enforces the embedding of the timer into another data structure. nanosleep now uses a private implementation of the most common used timer callback function (simple task wakeup). In order to avoid the reimplentation of such functionality all over the place a generic hrtimer_sleeper functionality is created. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hrtimer.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 93830158348e..b20939287613 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -57,6 +57,19 @@ struct hrtimer { struct hrtimer_base *base; }; +/** + * struct hrtimer_sleeper - simple sleeper structure + * + * @timer: embedded timer structure + * @task: task to wake up + * + * task is set to NULL, when the timer expires. + */ +struct hrtimer_sleeper { + struct hrtimer timer; + struct task_struct *task; +}; + /** * struct hrtimer_base - the timer base for a specific clock * @@ -127,6 +140,9 @@ extern long hrtimer_nanosleep(struct timespec *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); +extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + struct task_struct *tsk); + /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); -- cgit v1.2.3 From db1b1fefc2cecbff2e4214062fa8c680cb6e7b7d Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Fri, 31 Mar 2006 02:31:21 -0800 Subject: [PATCH] sched: reduce overhead of calc_load Currently, count_active_tasks() calls both nr_running() & nr_interruptible(). Each of these functions does a "for_each_cpu" & reads values from the runqueue of each cpu. Although this is not a lot of instructions, each runqueue may be located on different node. Depending on the architecture, a unique TLB entry may be required to access each runqueue. Since there may be more runqueues than cpu TLB entries, a scan of all runqueues can trash the TLB. Each memory reference incurs a TLB miss & refill. In addition, the runqueue cacheline that contains nr_running & nr_uninterruptible may be evicted from the cache between the two passes. This causes unnecessary cache misses. Combining nr_running() & nr_interruptible() into a single function substantially reduces the TLB & cache misses on large systems. This should have no measureable effect on smaller systems. On a 128p IA64 system running a memory stress workload, the new function reduced the overhead of calc_load() from 605 usec/call to 324 usec/call. Signed-off-by: Jack Steiner Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d04186d8cc68..ab84adf5bb9a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,6 +100,7 @@ DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); +extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); #include -- cgit v1.2.3 From 3dee386e14045484a6c41c8f03a263f9d79de740 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 31 Mar 2006 02:31:23 -0800 Subject: [PATCH] sched: cleanup task_activated() The activated flag in task_struct is used to track different sleep types and its usage is somewhat obfuscated. Convert the variable to an enum with more descriptive names without altering the function. Signed-off-by: Con Kolivas Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ab84adf5bb9a..c4fd3fcd3feb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -684,6 +684,13 @@ static inline void prefetch_stack(struct task_struct *t) { } struct audit_context; /* See audit.c */ struct mempolicy; +enum sleep_type { + SLEEP_NORMAL, + SLEEP_NONINTERACTIVE, + SLEEP_INTERACTIVE, + SLEEP_INTERRUPTED, +}; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -706,7 +713,7 @@ struct task_struct { unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - int activated; + enum sleep_type sleep_type; unsigned long policy; cpumask_t cpus_allowed; -- cgit v1.2.3 From d425b274ba83ba4e7746a40446ec0ba3267de51f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 31 Mar 2006 02:31:29 -0800 Subject: [PATCH] sched: activate SCHED BATCH expired To increase the strength of SCHED_BATCH as a scheduling hint we can activate batch tasks on the expired array since by definition they are latency insensitive tasks. Signed-off-by: Con Kolivas Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c4fd3fcd3feb..78c40dd2e19a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -484,6 +484,7 @@ struct signal_struct { #define MAX_PRIO (MAX_RT_PRIO + 40) #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) /* * Some day this will be a full-fledged user tracking system.. -- cgit v1.2.3 From 158d9ebd19280582da172626ad3edda1a626dace Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 31 Mar 2006 02:31:34 -0800 Subject: [PATCH] resurrect __put_task_struct This just got nuked in mainline. Bring it back because Eric's patches use it. Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c40dd2e19a..95f248ba36c9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -906,6 +906,7 @@ extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) extern void __put_task_struct_cb(struct rcu_head *rhp); +extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { -- cgit v1.2.3 From 8c7904a00b06d2ee51149794b619e07369fcf9d4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 31 Mar 2006 02:31:37 -0800 Subject: [PATCH] task: RCU protect task->usage A big problem with rcu protected data structures that are also reference counted is that you must jump through several hoops to increase the reference count. I think someone finally implemented atomic_inc_not_zero(&count) to automate the common case. Unfortunately this means you must special case the rcu access case. When data structures are only visible via rcu in a manner that is not determined by the reference count on the object (i.e. tasks are visible until their zombies are reaped) there is a much simpler technique we can employ. Simply delaying the decrement of the reference count until the rcu interval is over. What that means is that the proc code that looks up a task and later wants to sleep can now do: rcu_read_lock(); task = find_task_by_pid(some_pid); if (task) { get_task_struct(task); } rcu_read_unlock(); The effect on the rest of the kernel is that put_task_struct becomes cheaper and immediate, and in the case where the task has been reaped it frees the task immediate instead of unnecessarily waiting an until the rcu interval is over. Cleanup of task_struct does not happen when its reference count drops to zero, instead cleanup happens when release_task is called. Tasks can only be looked up via rcu before release_task is called. All rcu protected members of task_struct are freed by release_task. Therefore we can move call_rcu from put_task_struct into release_task. And we can modify release_task to not immediately release the reference count but instead have it call put_task_struct from the function it gives to call_rcu. The end result: - get_task_struct is safe in an rcu context where we have just looked up the task. - put_task_struct() simplifies into its old pre rcu self. This reorganization also makes put_task_struct uncallable from modules as it is not exported but it does not appear to be called from any modules so this should not be an issue, and is trivially fixed. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 95f248ba36c9..7e0ff5dba986 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -911,7 +911,7 @@ extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { if (atomic_dec_and_test(&t->usage)) - call_rcu(&t->rcu, __put_task_struct_cb); + __put_task_struct(t); } /* -- cgit v1.2.3 From 92476d7fc0326a409ab1d3864a04093a6be9aca7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 31 Mar 2006 02:31:42 -0800 Subject: [PATCH] pidhash: Refactor the pid hash table Simplifies the code, reduces the need for 4 pid hash tables, and makes the code more capable. In the discussions I had with Oleg it was felt that to a large extent the cleanup itself justified the work. With struct pid being dynamically allocated meant we could create the hash table entry when the pid was allocated and free the hash table entry when the pid was freed. Instead of playing with the hash lists when ever a process would attach or detach to a process. For myself the fact that it gave what my previous task_ref patch gave for free with simpler code was a big win. The problem is that if you hold a reference to struct task_struct you lock in 10K of low memory. If you do that in a user controllable way like /proc does, with an unprivileged but hostile user space application with typical resource limits of 1000 fds and 100 processes I can trigger the OOM killer by consuming all of low memory with task structs, on a machine wight 1GB of low memory. If I instead hold a reference to struct pid which holds a pointer to my task_struct, I don't suffer from that problem because struct pid is 2 orders of magnitude smaller. In fact struct pid is small enough that most other kernel data structures dwarf it, so simply limiting the number of referring data structures is enough to prevent exhaustion of low memory. This splits the current struct pid into two structures, struct pid and struct pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one. struct pid_link is the per process linkage into the hash tables and lives in struct task_struct. struct pid is given an indepedent lifetime, and holds pointers to each of the pid types. The independent life of struct pid simplifies attach_pid, and detach_pid, because we are always manipulating the list of pids and not the hash table. In addition in giving struct pid an indpendent life it makes the concept much more powerful. Kernel data structures can now embed a struct pid * instead of a pid_t and not suffer from pid wrap around problems or from keeping unnecessarily large amounts of memory allocated. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid.h | 96 +++++++++++++++++++++++++++++++++++++++++++-------- include/linux/sched.h | 4 +-- 2 files changed, 83 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 5b9082cc600f..29960b03bef7 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -1,6 +1,8 @@ #ifndef _LINUX_PID_H #define _LINUX_PID_H +#include + enum pid_type { PIDTYPE_PID, @@ -9,45 +11,109 @@ enum pid_type PIDTYPE_MAX }; +/* + * What is struct pid? + * + * A struct pid is the kernel's internal notion of a process identifier. + * It refers to individual tasks, process groups, and sessions. While + * there are processes attached to it the struct pid lives in a hash + * table, so it and then the processes that it refers to can be found + * quickly from the numeric pid value. The attached processes may be + * quickly accessed by following pointers from struct pid. + * + * Storing pid_t values in the kernel and refering to them later has a + * problem. The process originally with that pid may have exited and the + * pid allocator wrapped, and another process could have come along + * and been assigned that pid. + * + * Referring to user space processes by holding a reference to struct + * task_struct has a problem. When the user space process exits + * the now useless task_struct is still kept. A task_struct plus a + * stack consumes around 10K of low kernel memory. More precisely + * this is THREAD_SIZE + sizeof(struct task_struct). By comparison + * a struct pid is about 64 bytes. + * + * Holding a reference to struct pid solves both of these problems. + * It is small so holding a reference does not consume a lot of + * resources, and since a new struct pid is allocated when the numeric + * pid value is reused we don't mistakenly refer to new processes. + */ + struct pid { + atomic_t count; /* Try to keep pid_chain in the same cacheline as nr for find_pid */ int nr; struct hlist_node pid_chain; - /* list of pids with the same nr, only one of them is in the hash */ - struct list_head pid_list; + /* lists of tasks that use this pid */ + struct hlist_head tasks[PIDTYPE_MAX]; + struct rcu_head rcu; }; -#define pid_task(elem, type) \ - list_entry(elem, struct task_struct, pids[type].pid_list) +struct pid_link +{ + struct hlist_node node; + struct pid *pid; +}; + +static inline struct pid *get_pid(struct pid *pid) +{ + if (pid) + atomic_inc(&pid->count); + return pid; +} + +extern void FASTCALL(put_pid(struct pid *pid)); +extern struct task_struct *FASTCALL(pid_task(struct pid *pid, enum pid_type)); +extern struct task_struct *FASTCALL(get_pid_task(struct pid *pid, + enum pid_type)); /* * attach_pid() and detach_pid() must be called with the tasklist_lock * write-held. */ -extern int FASTCALL(attach_pid(struct task_struct *task, enum pid_type type, int nr)); +extern int FASTCALL(attach_pid(struct task_struct *task, + enum pid_type type, int nr)); extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type)); /* * look up a PID in the hash table. Must be called with the tasklist_lock - * held. + * or rcu_read_lock() held. + */ +extern struct pid *FASTCALL(find_pid(int nr)); + +/* + * Lookup a PID in the hash table, and return with it's count elevated. */ -extern struct pid *FASTCALL(find_pid(enum pid_type, int)); +extern struct pid *find_get_pid(int nr); -extern int alloc_pidmap(void); -extern void FASTCALL(free_pidmap(int)); +extern struct pid *alloc_pid(void); +extern void FASTCALL(free_pid(struct pid *pid)); +#define pid_next(task, type) \ + ((task)->pids[(type)].node.next) + +#define pid_next_task(task, type) \ + hlist_entry(pid_next(task, type), struct task_struct, \ + pids[(type)].node) + + +/* We could use hlist_for_each_entry_rcu here but it takes more arguments + * than the do_each_task_pid/while_each_task_pid. So we roll our own + * to preserve the existing interface. + */ #define do_each_task_pid(who, type, task) \ if ((task = find_task_by_pid_type(type, who))) { \ - prefetch((task)->pids[type].pid_list.next); \ + prefetch(pid_next(task, type)); \ do { #define while_each_task_pid(who, type, task) \ - } while (task = pid_task((task)->pids[type].pid_list.next,\ - type), \ - prefetch((task)->pids[type].pid_list.next), \ - hlist_unhashed(&(task)->pids[type].pid_chain)); \ - } \ + } while (pid_next(task, type) && ({ \ + task = pid_next_task(task, type); \ + rcu_dereference(task); \ + prefetch(pid_next(task, type)); \ + 1; }) ); \ + } #endif /* _LINUX_PID_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 7e0ff5dba986..541f4828f5e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -760,7 +760,7 @@ struct task_struct { struct task_struct *group_leader; /* threadgroup leader */ /* PID/PID hash table linkage. */ - struct pid pids[PIDTYPE_MAX]; + struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; struct completion *vfork_done; /* for vfork() */ @@ -899,7 +899,7 @@ static inline pid_t process_group(struct task_struct *tsk) */ static inline int pid_alive(struct task_struct *p) { - return p->pids[PIDTYPE_PID].nr != 0; + return p->pids[PIDTYPE_PID].pid != NULL; } extern void free_task(struct task_struct *tsk); -- cgit v1.2.3 From 3e7e241f8c5c87cc3685364feface081c9fa3648 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 31 Mar 2006 02:31:43 -0800 Subject: [PATCH] dcache: Add helper d_hash_and_lookup It is very common to hash a dentry and then to call lookup. If we take fs specific hash functions into account the full hash logic can get ugly. Further full_name_hash as an inline function is almost 100 bytes on x86 so having a non-inline choice in some cases can measurably decrease code size. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d10bd30c337e..836325ee0931 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -275,6 +275,7 @@ extern void d_move(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ extern struct dentry * d_lookup(struct dentry *, struct qstr *); extern struct dentry * __d_lookup(struct dentry *, struct qstr *); +extern struct dentry * d_hash_and_lookup(struct dentry *, struct qstr *); /* validate "insecure" dentry pointer */ extern int d_validate(struct dentry *, struct dentry *); -- cgit v1.2.3 From 6ca017658b1f902c9bba2cc1017e301581f7728d Mon Sep 17 00:00:00 2001 From: Richard Purdie Date: Fri, 31 Mar 2006 02:31:49 -0800 Subject: [PATCH] backlight: Backlight Class Improvements Backlight class attributes are currently easy to implement incorrectly. Moving certain handling into the backlight core prevents this whilst at the same time makes the drivers simpler and consistent. The following changes are included: The brightness attribute only sets and reads the brightness variable in the backlight_properties structure. The power attribute only sets and reads the power variable in the backlight_properties structure. Any framebuffer blanking events change a variable fb_blank in the backlight_properties structure. The backlight driver has only two functions to implement. One function is called when any of the above properties change (to update the backlight brightness), the second is called to return the current backlight brightness value. A new attribute "actual_brightness" is added to return this brightness as determined by the driver having combined all the above factors (and any driver/device specific factors). Additionally, the backlight core takes care of checking the maximum brightness is not exceeded and of turning off the backlight before device removal. The corgi backlight driver is updated to reflect these changes. Signed-off-by: Richard Purdie Signed-off-by: Antonino Daplas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backlight.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index bb9e54322322..75e91f5b6a04 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -19,20 +19,25 @@ struct fb_info; struct backlight_properties { /* Owner module */ struct module *owner; - /* Get the backlight power status (0: full on, 1..3: power saving - modes; 4: full off), see FB_BLANK_XXX */ - int (*get_power)(struct backlight_device *); - /* Enable or disable power to the LCD (0: on; 4: off, see FB_BLANK_XXX) */ - int (*set_power)(struct backlight_device *, int power); - /* Maximal value for brightness (read-only) */ - int max_brightness; - /* Get current backlight brightness */ + + /* Notify the backlight driver some property has changed */ + int (*update_status)(struct backlight_device *); + /* Return the current backlight brightness (accounting for power, + fb_blank etc.) */ int (*get_brightness)(struct backlight_device *); - /* Set backlight brightness (0..max_brightness) */ - int (*set_brightness)(struct backlight_device *, int brightness); /* Check if given framebuffer device is the one bound to this backlight; return 0 if not, !=0 if it is. If NULL, backlight always matches the fb. */ int (*check_fb)(struct fb_info *); + + /* Current User requested brightness (0 - max_brightness) */ + int brightness; + /* Maximal value for brightness (read-only) */ + int max_brightness; + /* Current FB Power mode (0: full on, 1..3: power saving + modes; 4: full off), see FB_BLANK_XXX */ + int power; + /* FB Blanking active? (values as for power) */ + int fb_blank; }; struct backlight_device { -- cgit v1.2.3 From a536093a2f07007aa572e922752b7491b9ea8ff2 Mon Sep 17 00:00:00 2001 From: "Antonino A. Daplas" Date: Fri, 31 Mar 2006 02:31:54 -0800 Subject: [PATCH] fbcon: Fix big-endian bogosity in slow_imageblit() The monochrome->color expansion routine that handles bitmaps which have (widths % 8) != 0 (slow_imageblit) produces corrupt characters in big-endian. This is caused by a bogus bit test in slow_imageblit(). Fix. This patch may deserve to go to the stable tree. The code has already been well tested in little-endian machines. It's only in big-endian where there is uncertainty and Herbert confirmed that this is the correct way to go. It should not introduce regressions. Signed-off-by: Antonino Daplas Acked-by: Herbert Poetzl Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index d03fadfcafe3..315d89740ddf 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -839,12 +839,10 @@ struct fb_info { #define FB_LEFT_POS(bpp) (32 - bpp) #define FB_SHIFT_HIGH(val, bits) ((val) >> (bits)) #define FB_SHIFT_LOW(val, bits) ((val) << (bits)) -#define FB_BIT_NR(b) (7 - (b)) #else #define FB_LEFT_POS(bpp) (0) #define FB_SHIFT_HIGH(val, bits) ((val) << (bits)) #define FB_SHIFT_LOW(val, bits) ((val) >> (bits)) -#define FB_BIT_NR(b) (b) #endif /* -- cgit v1.2.3 From a244e1698ae3609cdfe24088e1293593cb7a5278 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 31 Mar 2006 02:32:11 -0800 Subject: [PATCH] fs/namei.c: make lookup_hash() static As announced, lookup_hash() can now become static. Signed-off-by: Adrian Bunk Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/namei.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index e6698013e4d0..58cb3d3d44b4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -75,7 +75,6 @@ extern struct file *nameidata_to_filp(struct nameidata *nd, int flags); extern void release_open_intent(struct nameidata *); extern struct dentry * lookup_one_len(const char *, struct dentry *, int); -extern __deprecated_for_modules struct dentry * lookup_hash(struct nameidata *); extern int follow_down(struct vfsmount **, struct dentry **); extern int follow_up(struct vfsmount **, struct dentry **); -- cgit v1.2.3 From e1211e3fa7fd05ff0d4f597fd37e40de8acc6784 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 1 Apr 2006 01:38:18 +0900 Subject: [PATCH] libata: implement ata_dev_enabled and disabled() This patch renames ata_dev_present() to ata_dev_enabled() and adds ata_dev_disabled(). This is to discern the state where a device is present but disabled from not-present state. This disctinction is necessary when configuring transfer mode because device selection timing must not be violated even if a device fails to configure. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 0d61357604d5..c6883ba8cba9 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -672,14 +672,24 @@ static inline unsigned int ata_tag_valid(unsigned int tag) return (tag < ATA_MAX_QUEUE) ? 1 : 0; } -static inline unsigned int ata_class_present(unsigned int class) +static inline unsigned int ata_class_enabled(unsigned int class) { return class == ATA_DEV_ATA || class == ATA_DEV_ATAPI; } -static inline unsigned int ata_dev_present(const struct ata_device *dev) +static inline unsigned int ata_class_disabled(unsigned int class) { - return ata_class_present(dev->class); + return class == ATA_DEV_ATA_UNSUP || class == ATA_DEV_ATAPI_UNSUP; +} + +static inline unsigned int ata_dev_enabled(const struct ata_device *dev) +{ + return ata_class_enabled(dev->class); +} + +static inline unsigned int ata_dev_disabled(const struct ata_device *dev) +{ + return ata_class_disabled(dev->class); } static inline u8 ata_chk_status(struct ata_port *ap) -- cgit v1.2.3