From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:36:59 -0700 Subject: mm: thp: tail page refcounting fix Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli Reported-by: Michel Lespinasse Reviewed-by: Michel Lespinasse Reviewed-by: Minchan Kim Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 56 +++++++++++++++++++----------------------------- include/linux/mm_types.h | 21 ++++++++++++++---- 2 files changed, 39 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b3e3b8bb706..f81b7b41930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page) return page; } +/* + * The atomic page->_mapcount, starts from -1: so that transitions + * both from it and to it can be tracked, using atomic_inc_and_test + * and atomic_add_negative(-1). + */ +static inline void reset_page_mapcount(struct page *page) +{ + atomic_set(&(page)->_mapcount, -1); +} + +static inline int page_mapcount(struct page *page) +{ + return atomic_read(&(page)->_mapcount) + 1; +} + static inline int page_count(struct page *page) { return atomic_read(&compound_head(page)->_count); } +extern bool __get_page_tail(struct page *page); + static inline void get_page(struct page *page) { + if (unlikely(PageTail(page))) + if (likely(__get_page_tail(page))) + return; /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. Only if - * we're getting a tail page, the elevated page->_count is - * required only in the head page, so for tail pages the - * bugcheck only verifies that the page->_count isn't - * negative. + * requires to already have an elevated page->_count. */ - VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); + VM_BUG_ON(atomic_read(&page->_count) <= 0); atomic_inc(&page->_count); - /* - * Getting a tail page will elevate both the head and tail - * page->_count(s). - */ - if (unlikely(PageTail(page))) { - /* - * This is safe only because - * __split_huge_page_refcount can't run under - * get_page(). - */ - VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - atomic_inc(&page->first_page->_count); - } } static inline struct page *virt_to_head_page(const void *x) @@ -803,21 +806,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -/* - * The atomic page->_mapcount, like _count, starts from -1: - * so that transitions both from it and to it can be tracked, - * using atomic_inc_and_test and atomic_add_negative(-1). - */ -static inline void reset_page_mapcount(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - -static inline int page_mapcount(struct page *page) -{ - return atomic_read(&(page)->_mapcount) + 1; -} - /* * Return true if this page is mapped into pagetables. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e01a19a91e8..5b42f1b34eb7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -62,10 +62,23 @@ struct page { struct { union { - atomic_t _mapcount; /* Count of ptes mapped in mms, - * to show when page is mapped - * & limit reverse map searches. - */ + /* + * Count of ptes mapped in + * mms, to show when page is + * mapped & limit reverse map + * searches. + * + * Used also for tail pages + * refcounting instead of + * _count. Tail pages cannot + * be mapped and keeping the + * tail page _count zero at + * all times guarantees + * get_page_unless_zero() will + * never succeed on tail + * pages. + */ + atomic_t _mapcount; struct { unsigned inuse:16; -- cgit v1.2.3 From b35a35b556f5e6b7993ad0baf20173e75c09ce8c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:36 -0700 Subject: thp: share get_huge_page_tail() This avoids duplicating the function in every arch gup_fast. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f81b7b41930c..3dc3a8c2c485 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -376,6 +376,17 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + extern bool __get_page_tail(struct page *page); static inline void get_page(struct page *page) -- cgit v1.2.3 From b6eb48d02dc73d19bebc396a9e92dd64a65d3199 Mon Sep 17 00:00:00 2001 From: Sami Kerola Date: Wed, 2 Nov 2011 13:37:58 -0700 Subject: minix: describe usage of different magic numbers One can get this information from minix/inode.c, but adding the explanations at the definition sites is more appropriate. Signed-off-by: Sami Kerola Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/magic.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/magic.h b/include/linux/magic.h index 1e5df2af8d84..2d4beab0d5b7 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -30,11 +30,11 @@ #define ANON_INODE_FS_MAGIC 0x09041934 #define PSTOREFS_MAGIC 0x6165676C -#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */ -#define MINIX_SUPER_MAGIC2 0x138F /* minix fs, 30 char names */ -#define MINIX2_SUPER_MAGIC 0x2468 /* minix V2 fs */ -#define MINIX2_SUPER_MAGIC2 0x2478 /* minix V2 fs, 30 char names */ -#define MINIX3_SUPER_MAGIC 0x4d5a /* minix V3 fs */ +#define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ +#define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ +#define MINIX2_SUPER_MAGIC 0x2468 /* minix v2 fs, 14 char names */ +#define MINIX2_SUPER_MAGIC2 0x2478 /* minix v2 fs, 30 char names */ +#define MINIX3_SUPER_MAGIC 0x4d5a /* minix v3 fs, 60 char names */ #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ -- cgit v1.2.3 From c0ff4b8540a5c158b8e5bafb7d767298b67b0b92 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 2 Nov 2011 13:38:15 -0700 Subject: memcg: rename mem variable to memcg The memcg code sometimes uses "struct mem_cgroup *mem" and sometimes uses "struct mem_cgroup *memcg". Rename all mem variables to memcg in source file. Signed-off-by: Raghavendra K T Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ac797fa03ef8..05206aac5965 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -78,8 +78,8 @@ extern void mem_cgroup_uncharge_end(void); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); +extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask); +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -88,19 +88,19 @@ extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); static inline int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference((mm)->owner)); + memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner)); rcu_read_unlock(); - return cgroup == mem; + return cgroup == memcg; } -extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); +extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask); -extern void mem_cgroup_end_migration(struct mem_cgroup *mem, +extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); /* @@ -148,7 +148,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, unsigned long *total_scanned); -u64 mem_cgroup_get_limit(struct mem_cgroup *mem); +u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -244,18 +244,20 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm return NULL; } -static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem) +static inline int mm_match_cgroup(struct mm_struct *mm, + struct mem_cgroup *memcg) { return 1; } static inline int task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *mem) + const struct mem_cgroup *memcg) { return 1; } -static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +static inline struct cgroup_subsys_state + *mem_cgroup_css(struct mem_cgroup *memcg) { return NULL; } @@ -267,22 +269,22 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, return 0; } -static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, +static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok) { } -static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) +static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg) { return 0; } -static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, +static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg, int priority) { } -static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, +static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg, int priority) { } @@ -348,7 +350,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, } static inline -u64 mem_cgroup_get_limit(struct mem_cgroup *mem) +u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) { return 0; } -- cgit v1.2.3 From 9b272977e3b99a8699361d214b51f98c8a9e0e7b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 2 Nov 2011 13:38:23 -0700 Subject: memcg: skip scanning active lists based on individual size Reclaim decides to skip scanning an active list when the corresponding inactive list is above a certain size in comparison to leave the assumed working set alone while there are still enough reclaim candidates around. The memcg implementation of comparing those lists instead reports whether the whole memcg is low on the requested type of inactive pages, considering all nodes and zones. This can lead to an oversized active list not being scanned because of the state of the other lists in the memcg, as well as an active list being scanned while its corresponding inactive list has enough pages. Not only is this wrong, it's also a scalability hazard, because the global memory state over all nodes and zones has to be gathered for each memcg and zone scanned. Make these calculations purely based on the size of the two LRU lists that are actually affected by the outcome of the decision. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Reviewed-by: Minchan Kim Reviewed-by: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 05206aac5965..b87068a1a09e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -106,8 +106,10 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, /* * For memory reclaim. */ -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, + struct zone *zone); +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, + struct zone *zone); int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lrumask); @@ -295,13 +297,13 @@ static inline bool mem_cgroup_disabled(void) } static inline int -mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) +mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) { return 1; } static inline int -mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) { return 1; } -- cgit v1.2.3 From e57940d719e9fc5223d133b631f8cb5232d6064e Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 2 Nov 2011 13:38:54 -0700 Subject: ipc/sem.c: remove private structures from public header file include/linux/sem.h contains several structures that are only used within ipc/sem.c. The patch moves them into ipc/sem.c - there is no need to expose the structures to the whole kernel. No functional changes, only whitespace cleanups and 80-char per line fixes. Signed-off-by: Manfred Spraul Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sem.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sem.h b/include/linux/sem.h index 1feb2de2ee57..464842621a4a 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -83,13 +83,6 @@ struct seminfo { struct task_struct; -/* One semaphore structure for each semaphore in the system. */ -struct sem { - int semval; /* current value */ - int sempid; /* pid of last operation */ - struct list_head sem_pending; /* pending single-sop operations */ -}; - /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm ____cacheline_aligned_in_smp @@ -103,41 +96,6 @@ struct sem_array { int complex_count; /* pending complex operations */ }; -/* One queue for each sleeping process in the system. */ -struct sem_queue { - struct list_head simple_list; /* queue of pending operations */ - struct list_head list; /* queue of pending operations */ - struct task_struct *sleeper; /* this process */ - struct sem_undo *undo; /* undo structure */ - int pid; /* process id of requesting process */ - int status; /* completion status of operation */ - struct sembuf *sops; /* array of pending operations */ - int nsops; /* number of operations */ - int alter; /* does the operation alter the array? */ -}; - -/* Each task has a list of undo requests. They are executed automatically - * when the process exits. - */ -struct sem_undo { - struct list_head list_proc; /* per-process list: all undos from one process. */ - /* rcu protected */ - struct rcu_head rcu; /* rcu struct for sem_undo() */ - struct sem_undo_list *ulp; /* sem_undo_list for the process */ - struct list_head list_id; /* per semaphore array list: all undos for one array */ - int semid; /* semaphore set identifier */ - short * semadj; /* array of adjustments, one per semaphore */ -}; - -/* sem_undo_list controls shared access to the list of sem_undo structures - * that may be shared among all a CLONE_SYSVSEM task group. - */ -struct sem_undo_list { - atomic_t refcnt; - spinlock_t lock; - struct list_head list_proc; -}; - struct sysv_sem { struct sem_undo_list *undo_list; }; -- cgit v1.2.3 From f567a18590742b811287b7512fb0908deac4eef7 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 2 Nov 2011 13:38:56 -0700 Subject: include/linux/sem.h: make sysv_sem empty if SYSVIPC is disabled For the sysvsem undo, each task struct contains a sysv_sem structure with a pointer to the undo information. This pointer is only necessary if sysvipc is enabled - thus the pointer can be made conditional on CONFIG_SYSVIPC. Signed-off-by: Manfred Spraul Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Mike Galbraith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sem.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sem.h b/include/linux/sem.h index 464842621a4a..10d6b226afc5 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -96,16 +96,21 @@ struct sem_array { int complex_count; /* pending complex operations */ }; +#ifdef CONFIG_SYSVIPC + struct sysv_sem { struct sem_undo_list *undo_list; }; -#ifdef CONFIG_SYSVIPC - extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else + +struct sysv_sem { + /* empty */ +}; + static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) { return 0; -- cgit v1.2.3 From 48618fb4e522d9d02e217ac05f52749545c1af20 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Wed, 2 Nov 2011 13:39:09 -0700 Subject: RapidIO: add mport driver for Tsi721 bridge Add RapidIO mport driver for IDT TSI721 PCI Express-to-SRIO bridge device. The driver provides full set of callback functions defined for mport devices in RapidIO subsystem. It also is compatible with current version of RIONET driver (Ethernet over RapidIO messaging services). This patch is applicable to kernel versions starting from 2.6.39. Signed-off-by: Alexandre Bounine Signed-off-by: Chul Kim Cc: Kumar Gala Cc: Matt Porter Cc: Li Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h index 0cee0152aca9..b66d13d1bdc0 100644 --- a/include/linux/rio_ids.h +++ b/include/linux/rio_ids.h @@ -39,5 +39,6 @@ #define RIO_DID_IDTCPS1616 0x0379 #define RIO_DID_IDTVPS1616 0x0377 #define RIO_DID_IDTSPS1616 0x0378 +#define RIO_DID_TSI721 0x80ab #endif /* LINUX_RIO_IDS_H */ -- cgit v1.2.3 From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 2 Nov 2011 13:39:22 -0700 Subject: sysctl: add support for poll() Adding support for poll() in sysctl fs allows userspace to receive notifications of changes in sysctl entries. This adds a infrastructure to allow files in sysctl fs to be pollable and implements it for hostname and domainname. [akpm@linux-foundation.org: s/declare/define/ for definitions] Signed-off-by: Lucas De Marchi Cc: Greg KH Cc: Kay Sievers Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 22 ++++++++++++++++++++++ include/linux/utsname.h | 16 ++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 9a1ec10fd504..703cfa33a3ca 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -931,6 +931,7 @@ enum #ifdef __KERNEL__ #include #include +#include /* For the /proc/sys support */ struct ctl_table; @@ -1011,6 +1012,26 @@ extern int proc_do_large_bitmap(struct ctl_table *, int, * cover common cases. */ +/* Support for userspace poll() to watch for changes */ +struct ctl_table_poll { + atomic_t event; + wait_queue_head_t wait; +}; + +static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) +{ + return (void *)(unsigned long)atomic_read(&poll->event); +} + +void proc_sys_poll_notify(struct ctl_table_poll *poll); + +#define __CTL_TABLE_POLL_INITIALIZER(name) { \ + .event = ATOMIC_INIT(0), \ + .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) } + +#define DEFINE_CTL_TABLE_POLL(name) \ + struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name) + /* A sysctl table is an array of struct ctl_table: */ struct ctl_table { @@ -1021,6 +1042,7 @@ struct ctl_table struct ctl_table *child; struct ctl_table *parent; /* Automatically set */ proc_handler *proc_handler; /* Callback for text formatting */ + struct ctl_table_poll *poll; void *extra1; void *extra2; }; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 4e5b0213fdc1..c714ed75eae2 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -37,6 +37,14 @@ struct new_utsname { #include #include +enum uts_proc { + UTS_PROC_OSTYPE, + UTS_PROC_OSRELEASE, + UTS_PROC_VERSION, + UTS_PROC_HOSTNAME, + UTS_PROC_DOMAINNAME, +}; + struct user_namespace; extern struct user_namespace init_user_ns; @@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags, } #endif +#ifdef CONFIG_PROC_SYSCTL +extern void uts_proc_notify(enum uts_proc proc); +#else +static inline void uts_proc_notify(enum uts_proc proc) +{ +} +#endif + static inline struct new_utsname *utsname(void) { return ¤t->nsproxy->uts_ns->name; -- cgit v1.2.3 From 842fa69f3e0c9a178b294e7af7c07f4c9d9e7af2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 Nov 2011 13:39:33 -0700 Subject: include/linux/dma-mapping.h: add dma_zalloc_coherent() Lots of driver code does a dma_alloc_coherent() and then zeroes out the memory with a memset. Make it easy for them. Cc: Alexandre Bounine Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 347fdc32177a..be86ae13893f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -1,6 +1,7 @@ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H +#include #include #include #include @@ -117,6 +118,15 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) return -EIO; } +static inline void *dma_zalloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = dma_alloc_coherent(dev, size, dma_handle, flag); + if (ret) + memset(ret, 0, size); + return ret; +} + #ifdef CONFIG_HAS_DMA static inline int dma_get_cache_alignment(void) { -- cgit v1.2.3 From 161520451dfacd0eb79d501933f47d3fb7464938 Mon Sep 17 00:00:00 2001 From: James Nuss Date: Wed, 2 Nov 2011 13:39:38 -0700 Subject: pps: new client driver using GPIO This client driver allows you to use a GPIO pin as a source for PPS signals. Platform data [1] are used to specify the GPIO pin number, label, assert event edge type, and whether clear events are captured. This driver is based on the work by Ricardo Martins who submitted an initial implementation [2] of a PPS IRQ client driver to the linuxpps mailing-list on Dec 3 2010. [1] include/linux/pps-gpio.h [2] http://ml.enneenne.com/pipermail/linuxpps/2010-December/004155.html [akpm@linux-foundation.org: remove unneeded cast of void*] Signed-off-by: James Nuss Cc: Ricardo Martins Acked-by: Rodolfo Giometti Signed-off-by: Ricardo Martins Cc: Alexander Gordeev Cc: Igor Plyatov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pps-gpio.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 include/linux/pps-gpio.h (limited to 'include/linux') diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h new file mode 100644 index 000000000000..0035abe41b9a --- /dev/null +++ b/include/linux/pps-gpio.h @@ -0,0 +1,32 @@ +/* + * pps-gpio.h -- PPS client for GPIOs + * + * + * Copyright (C) 2011 James Nuss + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _PPS_GPIO_H +#define _PPS_GPIO_H + +struct pps_gpio_platform_data { + bool assert_falling_edge; + bool capture_clear; + unsigned int gpio_pin; + const char *gpio_label; +}; + +#endif -- cgit v1.2.3 From 080d676de095a14ecba14c0b9a91acb5bbb634df Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 2 Nov 2011 13:40:10 -0700 Subject: aio: allocate kiocbs in batches In testing aio on a fast storage device, I found that the context lock takes up a fair amount of cpu time in the I/O submission path. The reason is that we take it for every I/O submitted (see __aio_get_req). Since we know how many I/Os are passed to io_submit, we can preallocate the kiocbs in batches, reducing the number of times we take and release the lock. In my testing, I was able to reduce the amount of time spent in _raw_spin_lock_irq by .56% (average of 3 runs). The command I used to test this was: aio-stress -O -o 2 -o 3 -r 8 -d 128 -b 32 -i 32 -s 16384 I also tested the patch with various numbers of events passed to io_submit, and I ran the xfstests aio group of tests to ensure I didn't break anything. Signed-off-by: Jeff Moyer Cc: Daniel Ehrenberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/aio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/aio.h b/include/linux/aio.h index 2dcb72bff4b6..2314ad8b3c9c 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -117,6 +117,7 @@ struct kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ + struct list_head ki_batch; /* batch allocation */ /* * If the aio_resfd field of the userspace iocb is not zero, -- cgit v1.2.3 From c1e2ee2dc436574880758b3836fc96935b774c32 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Wed, 2 Nov 2011 13:40:29 -0700 Subject: memcg: replace ss->id_lock with a rwlock While back-porting Johannes Weiner's patch "mm: memcg-aware global reclaim" for an internal effort, we noticed a significant performance regression during page-reclaim heavy workloads due to high contention of the ss->id_lock. This lock protects idr map, and serializes calls to idr_get_next() in css_get_next() (which is used during the memcg hierarchy walk). Since idr_get_next() is just doing a look up, we need only serialize it with respect to idr_remove()/idr_get_new(). By making the ss->id_lock a rwlock, contention is greatly reduced and performance improves. Tested: cat a 256m file from a ramdisk in a 128m container 50 times on each core (one file + container per core) in parallel on a NUMA machine. Result is the time for the test to complete in 1 of the containers. Both kernels included Johannes' memcg-aware global reclaim patches. Before rwlock patch: 1710.778s After rwlock patch: 152.227s Signed-off-by: Andrew Bresticker Cc: Paul Menage Cc: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index da7e4bc34e8c..1b7f9d525013 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -516,7 +516,7 @@ struct cgroup_subsys { struct list_head sibling; /* used when use_id == true */ struct idr idr; - spinlock_t id_lock; + rwlock_t id_lock; /* should be defined only by modular subsystems */ struct module *module; -- cgit v1.2.3