From 17d9ddc72fb8bba0d4f67868c9c612e472a594a9 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Wed, 10 Feb 2010 15:23:44 -0800 Subject: rbtree: Add support for augmented rbtrees Add support for augmented rbtrees in core rbtree code. This will be used in subsequent patches, in x86 PAT code, which needs interval trees to efficiently keep track of PAT ranges. Signed-off-by: Venkatesh Pallipadi LKML-Reference: <20100210232343.GA11465@linux-os.sc.intel.com> Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- include/linux/rbtree.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 9c295411d01f..8e33a256ea0e 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -110,6 +110,7 @@ struct rb_node struct rb_root { struct rb_node *rb_node; + void (*augment_cb)(struct rb_node *node); }; @@ -129,7 +130,9 @@ static inline void rb_set_color(struct rb_node *rb, int color) rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; } -#define RB_ROOT (struct rb_root) { NULL, } +#define RB_ROOT (struct rb_root) { NULL, NULL, } +#define RB_AUGMENT_ROOT(x) (struct rb_root) { NULL, x} + #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -- cgit v1.2.3 From 4abc14a733f9002c05623db755aaafdd27fa7a91 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 20 Jan 2010 14:52:23 +0100 Subject: iommu-api: Rename ->{un}map function pointers to ->{un}map_range The new function pointer names match better with the top-level functions of the iommu-api which are using them. Main intention of this change is to make the ->{un}map pointer names free for two new mapping functions. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3af4ffd591b9..0f18f37a6503 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -36,10 +36,10 @@ struct iommu_ops { void (*domain_destroy)(struct iommu_domain *domain); int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); - int (*map)(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot); - void (*unmap)(struct iommu_domain *domain, unsigned long iova, - size_t size); + int (*map_range)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot); + void (*unmap_range)(struct iommu_domain *domain, unsigned long iova, + size_t size); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, unsigned long iova); int (*domain_has_cap)(struct iommu_domain *domain, -- cgit v1.2.3 From cefc53c7f494240d4813c80154c7617452d1904d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 8 Jan 2010 13:35:09 +0100 Subject: iommu-api: Add iommu_map and iommu_unmap functions These two functions provide support for mapping and unmapping physical addresses to io virtual addresses. The difference to the iommu_(un)map_range() is that the new functions take a gfp_order parameter instead of a size. This allows the IOMMU backend implementations to detect easier if a given range can be mapped by larger page sizes. These new functions should replace the old ones in the long term. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0f18f37a6503..6d0035bb1a0c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -60,6 +60,10 @@ extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova, size_t size); +extern int iommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, int gfp_order, int prot); +extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova, + int gfp_order); extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, unsigned long iova); extern int iommu_domain_has_cap(struct iommu_domain *domain, @@ -108,6 +112,18 @@ static inline void iommu_unmap_range(struct iommu_domain *domain, { } +static inline int iommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, int gfp_order, int prot) +{ + return -ENODEV; +} + +static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova, + int gfp_order) +{ + return -ENODEV; +} + static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, unsigned long iova) { -- cgit v1.2.3 From 67651786948c360c3122b8a17cb1e59209d50880 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 21 Jan 2010 16:32:27 +0100 Subject: iommu-api: Add ->{un}map callbacks to iommu_ops This patch adds new callbacks for mapping and unmapping pages to the iommu_ops structure. These callbacks are aware of page sizes which makes them different to the ->{un}map_range callbacks. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6d0035bb1a0c..5a7a3d888dac 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -36,6 +36,10 @@ struct iommu_ops { void (*domain_destroy)(struct iommu_domain *domain); int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); + int (*map)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, int gfp_order, int prot); + int (*unmap)(struct iommu_domain *domain, unsigned long iova, + int gfp_order); int (*map_range)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); void (*unmap_range)(struct iommu_domain *domain, unsigned long iova, -- cgit v1.2.3 From 12c7389abe5786349d3ea6da1961cf78d0c1c7cd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 21 Jan 2010 11:50:28 +0100 Subject: iommu-api: Remove iommu_{un}map_range functions These functions are not longer used and can be removed savely. There functionality is now provided by the iommu_{un}map functions which are also capable of multiple page sizes. Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5a7a3d888dac..be22ad83689c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -40,10 +40,6 @@ struct iommu_ops { phys_addr_t paddr, int gfp_order, int prot); int (*unmap)(struct iommu_domain *domain, unsigned long iova, int gfp_order); - int (*map_range)(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot); - void (*unmap_range)(struct iommu_domain *domain, unsigned long iova, - size_t size); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, unsigned long iova); int (*domain_has_cap)(struct iommu_domain *domain, @@ -60,10 +56,6 @@ extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); extern void iommu_detach_device(struct iommu_domain *domain, struct device *dev); -extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot); -extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova, - size_t size); extern int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, int gfp_order, int prot); extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova, @@ -104,18 +96,6 @@ static inline void iommu_detach_device(struct iommu_domain *domain, { } -static inline int iommu_map_range(struct iommu_domain *domain, - unsigned long iova, phys_addr_t paddr, - size_t size, int prot) -{ - return -ENODEV; -} - -static inline void iommu_unmap_range(struct iommu_domain *domain, - unsigned long iova, size_t size) -{ -} - static inline int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, int gfp_order, int prot) { -- cgit v1.2.3 From ca037701a025334e724e5c61b3b1082940c8b981 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 19:52:12 +0100 Subject: perf, x86: Add PEBS infrastructure This patch implements support for Intel Precise Event Based Sampling, which is an alternative counter mode in which the counter triggers a hardware assist to collect information on events. The hardware assist takes a trap like snapshot of a subset of the machine registers. This data is written to the Intel Debug-Store, which can be programmed with a data threshold at which to raise a PMI. With the PEBS hardware assist being trap like, the reported IP is always one instruction after the actual instruction that triggered the event. This implements a simple PEBS model that always takes a single PEBS event at a time. This is done so that the interaction with the rest of the system is as expected (freq adjust, period randomization, lbr, callchains, etc.). It adds an ABI element: perf_event_attr::precise, which indicates that we wish to use this (constrained, but precise) mode. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <20100304140100.392111285@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 80acbf3d5de1..42307b50c787 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -203,8 +203,9 @@ struct perf_event_attr { enable_on_exec : 1, /* next exec enables */ task : 1, /* trace fork/exit */ watermark : 1, /* wakeup_watermark */ + precise : 1, /* OoO invariant counter */ - __reserved_1 : 49; + __reserved_1 : 48; union { __u32 wakeup_events; /* wakeup every n events */ -- cgit v1.2.3 From caff2befffe899e63df5cc760b7ed01cfd902685 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 12:02:30 +0100 Subject: perf, x86: Implement simple LBR support Implement simple suport Intel Last-Branch-Record, it supports all hardware that implements FREEZE_LBRS_ON_PMI, but does not (yet) implement the LBR config register. The Intel LBR is a FIFO of From,To addresses describing the last few branches the hardware took. This patch does not add perf interface to the LBR, but merely provides an interface for internal use. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <20100304140100.544191154@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 42307b50c787..ab4fd9ede264 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -467,6 +467,17 @@ struct perf_raw_record { void *data; }; +struct perf_branch_entry { + __u64 from; + __u64 to; + __u64 flags; +}; + +struct perf_branch_stack { + __u64 nr; + struct perf_branch_entry entries[0]; +}; + struct task_struct; /** -- cgit v1.2.3 From ef21f683a045a79b6aa86ad81e5fdfc0d5ddd250 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 13:12:23 +0100 Subject: perf, x86: use LBR for PEBS IP+1 fixup Use the LBR to fix up the PEBS IP+1 issue. As said, PEBS reports the next instruction, here we use the LBR to find the last branch and from that construct the actual IP. If the IP matches the LBR-TO, we use LBR-FROM, otherwise we use the LBR-TO address as the beginning of the last basic block and decode forward. Once we find a match to the current IP, we use the previous location. This patch introduces a new ABI element: PERF_RECORD_MISC_EXACT, which conveys that the reported IP (PERF_SAMPLE_IP) is the exact instruction that caused the event (barring CPU errata). The fixup can fail due to various reasons: 1) LBR contains invalid data (quite possible) 2) part of the basic block got paged out 3) the reported IP isn't part of the basic block (see 1) Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: "Zhang, Yanmin" Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: <20100304140100.619375431@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ab4fd9ede264..be85f7c4a94f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -294,6 +294,12 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_USER (2 << 0) #define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_EXACT (1 << 14) +/* + * Reserve the last bit to indicate some extended misc field + */ +#define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) + struct perf_event_header { __u32 type; __u16 misc; -- cgit v1.2.3 From 41acab8851a0408c1d5ad6c21a07456f88b54d40 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 10 Mar 2010 23:37:45 -0300 Subject: sched: Implement group scheduler statistics in one struct Put all statistic fields of sched_entity in one struct, sched_statistics, and embed it into sched_entity. This change allows to memset the sched_statistics to 0 when needed (for instance when forking), avoiding bugs of non initialized fields. Signed-off-by: Lucas De Marchi Signed-off-by: Peter Zijlstra LKML-Reference: <1268275065-18542-1-git-send-email-lucas.de.marchi@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 54 ++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1753f7e48e..8cc863d66477 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1127,36 +1127,8 @@ struct load_weight { unsigned long weight, inv_weight; }; -/* - * CFS stats for a schedulable entity (task, task-group etc) - * - * Current field usage histogram: - * - * 4 se->block_start - * 4 se->run_node - * 4 se->sleep_start - * 6 se->load.weight - */ -struct sched_entity { - struct load_weight load; /* for load-balancing */ - struct rb_node run_node; - struct list_head group_node; - unsigned int on_rq; - - u64 exec_start; - u64 sum_exec_runtime; - u64 vruntime; - u64 prev_sum_exec_runtime; - - u64 last_wakeup; - u64 avg_overlap; - - u64 nr_migrations; - - u64 start_runtime; - u64 avg_wakeup; - #ifdef CONFIG_SCHEDSTATS +struct sched_statistics { u64 wait_start; u64 wait_max; u64 wait_count; @@ -1188,6 +1160,30 @@ struct sched_entity { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; +}; +#endif + +struct sched_entity { + struct load_weight load; /* for load-balancing */ + struct rb_node run_node; + struct list_head group_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; + + u64 last_wakeup; + u64 avg_overlap; + + u64 nr_migrations; + + u64 start_runtime; + u64 avg_wakeup; + +#ifdef CONFIG_SCHEDSTATS + struct sched_statistics statistics; #endif #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.3 From 39c0cbe2150cbd848a25ba6cdb271d1ad46818ad Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:17:13 +0100 Subject: sched: Rate-limit nohz Entering nohz code on every micro-idle is costing ~10% throughput for netperf TCP_RR when scheduling cross-cpu. Rate limiting entry fixes this, but raises ticks a bit. On my Q6600, an idle box goes from ~85 interrupts/sec to 128. The higher the context switch rate, the more nohz entry costs. With this patch and some cycle recovery patches in my tree, max cross cpu context switch rate is improved by ~16%, a large portion of which of which is this ratelimiting. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301003.6785.28.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8cc863d66477..13efe7dac5fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -271,11 +271,17 @@ extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); extern int get_nohz_load_balancer(void); +extern int nohz_ratelimit(int cpu); #else static inline int select_nohz_load_balancer(int cpu) { return 0; } + +static inline int nohz_ratelimit(int cpu) +{ + return 0; +} #endif /* -- cgit v1.2.3 From b42e0c41a422a212ddea0666d5a3a0e3c35206db Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:15:38 +0100 Subject: sched: Remove avg_wakeup Testing the load which led to this heuristic (nfs4 kbuild) shows that it has outlived it's usefullness. With intervening load balancing changes, I cannot see any difference with/without, so recover there fastpath cycles. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301062.6785.29.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 13efe7dac5fa..70c560f5ada0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1185,9 +1185,6 @@ struct sched_entity { u64 nr_migrations; - u64 start_runtime; - u64 avg_wakeup; - #ifdef CONFIG_SCHEDSTATS struct sched_statistics statistics; #endif -- cgit v1.2.3 From e12f31d3e5d36328c7fbd0fce40a95e70b59152c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:15:51 +0100 Subject: sched: Remove avg_overlap Both avg_overlap and avg_wakeup had an inherent problem in that their accuracy was detrimentally affected by cross-cpu wakeups, this because we are missing the necessary call to update_curr(). This can't be fixed without increasing overhead in our already too fat fastpath. Additionally, with recent load balancing changes making us prefer to place tasks in an idle cache domain (which is good for compute bound loads), communicating tasks suffer when a sync wakeup, which would enable affine placement, is turned into a non-sync wakeup by SYNC_LESS. With one task on the runqueue, wake_affine() rejects the affine wakeup request, leaving the unfortunate where placed, taking frequent cache misses. Remove it, and recover some fastpath cycles. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301121.6785.30.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 70c560f5ada0..8604884cee87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1180,9 +1180,6 @@ struct sched_entity { u64 vruntime; u64 prev_sum_exec_runtime; - u64 last_wakeup; - u64 avg_overlap; - u64 nr_migrations; #ifdef CONFIG_SCHEDSTATS -- cgit v1.2.3 From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2010 14:51:50 +0100 Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code Support for the PMU's BTS features has been upstreamed in v2.6.32, but we still have the old and disabled ptrace-BTS, as Linus noticed it not so long ago. It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without regard for other uses (perf) and doesn't provide the flexibility needed for perf either. Its users are ptrace-block-step and ptrace-bts, since ptrace-bts was never used and ptrace-block-step can be implemented using a much simpler approach. So axe all 3000 lines of it. That includes the *locked_memory*() APIs in mm/mlock.c as well. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Roland McGrath Cc: Oleg Nesterov Cc: Markus Metzger Cc: Steven Rostedt Cc: Andrew Morton LKML-Reference: <20100325135413.938004390@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 12 ------------ include/linux/mm.h | 4 ---- include/linux/ptrace.h | 12 ------------ include/linux/sched.h | 9 --------- 4 files changed, 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 01e6adea07ec..cc12b3c556b3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -504,18 +504,6 @@ extern int ftrace_dump_on_oops; #define INIT_TRACE_RECURSION #endif -#ifdef CONFIG_HW_BRANCH_TRACER - -void trace_hw_branch(u64 from, u64 to); -void trace_hw_branch_oops(void); - -#else /* CONFIG_HW_BRANCH_TRACER */ - -static inline void trace_hw_branch(u64 from, u64 to) {} -static inline void trace_hw_branch_oops(void) {} - -#endif /* CONFIG_HW_BRANCH_TRACER */ - #ifdef CONFIG_FTRACE_SYSCALLS unsigned long arch_syscall_addr(int nr); diff --git a/include/linux/mm.h b/include/linux/mm.h index e70f21beb4b4..c8442b655111 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -19,7 +19,6 @@ struct anon_vma; struct file_ra_state; struct user_struct; struct writeback_control; -struct rlimit; #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1449,9 +1448,6 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); -extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, - size_t size); -extern void refund_locked_memory(struct mm_struct *mm, size_t size); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index e1fb60729979..4272521e29e9 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -345,18 +345,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk, #define arch_ptrace_stop(code, info) do { } while (0) #endif -#ifndef arch_ptrace_untrace -/* - * Do machine-specific work before untracing child. - * - * This is called for a normal detach as well as from ptrace_exit() - * when the tracing task dies. - * - * Called with write_lock(&tasklist_lock) held. - */ -#define arch_ptrace_untrace(task) do { } while (0) -#endif - extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/include/linux/sched.h b/include/linux/sched.h index dad7f668ebf7..e0447c64af6a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,7 +99,6 @@ struct futex_pi_state; struct robust_list_head; struct bio_list; struct fs_struct; -struct bts_context; struct perf_event_context; /* @@ -1272,12 +1271,6 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; - /* - * This is the tracer handle for the ptrace BTS extension. - * This field actually belongs to the ptracer task. - */ - struct bts_context *bts; - /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; @@ -2123,10 +2116,8 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP -extern void wait_task_context_switch(struct task_struct *p); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else -static inline void wait_task_context_switch(struct task_struct *p) {} static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { -- cgit v1.2.3 From ae832d1e03ac9bf09fb8a07fb37908ab40c7cd0e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Mar 2010 10:57:43 +0800 Subject: tracing: Remove side effect from module tracepoints that caused a GPF Remove the @refcnt argument, because it has side-effects, and arguments with side-effects are not skipped by the jump over disabled instrumentation and are executed even when the tracepoint is disabled. This was also causing a GPF as found by Randy Dunlap: Subject: 2.6.33 GP fault only when built with tracing LKML-Reference: <4BA2B69D.3000309@oracle.com> Note, the current 2.6.34-rc has a fix for the actual cause of the GPF, but this fixes one of its triggers. Tested-by: Randy Dunlap Acked-by: Mathieu Desnoyers Signed-off-by: Li Zefan LKML-Reference: <4BA97FA7.6040406@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/module.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 5e869ffd34aa..393ec39b580a 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -460,8 +460,7 @@ static inline void __module_get(struct module *module) if (module) { preempt_disable(); __this_cpu_inc(module->refptr->count); - trace_module_get(module, _THIS_IP_, - __this_cpu_read(module->refptr->count)); + trace_module_get(module, _THIS_IP_); preempt_enable(); } } @@ -475,8 +474,7 @@ static inline int try_module_get(struct module *module) if (likely(module_is_live(module))) { __this_cpu_inc(module->refptr->count); - trace_module_get(module, _THIS_IP_, - __this_cpu_read(module->refptr->count)); + trace_module_get(module, _THIS_IP_); } else ret = 0; -- cgit v1.2.3 From 66a8cb95ed04025664d1db4e952155ee1dccd048 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 31 Mar 2010 13:21:56 -0400 Subject: ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter Cc: Andi Kleen Cc: Li Zefan Cc: Arnaldo Carvalho de Melo Cc: "Luis Claudio R. Goncalves" Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 5fcc31ed5771..c8297761e414 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -120,9 +120,11 @@ int ring_buffer_write(struct ring_buffer *buffer, unsigned long length, void *data); struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts); +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events); struct ring_buffer_event * -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts); +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events); struct ring_buffer_iter * ring_buffer_read_start(struct ring_buffer *buffer, int cpu); -- cgit v1.2.3 From bc21b478425ac73f66a5ec0b375a5e0d12d609ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 31 Mar 2010 19:49:26 -0400 Subject: tracing: Show the lost events in the trace_pipe output Now that the ring buffer can keep track of where events are lost. Use this information to the output of trace_pipe: hackbench-3588 [001] 1326.701660: lock_acquire: ffffffff816591e0 read rcu_read_lock hackbench-3588 [001] 1326.701661: lock_acquire: ffff88003f4091f0 &(&dentry->d_lock)->rlock hackbench-3588 [001] 1326.701664: lock_release: ffff88003f4091f0 &(&dentry->d_lock)->rlock CPU:1 [LOST 673 EVENTS] hackbench-3588 [001] 1326.702711: kmem_cache_free: call_site=ffffffff81102b85 ptr=ffff880026d96738 hackbench-3588 [001] 1326.702712: lock_release: ffff88003e1480a8 &mm->mmap_sem hackbench-3588 [001] 1326.702713: lock_acquire: ffff88003e1480a8 &mm->mmap_sem Even works with the function graph tracer: 2) ! 170.098 us | } 2) 4.036 us | rcu_irq_exit(); 2) 3.657 us | idle_cpu(); 2) ! 190.301 us | } CPU:2 [LOST 2196 EVENTS] 2) 0.853 us | } /* cancel_dirty_page */ 2) | remove_from_page_cache() { 2) 1.578 us | _raw_spin_lock_irq(); 2) | __remove_from_page_cache() { Note, it does not work with the iterator "trace" file, since it requires the use of consuming the page from the ring buffer to determine how many events were lost, which the iterator does not do. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c0f4b364c711..39e71b0a3bfd 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -58,6 +58,7 @@ struct trace_iterator { /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; + unsigned long lost_events; int leftover; int cpu; u64 ts; -- cgit v1.2.3 From 897f0b3c3ff40b443c84e271bef19bd6ae885195 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Mar 2010 10:10:03 +0100 Subject: sched: Kill the broken and deadlockable cpuset_lock/cpuset_cpus_allowed_locked code This patch just states the fact the cpusets/cpuhotplug interaction is broken and removes the deadlockable code which only pretends to work. - cpuset_lock() doesn't really work. It is needed for cpuset_cpus_allowed_locked() but we can't take this lock in try_to_wake_up()->select_fallback_rq() path. - cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take cpuset_lock() and hangs forever because CPU is already dead and thus T can't be scheduled. - cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock() which is not irq-safe, but try_to_wake_up() can be called from irq. Kill them, and change select_fallback_rq() to use cpu_possible_mask, like we currently do without CONFIG_CPUSETS. Also, with or without this patch, with or without CONFIG_CPUSETS, the callers of select_fallback_rq() can race with each other or with set_cpus_allowed() pathes. The subsequent patches try to to fix these problems. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <20100315091003.GA9123@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a5740fc4d04b..eeaaee746bee 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -21,8 +21,6 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern void cpuset_cpus_allowed_locked(struct task_struct *p, - struct cpumask *mask); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -69,9 +67,6 @@ struct seq_file; extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); -extern void cpuset_lock(void); -extern void cpuset_unlock(void); - extern int cpuset_mem_spread_node(void); static inline int cpuset_do_page_mem_spread(void) @@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, { cpumask_copy(mask, cpu_possible_mask); } -static inline void cpuset_cpus_allowed_locked(struct task_struct *p, - struct cpumask *mask) -{ - cpumask_copy(mask, cpu_possible_mask); -} static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { @@ -157,9 +147,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m, { } -static inline void cpuset_lock(void) {} -static inline void cpuset_unlock(void) {} - static inline int cpuset_mem_spread_node(void) { return 0; -- cgit v1.2.3 From 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Mar 2010 10:10:23 +0100 Subject: sched: _cpu_down(): Don't play with current->cpus_allowed _cpu_down() changes the current task's affinity and then recovers it at the end. The problems are well known: we can't restore old_allowed if it was bound to the now-dead-cpu, and we can race with the userspace which can change cpu-affinity during unplug. _cpu_down() should not play with current->cpus_allowed at all. Instead, take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable() removes the dying cpu from cpu_online_mask. Signed-off-by: Oleg Nesterov Acked-by: Rafael J. Wysocki Signed-off-by: Peter Zijlstra LKML-Reference: <20100315091023.GA9148@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 43c945152732..8bea40725c76 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1843,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU +extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); #else static inline void idle_task_exit(void) {} -- cgit v1.2.3 From 9084bb8246ea935b98320554229e2f371f7f52fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Mar 2010 10:10:27 +0100 Subject: sched: Make select_fallback_rq() cpuset friendly Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems with select_fallback_rq(). It can be called from any context and can't use any cpuset locks including task_lock(). It is called when the task doesn't have online cpus in ->cpus_allowed but ttwu/etc must be able to find a suitable cpu. I am not proud of this patch. Everything which needs such a fat comment can't be good even if correct. But I'd prefer to not change the locking rules in the code I hardly understand, and in any case I believe this simple change make the code much more correct compared to deadlocks we currently have. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <20100315091027.GA9155@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index eeaaee746bee..a73454aec333 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -21,6 +21,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_copy(mask, cpu_possible_mask); } +static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) +{ + cpumask_copy(&p->cpus_allowed, cpu_possible_mask); + return cpumask_any(cpu_active_mask); +} + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; -- cgit v1.2.3 From 0017d735092844118bef006696a750a0e4ef6ebd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Mar 2010 18:34:10 +0100 Subject: sched: Fix TASK_WAKING vs fork deadlock Oleg noticed a few races with the TASK_WAKING usage on fork. - since TASK_WAKING is basically a spinlock, it should be IRQ safe - since we set TASK_WAKING (*) without holding rq->lock it could be there still is a rq->lock holder, thereby not actually providing full serialization. (*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING. Cure the second issue by not setting TASK_WAKING in sched_fork(), but only temporarily in wake_up_new_task() while calling select_task_rq(). Cure the first by holding rq->lock around the select_task_rq() call, this will disable IRQs, this however requires that we push down the rq->lock release into select_task_rq_fair()'s cgroup stuff. Because select_task_rq_fair() still needs to drop the rq->lock we cannot fully get rid of TASK_WAKING. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8bea40725c76..fb6c18843ee8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1046,7 +1046,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + int (*select_task_rq)(struct rq *rq, struct task_struct *p, + int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); -- cgit v1.2.3 From 371fd7e7a56a5c136d31aa980011bd2f131c3ef5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Mar 2010 16:38:48 +0100 Subject: sched: Add enqueue/dequeue flags In order to reduce the dependency on TASK_WAKING rework the enqueue interface to support a proper flags field. Replace the int wakeup, bool head arguments with an int flags argument and create the following flags: ENQUEUE_WAKEUP - the enqueue is a wakeup of a sleeping task, ENQUEUE_WAKING - the enqueue has relative vruntime due to having sched_class::task_waking() called, ENQUEUE_HEAD - the waking task should be places on the head of the priority queue (where appropriate). For symmetry also convert sched_class::dequeue() to a flags scheme. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fb6c18843ee8..e3e900f318d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1032,12 +1032,17 @@ struct sched_domain; #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ #define WF_FORK 0x02 /* child wakeup after fork */ +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_WAKING 2 +#define ENQUEUE_HEAD 4 + +#define DEQUEUE_SLEEP 1 + struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, - bool head); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); -- cgit v1.2.3 From 1527bc8b928dd1399c3d3467dd47d9ede210978a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Feb 2010 15:03:07 +0100 Subject: bitops: Optimize hweight() by making use of compile-time evaluation Rename the extisting runtime hweight() implementations to __arch_hweight(), rename the compile-time versions to __const_hweight() and then have hweight() pick between them. Suggested-by: H. Peter Anvin Signed-off-by: Peter Zijlstra LKML-Reference: <20100318111929.GB11152@aftab> Acked-by: H. Peter Anvin LKML-Reference: <1265028224.24455.154.camel@laptop> Signed-off-by: H. Peter Anvin --- include/linux/bitops.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index b79389879238..c55d5bc4ee58 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -47,31 +47,6 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } -/* - * Clearly slow versions of the hweightN() functions, their benefit is - * of course compile time evaluation of constant arguments. - */ -#define HWEIGHT8(w) \ - ( BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + \ - (!!((w) & (1ULL << 0))) + \ - (!!((w) & (1ULL << 1))) + \ - (!!((w) & (1ULL << 2))) + \ - (!!((w) & (1ULL << 3))) + \ - (!!((w) & (1ULL << 4))) + \ - (!!((w) & (1ULL << 5))) + \ - (!!((w) & (1ULL << 6))) + \ - (!!((w) & (1ULL << 7))) ) - -#define HWEIGHT16(w) (HWEIGHT8(w) + HWEIGHT8((w) >> 8)) -#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16((w) >> 16)) -#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32((w) >> 32)) - -/* - * Type invariant version that simply casts things to the - * largest type. - */ -#define HWEIGHT(w) HWEIGHT64((u64)(w)) - /** * rol32 - rotate a 32-bit value left * @word: value to rotate -- cgit v1.2.3 From 6dad2a29646ce3792c40cfc52d77e9b65a7bb143 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 31 Mar 2010 21:56:46 +0200 Subject: cpufreq: Unify sysfs attribute definition macros Multiple modules used to define those which are with identical functionality and were needlessly replicated among the different cpufreq drivers. Push them into the header and remove duplication. Signed-off-by: Borislav Petkov LKML-Reference: <1270065406-1814-7-git-send-email-bp@amd64.org> Reviewed-by: Thomas Renninger Signed-off-by: H. Peter Anvin --- include/linux/cpufreq.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 4de02b10007f..9f15150ce8d6 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -278,6 +278,27 @@ struct freq_attr { ssize_t (*store)(struct cpufreq_policy *, const char *, size_t count); }; +#define cpufreq_freq_attr_ro(_name) \ +static struct freq_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define cpufreq_freq_attr_ro_perm(_name, _perm) \ +static struct freq_attr _name = \ +__ATTR(_name, _perm, show_##_name, NULL) + +#define cpufreq_freq_attr_ro_old(_name) \ +static struct freq_attr _name##_old = \ +__ATTR(_name, 0444, show_##_name##_old, NULL) + +#define cpufreq_freq_attr_rw(_name) \ +static struct freq_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +#define cpufreq_freq_attr_rw_old(_name) \ +static struct freq_attr _name##_old = \ +__ATTR(_name, 0644, show_##_name##_old, store_##_name##_old) + + struct global_attr { struct attribute attr; ssize_t (*show)(struct kobject *kobj, @@ -286,6 +307,15 @@ struct global_attr { const char *c, size_t count); }; +#define define_one_global_ro(_name) \ +static struct global_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define define_one_global_rw(_name) \ +static struct global_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + + /********************************************************************* * CPUFREQ 2.6. INTERFACE * *********************************************************************/ -- cgit v1.2.3 From 76e1d9047e4edefb8ada20aa90d5762306082bd6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 5 Apr 2010 15:35:57 +0200 Subject: perf: Store active software events in a hashlist Each time a software event triggers, we need to walk through the entire list of events from the current cpu and task contexts to retrieve a running perf event that matches. We also need to check a matching perf event is actually counting. This walk is wasteful and makes the event fast path scaling down with a growing number of events running on the same contexts. To solve this, we store the running perf events in a hashlist to get an immediate access to them against their type:event_id when they trigger. v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic maths along the way) - Only allocate hlist for online cpus, but keep track of the refcount on offline possible cpus too, so that we allocate it if needed when it becomes online. - Drop the kref use as it's not adapted to our tricks anymore. v3: - Fix bad refcount check (address instead of value). Thanks to Eric Dumazet who spotted this. - While exiting cpu, move the hlist release out of the IPI path to lock the hlist mutex sanely. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- include/linux/perf_event.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6e96cc8225d4..bf896d0b2e9c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -589,6 +589,14 @@ enum perf_group_flag { PERF_GROUP_SOFTWARE = 0x1, }; +#define SWEVENT_HLIST_BITS 8 +#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) + +struct swevent_hlist { + struct hlist_head heads[SWEVENT_HLIST_SIZE]; + struct rcu_head rcu_head; +}; + /** * struct perf_event - performance event kernel representation: */ @@ -597,6 +605,7 @@ struct perf_event { struct list_head group_entry; struct list_head event_entry; struct list_head sibling_list; + struct hlist_node hlist_entry; int nr_siblings; int group_flags; struct perf_event *group_leader; @@ -744,6 +753,9 @@ struct perf_cpu_context { int active_oncpu; int max_pertask; int exclusive; + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; /* * Recursion avoidance: -- cgit v1.2.3 From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 19 Apr 2010 13:32:41 +0800 Subject: perf: Enhance perf to allow for guest statistic collection from host Below patch introduces perf_guest_info_callbacks and related register/unregister functions. Add more PERF_RECORD_MISC_XXX bits meaning guest kernel and guest user space. Signed-off-by: Zhang Yanmin Signed-off-by: Avi Kivity --- include/linux/perf_event.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bf896d0b2e9c..24de5f181a41 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -288,11 +288,13 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ }; -#define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0) +#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) #define PERF_RECORD_MISC_USER (2 << 0) #define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) +#define PERF_RECORD_MISC_GUEST_USER (5 << 0) #define PERF_RECORD_MISC_EXACT (1 << 14) /* @@ -446,6 +448,12 @@ enum perf_callchain_context { # include #endif +struct perf_guest_info_callbacks { + int (*is_in_guest) (void); + int (*is_user_mode) (void); + unsigned long (*get_guest_ip) (void); +}; + #ifdef CONFIG_HAVE_HW_BREAKPOINT #include #endif @@ -932,6 +940,12 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) __perf_event_mmap(vma); } +extern struct perf_guest_info_callbacks *perf_guest_cbs; +extern int perf_register_guest_info_callbacks( + struct perf_guest_info_callbacks *); +extern int perf_unregister_guest_info_callbacks( + struct perf_guest_info_callbacks *); + extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); @@ -1001,6 +1015,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, static inline void perf_bp_event(struct perf_event *event, void *data) { } +static inline int perf_register_guest_info_callbacks +(struct perf_guest_info_callbacks *) {return 0; } +static inline int perf_unregister_guest_info_callbacks +(struct perf_guest_info_callbacks *) {return 0; } + static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } -- cgit v1.2.3 From dcf46b9443ad48a227a61713adea001228925adf Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Tue, 20 Apr 2010 10:13:58 +0800 Subject: perf & kvm: Clean up some of the guest profiling callback API details Fix some build bug and programming style issues: - use valid C - fix up various style details Signed-off-by: Zhang Yanmin Cc: Avi Kivity Cc: Peter Zijlstra Cc: Sheng Yang Cc: Marcelo Tosatti Cc: oerg Roedel Cc: Jes Sorensen Cc: Gleb Natapov Cc: Zachary Amsden Cc: zhiteng.huang@intel.com Cc: tim.c.chen@intel.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <1271729638.2078.624.camel@ymzhang.sh.intel.com> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24de5f181a41..ace31fbac513 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -941,10 +941,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) } extern struct perf_guest_info_callbacks *perf_guest_cbs; -extern int perf_register_guest_info_callbacks( - struct perf_guest_info_callbacks *); -extern int perf_unregister_guest_info_callbacks( - struct perf_guest_info_callbacks *); +extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern void perf_event_comm(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); @@ -1016,9 +1014,9 @@ static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *) {return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *) {return 0; } +(struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } -- cgit v1.2.3 From cecbca96da387428e220e307a9c945e37e2f4d9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 18 Apr 2010 19:08:41 +0200 Subject: tracing: Dump either the oops's cpu source or all cpus buffers The ftrace_dump_on_oops kernel parameter, sysctl and sysrq let one dump every cpu buffers when an oops or panic happens. It's nice when you have few cpus but it may take ages if have many, plus you miss the real origin of the problem in all the cpu traces. Sometimes, all you need is to dump the cpu buffer that triggered the opps, most of the time it is our main interest. This patch modifies ftrace_dump_on_oops to handle this choice. The ftrace_dump_on_oops kernel parameter, when it comes alone, has the same behaviour than before. But ftrace_dump_on_oops=orig_cpu will only dump the buffer of the cpu that oops'ed. Similarly, sysctl kernel.ftrace_dump_on_oops=1 and echo 1 > /proc/sys/kernel/ftrace_dump_on_oops keep their previous behaviour. But setting 2 jumps into cpu origin dump mode. v2: Fix double setup v3: Fix spelling issues reported by Randy Dunlap v4: Also update __ftrace_dump in the selftests Signed-off-by: Frederic Weisbecker Acked-by: David S. Miller Acked-by: Steven Rostedt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Li Zefan Cc: Lai Jiangshan --- include/linux/ftrace.h | 4 +++- include/linux/kernel.h | 11 +++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 01e6adea07ec..ea5b1aae0e8b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -492,7 +492,9 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) return tsk->trace & TSK_TRACE_FL_GRAPH; } -extern int ftrace_dump_on_oops; +enum ftrace_dump_mode; + +extern enum ftrace_dump_mode ftrace_dump_on_oops; #ifdef CONFIG_PREEMPT #define INIT_TRACE_RECURSION .trace_recursion = 0, diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9365227dbaf6..9fb1c1299032 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -490,6 +490,13 @@ static inline void tracing_off(void) { } static inline void tracing_off_permanent(void) { } static inline int tracing_is_on(void) { return 0; } #endif + +enum ftrace_dump_mode { + DUMP_NONE, + DUMP_ALL, + DUMP_ORIG, +}; + #ifdef CONFIG_TRACING extern void tracing_start(void); extern void tracing_stop(void); @@ -571,7 +578,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); -extern void ftrace_dump(void); +extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); #else static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } @@ -592,7 +599,7 @@ ftrace_vprintk(const char *fmt, va_list ap) { return 0; } -static inline void ftrace_dump(void) { } +static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* CONFIG_TRACING */ /* -- cgit v1.2.3 From 669c55e9f99b90e46eaa0f98a67ec53d46dc969a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 16 Apr 2010 14:59:29 +0200 Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd)) Dave reported that his large SPARC machines spend lots of time in hweight64(), try and optimize some of those needless cpumask_weight() invocations (esp. with the large offstack cpumasks these are very expensive indeed). Reported-by: David Miller Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e3e900f318d7..dfea40574b2a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -960,6 +960,7 @@ struct sched_domain { char *name; #endif + unsigned int span_weight; /* * Span of all CPUs in this domain. * -- cgit v1.2.3 From 62b915f1060996a8e1f69be50e3b8e9e43b710cb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 2 Apr 2010 19:01:22 +0200 Subject: tracing: Add graph output support for irqsoff tracer Add function graph output to irqsoff tracer. The graph output is enabled by setting new 'display-graph' trace option. Signed-off-by: Jiri Olsa LKML-Reference: <1270227683-14631-4-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ea5b1aae0e8b..8415a522f430 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -352,6 +352,10 @@ struct ftrace_graph_ret { int depth; }; +/* Type of the callback handlers for tracing function graph*/ +typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ +typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* for init task */ @@ -400,10 +404,6 @@ extern char __irqentry_text_end[]; #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 -/* Type of the callback handlers for tracing function graph*/ -typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */ -typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */ - extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc); @@ -441,6 +441,13 @@ static inline void unpause_graph_tracing(void) static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } +static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) +{ + return -1; +} +static inline void unregister_ftrace_graph(void) { } + static inline int task_curr_ret_stack(struct task_struct *tsk) { return -1; -- cgit v1.2.3 From 72c9ddfd4c5bf54ef03cfdf57026416cb678eeba Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 20 Apr 2010 15:47:11 -0700 Subject: ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index c8297761e414..25b4f686d918 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -127,7 +127,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, unsigned long *lost_events); struct ring_buffer_iter * -ring_buffer_read_start(struct ring_buffer *buffer, int cpu); +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu); +void ring_buffer_read_prepare_sync(void); +void ring_buffer_read_start(struct ring_buffer_iter *iter); void ring_buffer_read_finish(struct ring_buffer_iter *iter); struct ring_buffer_event * -- cgit v1.2.3 From 73266fc1df2f94cf72b3beba3eee3b88ed0b0664 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Apr 2010 05:05:45 +0200 Subject: hw-breakpoints: Tag ptrace breakpoint as exclude_kernel Tag ptrace breakpoints with the exclude_kernel attribute set. This will make it easier to set generic policies on breakpoints, when it comes to ensure nobody unpriviliged try to breakpoint on the kernel. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar --- include/linux/hw_breakpoint.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index c70d27af03f9..a0aa5a9cfb0e 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -34,6 +34,12 @@ static inline void hw_breakpoint_init(struct perf_event_attr *attr) attr->sample_period = 1; } +static inline void ptrace_breakpoint_init(struct perf_event_attr *attr) +{ + hw_breakpoint_init(attr); + attr->exclude_kernel = 1; +} + static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) { return bp->attr.bp_addr; -- cgit v1.2.3 From 0102752e4c9e0655b39734550d4c35327954f7f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 11 Apr 2010 18:55:56 +0200 Subject: hw-breakpoints: Separate constraint space for data and instruction breakpoints There are two outstanding fashions for archs to implement hardware breakpoints. The first is to separate breakpoint address pattern definition space between data and instruction breakpoints. We then have typically distinct instruction address breakpoint registers and data address breakpoint registers, delivered with separate control registers for data and instruction breakpoints as well. This is the case of PowerPc and ARM for example. The second consists in having merged breakpoint address space definition between data and instruction breakpoint. Address registers can host either instruction or data address and the access mode for the breakpoint is defined in a control register. This is the case of x86 and Super H. This patch adds a new CONFIG_HAVE_MIXED_BREAKPOINTS_REGS config that archs can select if they belong to the second case. Those will have their slot allocation merged for instructions and data breakpoints. The others will have a separate slot tracking between data and instruction breakpoints. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar --- include/linux/hw_breakpoint.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a0aa5a9cfb0e..7e8899093098 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -9,9 +9,12 @@ enum { }; enum { - HW_BREAKPOINT_R = 1, - HW_BREAKPOINT_W = 2, - HW_BREAKPOINT_X = 4, + HW_BREAKPOINT_EMPTY = 0, + HW_BREAKPOINT_R = 1, + HW_BREAKPOINT_W = 2, + HW_BREAKPOINT_RW = HW_BREAKPOINT_R | HW_BREAKPOINT_W, + HW_BREAKPOINT_X = 4, + HW_BREAKPOINT_INVALID = HW_BREAKPOINT_RW | HW_BREAKPOINT_X, }; #ifdef __KERNEL__ -- cgit v1.2.3 From feef47d0cb530e8419dfa0b48141b538b89b1b1a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 23 Apr 2010 05:59:55 +0200 Subject: hw-breakpoints: Get the number of available registers on boot dynamically The breakpoint generic layer assumes that archs always know in advance the static number of address registers available to host breakpoints through the HBP_NUM macro. However this is not true for every archs. For example Arm needs to get this information dynamically to handle the compatiblity between different versions. To solve this, this patch proposes to drop the static HBP_NUM macro and let the arch provide the number of available slots through a new hw_breakpoint_slots() function. For archs that have CONFIG_HAVE_MIXED_BREAKPOINTS_REGS selected, it will be called once as the number of registers fits for instruction and data breakpoints together. For the others it will be called first to get the number of instruction breakpoint registers and another time to get the data breakpoint registers, the targeted type is given as a parameter of hw_breakpoint_slots(). Reported-by: Will Deacon Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jason Wessel Cc: Ingo Molnar --- include/linux/hw_breakpoint.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 7e8899093098..a2d6ea49ec56 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -17,6 +17,16 @@ enum { HW_BREAKPOINT_INVALID = HW_BREAKPOINT_RW | HW_BREAKPOINT_X, }; +enum bp_type_idx { + TYPE_INST = 0, +#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS + TYPE_DATA = 0, +#else + TYPE_DATA = 1, +#endif + TYPE_MAX +}; + #ifdef __KERNEL__ #include -- cgit v1.2.3 From 4dbf6bc239c169b032777616806ecc648058f6b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 4 May 2010 11:24:01 -0400 Subject: tracing: Convert nop macros to static inlines The ftrace.h file contains several functions as macros when the functions are disabled due to config options. This patch converts most of them to static inlines. There are two exceptions: register_ftrace_function() and unregister_ftrace_function() This is because their parameter "ops" must not be evaluated since code using the function is allowed to #ifdef out the creation of the parameter. This also fixes an error caused by recent changes: kernel/trace/trace_irqsoff.c: In function 'start_irqsoff_tracer': kernel/trace/trace_irqsoff.c:571: error: expected expression before 'do' Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8415a522f430..e0ae83bbd9cc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -82,9 +82,13 @@ void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1); #else /* !CONFIG_FUNCTION_TRACER */ -# define register_ftrace_function(ops) do { } while (0) -# define unregister_ftrace_function(ops) do { } while (0) -# define clear_ftrace_function(ops) do { } while (0) +/* + * (un)register_ftrace_function must be a macro since the ops parameter + * must not be evaluated. + */ +#define register_ftrace_function(ops) ({ 0; }) +#define unregister_ftrace_function(ops) ({ 0; }) +static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } static inline void ftrace_stop(void) { } static inline void ftrace_start(void) { } @@ -237,11 +241,13 @@ extern int skip_trace(unsigned long ip); extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); #else -# define skip_trace(ip) ({ 0; }) -# define ftrace_force_update() ({ 0; }) -# define ftrace_set_filter(buf, len, reset) do { } while (0) -# define ftrace_disable_daemon() do { } while (0) -# define ftrace_enable_daemon() do { } while (0) +static inline int skip_trace(unsigned long ip) { return 0; } +static inline int ftrace_force_update(void) { return 0; } +static inline void ftrace_set_filter(unsigned char *buf, int len, int reset) +{ +} +static inline void ftrace_disable_daemon(void) { } +static inline void ftrace_enable_daemon(void) { } static inline void ftrace_release_mod(struct module *mod) {} static inline int register_ftrace_command(struct ftrace_func_command *cmd) { @@ -314,16 +320,16 @@ static inline void __ftrace_enabled_restore(int enabled) extern void time_hardirqs_on(unsigned long a0, unsigned long a1); extern void time_hardirqs_off(unsigned long a0, unsigned long a1); #else -# define time_hardirqs_on(a0, a1) do { } while (0) -# define time_hardirqs_off(a0, a1) do { } while (0) + static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { } + static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif #ifdef CONFIG_PREEMPT_TRACER extern void trace_preempt_on(unsigned long a0, unsigned long a1); extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else -# define trace_preempt_on(a0, a1) do { } while (0) -# define trace_preempt_off(a0, a1) do { } while (0) + static inline void trace_preempt_on(unsigned long a0, unsigned long a1) { } + static inline void trace_preempt_off(unsigned long a0, unsigned long a1) { } #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD -- cgit v1.2.3 From 4677d4a53e0d565742277e8913e91c821453e63e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 3 May 2010 14:57:11 +0200 Subject: arch, hweight: Fix compilation errors Fix function prototype visibility issues when compiling for non-x86 architectures. Tested with crosstool (ftp://ftp.kernel.org/pub/tools/crosstool/) with alpha, ia64 and sparc targets. Signed-off-by: Borislav Petkov LKML-Reference: <20100503130736.GD26107@aftab> Signed-off-by: H. Peter Anvin --- include/linux/bitops.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index c55d5bc4ee58..26caa608ccd9 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -10,6 +10,11 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #endif +extern unsigned int __sw_hweight8(unsigned int w); +extern unsigned int __sw_hweight16(unsigned int w); +extern unsigned int __sw_hweight32(unsigned int w); +extern unsigned long __sw_hweight64(__u64 w); + /* * Include this here because some architectures need generic_ffs/fls in * scope -- cgit v1.2.3 From 2c2df8418ac7908eec4558407b83f16739006c54 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 30 Mar 2010 01:07:02 -0700 Subject: x86, acpi/irq: Introduce apci_isa_irq_to_gsi There are a number of cases where the current code makes the assumption that isa irqs identity map to the first 16 acpi global system intereupts. In most instances that assumption is correct as that is the required behaviour in dual i8259 mode and the default behavior in ioapic mode. However there are some systems out there that take advantage of acpis interrupt remapping for the isa irqs to have a completely different mapping of isa_irq to gsi. Introduce acpi_isa_irq_to_gsi to perform this mapping explicitly in the code that needs it. Initially this will be just the current assumed identity mapping to ensure it's introduction does not cause regressions. Signed-off-by: Eric W. Biederman LKML-Reference: <1269936436-7039-1-git-send-email-ebiederm@xmission.com> Signed-off-by: H. Peter Anvin --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b926afe8c03e..7a937dabcc4a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -116,6 +116,7 @@ extern unsigned long acpi_realmode_flags; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); +int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi); #ifdef CONFIG_X86_IO_APIC extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity); -- cgit v1.2.3 From 9a0a91bb56d2915cdb8585717de38376ad20fef9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 30 Mar 2010 01:07:03 -0700 Subject: x86, acpi/irq: Teach acpi_get_override_irq to take a gsi not an isa_irq In perverse acpi implementations the isa irqs are not identity mapped to the first 16 gsi. Furthermore at least the extended interrupt resource capability may return gsi's and not isa irqs. So since what we get from acpi is a gsi teach acpi_get_overrride_irq to operate on a gsi instead of an isa_irq. Signed-off-by: Eric W. Biederman LKML-Reference: <1269936436-7039-2-git-send-email-ebiederm@xmission.com> Signed-off-by: H. Peter Anvin --- include/linux/acpi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 7a937dabcc4a..3da73f5f0ae9 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -119,9 +119,9 @@ int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi); #ifdef CONFIG_X86_IO_APIC -extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity); +extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); #else -#define acpi_get_override_irq(bus, trigger, polarity) (-1) +#define acpi_get_override_irq(gsi, trigger, polarity) (-1) #endif /* * This function undoes the effect of one call to acpi_register_gsi(). -- cgit v1.2.3 From 2e26ca7150a4f2ab3e69471dfc65f131e7dd7a05 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 May 2010 10:52:31 -0400 Subject: tracing: Fix tracepoint.h DECLARE_TRACE() to allow more than one header When more than one header is included under CREATE_TRACE_POINTS the DECLARE_TRACE() macro is not defined back to its original meaning and the second include will fail to initialize the TRACE_EVENT() and DECLARE_TRACE() correctly. To fix this the tracepoint.h file moves the define of DECLARE_TRACE() out of the #ifdef _LINUX_TRACEPOINT_H protection (just like the define of the TRACE_EVENT()). This way the define_trace.h will undef the DECLARE_TRACE() at the end and allow new headers to start from scratch. This patch also requires fixing the include/events/napi.h It currently uses DECLARE_TRACE() and should be converted to a TRACE_EVENT() format. But I'll leave that change to the authors of that file. But since the napi.h file depends on using the CREATE_TRACE_POINTS and does not define its own DEFINE_TRACE() it must use the define_trace.h method instead. Cc: Neil Horman Cc: David S. Miller Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 114 +++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 78b4bd3be496..1d85f9a6a199 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -33,6 +33,65 @@ struct tracepoint { * Keep in sync with vmlinux.lds.h. */ +/* + * Connect a probe to a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_register(const char *name, void *probe); + +/* + * Disconnect a probe from a tracepoint. + * Internal API, should not be used directly. + */ +extern int tracepoint_probe_unregister(const char *name, void *probe); + +extern int tracepoint_probe_register_noupdate(const char *name, void *probe); +extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe); +extern void tracepoint_probe_update_all(void); + +struct tracepoint_iter { + struct module *module; + struct tracepoint *tracepoint; +}; + +extern void tracepoint_iter_start(struct tracepoint_iter *iter); +extern void tracepoint_iter_next(struct tracepoint_iter *iter); +extern void tracepoint_iter_stop(struct tracepoint_iter *iter); +extern void tracepoint_iter_reset(struct tracepoint_iter *iter); +extern int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end); + +/* + * tracepoint_synchronize_unregister must be called between the last tracepoint + * probe unregistration and the end of module exit to make sure there is no + * caller executing a probe when it is freed. + */ +static inline void tracepoint_synchronize_unregister(void) +{ + synchronize_sched(); +} + +#define PARAMS(args...) args + +#ifdef CONFIG_TRACEPOINTS +extern void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end); +#else +static inline void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ } +#endif /* CONFIG_TRACEPOINTS */ + +#endif /* _LINUX_TRACEPOINT_H */ + +/* + * Note: we keep the TRACE_EVENT and DECLARE_TRACE outside the include + * file ifdef protection. + * This is due to the way trace events work. If a file includes two + * trace event headers under one "CREATE_TRACE_POINTS" the first include + * will override the TRACE_EVENT and break the second include. + */ + #ifndef DECLARE_TRACE #define TP_PROTO(args...) args @@ -96,9 +155,6 @@ struct tracepoint { #define EXPORT_TRACEPOINT_SYMBOL(name) \ EXPORT_SYMBOL(__tracepoint_##name) -extern void tracepoint_update_probe_range(struct tracepoint *begin, - struct tracepoint *end); - #else /* !CONFIG_TRACEPOINTS */ #define DECLARE_TRACE(name, proto, args) \ static inline void _do_trace_##name(struct tracepoint *tp, proto) \ @@ -119,61 +175,9 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) -static inline void tracepoint_update_probe_range(struct tracepoint *begin, - struct tracepoint *end) -{ } #endif /* CONFIG_TRACEPOINTS */ #endif /* DECLARE_TRACE */ -/* - * Connect a probe to a tracepoint. - * Internal API, should not be used directly. - */ -extern int tracepoint_probe_register(const char *name, void *probe); - -/* - * Disconnect a probe from a tracepoint. - * Internal API, should not be used directly. - */ -extern int tracepoint_probe_unregister(const char *name, void *probe); - -extern int tracepoint_probe_register_noupdate(const char *name, void *probe); -extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe); -extern void tracepoint_probe_update_all(void); - -struct tracepoint_iter { - struct module *module; - struct tracepoint *tracepoint; -}; - -extern void tracepoint_iter_start(struct tracepoint_iter *iter); -extern void tracepoint_iter_next(struct tracepoint_iter *iter); -extern void tracepoint_iter_stop(struct tracepoint_iter *iter); -extern void tracepoint_iter_reset(struct tracepoint_iter *iter); -extern int tracepoint_get_iter_range(struct tracepoint **tracepoint, - struct tracepoint *begin, struct tracepoint *end); - -/* - * tracepoint_synchronize_unregister must be called between the last tracepoint - * probe unregistration and the end of module exit to make sure there is no - * caller executing a probe when it is freed. - */ -static inline void tracepoint_synchronize_unregister(void) -{ - synchronize_sched(); -} - -#define PARAMS(args...) args - -#endif /* _LINUX_TRACEPOINT_H */ - -/* - * Note: we keep the TRACE_EVENT outside the include file ifdef protection. - * This is due to the way trace events work. If a file includes two - * trace event headers under one "CREATE_TRACE_POINTS" the first include - * will override the TRACE_EVENT and break the second include. - */ - #ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: -- cgit v1.2.3 From 1142d810298e694754498dbb4983fcb6cb7fd884 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:20 +0200 Subject: cpu_stop: implement stop_cpu[s]() Implement a simplistic per-cpu maximum priority cpu monopolization mechanism. A non-sleeping callback can be scheduled to run on one or multiple cpus with maximum priority monopolozing those cpus. This is primarily to replace and unify RT workqueue usage in stop_machine and scheduler migration_thread which currently is serving multiple purposes. Four functions are provided - stop_one_cpu(), stop_one_cpu_nowait(), stop_cpus() and try_stop_cpus(). This is to allow clean sharing of resources among stop_cpu and all the migration thread users. One stopper thread per cpu is created which is currently named "stopper/CPU". This will eventually replace the migration thread and take on its name. * This facility was originally named cpuhog and lived in separate files but Peter Zijlstra nacked the name and thus got renamed to cpu_stop and moved into stop_machine.c. * Better reporting of preemption leak as per Peter's suggestion. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Dimitri Sivanich --- include/linux/stop_machine.h | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index baba3a23a814..efcbd6c37947 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -1,15 +1,46 @@ #ifndef _LINUX_STOP_MACHINE #define _LINUX_STOP_MACHINE -/* "Bogolock": stop the entire machine, disable interrupts. This is a - very heavy lock, which is equivalent to grabbing every spinlock - (and more). So the "read" side to such a lock is anything which - disables preeempt. */ + #include #include +#include #include #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +/* + * stop_cpu[s]() is simplistic per-cpu maximum priority cpu + * monopolization mechanism. The caller can specify a non-sleeping + * function to be executed on a single or multiple cpus preempting all + * other processes and monopolizing those cpus until it finishes. + * + * Resources for this mechanism are preallocated when a cpu is brought + * up and requests are guaranteed to be served as long as the target + * cpus are online. + */ + +typedef int (*cpu_stop_fn_t)(void *arg); + +struct cpu_stop_work { + struct list_head list; /* cpu_stopper->works */ + cpu_stop_fn_t fn; + void *arg; + struct cpu_stop_done *done; +}; + +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf); +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); + +/* + * stop_machine "Bogolock": stop the entire machine, disable + * interrupts. This is a very heavy lock, which is equivalent to + * grabbing every spinlock (and more). So the "read" side to such a + * lock is anything which disables preeempt. + */ + /** * stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run -- cgit v1.2.3 From 3fc1f1e27a5b807791d72e5d992aa33b668a6626 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:20 +0200 Subject: stop_machine: reimplement using cpu_stop Reimplement stop_machine using cpu_stop. As cpu stoppers are guaranteed to be available for all online cpus, stop_machine_create/destroy() are no longer necessary and removed. With resource management and synchronization handled by cpu_stop, the new implementation is much simpler. Asking the cpu_stop to execute the stop_cpu() state machine on all online cpus with cpu hotplug disabled is enough. stop_machine itself doesn't need to manage any global resources anymore, so all per-instance information is rolled into struct stop_machine_data and the mutex and all static data variables are removed. The previous implementation created and destroyed RT workqueues as necessary which made stop_machine() calls highly expensive on very large machines. According to Dimitri Sivanich, preventing the dynamic creation/destruction makes booting faster more than twice on very large machines. cpu_stop resources are preallocated for all online cpus and should have the same effect. Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Dimitri Sivanich --- include/linux/stop_machine.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index efcbd6c37947..0e552e72a4c4 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -67,23 +67,6 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); */ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); -/** - * stop_machine_create: create all stop_machine threads - * - * Description: This causes all stop_machine threads to be created before - * stop_machine actually gets called. This can be used by subsystems that - * need a non failing stop_machine infrastructure. - */ -int stop_machine_create(void); - -/** - * stop_machine_destroy: destroy all stop_machine threads - * - * Description: This causes all stop_machine threads which were created with - * stop_machine_create to be destroyed again. - */ -void stop_machine_destroy(void); - #else static inline int stop_machine(int (*fn)(void *), void *data, @@ -96,8 +79,5 @@ static inline int stop_machine(int (*fn)(void *), void *data, return ret; } -static inline int stop_machine_create(void) { return 0; } -static inline void stop_machine_destroy(void) { } - #endif /* CONFIG_SMP */ #endif /* _LINUX_STOP_MACHINE */ -- cgit v1.2.3 From 969c79215a35b06e5e3efe69b9412f858df7856c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:21 +0200 Subject: sched: replace migration_thread with cpu_stop Currently migration_thread is serving three purposes - migration pusher, context to execute active_load_balance() and forced context switcher for expedited RCU synchronize_sched. All three roles are hardcoded into migration_thread() and determining which job is scheduled is slightly messy. This patch kills migration_thread and replaces all three uses with cpu_stop. The three different roles of migration_thread() are splitted into three separate cpu_stop callbacks - migration_cpu_stop(), active_load_balance_cpu_stop() and synchronize_sched_expedited_cpu_stop() - and each use case now simply asks cpu_stop to execute the callback as necessary. synchronize_sched_expedited() was implemented with private preallocated resources and custom multi-cpu queueing and waiting logic, both of which are provided by cpu_stop. synchronize_sched_expedited_count is made atomic and all other shared resources along with the mutex are dropped. synchronize_sched_expedited() also implemented a check to detect cases where not all the callback got executed on their assigned cpus and fall back to synchronize_sched(). If called with cpu hotplug blocked, cpu_stop already guarantees that and the condition cannot happen; otherwise, stop_machine() would break. However, this patch preserves the paranoid check using a cpumask to record on which cpus the stopper ran so that it can serve as a bisection point if something actually goes wrong theree. Because the internal execution state is no longer visible, rcu_expedited_torture_stats() is removed. This patch also renames cpu_stop threads to from "stopper/%d" to "migration/%d". The names of these threads ultimately don't matter and there's no reason to make unnecessary userland visible changes. With this patch applied, stop_machine() and sched now share the same resources. stop_machine() is faster without wasting any resources and sched migration users are much cleaner. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Dipankar Sarma Cc: Josh Triplett Cc: Paul E. McKenney Cc: Oleg Nesterov Cc: Dimitri Sivanich --- include/linux/rcutiny.h | 2 -- include/linux/rcutree.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index a5195875480a..0006b2df00e1 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void) return 0; } -extern int rcu_expedited_torture_stats(char *page); - static inline void rcu_force_quiescent_state(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 42cc3a04779e..24e467e526b8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,7 +35,6 @@ struct notifier_block; extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern int rcu_needs_cpu(int cpu); -extern int rcu_expedited_torture_stats(char *page); #ifdef CONFIG_TREE_PREEMPT_RCU -- cgit v1.2.3 From 4fd38e4595e2f6c9d27732c042a0e16b2753049c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2010 17:31:38 +0200 Subject: perf: Fix exit() vs PERF_FORMAT_GROUP Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work as expected if the task the counters were attached to quit before the read() call. The cause is that we unconditionally destroy the grouping when we remove counters from their context. Fix this by only doing this when we free the counter itself. Reported-by: Corey Ashford Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1273160566.5605.404.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c8e375440403..bf8f3c003297 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -522,6 +522,7 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { + PERF_EVENT_STATE_FREE = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, -- cgit v1.2.3 From ab608344bcbde4f55ec4cd911b686b0ce3eae076 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Apr 2010 23:03:20 +0200 Subject: perf, x86: Improve the PEBS ABI Rename perf_event_attr::precise to perf_event_attr::precise_ip and widen it to 2 bits. This new field describes the required precision of the PERF_SAMPLE_IP field: 0 - SAMPLE_IP can have arbitrary skid 1 - SAMPLE_IP must have constant skid 2 - SAMPLE_IP requested to have 0 skid 3 - SAMPLE_IP must have 0 skid And modify the Intel PEBS code accordingly. The PEBS implementation now supports up to precise_ip == 2, where we perform the IP fixup. Also s/PERF_RECORD_MISC_EXACT/&_IP/ to clarify its meaning, this bit should be set for each PERF_SAMPLE_IP field known to match the actual instruction triggering the event. This new scheme allows for a PEBS mode that uses the buffer for more than a single event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6be4a0f9137c..23cd0057a681 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -203,9 +203,19 @@ struct perf_event_attr { enable_on_exec : 1, /* next exec enables */ task : 1, /* trace fork/exit */ watermark : 1, /* wakeup_watermark */ - precise : 1, /* OoO invariant counter */ - - __reserved_1 : 48; + /* + * precise_ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + precise_ip : 2, /* skid constraint */ + + __reserved_1 : 47; union { __u32 wakeup_events; /* wakeup every n events */ @@ -296,7 +306,12 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) #define PERF_RECORD_MISC_GUEST_USER (5 << 0) -#define PERF_RECORD_MISC_EXACT (1 << 14) +/* + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + */ +#define PERF_RECORD_MISC_EXACT_IP (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ -- cgit v1.2.3 From 6bde9b6ce0127e2a56228a2071536d422be31336 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 23 Apr 2010 13:56:00 +0800 Subject: perf: Add group scheduling transactional APIs Add group scheduling transactional APIs to struct pmu. These APIs will be implemented in arch code, based on Peter's idea as below. > the idea behind hw_perf_group_sched_in() is to not perform > schedulability tests on each event in the group, but to add the group > as a whole and then perform one test. > > Of course, when that test fails, you'll have to roll-back the whole > group again. > > So start_txn (or a better name) would simply toggle a flag in the pmu > implementation that will make pmu::enable() not perform the > schedulablilty test. > > Then commit_txn() will perform the schedulability test (so note the > method has to have a !void return value. > > This will allow us to use the regular > kernel/perf_event.c::group_sched_in() and all the rollback code. > Currently each hw_perf_group_sched_in() implementation duplicates all > the rolllback code (with various bugs). ->start_txn: Start group events scheduling transaction, set a flag to make pmu::enable() not perform the schedulability test, it will be performed at commit time. ->commit_txn: Commit group events scheduling transaction, perform the group schedulability as a whole ->cancel_txn: Stop group events scheduling transaction, clear the flag so pmu::enable() will perform the schedulability test. Reviewed-by: Stephane Eranian Reviewed-by: Frederic Weisbecker Signed-off-by: Lin Ming Cc: David Miller Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1272002160.5707.60.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 23cd0057a681..4924c96d7e2d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -547,6 +547,8 @@ struct hw_perf_event { struct perf_event; +#define PERF_EVENT_TXN_STARTED 1 + /** * struct pmu - generic performance monitoring unit */ @@ -557,6 +559,16 @@ struct pmu { void (*stop) (struct perf_event *event); void (*read) (struct perf_event *event); void (*unthrottle) (struct perf_event *event); + + /* + * group events scheduling is treated as a transaction, + * add group events as a whole and perform one schedulability test. + * If test fails, roll back the whole group + */ + + void (*start_txn) (const struct pmu *pmu); + void (*cancel_txn) (const struct pmu *pmu); + int (*commit_txn) (const struct pmu *pmu); }; /** @@ -823,9 +835,6 @@ extern void perf_disable(void); extern void perf_enable(void); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); -extern int hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * -- cgit v1.2.3 From bbf1bb3eee86f2eef2baa14e600be454d09109ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 May 2010 16:20:53 +0200 Subject: cpu_stop: add dummy implementation for UP When !CONFIG_SMP, cpu_stop functions weren't defined at all which could lead to build failures if UP code uses cpu_stop facility. Add dummy cpu_stop implementation for UP. The waiting variants execute the work function directly with preempt disabled and stop_one_cpu_nowait() schedules a workqueue work. Makefile and ifdefs around stop_machine implementation are updated to accomodate CONFIG_SMP && !CONFIG_STOP_MACHINE case. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- include/linux/stop_machine.h | 69 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0e552e72a4c4..6b524a0d02e4 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -6,8 +6,6 @@ #include #include -#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) - /* * stop_cpu[s]() is simplistic per-cpu maximum priority cpu * monopolization mechanism. The caller can specify a non-sleeping @@ -18,9 +16,10 @@ * up and requests are guaranteed to be served as long as the target * cpus are online. */ - typedef int (*cpu_stop_fn_t)(void *arg); +#ifdef CONFIG_SMP + struct cpu_stop_work { struct list_head list; /* cpu_stopper->works */ cpu_stop_fn_t fn; @@ -34,12 +33,70 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +#else /* CONFIG_SMP */ + +#include + +struct cpu_stop_work { + struct work_struct work; + cpu_stop_fn_t fn; + void *arg; +}; + +static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) +{ + int ret = -ENOENT; + preempt_disable(); + if (cpu == smp_processor_id()) + ret = fn(arg); + preempt_enable(); + return ret; +} + +static void stop_one_cpu_nowait_workfn(struct work_struct *work) +{ + struct cpu_stop_work *stwork = + container_of(work, struct cpu_stop_work, work); + preempt_disable(); + stwork->fn(stwork->arg); + preempt_enable(); +} + +static inline void stop_one_cpu_nowait(unsigned int cpu, + cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf) +{ + if (cpu == smp_processor_id()) { + INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn); + work_buf->fn = fn; + work_buf->arg = arg; + schedule_work(&work_buf->work); + } +} + +static inline int stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + if (cpumask_test_cpu(raw_smp_processor_id(), cpumask)) + return stop_one_cpu(raw_smp_processor_id(), fn, arg); + return -ENOENT; +} + +static inline int try_stop_cpus(const struct cpumask *cpumask, + cpu_stop_fn_t fn, void *arg) +{ + return stop_cpus(cpumask, fn, arg); +} + +#endif /* CONFIG_SMP */ + /* * stop_machine "Bogolock": stop the entire machine, disable * interrupts. This is a very heavy lock, which is equivalent to * grabbing every spinlock (and more). So the "read" side to such a * lock is anything which disables preeempt. */ +#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) /** * stop_machine: freeze the machine on all CPUs and run this function @@ -67,7 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); */ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); -#else +#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ static inline int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) @@ -79,5 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data, return ret; } -#endif /* CONFIG_SMP */ -#endif /* _LINUX_STOP_MACHINE */ +#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#endif /* _LINUX_STOP_MACHINE */ -- cgit v1.2.3 From e0e37c200f1357db0dd986edb359c41c57d24f6e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 9 May 2010 08:24:39 -0700 Subject: sched: Eliminate the ts->idle_lastupdate field Now that the only user of ts->idle_lastupdate is update_ts_time_stats(), the entire field can be eliminated. In update_ts_time_stats(), idle_lastupdate is first set to "now", and a few lines later, the only user is an if() statement that assigns a variable either to "now" or to ts->idle_lastupdate, which has the value of "now" at that point. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: davej@redhat.com LKML-Reference: <20100509082439.2fab0b4f@infradead.org> Signed-off-by: Ingo Molnar --- include/linux/tick.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index d2ae79e21be3..0343eed40619 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -60,7 +60,6 @@ struct tick_sched { ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; - ktime_t idle_lastupdate; ktime_t sleep_length; unsigned long last_jiffies; unsigned long next_jiffies; -- cgit v1.2.3 From 0224cf4c5ee0d7faec83956b8e21f7d89e3df3bd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sun, 9 May 2010 08:25:23 -0700 Subject: sched: Intoduce get_cpu_iowait_time_us() For the ondemand cpufreq governor, it is desired that the iowait time is microaccounted in a similar way as idle time is. This patch introduces the infrastructure to account and expose this information via the get_cpu_iowait_time_us() function. [akpm@linux-foundation.org: fix CONFIG_NO_HZ=n build] Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: davej@redhat.com LKML-Reference: <20100509082523.284feab6@infradead.org> Signed-off-by: Ingo Molnar --- include/linux/tick.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 0343eed40619..b232ccc0ee29 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -42,6 +42,7 @@ enum tick_nohz_mode { * @idle_waketime: Time when the idle was interrupted * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @sleep_length: Duration of the current idle sleep * @do_timer_lst: CPU was the last one doing do_timer before going idle */ @@ -60,6 +61,7 @@ struct tick_sched { ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; unsigned long next_jiffies; @@ -123,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle); extern void tick_nohz_restart_sched_tick(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); +extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); # else static inline void tick_nohz_stop_sched_tick(int inidle) { } static inline void tick_nohz_restart_sched_tick(void) { } @@ -133,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void) return len; } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } +static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ #endif -- cgit v1.2.3 From 2b3fc35f6919344e3cf722dde8308f47235c0b70 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 20 Apr 2010 16:23:07 +0800 Subject: rcu: optionally leave lockdep enabled after RCU lockdep splat There is no need to disable lockdep after an RCU lockdep splat, so remove the debug_lockdeps_off() from lockdep_rcu_dereference(). To avoid repeated lockdep splats, use a static variable in the inlined rcu_dereference_check() and rcu_dereference_protected() macros so that a given instance splats only once, but so that multiple instances can be detected per boot. This is controlled by a new config variable CONFIG_PROVE_RCU_REPEATEDLY, which is disabled by default. This provides the normal lockdep behavior by default, but permits people who want to find multiple RCU-lockdep splats per boot to easily do so. Requested-by: Eric Paris Signed-off-by: Lai Jiangshan Tested-by: Eric Paris Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index db266bbed23f..4dca2752cfde 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -192,6 +192,15 @@ static inline int rcu_read_lock_sched_held(void) extern int rcu_my_thread_group_empty(void); +#define __do_rcu_dereference_check(c) \ + do { \ + static bool __warned; \ + if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ + __warned = true; \ + lockdep_rcu_dereference(__FILE__, __LINE__); \ + } \ + } while (0) + /** * rcu_dereference_check - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing @@ -221,8 +230,7 @@ extern int rcu_my_thread_group_empty(void); */ #define rcu_dereference_check(p, c) \ ({ \ - if (debug_lockdep_rcu_enabled() && !(c)) \ - lockdep_rcu_dereference(__FILE__, __LINE__); \ + __do_rcu_dereference_check(c); \ rcu_dereference_raw(p); \ }) @@ -239,8 +247,7 @@ extern int rcu_my_thread_group_empty(void); */ #define rcu_dereference_protected(p, c) \ ({ \ - if (debug_lockdep_rcu_enabled() && !(c)) \ - lockdep_rcu_dereference(__FILE__, __LINE__); \ + __do_rcu_dereference_check(c); \ (p); \ }) -- cgit v1.2.3 From d20200b591f59847ab6a5c23507084a7d29e23c5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Mar 2010 10:52:21 -0700 Subject: rcu: Fix bogus CONFIG_PROVE_LOCKING in comments to reflect reality. It is CONFIG_DEBUG_LOCK_ALLOC rather than CONFIG_PROVE_LOCKING, so fix it. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 15 ++++++++------- include/linux/srcu.h | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 4dca2752cfde..a150af0e5cd5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -106,8 +106,8 @@ extern int debug_lockdep_rcu_enabled(void); /** * rcu_read_lock_held - might we be in RCU read-side critical section? * - * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in - * an RCU read-side critical section. In absence of CONFIG_PROVE_LOCKING, + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU + * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an RCU read-side critical section unless it can * prove otherwise. * @@ -129,11 +129,12 @@ extern int rcu_read_lock_bh_held(void); /** * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section? * - * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in an - * RCU-sched read-side critical section. In absence of CONFIG_PROVE_LOCKING, - * this assumes we are in an RCU-sched read-side critical section unless it - * can prove otherwise. Note that disabling of preemption (including - * disabling irqs) counts as an RCU-sched read-side critical section. + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * RCU-sched read-side critical section. In absence of + * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side + * critical section unless it can prove otherwise. Note that disabling + * of preemption (including disabling irqs) counts as an RCU-sched + * read-side critical section. * * Check rcu_scheduler_active to prevent false positives during boot. */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 4d5ecb222af9..9c01f1022428 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -84,8 +84,8 @@ long srcu_batches_completed(struct srcu_struct *sp); /** * srcu_read_lock_held - might we be in SRCU read-side critical section? * - * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in - * an SRCU read-side critical section. In absence of CONFIG_PROVE_LOCKING, + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU + * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. */ -- cgit v1.2.3 From 32c141a0a1dfa29e0a07d78bec0c0919fc4b9f88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Mar 2010 10:59:28 -0700 Subject: rcu: fix now-bogus rcu_scheduler_active comments. The rcu_scheduler_active check has been wrapped into the new debug_lockdep_rcu_enabled() function, so update the comments to reflect this new reality. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a150af0e5cd5..02537a72aaa4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -111,7 +111,8 @@ extern int debug_lockdep_rcu_enabled(void); * this assumes we are in an RCU read-side critical section unless it can * prove otherwise. * - * Check rcu_scheduler_active to prevent false positives during boot. + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. */ static inline int rcu_read_lock_held(void) { @@ -136,7 +137,8 @@ extern int rcu_read_lock_bh_held(void); * of preemption (including disabling irqs) counts as an RCU-sched * read-side critical section. * - * Check rcu_scheduler_active to prevent false positives during boot. + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. */ #ifdef CONFIG_PREEMPT static inline int rcu_read_lock_sched_held(void) -- cgit v1.2.3 From da848c47bc6e873a54a445ea1960423a495b6b32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Mar 2010 15:46:01 -0700 Subject: rcu: shrink rcutiny by making synchronize_rcu_bh() be inline Because synchronize_rcu_bh() is identical to synchronize_sched(), make the former a static inline invoking the latter, saving the overhead of an EXPORT_SYMBOL_GPL() and the duplicate code. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 -- include/linux/rcutiny.h | 12 +++++++++++- include/linux/rcutree.h | 2 ++ 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 02537a72aaa4..d8fb2abcf303 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -56,8 +56,6 @@ struct rcu_head { }; /* Exported common interfaces */ -extern void synchronize_rcu_bh(void); -extern void synchronize_sched(void); extern void rcu_barrier(void); extern void rcu_barrier_bh(void); extern void rcu_barrier_sched(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index a5195875480a..bbeb55b7709b 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -74,7 +74,17 @@ static inline void rcu_sched_force_quiescent_state(void) { } -#define synchronize_rcu synchronize_sched +extern void synchronize_sched(void); + +static inline void synchronize_rcu(void) +{ + synchronize_sched(); +} + +static inline void synchronize_rcu_bh(void) +{ + synchronize_sched(); +} static inline void synchronize_rcu_expedited(void) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 42cc3a04779e..7484fe66a3aa 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -86,6 +86,8 @@ static inline void __rcu_read_unlock_bh(void) extern void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +extern void synchronize_rcu_bh(void); +extern void synchronize_sched(void); extern void synchronize_rcu_expedited(void); static inline void synchronize_rcu_bh_expedited(void) -- cgit v1.2.3 From 25502a6c13745f4650cc59322bd198194f55e796 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Apr 2010 17:37:01 -0700 Subject: rcu: refactor RCU's context-switch handling The addition of preemptible RCU to treercu resulted in a bit of confusion and inefficiency surrounding the handling of context switches for RCU-sched and for RCU-preempt. For RCU-sched, a context switch is a quiescent state, pure and simple, just like it always has been. For RCU-preempt, a context switch is in no way a quiescent state, but special handling is required when a task blocks in an RCU read-side critical section. However, the callout from the scheduler and the outer loop in ksoftirqd still calls something named rcu_sched_qs(), whose name is no longer accurate. Furthermore, when rcu_check_callbacks() notes an RCU-sched quiescent state, it ends up unnecessarily (though harmlessly, aside from the performance hit) enqueuing the current task if it happens to be running in an RCU-preempt read-side critical section. This not only increases the maximum latency of scheduler_tick(), it also needlessly increases the overhead of the next outermost rcu_read_unlock() invocation. This patch addresses this situation by separating the notion of RCU's context-switch handling from that of RCU-sched's quiescent states. The context-switch handling is covered by rcu_note_context_switch() in general and by rcu_preempt_note_context_switch() for preemptible RCU. This permits rcu_sched_qs() to handle quiescent states and only quiescent states. It also reduces the maximum latency of scheduler_tick(), though probably by much less than a microsecond. Finally, it means that tasks within preemptible-RCU read-side critical sections avoid incurring the overhead of queuing unless there really is a context switch. Suggested-by: Lai Jiangshan Acked-by: Lai Jiangshan Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/rcutiny.h | 4 ++++ include/linux/rcutree.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index bbeb55b7709b..ff22b97fb979 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -29,6 +29,10 @@ void rcu_sched_qs(int cpu); void rcu_bh_qs(int cpu); +static inline void rcu_note_context_switch(int cpu) +{ + rcu_sched_qs(cpu); +} #define __rcu_read_lock() preempt_disable() #define __rcu_read_unlock() preempt_enable() diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 7484fe66a3aa..b9f74606f320 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,6 +34,7 @@ struct notifier_block; extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); +extern void rcu_note_context_switch(int cpu); extern int rcu_needs_cpu(int cpu); extern int rcu_expedited_torture_stats(char *page); -- cgit v1.2.3 From bbad937983147c017c25406860287cb94da9af7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Apr 2010 16:17:17 -0700 Subject: rcu: slim down rcutiny by removing rcu_scheduler_active and friends TINY_RCU does not need rcu_scheduler_active unless CONFIG_DEBUG_LOCK_ALLOC. So conditionally compile rcu_scheduler_active in order to slim down rcutiny a bit more. Also gets rid of an EXPORT_SYMBOL_GPL, which is responsible for most of the slimming. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 +--- include/linux/rcutiny.h | 13 +++++++++++++ include/linux/rcutree.h | 3 +++ 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d8fb2abcf303..23be3a702516 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -64,8 +64,6 @@ extern int sched_expedited_torture_stats(char *page); /* Internal to kernel */ extern void rcu_init(void); -extern int rcu_scheduler_active; -extern void rcu_scheduler_starting(void); #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) #include @@ -178,7 +176,7 @@ static inline int rcu_read_lock_bh_held(void) #ifdef CONFIG_PREEMPT static inline int rcu_read_lock_sched_held(void) { - return !rcu_scheduler_active || preempt_count() != 0 || irqs_disabled(); + return preempt_count() != 0 || irqs_disabled(); } #else /* #ifdef CONFIG_PREEMPT */ static inline int rcu_read_lock_sched_held(void) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index ff22b97fb979..14e5a76b2c06 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -128,4 +128,17 @@ static inline int rcu_preempt_depth(void) return 0; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +extern int rcu_scheduler_active __read_mostly; +extern void rcu_scheduler_starting(void); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +static inline void rcu_scheduler_starting(void) +{ +} + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index b9f74606f320..48282055e83d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -123,4 +123,7 @@ static inline int rcu_blocking_is_gp(void) return num_online_cpus() == 1; } +extern void rcu_scheduler_starting(void); +extern int rcu_scheduler_active __read_mostly; + #endif /* __LINUX_RCUTREE_H */ -- cgit v1.2.3 From d14aada8e20bdf81ffd43f433b123972cf575b32 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Apr 2010 22:24:22 -0700 Subject: rcu: make SRCU usable in modules Add a #include for mutex.h to allow SRCU to be more easily used in kernel modules. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 9c01f1022428..4d5d2f546dbf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -27,6 +27,8 @@ #ifndef _LINUX_SRCU_H #define _LINUX_SRCU_H +#include + struct srcu_struct_array { int c[2]; }; -- cgit v1.2.3 From a5d8e467f83f6672104f276223a88e3b50cbd375 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 17 Apr 2010 08:48:38 -0400 Subject: Debugobjects transition check Implement a basic state machine checker in the debugobjects. This state machine checker detects races and inconsistencies within the "active" life of a debugobject. The checker only keeps track of the current state; all the state machine logic is kept at the object instance level. The checker works by adding a supplementary "unsigned int astate" field to the debug_obj structure. It keeps track of the current "active state" of the object. The only constraints that are imposed on the states by the debugobjects system is that: - activation of an object sets the current active state to 0, - deactivation of an object expects the current active state to be 0. For the rest of the states, the state mapping is determined by the specific object instance. Therefore, the logic keeping track of the state machine is within the specialized instance, without any need to know about it at the debugobject level. The current object active state is changed by calling: debug_object_active_state(addr, descr, expect, next) where "expect" is the expected state and "next" is the next state to move to if the expected state is found. A warning is generated if the expected is not found. Signed-off-by: Mathieu Desnoyers Reviewed-by: Thomas Gleixner Acked-by: David S. Miller CC: "Paul E. McKenney" CC: akpm@linux-foundation.org CC: mingo@elte.hu CC: laijs@cn.fujitsu.com CC: dipankar@in.ibm.com CC: josh@joshtriplett.org CC: dvhltc@us.ibm.com CC: niv@us.ibm.com CC: peterz@infradead.org CC: rostedt@goodmis.org CC: Valdis.Kletnieks@vt.edu CC: dhowells@redhat.com CC: eric.dumazet@gmail.com CC: Alexey Dobriyan Signed-off-by: Paul E. McKenney --- include/linux/debugobjects.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 8c243aaa86a7..597692f1fc8d 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -20,12 +20,14 @@ struct debug_obj_descr; * struct debug_obj - representaion of an tracked object * @node: hlist node to link the object into the tracker list * @state: tracked object state + * @astate: current active state * @object: pointer to the real object * @descr: pointer to an object type specific debug description structure */ struct debug_obj { struct hlist_node node; enum debug_obj_state state; + unsigned int astate; void *object; struct debug_obj_descr *descr; }; @@ -60,6 +62,15 @@ extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr); extern void debug_object_destroy (void *addr, struct debug_obj_descr *descr); extern void debug_object_free (void *addr, struct debug_obj_descr *descr); +/* + * Active state: + * - Set at 0 upon initialization. + * - Must return to 0 before deactivation. + */ +extern void +debug_object_active_state(void *addr, struct debug_obj_descr *descr, + unsigned int expect, unsigned int next); + extern void debug_objects_early_init(void); extern void debug_objects_mem_init(void); #else -- cgit v1.2.3 From 4376030a54860dedab9d848dfa7cc700a6025c0b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 17 Apr 2010 08:48:39 -0400 Subject: rcu head introduce rcu head init on stack PEM: o Would it be possible to make this bisectable as follows? a. Insert a new patch after current patch 4/6 that defines destroy_rcu_head_on_stack(), init_rcu_head_on_stack(), and init_rcu_head() with their !CONFIG_DEBUG_OBJECTS_RCU_HEAD definitions. This patch performs this transition. Signed-off-by: Mathieu Desnoyers CC: "Paul E. McKenney" CC: David S. Miller CC: akpm@linux-foundation.org CC: mingo@elte.hu CC: laijs@cn.fujitsu.com CC: dipankar@in.ibm.com CC: josh@joshtriplett.org CC: dvhltc@us.ibm.com CC: niv@us.ibm.com CC: tglx@linutronix.de CC: peterz@infradead.org CC: rostedt@goodmis.org CC: Valdis.Kletnieks@vt.edu CC: dhowells@redhat.com CC: eric.dumazet@gmail.com CC: Alexey Dobriyan Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 23be3a702516..b653b4aaa8a6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -79,6 +79,14 @@ extern void rcu_init(void); (ptr)->next = NULL; (ptr)->func = NULL; \ } while (0) +static inline void init_rcu_head_on_stack(struct rcu_head *head) +{ +} + +static inline void destroy_rcu_head_on_stack(struct rcu_head *head) +{ +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC extern struct lockdep_map rcu_lock_map; -- cgit v1.2.3 From e3174cfd2a1e28fff774681f00a0eef3d31da970 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 May 2010 08:31:49 +0200 Subject: Revert "perf: Fix exit() vs PERF_FORMAT_GROUP" This reverts commit 4fd38e4595e2f6c9d27732c042a0e16b2753049c. It causes various crashes and hangs when events are activated. The cause is not fully understood yet but we need to revert it because the effects are severe. Reported-by: Stephane Eranian Reported-by: Lin Ming Cc: Peter Zijlstra Cc: Corey Ashford Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4924c96d7e2d..3fd5c82e0e18 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -575,7 +575,6 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { - PERF_EVENT_STATE_FREE = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, -- cgit v1.2.3 From a93d2f1744206827ccf416e2cdc5018aa503314e Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 7 May 2010 14:33:26 +0800 Subject: sched, wait: Use wrapper functions epoll should not touch flags in wait_queue_t. This patch introduces a new function __add_wait_queue_exclusive(), for the users, who use wait queue as a LIFO queue. __add_wait_queue_tail_exclusive() is introduced too instead of add_wait_queue_exclusive_locked(). remove_wait_queue_locked() is removed, as it is a duplicate of __remove_wait_queue(), disliked by users, and with less users. Signed-off-by: Changli Gao Signed-off-by: Peter Zijlstra Cc: Alexander Viro Cc: Paul Menage Cc: Li Zefan Cc: Davide Libenzi Cc: LKML-Reference: <1273214006-2979-1-git-send-email-xiaosuo@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index a48e16b77d5e..76d96d035ea0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) /* * Used for wake-one threads: */ +static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, + wait_queue_t *wait) +{ + wait->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wait); +} + static inline void __add_wait_queue_tail(wait_queue_head_t *head, - wait_queue_t *new) + wait_queue_t *new) { list_add_tail(&new->task_list, &head->task_list); } +static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q, + wait_queue_t *wait) +{ + wait->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(q, wait); +} + static inline void __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) { @@ -403,25 +417,6 @@ do { \ __ret; \ }) -/* - * Must be called with the spinlock in the wait_queue_head_t held. - */ -static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q, - wait_queue_t * wait) -{ - wait->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(q, wait); -} - -/* - * Must be called with the spinlock in the wait_queue_head_t held. - */ -static inline void remove_wait_queue_locked(wait_queue_head_t *q, - wait_queue_t * wait) -{ - __remove_wait_queue(q, wait); -} - /* * These are the old interfaces to sleep waiting for an event. * They are racy. DO NOT use them, use the wait_event* interfaces above. -- cgit v1.2.3 From 72d5a9f7a9542f88397558c65bcfc3b115a65e34 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 May 2010 17:12:17 -0700 Subject: rcu: remove all rcu head initializations, except on_stack initializations Remove all rcu head inits. We don't care about the RCU head state before passing it to call_rcu() anyway. Only leave the "on_stack" variants so debugobjects can keep track of objects on stack. Signed-off-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b1ed1cd8e2a8..7996fc2c9ba9 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -49,7 +49,6 @@ extern struct group_info init_groups; { .first = &init_task.pids[PIDTYPE_PGID].node }, \ { .first = &init_task.pids[PIDTYPE_SID].node }, \ }, \ - .rcu = RCU_HEAD_INIT, \ .level = 0, \ .numbers = { { \ .nr = 0, \ -- cgit v1.2.3 From d83c49f3e36cecd2e8823b6c48ffba083b8a5704 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Apr 2010 17:17:09 -0400 Subject: Fix the regression created by "set S_DEAD on unlink()..." commit 1) i_flags simply doesn't work for mount/unlink race prevention; we may have many links to file and rm on one of those obviously shouldn't prevent bind on top of another later on. To fix it right way we need to mark _dentry_ as unsuitable for mounting upon; new flag (DCACHE_CANT_MOUNT) is protected by d_flags and i_mutex on the inode in question. Set it (with dont_mount(dentry)) in unlink/rmdir/etc., check (with cant_mount(dentry)) in places in namespace.c that used to check for S_DEAD. Setting S_DEAD is still needed in places where we used to set it (for directories getting killed), since we rely on it for readdir/rmdir race prevention. 2) rename()/mount() protection has another bogosity - we unhash the target before we'd checked that it's not a mountpoint. Fixed. 3) ancient bogosity in pivot_root() - we locked i_mutex on the right directory, but checked S_DEAD on the different (and wrong) one. Noticed and fixed. Signed-off-by: Al Viro --- include/linux/dcache.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 30b93b2a01a4..eebb617c17d8 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -186,6 +186,8 @@ d_iput: no no no yes #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ +#define DCACHE_CANT_MOUNT 0x0100 + extern spinlock_t dcache_lock; extern seqlock_t rename_lock; @@ -358,6 +360,18 @@ static inline int d_unlinked(struct dentry *dentry) return d_unhashed(dentry) && !IS_ROOT(dentry); } +static inline int cant_mount(struct dentry *dentry) +{ + return (dentry->d_flags & DCACHE_CANT_MOUNT); +} + +static inline void dont_mount(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_CANT_MOUNT; + spin_unlock(&dentry->d_lock); +} + static inline struct dentry *dget_parent(struct dentry *dentry) { struct dentry *ret; -- cgit v1.2.3 From 81880d603d00c645e0890d0a44d50711c503b72b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 17 May 2010 14:34:57 +1000 Subject: atomic_t: Remove volatile from atomic_t definition When looking at a performance problem on PowerPC, I noticed some awful code generation: c00000000051fc98: 3b 60 00 01 li r27,1 ... c00000000051fca0: 3b 80 00 00 li r28,0 ... c00000000051fcdc: 93 61 00 70 stw r27,112(r1) c00000000051fce0: 93 81 00 74 stw r28,116(r1) c00000000051fce4: 81 21 00 70 lwz r9,112(r1) c00000000051fce8: 80 01 00 74 lwz r0,116(r1) c00000000051fcec: 7d 29 07 b4 extsw r9,r9 c00000000051fcf0: 7c 00 07 b4 extsw r0,r0 c00000000051fcf4: 7c 20 04 ac lwsync c00000000051fcf8: 7d 60 f8 28 lwarx r11,0,r31 c00000000051fcfc: 7c 0b 48 00 cmpw r11,r9 c00000000051fd00: 40 c2 00 10 bne- c00000000051fd10 c00000000051fd04: 7c 00 f9 2d stwcx. r0,0,r31 c00000000051fd08: 40 c2 ff f0 bne+ c00000000051fcf8 c00000000051fd0c: 4c 00 01 2c isync We create two constants, write them out to the stack, read them straight back in and sign extend them. What a mess. It turns out this bad code is a result of us defining atomic_t as a volatile int. We removed the volatile attribute from the powerpc atomic_t definition years ago, but commit ea435467500612636f8f4fb639ff6e76b2496e4b (atomic_t: unify all arch definitions) added it back in. To dig up an old quote from Linus: > The fact is, volatile on data structures is a bug. It's a wart in the C > language. It shouldn't be used. > > Volatile accesses in *code* can be ok, and if we have "atomic_read()" > expand to a "*(volatile int *)&(x)->value", then I'd be ok with that. > > But marking data structures volatile just makes the compiler screw up > totally, and makes code for initialization sequences etc much worse. And screw up it does :) With the volatile removed, we see much more reasonable code generation: c00000000051f5b8: 3b 60 00 01 li r27,1 ... c00000000051f5c0: 3b 80 00 00 li r28,0 ... c00000000051fc7c: 7c 20 04 ac lwsync c00000000051fc80: 7c 00 f8 28 lwarx r0,0,r31 c00000000051fc84: 7c 00 d8 00 cmpw r0,r27 c00000000051fc88: 40 c2 00 10 bne- c00000000051fc98 c00000000051fc8c: 7f 80 f9 2d stwcx. r28,0,r31 c00000000051fc90: 40 c2 ff f0 bne+ c00000000051fc80 c00000000051fc94: 4c 00 01 2c isync Six instructions less. Signed-off-by: Anton Blanchard Signed-off-by: Linus Torvalds --- include/linux/types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index c42724f8c802..23d237a075e2 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -188,12 +188,12 @@ typedef u32 phys_addr_t; typedef phys_addr_t resource_size_t; typedef struct { - volatile int counter; + int counter; } atomic_t; #ifdef CONFIG_64BIT typedef struct { - volatile long counter; + long counter; } atomic64_t; #endif -- cgit v1.2.3 From 0b7f1a7efb38b551f5948a13d0b36e876ba536db Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 28 Jan 2009 21:01:02 +0100 Subject: platform: Make platform resource input parameters const Make the platform resource input parameters of platform_device_add_resources() and platform_device_register_simple() const, as the resources are copied and never modified. Signed-off-by: Geert Uytterhoeven Acked-by: Greg Kroah-Hartman --- include/linux/platform_device.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 212da17d06af..5417944d3687 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -44,12 +44,14 @@ extern int platform_get_irq_byname(struct platform_device *, const char *); extern int platform_add_devices(struct platform_device **, int); extern struct platform_device *platform_device_register_simple(const char *, int id, - struct resource *, unsigned int); + const struct resource *, unsigned int); extern struct platform_device *platform_device_register_data(struct device *, const char *, int, const void *, size_t); extern struct platform_device *platform_device_alloc(const char *name, int id); -extern int platform_device_add_resources(struct platform_device *pdev, struct resource *res, unsigned int num); +extern int platform_device_add_resources(struct platform_device *pdev, + const struct resource *res, + unsigned int num); extern int platform_device_add_data(struct platform_device *pdev, const void *data, size_t size); extern int platform_device_add(struct platform_device *pdev); extern void platform_device_del(struct platform_device *pdev); -- cgit v1.2.3 From bf54a2b3c0dbf76136f00ff785bf6d8f6291311d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 18 Nov 2008 21:13:53 +0100 Subject: m68k: amiga - Zorro bus modalias support Add Amiga Zorro bus modalias and uevent support Signed-off-by: Geert Uytterhoeven --- include/linux/mod_devicetable.h | 9 +++++++++ include/linux/zorro.h | 13 +------------ 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index f58e9d836f32..56fde4364e4c 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -474,4 +474,13 @@ struct platform_device_id { __attribute__((aligned(sizeof(kernel_ulong_t)))); }; +struct zorro_device_id { + __u32 id; /* Device ID or ZORRO_WILDCARD */ + kernel_ulong_t driver_data; /* Data private to the driver */ +}; + +#define ZORRO_WILDCARD (0xffffffff) /* not official */ + +#define ZORRO_DEVICE_MODALIAS_FMT "zorro:i%08X" + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/include/linux/zorro.h b/include/linux/zorro.h index 913bfc226dda..908db1b36d6c 100644 --- a/include/linux/zorro.h +++ b/include/linux/zorro.h @@ -38,8 +38,6 @@ typedef __u32 zorro_id; -#define ZORRO_WILDCARD (0xffffffff) /* not official */ - /* Include the ID list */ #include @@ -116,6 +114,7 @@ struct ConfigDev { #include #include +#include #include @@ -154,16 +153,6 @@ extern struct zorro_bus zorro_bus; /* single Zorro bus */ extern struct bus_type zorro_bus_type; - /* - * Zorro device IDs - */ - -struct zorro_device_id { - zorro_id id; /* Device ID or ZORRO_WILDCARD */ - unsigned long driver_data; /* Data private to the driver */ -}; - - /* * Zorro device drivers */ -- cgit v1.2.3 From 0d305464aefff342c85b4db8b3d7a4345246e5a1 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 5 Apr 2009 12:40:41 +0200 Subject: m68k: amiga - Zorro host bridge platform device conversion Signed-off-by: Geert Uytterhoeven --- include/linux/zorro.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zorro.h b/include/linux/zorro.h index 908db1b36d6c..7bf9db525e9e 100644 --- a/include/linux/zorro.h +++ b/include/linux/zorro.h @@ -141,15 +141,6 @@ struct zorro_dev { * Zorro bus */ -struct zorro_bus { - struct list_head devices; /* list of devices on this bus */ - unsigned int num_resources; /* number of resources */ - struct resource resources[4]; /* address space routed to this bus */ - struct device dev; - char name[10]; -}; - -extern struct zorro_bus zorro_bus; /* single Zorro bus */ extern struct bus_type zorro_bus_type; -- cgit v1.2.3