From aff7385b5a16bca6b8d9243f01a9ea5a5b411e1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 21 Jan 2014 15:35:53 -0800 Subject: locking/mutexes/mcs: Correct barrier usage This patch corrects the way memory barriers are used in the MCS lock with smp_load_acquire and smp_store_release fucnctions. The previous barriers could leak critical sections if mcs lock is used by itself. It is not a problem when mcs lock is embedded in mutex but will be an issue when the mcs_lock is used elsewhere. The patch removes the incorrect barriers and put in correct barriers with the pair of functions smp_load_acquire and smp_store_release. Suggested-by: Michel Lespinasse Reviewed-by: Paul E. McKenney Signed-off-by: Waiman Long Signed-off-by: Jason Low Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1390347353.3138.62.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4dd6e4c219de..fbbd2eda867e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -136,9 +136,12 @@ void mspin_lock(struct mspin_node **lock, struct mspin_node *node) return; } ACCESS_ONCE(prev->next) = node; - smp_wmb(); - /* Wait until the lock holder passes the lock down */ - while (!ACCESS_ONCE(node->locked)) + /* + * Wait until the lock holder passes the lock down. + * Using smp_load_acquire() provides a memory barrier that + * ensures subsequent operations happen after the lock is acquired. + */ + while (!(smp_load_acquire(&node->locked))) arch_mutex_cpu_relax(); } @@ -156,8 +159,13 @@ static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) while (!(next = ACCESS_ONCE(node->next))) arch_mutex_cpu_relax(); } - ACCESS_ONCE(next->locked) = 1; - smp_wmb(); + /* + * Pass lock to next waiter. + * smp_store_release() provides a memory barrier to ensure + * all operations in the critical section has been completed + * before unlocking. + */ + smp_store_release(&next->locked, 1); } /* -- cgit v1.2.3 From e72246748ff006ab928bc774e276e6ef5542f9c5 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 21 Jan 2014 15:36:00 -0800 Subject: locking/mutexes/mcs: Restructure the MCS lock defines and locking code into its own file We will need the MCS lock code for doing optimistic spinning for rwsem and queued rwlock. Extracting the MCS code from mutex.c and put into its own file allow us to reuse this code easily. We also inline mcs_spin_lock and mcs_spin_unlock functions for better efficiency. Note that using the smp_load_acquire/smp_store_release pair used in mcs_lock and mcs_unlock is not sufficient to form a full memory barrier across cpus for many architectures (except x86). For applications that absolutely need a full barrier across multiple cpus with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be used after mcs_lock. Reviewed-by: Paul E. McKenney Signed-off-by: Tim Chen Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1390347360.3138.63.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 68 ++++++-------------------------------------------- 1 file changed, 7 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index fbbd2eda867e..45fe1b5293d6 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -52,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->spin_mlock = NULL; + lock->mcs_lock = NULL; #endif debug_mutex_init(lock, name, key); @@ -111,62 +112,7 @@ EXPORT_SYMBOL(mutex_lock); * more or less simultaneously, the spinners need to acquire a MCS lock * first before spinning on the owner field. * - * We don't inline mspin_lock() so that perf can correctly account for the - * time spent in this lock function. */ -struct mspin_node { - struct mspin_node *next ; - int locked; /* 1 if lock acquired */ -}; -#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) - -static noinline -void mspin_lock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *prev; - - /* Init node */ - node->locked = 0; - node->next = NULL; - - prev = xchg(lock, node); - if (likely(prev == NULL)) { - /* Lock acquired */ - node->locked = 1; - return; - } - ACCESS_ONCE(prev->next) = node; - /* - * Wait until the lock holder passes the lock down. - * Using smp_load_acquire() provides a memory barrier that - * ensures subsequent operations happen after the lock is acquired. - */ - while (!(smp_load_acquire(&node->locked))) - arch_mutex_cpu_relax(); -} - -static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *next = ACCESS_ONCE(node->next); - - if (likely(!next)) { - /* - * Release the lock by setting it to NULL - */ - if (cmpxchg(lock, node, NULL) == node) - return; - /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); - } - /* - * Pass lock to next waiter. - * smp_store_release() provides a memory barrier to ensure - * all operations in the critical section has been completed - * before unlocking. - */ - smp_store_release(&next->locked, 1); -} /* * Mutex spinning code migrated from kernel/sched/core.c @@ -456,7 +402,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, for (;;) { struct task_struct *owner; - struct mspin_node node; + struct mcs_spinlock node; if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; @@ -478,10 +424,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * If there's an owner, wait for it to either * release the lock or go to sleep. */ - mspin_lock(MLOCK(lock), &node); + mcs_spin_lock(&lock->mcs_lock, &node); owner = ACCESS_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) { - mspin_unlock(MLOCK(lock), &node); + mcs_spin_unlock(&lock->mcs_lock, &node); goto slowpath; } @@ -496,11 +442,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mspin_unlock(MLOCK(lock), &node); + mcs_spin_unlock(&lock->mcs_lock, &node); preempt_enable(); return 0; } - mspin_unlock(MLOCK(lock), &node); + mcs_spin_unlock(&lock->mcs_lock, &node); /* * When there's no owner, we might have preempted between the -- cgit v1.2.3 From 52bf84aa206cd2c2516dfa3e03b578edf8a3242f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:40 -0500 Subject: sched/numa, mm: Remove p->numa_migrate_deferred Excessive migration of pages can hurt the performance of workloads that span multiple NUMA nodes. However, it turns out that the p->numa_migrate_deferred knob is a really big hammer, which does reduce migration rates, but does not actually help performance. Now that the second stage of the automatic numa balancing code has stabilized, it is time to replace the simplistic migration deferral code with something smarter. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 -------- kernel/sysctl.c | 7 ------- 2 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efe6457ac5c8..7cdde913b4dc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -819,14 +819,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8da99f905cf..b41d61d95c14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "numa_balancing_migrate_deferred", - .data = &sysctl_numa_balancing_migrate_deferred, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { -- cgit v1.2.3 From ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:41 -0500 Subject: sched/numa: Rename p->numa_faults to numa_faults_memory In order to get a more consistent naming scheme, making it clear which fault statistics track memory locality, and which track CPU locality, rename the memory fault statistics. Suggested-by: Mel Gorman Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/debug.c | 6 +++--- kernel/sched/fair.c | 56 ++++++++++++++++++++++++++-------------------------- 3 files changed, 33 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 81343d6bd9cb..bc708c53bf03 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1744,8 +1744,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10e..31b908daaa1b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; + if (p->numa_faults_memory) + nr_faults = p->numa_faults_memory[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); home_node = (p->numa_preferred_nid == node); - SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", i, node, cpu_current, home_node, nr_faults); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7cdde913b4dc..3e616d704f67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -901,11 +901,11 @@ static inline int task_faults_idx(int nid, int priv) static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -927,7 +927,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) { unsigned long total_faults; - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; total_faults = p->total_numa_faults; @@ -1255,7 +1255,7 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1371,16 +1371,16 @@ static void task_numa_placement(struct task_struct *p) long diff; i = task_faults_idx(nid, priv); - diff = -p->numa_faults[i]; + diff = -p->numa_faults_memory[i]; /* Decay existing window, copy faults since last scan */ - p->numa_faults[i] >>= 1; - p->numa_faults[i] += p->numa_faults_buffer[i]; - fault_types[priv] += p->numa_faults_buffer[i]; - p->numa_faults_buffer[i] = 0; + p->numa_faults_memory[i] >>= 1; + p->numa_faults_memory[i] += p->numa_faults_buffer_memory[i]; + fault_types[priv] += p->numa_faults_buffer_memory[i]; + p->numa_faults_buffer_memory[i] = 0; - faults += p->numa_faults[i]; - diff += p->numa_faults[i]; + faults += p->numa_faults_memory[i]; + diff += p->numa_faults_memory[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ @@ -1465,7 +1465,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->gid = p->pid; for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] = p->numa_faults[i]; + grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1523,8 +1523,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock(&my_grp->lock, &grp->lock); for (i = 0; i < 2*nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults[i]; - grp->faults[i] += p->numa_faults[i]; + my_grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] += p->numa_faults_memory[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; @@ -1550,12 +1550,12 @@ void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; int i; - void *numa_faults = p->numa_faults; + void *numa_faults = p->numa_faults_memory; if (grp) { spin_lock(&grp->lock); for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); @@ -1565,8 +1565,8 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; kfree(numa_faults); } @@ -1591,16 +1591,16 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; + if (unlikely(!p->numa_faults_memory)) { + int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids; /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults) + p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults_memory) return; - BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); + BUG_ON(p->numa_faults_buffer_memory); + p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1629,7 +1629,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } @@ -4771,7 +4771,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) { return false; } @@ -4802,7 +4802,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); -- cgit v1.2.3 From 50ec8a401fed6d246ab65e6011d61ac91c34af70 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:42 -0500 Subject: sched/numa: Track from which nodes NUMA faults are triggered Track which nodes NUMA faults are triggered from, in other words the CPUs on which the NUMA faults happened. This uses a similar mechanism to what is used to track the memory involved in numa faults. The next patches use this to build up a bitmap of which nodes a workload is actively running on. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e616d704f67..4841aaff7394 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -886,6 +886,7 @@ struct numa_group { struct rcu_head rcu; unsigned long total_faults; + unsigned long *faults_cpu; unsigned long faults[0]; }; @@ -1368,10 +1369,11 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { - long diff; + long diff, f_diff; i = task_faults_idx(nid, priv); diff = -p->numa_faults_memory[i]; + f_diff = -p->numa_faults_cpu[i]; /* Decay existing window, copy faults since last scan */ p->numa_faults_memory[i] >>= 1; @@ -1379,12 +1381,18 @@ static void task_numa_placement(struct task_struct *p) fault_types[priv] += p->numa_faults_buffer_memory[i]; p->numa_faults_buffer_memory[i] = 0; + p->numa_faults_cpu[i] >>= 1; + p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i]; + p->numa_faults_buffer_cpu[i] = 0; + faults += p->numa_faults_memory[i]; diff += p->numa_faults_memory[i]; + f_diff += p->numa_faults_cpu[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ p->numa_group->faults[i] += diff; + p->numa_group->faults_cpu[i] += f_diff; p->numa_group->total_faults += diff; group_faults += p->numa_group->faults[i]; } @@ -1453,7 +1461,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!p->numa_group)) { unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(unsigned long); + 4*nr_node_ids*sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -1463,8 +1471,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, spin_lock_init(&grp->lock); INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; + /* Second half of the array tracks nids where faults happen */ + grp->faults_cpu = grp->faults + 2 * nr_node_ids; - for (i = 0; i < 2*nr_node_ids; i++) + for (i = 0; i < 4*nr_node_ids; i++) grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1522,7 +1532,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock(&my_grp->lock, &grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) { + for (i = 0; i < 4*nr_node_ids; i++) { my_grp->faults[i] -= p->numa_faults_memory[i]; grp->faults[i] += p->numa_faults_memory[i]; } @@ -1554,7 +1564,7 @@ void task_numa_free(struct task_struct *p) if (grp) { spin_lock(&grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) + for (i = 0; i < 4*nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; @@ -1567,6 +1577,8 @@ void task_numa_free(struct task_struct *p) p->numa_faults_memory = NULL; p->numa_faults_buffer_memory = NULL; + p->numa_faults_cpu= NULL; + p->numa_faults_buffer_cpu = NULL; kfree(numa_faults); } @@ -1577,6 +1589,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; + int this_node = task_node(current); int priv; if (!numabalancing_enabled) @@ -1592,7 +1605,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids; + int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids; /* numa_faults and numa_faults_buffer share the allocation */ p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); @@ -1600,7 +1613,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) return; BUG_ON(p->numa_faults_buffer_memory); - p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); + p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1630,6 +1645,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) p->numa_pages_migrated += pages; p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } -- cgit v1.2.3 From 20e07dea286a90f096a779706861472d296397c6 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:43 -0500 Subject: sched/numa: Build per numa_group active node mask from numa_faults_cpu statistics The numa_faults_cpu statistics are used to maintain an active_nodes nodemask per numa_group. This allows us to be smarter about when to do numa migrations. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-5-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4841aaff7394..1ee921f1ec35 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -885,6 +885,7 @@ struct numa_group { struct list_head task_list; struct rcu_head rcu; + nodemask_t active_nodes; unsigned long total_faults; unsigned long *faults_cpu; unsigned long faults[0]; @@ -918,6 +919,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) p->numa_group->faults[task_faults_idx(nid, 1)]; } +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ + return group->faults_cpu[task_faults_idx(nid, 0)] + + group->faults_cpu[task_faults_idx(nid, 1)]; +} + /* * These return the fraction of accesses done by a particular task, or * task group, on a particular numa node. The group weight is given a @@ -1270,6 +1277,38 @@ static void numa_migrate_preferred(struct task_struct *p) task_numa_migrate(p); } +/* + * Find the nodes on which the workload is actively running. We do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + * + * The bitmask is used to make smarter decisions on when to do NUMA page + * migrations, To prevent flip-flopping, and excessive page migrations, nodes + * are added when they cause over 6/16 of the maximum number of faults, but + * only removed when they drop below 3/16. + */ +static void update_numa_active_node_mask(struct numa_group *numa_group) +{ + unsigned long faults, max_faults = 0; + int nid; + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults > max_faults) + max_faults = faults; + } + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (!node_isset(nid, numa_group->active_nodes)) { + if (faults > max_faults * 6 / 16) + node_set(nid, numa_group->active_nodes); + } else if (faults < max_faults * 3 / 16) + node_clear(nid, numa_group->active_nodes); + } +} + /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan @@ -1412,6 +1451,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { + update_numa_active_node_mask(p->numa_group); /* * If the preferred task and group nids are different, * iterate over the nodes again to find the best place. @@ -1474,6 +1514,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + 2 * nr_node_ids; + node_set(task_node(current), grp->active_nodes); + for (i = 0; i < 4*nr_node_ids; i++) grp->faults[i] = p->numa_faults_memory[i]; -- cgit v1.2.3 From 10f39042711ba21773763f267b4943a2c66c8bef Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:44 -0500 Subject: sched/numa, mm: Use active_nodes nodemask to limit numa migrations Use the active_nodes nodemask to make smarter decisions on NUMA migrations. In order to maximize performance of workloads that do not fit in one NUMA node, we want to satisfy the following criteria: 1) keep private memory local to each thread 2) avoid excessive NUMA migration of pages 3) distribute shared memory across the active nodes, to maximize memory bandwidth available to the workload This patch accomplishes that by implementing the following policy for NUMA migrations: 1) always migrate on a private fault 2) never migrate to a node that is not in the set of active nodes for the numa_group 3) always migrate from a node outside of the set of active nodes, to a node that is in that set 4) within the set of active nodes in the numa_group, only migrate from a node with more NUMA page faults, to a node with fewer NUMA page faults, with a 25% margin to avoid ping-ponging This results in most pages of a workload ending up on the actively used nodes, with reduced ping-ponging of pages between those nodes. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-6-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1ee921f1ec35..eeabb33f349e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -954,6 +954,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) return 1000 * group_faults(p, nid) / p->numa_group->total_faults; } +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = p->numa_group; + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Do not migrate if the destination is not a node that + * is actively used by this numa group. + */ + if (!node_isset(dst_nid, ng->active_nodes)) + return false; + + /* + * Source is a node that is not actively used by this + * numa group, while the destination is. Migrate. + */ + if (!node_isset(src_nid, ng->active_nodes)) + return true; + + /* + * Both source and destination are nodes in active + * use by this numa group. Maximize memory bandwidth + * by migrating from more heavily used groups, to less + * heavily used ones, spreading the load around. + * Use a 1/4 hysteresis to avoid spurious page movement. + */ + return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -- cgit v1.2.3 From 7e2703e6099609adc93679c4d45cd6247f565971 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:45 -0500 Subject: sched/numa: Normalize faults_cpu stats and weigh by CPU use Tracing the code that decides the active nodes has made it abundantly clear that the naive implementation of the faults_from code has issues. Specifically, the garbage collector in some workloads will access orders of magnitudes more memory than the threads that do all the active work. This resulted in the node with the garbage collector being marked the only active node in the group. This issue is avoided if we weigh the statistics by CPU use of each task in the numa group, instead of by how many faults each thread has occurred. To achieve this, we normalize the number of faults to the fraction of faults that occurred on each node, and then multiply that fraction by the fraction of CPU time the task has used since the last time task_numa_placement was invoked. This way the nodes in the active node mask will be the ones where the tasks from the numa group are most actively running, and the influence of eg. the garbage collector and other do-little threads is properly minimized. On a 4 node system, using CPU use statistics calculated over a longer interval results in about 1% fewer page migrations with two 32-warehouse specjbb runs on a 4 node system, and about 5% fewer page migrations, as well as 1% better throughput, with two 8-warehouse specjbb runs, as compared with the shorter term statistics kept by the scheduler. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-7-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc708c53bf03..a561c9e8e382 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1746,6 +1746,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_work.next = &p->numa_work; p->numa_faults_memory = NULL; p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eeabb33f349e..8fc3a8234817 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -887,6 +887,11 @@ struct numa_group { struct rcu_head rcu; nodemask_t active_nodes; unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ unsigned long *faults_cpu; unsigned long faults[0]; }; @@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct *p, memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1459,6 +1494,10 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; @@ -1471,7 +1510,7 @@ static void task_numa_placement(struct task_struct *p) int priv, i; for (priv = 0; priv < 2; priv++) { - long diff, f_diff; + long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); diff = -p->numa_faults_memory[i]; @@ -1483,8 +1522,18 @@ static void task_numa_placement(struct task_struct *p) fault_types[priv] += p->numa_faults_buffer_memory[i]; p->numa_faults_buffer_memory[i] = 0; + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); p->numa_faults_cpu[i] >>= 1; - p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i]; + p->numa_faults_cpu[i] += f_weight; p->numa_faults_buffer_cpu[i] = 0; faults += p->numa_faults_memory[i]; -- cgit v1.2.3 From 35664fd41e1c8cc4f0b89f6a51db5af39ba50640 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:46 -0500 Subject: sched/numa: Do statistics calculation using local variables only The current code in task_numa_placement calculates the difference between the old and the new value, but also temporarily stores half of the old value in the per-process variables. The NUMA balancing code looks at those per-process variables, and having other tasks temporarily see halved statistics could lead to unwanted numa migrations. This can be avoided by doing all the math in local variables. This change also simplifies the code a little. Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-8-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8fc3a8234817..4c449907a10e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1513,12 +1513,9 @@ static void task_numa_placement(struct task_struct *p) long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); - diff = -p->numa_faults_memory[i]; - f_diff = -p->numa_faults_cpu[i]; /* Decay existing window, copy faults since last scan */ - p->numa_faults_memory[i] >>= 1; - p->numa_faults_memory[i] += p->numa_faults_buffer_memory[i]; + diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; fault_types[priv] += p->numa_faults_buffer_memory[i]; p->numa_faults_buffer_memory[i] = 0; @@ -1532,13 +1529,12 @@ static void task_numa_placement(struct task_struct *p) f_weight = div64_u64(runtime << 16, period + 1); f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / (total_faults + 1); - p->numa_faults_cpu[i] >>= 1; - p->numa_faults_cpu[i] += f_weight; + f_diff = f_weight - p->numa_faults_cpu[i] / 2; p->numa_faults_buffer_cpu[i] = 0; + p->numa_faults_memory[i] += diff; + p->numa_faults_cpu[i] += f_diff; faults += p->numa_faults_memory[i]; - diff += p->numa_faults_memory[i]; - f_diff += p->numa_faults_cpu[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ -- cgit v1.2.3 From 58b46da336a9312b2e21bb576d1c2c484dbf6257 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:47 -0500 Subject: sched/numa: Rename variables in task_numa_fault() We track both the node of the memory after a NUMA fault, and the node of the CPU on which the fault happened. Rename the local variables in task_numa_fault to make things more explicit. Suggested-by: Mel Gorman Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-9-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c449907a10e..d5832c367d87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1735,11 +1735,11 @@ void task_numa_free(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; - int this_node = task_node(current); + int cpu_node = task_node(current); int priv; if (!numabalancing_enabled) @@ -1794,8 +1794,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages; + p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } -- cgit v1.2.3 From be1e4e760d940c14d119bffef5eb007dfdf29046 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 27 Jan 2014 17:03:48 -0500 Subject: sched/numa: Turn some magic numbers into #defines Cleanup suggested by Mel Gorman. Now the code contains some more hints on what statistics go where. Suggested-by: Mel Gorman Signed-off-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Chegu Vinod Link: http://lkml.kernel.org/r/1390860228-21539-10-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d5832c367d87..1f41b122198e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -896,6 +896,15 @@ struct numa_group { unsigned long faults[0]; }; +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + pid_t task_numa_group_id(struct task_struct *p) { return p->numa_group ? p->numa_group->gid : 0; @@ -903,7 +912,7 @@ pid_t task_numa_group_id(struct task_struct *p) static inline int task_faults_idx(int nid, int priv) { - return 2 * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * nid + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) @@ -1509,7 +1518,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long faults = 0, group_faults = 0; int priv, i; - for (priv = 0; priv < 2; priv++) { + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); @@ -1620,11 +1629,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ - grp->faults_cpu = grp->faults + 2 * nr_node_ids; + grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * + nr_node_ids; node_set(task_node(current), grp->active_nodes); - for (i = 0; i < 4*nr_node_ids; i++) + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1682,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock(&my_grp->lock, &grp->lock); - for (i = 0; i < 4*nr_node_ids; i++) { + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { my_grp->faults[i] -= p->numa_faults_memory[i]; grp->faults[i] += p->numa_faults_memory[i]; } @@ -1714,7 +1724,7 @@ void task_numa_free(struct task_struct *p) if (grp) { spin_lock(&grp->lock); - for (i = 0; i < 4*nr_node_ids; i++) + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; @@ -1755,14 +1765,20 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids; + int size = sizeof(*p->numa_faults_memory) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); + p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); if (!p->numa_faults_memory) return; BUG_ON(p->numa_faults_buffer_memory); + /* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); -- cgit v1.2.3 From 81993e81a994504f4c8b97d3410c9a052cdbcc9d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 1 Feb 2014 18:54:11 -0800 Subject: compat: Get rid of (get|put)_compat_time(val|spec) We have two APIs for compatiblity timespec/val, with confusingly similar names. compat_(get|put)_time(val|spec) *do* handle the case where COMPAT_USE_64BIT_TIME is set, whereas (get|put)_compat_time(val|spec) do not. This is an accident waiting to happen. Clean it up by favoring the full-service version; the limited version is replaced with double-underscore versions static to kernel/compat.c. A common pattern is to convert a struct timespec to kernel format in an allocation on the user stack. Unfortunately it is open-coded in several places. Since this allocation isn't actually needed if COMPAT_USE_64BIT_TIME is true (since user format == kernel format) encapsulate that whole pattern into the function compat_convert_timespec(). An equivalent function should be written for struct timeval if it is needed in the future. Finally, get rid of compat_(get|put)_timeval_convert(): each was only used once, and the latter was not even doing what the function said (no conversion actually was being done.) Moving the conversion into compat_sys_settimeofday() itself makes the code much more similar to sys_settimeofday() itself. v3: Remove unused compat_convert_timeval(). v2: Drop bogus "const" in the destination argument for compat_convert_time*(). Cc: Mauro Carvalho Chehab Cc: Alexander Viro Cc: Hans Verkuil Cc: Andrew Morton Cc: Heiko Carstens Cc: Manfred Spraul Cc: Mateusz Guzik Cc: Rafael Aquini Cc: Davidlohr Bueso Cc: Stephen Rothwell Cc: Dan Carpenter Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Tested-by: H.J. Lu Signed-off-by: H. Peter Anvin --- kernel/compat.c | 108 +++++++++++++++++++++++++------------------------- kernel/futex_compat.c | 2 +- 2 files changed, 55 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 0a09e481b70b..3afc524a57ad 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -30,28 +30,6 @@ #include -/* - * Get/set struct timeval with struct timespec on the native side - */ -static int compat_get_timeval_convert(struct timespec *o, - struct compat_timeval __user *i) -{ - long usec; - - if (get_user(o->tv_sec, &i->tv_sec) || - get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -static int compat_put_timeval_convert(struct compat_timeval __user *o, - struct timeval *i) -{ - return (put_user(i->tv_sec, &o->tv_sec) || - put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; -} - static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) { memset(txc, 0, sizeof(struct timex)); @@ -116,7 +94,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval_convert(tv, &ktv)) + if (compat_put_timeval(&ktv, tv)) return -EFAULT; } if (tz) { @@ -130,59 +108,58 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) { - struct timespec kts; - struct timezone ktz; + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; if (tv) { - if (compat_get_timeval_convert(&kts, tv)) + if (compat_get_timeval(&user_tv, tv)) return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) + if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(get_compat_timeval); -int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(put_compat_timeval); -int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(get_compat_timespec); -int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(put_compat_timespec); int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; else - return get_compat_timeval(tv, utv); + return __compat_get_timeval(tv, utv); } EXPORT_SYMBOL_GPL(compat_get_timeval); @@ -191,7 +168,7 @@ int compat_put_timeval(const struct timeval *tv, void __user *utv) if (COMPAT_USE_64BIT_TIME) return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; else - return put_compat_timeval(tv, utv); + return __compat_put_timeval(tv, utv); } EXPORT_SYMBOL_GPL(compat_put_timeval); @@ -200,7 +177,7 @@ int compat_get_timespec(struct timespec *ts, const void __user *uts) if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; else - return get_compat_timespec(ts, uts); + return __compat_get_timespec(ts, uts); } EXPORT_SYMBOL_GPL(compat_get_timespec); @@ -209,10 +186,33 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; else - return put_compat_timespec(ts, uts); + return __compat_put_timespec(ts, uts); } EXPORT_SYMBOL_GPL(compat_put_timespec); +int compat_convert_timespec(struct timespec __user **kts, + const void __user *cts) +{ + struct timespec ts; + struct timespec __user *uts; + + if (!cts || COMPAT_USE_64BIT_TIME) { + *kts = (struct timespec __user *)cts; + return 0; + } + + uts = compat_alloc_user_space(sizeof(ts)); + if (!uts) + return -EFAULT; + if (compat_get_timespec(&ts, cts)) + return -EFAULT; + if (copy_to_user(uts, &ts, sizeof(ts))) + return -EFAULT; + + *kts = uts; + return 0; +} + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; @@ -229,7 +229,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) if (ret) { rmtp = restart->nanosleep.compat_rmtp; - if (rmtp && put_compat_timespec(&rmt, rmtp)) + if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } @@ -243,7 +243,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, mm_segment_t oldfs; long ret; - if (get_compat_timespec(&tu, rqtp)) + if (compat_get_timespec(&tu, rqtp)) return -EFAULT; if (!timespec_valid(&tu)) @@ -263,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, restart->fn = compat_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; - if (rmtp && put_compat_timespec(&rmt, rmtp)) + if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } @@ -647,8 +647,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, int get_compat_itimerspec(struct itimerspec *dst, const struct compat_itimerspec __user *src) { - if (get_compat_timespec(&dst->it_interval, &src->it_interval) || - get_compat_timespec(&dst->it_value, &src->it_value)) + if (__compat_get_timespec(&dst->it_interval, &src->it_interval) || + __compat_get_timespec(&dst->it_value, &src->it_value)) return -EFAULT; return 0; } @@ -656,8 +656,8 @@ int get_compat_itimerspec(struct itimerspec *dst, int put_compat_itimerspec(struct compat_itimerspec __user *dst, const struct itimerspec *src) { - if (put_compat_timespec(&src->it_interval, &dst->it_interval) || - put_compat_timespec(&src->it_value, &dst->it_value)) + if (__compat_put_timespec(&src->it_interval, &dst->it_interval) || + __compat_put_timespec(&src->it_value, &dst->it_value)) return -EFAULT; return 0; } @@ -727,7 +727,7 @@ long compat_sys_clock_settime(clockid_t which_clock, mm_segment_t oldfs; struct timespec ts; - if (get_compat_timespec(&ts, tp)) + if (compat_get_timespec(&ts, tp)) return -EFAULT; oldfs = get_fs(); set_fs(KERNEL_DS); @@ -749,7 +749,7 @@ long compat_sys_clock_gettime(clockid_t which_clock, err = sys_clock_gettime(which_clock, (struct timespec __user *) &ts); set_fs(oldfs); - if (!err && put_compat_timespec(&ts, tp)) + if (!err && compat_put_timespec(&ts, tp)) return -EFAULT; return err; } @@ -789,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock, err = sys_clock_getres(which_clock, (struct timespec __user *) &ts); set_fs(oldfs); - if (!err && tp && put_compat_timespec(&ts, tp)) + if (!err && tp && compat_put_timespec(&ts, tp)) return -EFAULT; return err; } @@ -808,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) set_fs(oldfs); if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&tu, rmtp)) + compat_put_timespec(&tu, rmtp)) return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { @@ -827,7 +827,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, struct timespec in, out; struct restart_block *restart; - if (get_compat_timespec(&in, rqtp)) + if (compat_get_timespec(&in, rqtp)) return -EFAULT; oldfs = get_fs(); @@ -838,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, set_fs(oldfs); if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&out, rmtp)) + compat_put_timespec(&out, rmtp)) return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { @@ -1130,7 +1130,7 @@ COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, set_fs(KERNEL_DS); ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); set_fs(old_fs); - if (put_compat_timespec(&t, interval)) + if (compat_put_timespec(&t, interval)) return -EFAULT; return ret; } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f9f44fd4d34d..55c8c9349cfe 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -183,7 +183,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (get_compat_timespec(&ts, utime)) + if (compat_get_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) return -EINVAL; -- cgit v1.2.3 From dce44e03b0a3448ad11ac6c6e0cbe299e0400791 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 2 Feb 2014 17:57:28 -0800 Subject: compat: Fix sparse address space warnings In compat_sys_old_getrlimit() we pass a kernel pointer to sys_old_getrlimit() inside a set_fs() bracket. This is okay, so we can safely cast the affected pointer to __user. In compat_clock_nanosleep_restart(), the variable "rmtp" holds a user pointer. Annotate it as such. Both of these warnings are ancient, but were reported by Fengguang Wu's test system due to other changes. Signed-off-by: H. Peter Anvin Cc: Toyo Abe Link: http://lkml.kernel.org/n/tip-507h7cq5e45eg6ygtykon3bf@git.kernel.org --- kernel/compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 3afc524a57ad..7076b57fa52e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -451,7 +451,7 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, &r); + ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); set_fs(old_fs); if (!ret) { @@ -799,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) long err; mm_segment_t oldfs; struct timespec tu; - struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; + struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp; restart->nanosleep.rmtp = (struct timespec __user *) &tu; oldfs = get_fs(); -- cgit v1.2.3 From 1ff6bbfd13ca2c114a5cb58e1a92d1e5d68ce0b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Jan 2014 18:10:37 -0500 Subject: arm, pm, vmpressure: add missing slab.h includes arch/arm/mach-tegra/pm.c, kernel/power/console.c and mm/vmpressure.c were somehow getting slab.h indirectly through cgroup.h which in turn was getting it indirectly through xattr.h. A scheduled cgroup change drops xattr.h inclusion from cgroup.h and breaks compilation of these three files. Add explicit slab.h includes to the three files. A pending cgroup patch depends on this change and it'd be great if this can be routed through cgroup/for-3.14-fixes branch. Signed-off-by: Tejun Heo Acked-by: Stephen Warren Cc: Thierry Reding Cc: linux-tegra@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: linux-pm@vger.kernel.org Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: cgroups@vger.kernel.org --- kernel/power/console.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index eacb8bd8cab4..aba9c545a0e3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "power.h" #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) -- cgit v1.2.3 From 80d767d770fd9c697e434fd080c2db7b5c60c6dd Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 24 Jan 2014 16:41:36 -0500 Subject: time: Fix overflow when HZ is smaller than 60 When compiling for the IA-64 ski emulator, HZ is set to 32 because the emulation is slow and we don't want to waste too many cycles processing timers. Alpha also has an option to set HZ to 32. This causes integer underflow in kernel/time/jiffies.c: kernel/time/jiffies.c:66:2: warning: large integer implicitly truncated to unsigned type [-Woverflow] .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ ^ This patch reduces the JIFFIES_SHIFT value to avoid the overflow. Signed-off-by: Mikulas Patocka Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1401241639100.23871@file01.intranet.prod.int.rdu2.redhat.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/jiffies.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7a925ba456fb..a6a5bf53e86d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -51,7 +51,13 @@ * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else #define JIFFIES_SHIFT 8 +#endif static cycle_t jiffies_read(struct clocksource *cs) { -- cgit v1.2.3 From e8b175946c16d7001b22620f52d78ab497efc9d0 Mon Sep 17 00:00:00 2001 From: Shaibal Dutta Date: Fri, 31 Jan 2014 11:18:24 -0800 Subject: timekeeping: Move clock sync work to power efficient workqueue For better use of CPU idle time, allow the scheduler to select the CPU on which the CMOS clock sync work would be scheduled. This improves idle residency time and conserver power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Signed-off-by: Shaibal Dutta [zoran.markovic@linaro.org: Added commit message. Aligned code.] Signed-off-by: Zoran Markovic Cc: John Stultz Link: http://lkml.kernel.org/r/1391195904-12497-1-git-send-email-zoran.markovic@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af8d1d4f3d55..419a52cecd20 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) { - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else -- cgit v1.2.3 From 627ee7947e2e83ba565c31c5c9373d6e364b1ecd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Feb 2014 14:34:31 -0800 Subject: clockevents: Serialize calls to clockevents_update_freq() in the core We can identify the broadcast device in the core and serialize all callers including interrupts on a different CPU against the update. Also, disabling interrupts is moved into the core allowing callers to leave interrutps enabled when calling clockevents_update_freq(). Signed-off-by: Soren Brinkmann Cc: linux-arm-kernel@lists.infradead.org Cc: Soeren Brinkmann Cc: Daniel Lezcano Cc: Michal Simek Link: http://lkml.kernel.org/r/1391466877-28908-2-git-send-email-soren.brinkmann@xilinx.com Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 29 ++++++++++++++++++++++------- kernel/time/tick-broadcast.c | 25 +++++++++++++++++++------ kernel/time/tick-internal.h | 4 ++++ 3 files changed, 45 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 086ad6043bcb..641d91003a45 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -439,6 +439,16 @@ void clockevents_config_and_register(struct clock_event_device *dev, } EXPORT_SYMBOL_GPL(clockevents_config_and_register); +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return 0; + + return clockevents_program_event(dev, dev->next_event, false); +} + /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify @@ -446,17 +456,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register); * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per - * cpu timer events with interrupts disabled! Returns 0 on success, - * -ETIME when the event is in the past. + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { - clockevents_config(dev, freq); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; + unsigned long flags; + int ret; - return clockevents_program_event(dev, dev->next_event, false); + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; } /* diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 43780ab5e279..003e6c3663b1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -120,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); @@ -272,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - raw_spin_lock(&tick_broadcast_lock); - cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); tick_do_broadcast(tmpmask); - - raw_spin_unlock(&tick_broadcast_lock); } /* @@ -287,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { ktime_t next; + raw_spin_lock(&tick_broadcast_lock); + tick_do_periodic_broadcast(); /* * The device is in periodic mode. No reprogramming necessary: */ if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; + goto unlock; /* * Setup the next period for devices, which do not have @@ -306,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, false)) - return; + goto unlock; tick_do_periodic_broadcast(); } +unlock: + raw_spin_unlock(&tick_broadcast_lock); } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 8329669b51ec..26f1c0ba9d81 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -111,6 +111,7 @@ extern int tick_resume_broadcast(void); extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); #else /* !BROADCAST */ @@ -133,6 +134,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, + u32 freq) { return -ENODEV; } /* * Set the periodic handler in non broadcast mode @@ -154,5 +157,6 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) #endif +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -- cgit v1.2.3 From fe79a9ba11962a603fb6af68fcb476e64031e46c Mon Sep 17 00:00:00 2001 From: Soren Brinkmann Date: Mon, 3 Feb 2014 14:34:32 -0800 Subject: clockevents: Adjust timer interval when frequency changes clockevent devices in periodic mode are not updated when the frequency of the device changes. Issue a dev->set_mode() callback which forces the device to reevaluate the timer settings. Signed-off-by: Soren Brinkmann Cc: linux-arm-kernel@lists.infradead.org Cc: Daniel Lezcano Cc: Michal Simek Link: http://lkml.kernel.org/r/1391466877-28908-3-git-send-email-soren.brinkmann@xilinx.com Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 641d91003a45..f85e5fda9c66 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -443,10 +443,13 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; + if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + return clockevents_program_event(dev, dev->next_event, false); + + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); - return clockevents_program_event(dev, dev->next_event, false); + return 0; } /** -- cgit v1.2.3 From da7e6f45c34d39186b72328bacc4dd86bff60e0a Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Fri, 7 Feb 2014 13:36:06 +0530 Subject: time: Change the return type of clockevents_notify() to integer The broadcast framework can potentially be made use of by archs which do not have an external clock device as well. Then, it is required that one of the CPUs need to handle the broadcasting of wakeup IPIs to the CPUs in deep idle. As a result its local timers should remain functional all the time. For such a CPU, the BROADCAST_ENTER notification has to fail indicating that its clock device cannot be shutdown. To make way for this support, change the return type of tick_broadcast_oneshot_control() and hence clockevents_notify() to indicate such scenarios. Signed-off-by: Preeti U Murthy Cc: deepthi@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: fweisbec@gmail.com Cc: paulus@samba.org Cc: srivatsa.bhat@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Cc: rafael.j.wysocki@intel.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140207080606.17187.78306.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 8 +++++--- kernel/time/tick-broadcast.c | 6 ++++-- kernel/time/tick-internal.h | 6 +++--- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f85e5fda9c66..ad362c260ef4 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -542,12 +542,13 @@ void clockevents_resume(void) #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu; + int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); @@ -560,7 +561,7 @@ void clockevents_notify(unsigned long reason, void *arg) case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); + ret = tick_broadcast_oneshot_control(reason); break; case CLOCK_EVT_NOTIFY_CPU_DYING: @@ -603,6 +604,7 @@ void clockevents_notify(unsigned long reason, void *arg) break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clockevents_notify); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 003e6c3663b1..84c8fd91d744 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -646,14 +646,15 @@ again: /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason) { struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; ktime_t now; - int cpu; + int cpu, ret = 0; /* * Periodic mode does not care about the enter/exit of power @@ -759,6 +760,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 26f1c0ba9d81..0756c62c219a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -46,7 +46,7 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); @@ -58,7 +58,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } @@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { -- cgit v1.2.3 From 5d1638acb9f62fa7eb0c07cb85318bbe1f13b227 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Fri, 7 Feb 2014 13:36:32 +0530 Subject: tick: Introduce hrtimer based broadcast On some architectures, in certain CPU deep idle states the local timers stop. An external clock device is used to wakeup these CPUs. The kernel support for the wakeup of these CPUs is provided by the tick broadcast framework by using the external clock device as the wakeup source. However not all implementations of architectures provide such an external clock device. This patch includes support in the broadcast framework to handle the wakeup of the CPUs in deep idle states on such systems by queuing a hrtimer on one of the CPUs, which is meant to handle the wakeup of CPUs in deep idle states. This patchset introduces a pseudo clock device which can be registered by the archs as tick_broadcast_device in the absence of a real external clock device. Once registered, the broadcast framework will work as is for these architectures as long as the archs take care of the BROADCAST_ENTER notification failing for one of the CPUs. This CPU is made the stand by CPU to handle wakeup of the CPUs in deep idle and it *must not enter deep idle states*. The CPU with the earliest wakeup is chosen to be this CPU. Hence this way the stand by CPU dynamically moves around and so does the hrtimer which is queued to trigger at the next earliest wakeup time. This is consistent with the case where an external clock device is present. The smp affinity of this clock device is set to the CPU with the earliest wakeup. This patchset handles the hotplug of the stand by CPU as well by moving the hrtimer on to the CPU handling the CPU_DEAD notification. Originally-from: Thomas Gleixner Signed-off-by: Preeti U Murthy Cc: deepthi@linux.vnet.ibm.com Cc: paulmck@linux.vnet.ibm.com Cc: fweisbec@gmail.com Cc: paulus@samba.org Cc: srivatsa.bhat@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Cc: rafael.j.wysocki@intel.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140207080632.17187.80532.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/Makefile | 2 +- kernel/time/tick-broadcast-hrtimer.c | 106 +++++++++++++++++++++++++++++++++++ kernel/time/tick-broadcast.c | 54 +++++++++++++++++- 3 files changed, 158 insertions(+), 4 deletions(-) create mode 100644 kernel/time/tick-broadcast-hrtimer.c (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 9250130646f5..06151ef4a744 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o tick-broadcast-hrtimer.o obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 000000000000..92425279312b --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + */ + if (hrtimer_try_to_cancel(&bctimer) >= 0) { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = KTIME_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 84c8fd91d744..63c7b2d9ed8e 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -643,6 +643,42 @@ again: raw_spin_unlock(&tick_broadcast_lock); } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || !broadcast_needs_cpu(bc, deadcpu)) + return; + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); +} + /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop @@ -661,7 +697,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return; + return 0; /* * We are called with preemtion disabled from the depth of the @@ -672,7 +708,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return; + return 0; bc = tick_broadcast_device.evtdev; @@ -680,7 +716,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -693,6 +729,16 @@ int tick_broadcast_oneshot_control(unsigned long reason) dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); @@ -866,6 +912,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + broadcast_move_bc(cpu); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From f1689bb7abec8e2e670d8ad11eaa86d54bad8cfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 16:00:46 +0100 Subject: time: Fixup fallout from recent clockevent/tick changes Make the stub function static inline instead of static and move the clockevents related function into the proper ifdeffed section. Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner Cc: Soren Brinkmann Cc: Preeti U Murthy --- kernel/time/tick-internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0756c62c219a..7ab92b19965a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -155,8 +155,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + #endif -int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -- cgit v1.2.3 From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 6 Feb 2014 15:56:01 -0800 Subject: cgroup: use an ordered workqueue for cgroup destruction Sometimes the cleanup after memcg hierarchy testing gets stuck in mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0. There may turn out to be several causes, but a major cause is this: the workitem to offline parent can get run before workitem to offline child; parent's mem_cgroup_reparent_charges() circles around waiting for the child's pages to be reparented to its lrus, but it's holding cgroup_mutex which prevents the child from reaching its mem_cgroup_reparent_charges(). Just use an ordered workqueue for cgroup_destroy_wq. tj: Committing as the temporary fix until the reverse dependency can be removed from memcg. Comment updated accordingly. Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction") Suggested-by: Filipe Brandenburger Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2f46ba37f72..aa95591c1430 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void) /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. - * Use 1 for @max_active. + * + * XXX: Must be ordered to make sure parent is offlined after + * children. The ordering requirement is for memcg where a + * parent's offline may wait for a child's leading to deadlock. In + * the long term, this should be fixed from memcg side. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0); BUG_ON(!cgroup_destroy_wq); /* -- cgit v1.2.3 From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:33 -0500 Subject: cgroup: fix error return value in cgroup_mount() When cgroup_mount() fails to allocate an id for the root, it didn't set ret before jumping to unlock_drop ending up returning 0 after a failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa95591c1430..793f37176077 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); - root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, - 0, 1, GFP_KERNEL); - if (root_cgrp->id < 0) + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + if (ret < 0) goto unlock_drop; + root_cgrp->id = ret; /* Check for name clashes with existing mounts */ ret = -EBUSY; -- cgit v1.2.3 From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:33 -0500 Subject: cgroup: fix error return from cgroup_create() cgroup_create() was returning 0 after allocation failures. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 793f37176077..0eb7b868e1ab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; - int ssid, err = 0; + int ssid, err; struct cgroup_subsys *ss; struct super_block *sb = root->sb; @@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return -ENOMEM; name = cgroup_alloc_name(dentry); - if (!name) + if (!name) { + err = -ENOMEM; goto err_free_cgrp; + } rcu_assign_pointer(cgrp->name, name); /* @@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * a half-baked cgroup. */ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) + if (cgrp->id < 0) { + err = -ENOMEM; goto err_free_name; + } /* * Only live parents can have children. Note that the liveliness -- cgit v1.2.3 From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:34 -0500 Subject: cgroup: fix locking in cgroup_cfts_commit() cgroup_cfts_commit() walks the cgroup hierarchy that the target subsystem is attached to and tries to apply the file changes. Due to the convolution with inode locking, it can't keep cgroup_mutex locked while iterating. It currently holds only RCU read lock around the actual iteration and then pins the found cgroup using dget(). Unfortunately, this is incorrect. Although the iteration does check cgroup_is_dead() before invoking dget(), there's nothing which prevents the dentry from going away inbetween. Note that this is different from the usual css iterations where css_tryget() is used to pin the css - css_tryget() tests whether the css can be pinned and fails if not. The problem can be solved by simply holding cgroup_mutex instead of RCU read lock around the iteration, which actually reduces LOC. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0eb7b868e1ab..3edf7163b84f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) */ update_before = cgroup_serial_nr_next; - mutex_unlock(&cgroup_mutex); - /* add/rm files for all cgroups created before */ - rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; @@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) inode = cgrp->dentry->d_inode; dget(cgrp->dentry); - rcu_read_unlock(); - dput(prev); prev = cgrp->dentry; + mutex_unlock(&cgroup_mutex); mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - - rcu_read_lock(); if (ret) break; } - rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); dput(prev); deactivate_super(sb); return ret; -- cgit v1.2.3 From 3ed80a62bf959d34ebd4d553b026fbe7e6fbcc54 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: drop module support With module supported dropped from net_prio, no controller is using cgroup module support. None of actual resource controllers can be built as a module and we aren't gonna add new controllers which don't control resources. This patch drops module support from cgroup. * cgroup_[un]load_subsys() and cgroup_subsys->module removed. * As there's no point in distinguishing IS_BUILTIN() and IS_MODULE(), cgroup_subsys.h now uses IS_ENABLED() directly. * enum cgroup_subsys_id now exactly matches the list of enabled controllers as ordered in cgroup_subsys.h. * cgroup_subsys[] is now a contiguously occupied array. Size specification is no longer necessary and dropped. * for_each_builtin_subsys() is removed and for_each_subsys() is updated to not require any locking. * module ref handling is removed from rebind_subsystems(). * Module related comments dropped. v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core"). v3: Added {} around the if (need_forkexit_callback) block in cgroup_post_fork() for readability as suggested by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 284 ++++---------------------------------------------------- 1 file changed, 16 insertions(+), 268 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2f46ba37f72..ccb16b47e293 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -120,15 +119,9 @@ static struct workqueue_struct *cgroup_destroy_wq; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated with the built in subsystems, and modular subsystems are - * registered after that. The mutable section of this array is protected by - * cgroup_mutex. - */ +/* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { +static struct cgroup_subsys *cgroup_subsys[] = { #include }; @@ -258,30 +251,13 @@ static int notify_on_release(const struct cgroup *cgrp) else /** - * for_each_subsys - iterate all loaded cgroup subsystems + * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * - * Iterates through all loaded subsystems. Should be called under - * cgroup_mutex or cgroup_root_mutex. */ #define for_each_subsys(ss, ssid) \ - for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ - (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((ss) = cgroup_subsys[(ssid)])) { } \ - else - -/** - * for_each_builtin_subsys - iterate all built-in cgroup subsystems - * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end - * - * Bulit-in subsystems are always present and iteration itself doesn't - * require any synchronization. - */ -#define for_each_builtin_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[i]) || true); (i)++) + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /* iterate across the active hierarchies */ #define for_each_active_root(root) \ @@ -975,50 +951,24 @@ static void cgroup_d_remove_dir(struct dentry *dentry) remove_dir(dentry); } -/* - * Call with cgroup_mutex held. Drops reference counts on modules, including - * any duplicate ones that parse_cgroupfs_options took. If this function - * returns an error, no reference counts are touched. - */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_mask, unsigned removed_mask) { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - unsigned long pinned = 0; int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); /* Check that any added subsystems are currently free */ - for_each_subsys(ss, i) { - if (!(added_mask & (1 << i))) - continue; - - /* is the subsystem mounted elsewhere? */ - if (ss->root != &cgroup_dummy_root) { - ret = -EBUSY; - goto out_put; - } - - /* pin the module */ - if (!try_module_get(ss->module)) { - ret = -ENOENT; - goto out_put; - } - pinned |= 1 << i; - } - - /* subsys could be missing if unloaded between parsing and here */ - if (added_mask != pinned) { - ret = -ENOENT; - goto out_put; - } + for_each_subsys(ss, i) + if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root) + return -EBUSY; ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - goto out_put; + return ret; /* * Nothing can fail from this point on. Remove files for the @@ -1057,9 +1007,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; - - /* subsystem is now free - drop reference on module */ - module_put(ss->module); root->subsys_mask &= ~bit; } } @@ -1071,12 +1018,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; - -out_put: - for_each_subsys(ss, i) - if (pinned & (1 << i)) - module_put(ss->module); - return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -4506,7 +4447,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) return ret; } -static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) +static void __init cgroup_init_cftsets(struct cgroup_subsys *ss) { INIT_LIST_HEAD(&ss->cftsets); @@ -4559,185 +4500,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); - - /* this function shouldn't be used with modular subsystems, since they - * need to register a subsys_id, among other things */ - BUG_ON(ss->module); } -/** - * cgroup_load_subsys: load and register a modular subsystem at runtime - * @ss: the subsystem to load - * - * This function should be called in a modular subsystem's initcall. If the - * subsystem is built as a module, it will be assigned a new subsys_id and set - * up for use. If the subsystem is built-in anyway, work is delegated to the - * simpler cgroup_init_subsys. - */ -int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - int i, ret; - struct hlist_node *tmp; - struct css_set *cset; - unsigned long key; - - /* check name and function validity */ - if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->css_alloc == NULL || ss->css_free == NULL) - return -EINVAL; - - /* - * we don't support callbacks in modular subsystems. this check is - * before the ss->module check for consistency; a subsystem that could - * be a module should still have no callbacks even if the user isn't - * compiling it as one. - */ - if (ss->fork || ss->exit) - return -EINVAL; - - /* - * an optionally modular subsystem is built-in: we want to do nothing, - * since cgroup_init_subsys will have already taken care of it. - */ - if (ss->module == NULL) { - /* a sanity check */ - BUG_ON(cgroup_subsys[ss->subsys_id] != ss); - return 0; - } - - /* init base cftset */ - cgroup_init_cftsets(ss); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - cgroup_subsys[ss->subsys_id] = ss; - - /* - * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the dummy root - * attachment). - */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); - if (IS_ERR(css)) { - /* failure case - need to deassign the cgroup_subsys[] slot. */ - cgroup_subsys[ss->subsys_id] = NULL; - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return PTR_ERR(css); - } - - ss->root = &cgroup_dummy_root; - - /* our new subsystem will be attached to the dummy hierarchy. */ - init_css(css, ss, cgroup_dummy_top); - - /* - * Now we need to entangle the css into the existing css_sets. unlike - * in cgroup_init_subsys, there are now multiple css_sets, so each one - * will need a new pointer to it; done by iterating the css_set_table. - * furthermore, modifying the existing css_sets will corrupt the hash - * table state, so each changed css_set will need its hash recomputed. - * this is all done under the css_set_lock. - */ - write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { - /* skip entries that we already rehashed */ - if (cset->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hash_del(&cset->hlist); - /* set new value */ - cset->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - ret = online_css(css); - if (ret) { - ss->css_free(css); - goto err_unload; - } - - /* success! */ - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return 0; - -err_unload: - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - /* @ss can't be mounted here as try_module_get() would fail */ - cgroup_unload_subsys(ss); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_load_subsys); - -/** - * cgroup_unload_subsys: unload a modular subsystem - * @ss: the subsystem to unload - * - * This function should be called in a modular subsystem's exitcall. When this - * function is invoked, the refcount on the subsystem's module will be 0, so - * the subsystem will not be attached to any hierarchy. - */ -void cgroup_unload_subsys(struct cgroup_subsys *ss) -{ - struct cgrp_cset_link *link; - struct cgroup_subsys_state *css; - - BUG_ON(ss->module == NULL); - - /* - * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get() in rebind_subsystems() should ensure that it - * doesn't start being used while we're killing it off. - */ - BUG_ON(ss->root != &cgroup_dummy_root); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - css = cgroup_css(cgroup_dummy_top, ss); - if (css) - offline_css(css); - - /* deassign the subsys_id */ - cgroup_subsys[ss->subsys_id] = NULL; - - /* - * disentangle the css from all css_sets attached to the dummy - * top. as in loading, we need to pay our respects to the hashtable - * gods. - */ - write_lock(&css_set_lock); - list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { - struct css_set *cset = link->cset; - unsigned long key; - - hash_del(&cset->hlist); - cset->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - /* - * remove subsystem's css from the cgroup_dummy_top and free it - - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. - */ - if (css) - ss->css_free(css); - RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unload_subsys); - /** * cgroup_init_early - cgroup initialization at system boot * @@ -4763,8 +4527,7 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - /* at bootup time, we don't worry about modular subsystems */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); BUG_ON(!ss->css_alloc); @@ -4797,7 +4560,7 @@ int __init cgroup_init(void) if (err) return err; - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); } @@ -5032,15 +4795,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, and the builtin section of the subsys - * array is immutable, so we don't need to lock the - * subsys array here. On the other hand, modular section - * of the array can be freed at module unload, so we - * can't touch that. - */ - for_each_builtin_subsys(ss, i) + for_each_subsys(ss, i) if (ss->fork) ss->fork(child); } @@ -5105,11 +4860,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) RCU_INIT_POINTER(tsk->cgroups, &init_css_set); if (run_callbacks && need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, see cgroup_post_fork() for details. - */ - for_each_builtin_subsys(ss, i) { + /* see cgroup_post_fork() for details */ + for_each_subsys(ss, i) { if (ss->exit) { struct cgroup_subsys_state *old_css = cset->subsys[i]; struct cgroup_subsys_state *css = task_css(tsk, i); @@ -5228,11 +4980,7 @@ static int __init cgroup_disable(char *str) if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about - * module subsystems, so we don't worry about them. - */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" -- cgit v1.2.3 From 073219e995b4a3f8cf1ce8228b7ef440b6994ac0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: clean up cgroup_subsys names and initialization cgroup_subsys is a bit messier than it needs to be. * The name of a subsys can be different from its internal identifier defined in cgroup_subsys.h. Most subsystems use the matching name but three - cpu, memory and perf_event - use different ones. * cgroup_subsys_id enums are postfixed with _subsys_id and each cgroup_subsys is postfixed with _subsys. cgroup.h is widely included throughout various subsystems, it doesn't and shouldn't have claim on such generic names which don't have any qualifier indicating that they belong to cgroup. * cgroup_subsys->subsys_id should always equal the matching cgroup_subsys_id enum; however, we require each controller to initialize it and then BUG if they don't match, which is a bit silly. This patch cleans up cgroup_subsys names and initialization by doing the followings. * cgroup_subsys_id enums are now postfixed with _cgrp_id, and each cgroup_subsys with _cgrp_subsys. * With the above, renaming subsys identifiers to match the userland visible names doesn't cause any naming conflicts. All non-matching identifiers are renamed to match the official names. cpu_cgroup -> cpu mem_cgroup -> memory perf -> perf_event * controllers no longer need to initialize ->subsys_id and ->name. They're generated in cgroup core and set automatically during boot. * Redundant cgroup_subsys declarations removed. * While updating BUG_ON()s in cgroup_init_early(), convert them to WARN()s. BUGging that early during boot is stupid - the kernel can't print anything, even through serial console and the trap handler doesn't even link stack frame properly for back-tracing. This patch doesn't introduce any behavior changes. v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs classid handling into core"). Signed-off-by: Tejun Heo Acked-by: Neil Horman Acked-by: "David S. Miller" Acked-by: "Rafael J. Wysocki" Acked-by: Michal Hocko Acked-by: Peter Zijlstra Acked-by: Aristeu Rozanski Acked-by: Ingo Molnar Acked-by: Li Zefan Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Serge E. Hallyn Cc: Vivek Goyal Cc: Thomas Graf --- kernel/cgroup.c | 34 ++++++++++++++++++++-------------- kernel/cgroup_freezer.c | 8 ++------ kernel/cpuset.c | 10 ++++------ kernel/events/core.c | 8 +++----- kernel/sched/core.c | 6 ++---- kernel/sched/cpuacct.c | 6 ++---- 6 files changed, 33 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ccb16b47e293..fe3f7253aa90 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -120,10 +120,18 @@ static struct workqueue_struct *cgroup_destroy_wq; static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* generate an array of cgroup subsystem pointers */ -#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, static struct cgroup_subsys *cgroup_subsys[] = { #include }; +#undef SUBSYS + +/* array of cgroup subsystem names */ +#define SUBSYS(_x) [_x ## _cgrp_id] = #_x, +static const char *cgroup_subsys_name[] = { +#include +}; +#undef SUBSYS /* * The dummy hierarchy, reserved for the subsystems that are otherwise @@ -1076,7 +1084,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) BUG_ON(!mutex_is_locked(&cgroup_mutex)); #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_subsys_id); + mask = ~(1UL << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -4528,15 +4536,15 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); for_each_subsys(ss, i) { - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->css_alloc); - BUG_ON(!ss->css_free); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->subsys_id, + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, + ss->subsys_id, ss->name); + WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, + "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + + ss->subsys_id = i; + ss->name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss); @@ -5167,11 +5175,9 @@ static struct cftype debug_files[] = { { } /* terminate */ }; -struct cgroup_subsys debug_subsys = { - .name = "debug", +struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 6c3154e477f6..98ea26a99076 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) static inline struct freezer *task_freezer(struct task_struct *task) { - return css_freezer(task_css(task, freezer_subsys_id)); + return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) @@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state) return "THAWED"; }; -struct cgroup_subsys freezer_subsys; - static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -473,13 +471,11 @@ static struct cftype files[] = { { } /* terminate */ }; -struct cgroup_subsys freezer_subsys = { - .name = "freezer", +struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, - .subsys_id = freezer_subsys_id, .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f1..2d018c795fea 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return css_cs(task_css(task, cpuset_subsys_id)); + return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) @@ -1521,7 +1521,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, - cpuset_subsys_id); + cpuset_cgrp_id); struct cpuset *cs = css_cs(css); struct cpuset *oldcs = css_cs(oldcss); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); @@ -2024,8 +2024,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) kfree(cs); } -struct cgroup_subsys cpuset_subsys = { - .name = "cpuset", +struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, @@ -2033,7 +2032,6 @@ struct cgroup_subsys cpuset_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, }; @@ -2699,7 +2697,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; rcu_read_lock(); - css = task_css(tsk, cpuset_subsys_id); + css = task_css(tsk, cpuset_cgrp_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd3..64903731d834 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -342,7 +342,7 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_css(task, perf_subsys_id), + return container_of(task_css(task, perf_event_cgrp_id), struct perf_cgroup, css); } @@ -595,7 +595,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, rcu_read_lock(); - css = css_from_dir(f.file->f_dentry, &perf_subsys); + css = css_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -8055,9 +8055,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, +struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..d4cfc5561830 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7176,7 +7176,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7957,8 +7957,7 @@ static struct cftype cpu_files[] = { { } /* terminate */ }; -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", +struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, @@ -7966,7 +7965,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .subsys_id = cpu_cgroup_subsys_id, .base_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 622e0818f905..c143ee380e3a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return css_ca(task_css(tsk, cpuacct_subsys_id)); + return css_ca(task_css(tsk, cpuacct_cgrp_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) @@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, .base_cftypes = files, .early_init = 1, }; -- cgit v1.2.3 From aec25020f5d4b69aea5317551d1cb7043f6b04fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: rename cgroup_subsys->subsys_id to ->id It's no longer referenced outside cgroup core, so renaming is easy. Let's rename it for consistency & brevity. This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fe3f7253aa90..5a77ca0784a6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -198,7 +198,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (ss) - return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; @@ -3982,7 +3982,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); + rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4014,7 +4014,7 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; css->cgroup->nr_css++; - rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; } @@ -4034,7 +4034,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); } /** @@ -4065,7 +4065,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) goto err_free; @@ -4292,7 +4292,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { - cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* * Killing would put the base ref, but we need to keep it alive @@ -4496,7 +4496,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = css; + init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4536,14 +4536,14 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); for_each_subsys(ss, i) { - WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->subsys_id, + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, - ss->subsys_id, ss->name); + ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); - ss->subsys_id = i; + ss->id = i; ss->name = cgroup_subsys_name[i]; if (ss->early_init) -- cgit v1.2.3 From 69e943b7d3c2dcca1087e03e556ac6cb0d4433b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:36:58 -0500 Subject: cgroup: update locking in cgroup_show_options() cgroup_show_options() grabs cgroup_root_mutex to protect the options changing while printing; however, holding root_mutex or not doesn't really make much difference for the function. subsys_mask can be atomically tested and most of the options aren't allowed to change anyway once mounted. The only field which needs synchronization is ->release_agent_path. This patch introduces a dedicated spinlock to synchronize accesses to the field and drops cgroup_root_mutex locking from cgroup_show_options(). The next patch will remove cgroup_root_mutex. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a77ca0784a6..b15058602120 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -92,6 +92,12 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); + #define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ lockdep_is_held(&cgroup_mutex), \ @@ -1034,7 +1040,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) struct cgroup_subsys *ss; int ssid; - mutex_lock(&cgroup_root_mutex); for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); @@ -1044,13 +1049,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + spin_unlock(&release_agent_path_lock); + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_root_mutex); return 0; } @@ -1272,8 +1280,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - if (opts.release_agent) + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -2183,7 +2194,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css, if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; mutex_lock(&cgroup_root_mutex); + spin_lock(&release_agent_path_lock); strcpy(css->cgroup->root->release_agent_path, buffer); + spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; -- cgit v1.2.3 From 3417ae1f5f59bbf36c3defbbf2a76c5ca498db2a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:37:01 -0500 Subject: cgroup: remove cgroup_root_mutex cgroup_root_mutex was added to avoid deadlock involving namespace_sem via cgroup_show_options(). It added a lot of overhead for the small purpose of it and, because it's nested under cgroup_mutex, it has very limited usefulness. The previous patch made cgroup_show_options() not use cgroup_root_mutex, so nobody needs it anymore. Remove it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b15058602120..0e7829078049 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -70,18 +70,6 @@ /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. - * - * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify - * cgroupfs_root of any cgroup hierarchy - subsys list, flags, - * release_agent_path and so on. Modifying requires both cgroup_mutex and - * cgroup_root_mutex. Readers can acquire either of the two. This is to - * break the following locking order cycle. - * - * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem - * B. namespace_sem -> cgroup_mutex - * - * B happens only through cgroup_show_options() and using cgroup_root_mutex - * breaks it. */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); @@ -90,8 +78,6 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ static DEFINE_MUTEX(cgroup_mutex); #endif -static DEFINE_MUTEX(cgroup_root_mutex); - /* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. @@ -103,14 +89,6 @@ static DEFINE_SPINLOCK(release_agent_path_lock); lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); -#ifdef CONFIG_LOCKDEP -#define cgroup_assert_mutex_or_root_locked() \ - WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ - !lockdep_is_held(&cgroup_root_mutex))) -#else -#define cgroup_assert_mutex_or_root_locked() do { } while (0) -#endif - /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup @@ -154,11 +132,7 @@ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; static LIST_HEAD(cgroup_roots); static int cgroup_root_count; -/* - * Hierarchy ID allocation and mapping. It follows the same exclusion - * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for - * writes, either for reads. - */ +/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); static struct cgroup_name root_cgroup_name = { .name = "/" }; @@ -973,7 +947,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); - BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); /* Check that any added subsystems are currently free */ for_each_subsys(ss, i) @@ -1246,7 +1219,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1288,7 +1260,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) out_unlock: kfree(opts.release_agent); kfree(opts.name); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; @@ -1331,7 +1302,6 @@ static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) int id; lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, GFP_KERNEL); @@ -1345,7 +1315,6 @@ static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) static void cgroup_exit_root_id(struct cgroupfs_root *root) { lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); if (root->hierarchy_id) { idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); @@ -1524,7 +1493,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); @@ -1597,7 +1565,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); } else { @@ -1628,7 +1595,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); drop_new_super: @@ -1653,7 +1619,6 @@ static void cgroup_kill_sb(struct super_block *sb) mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* Rebind all subsystems back to the default hierarchy */ if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { @@ -1682,7 +1647,6 @@ static void cgroup_kill_sb(struct super_block *sb) cgroup_exit_root_id(root); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); @@ -2193,11 +2157,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css, return -EINVAL; if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; - mutex_lock(&cgroup_root_mutex); spin_lock(&release_agent_path_lock); strcpy(css->cgroup->root->release_agent_path, buffer); spin_unlock(&release_agent_path_lock); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; } @@ -4588,7 +4550,6 @@ int __init cgroup_init(void) /* allocate id for the dummy hierarchy */ mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); @@ -4600,7 +4561,6 @@ int __init cgroup_init(void) 0, 1, GFP_KERNEL); BUG_ON(err < 0); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); -- cgit v1.2.3 From 6a02ad66b2c44155d529f430d4fa5c6c66321077 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 18:11:08 +0100 Subject: perf/x86: Push the duration-logging printk() to IRQ context Calling printk() from NMI context is bad (TM), so move it to IRQ context. This also avoids the problem where the printk() time is measured by the generic NMI duration goo and triggers a second warning. Signed-off-by: Peter Zijlstra Cc: Don Zickus Cc: Dave Hansen Link: http://lkml.kernel.org/n/tip-75dv35xf6dhhmeb7nq6fua31@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd3..2067cbb378eb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); -void perf_sample_event_took(u64 sample_len_ns) +static void perf_duration_warn(struct irq_work *w) { + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); u64 avg_local_sample_len; u64 local_samples_len; + + local_samples_len = __get_cpu_var(running_sample_length); + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + printk_ratelimited(KERN_WARNING + "perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns, + sysctl_perf_event_sample_rate); +} + +static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); + +void perf_sample_event_took(u64 sample_len_ns) +{ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); + u64 avg_local_sample_len; + u64 local_samples_len; if (allowed_ns == 0) return; @@ -263,13 +281,9 @@ void perf_sample_event_took(u64 sample_len_ns) sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; - printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, - sysctl_perf_event_sample_rate); - update_perf_cpu_limits(); + + irq_work_queue(&perf_duration_work); } static atomic64_t perf_event_id; -- cgit v1.2.3 From 390f3258cb2d031f1c17aa32e771ebd336e89073 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 28 Jan 2014 11:26:14 +0400 Subject: sched/deadline: Skip in switched_to_dl() if task is current When p is current and it's not of dl class, then there are no other dl taks in the rq. If we had had pushable tasks in some other rq, they would have been pushed earlier. So, skip "p == rq->curr" case. Signed-off-by: Kirill Tkhai Acked-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140128072421.32315.25300.stgit@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0dd5e0971a07..b5700bceee55 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1560,7 +1560,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (unlikely(p->dl.dl_throttled)) return; - if (p->on_rq || rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ -- cgit v1.2.3 From 6b6350f155afdfdf888e18c7bf26950a6d10b0c2 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 27 Jan 2014 17:15:38 -0500 Subject: sched: Expose some macros related to priority Some macros in kernel/sched/sched.h about priority are private to kernel/sched. But they are useful to other parts of the core kernel. This patch moves these macros from kernel/sched/sched.h to include/linux/sched/prio.h so that they are available to other subsystems. Signed-off-by: Dongsheng Yang Cc: raistlin@linux.it Cc: juri.lelli@gmail.com Cc: clark.williams@gmail.com Cc: rostedt@goodmis.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/2b022810905b52d13238466807f4b2a691577180.1390859827.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd20f8b..b44720d38ae9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -23,24 +23,6 @@ extern atomic_long_t calc_load_tasks; extern long calc_load_fold_active(struct rq *this_rq); extern void update_cpu_load_active(struct rq *this_rq); -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - /* * Helpers for converting nanosecond timing to jiffy resolution */ -- cgit v1.2.3 From 849401b66d305f3feb75b6db7459b95ad190552a Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Sun, 9 Feb 2014 11:32:22 +0530 Subject: tick: Fixup more fallout from hrtimer broadcast mode The hrtimer mode of broadcast is supported only when GENERIC_CLOCKEVENTS_BROADCAST and TICK_ONESHOT config options are enabled. Hence compile in the functions for hrtimer mode of broadcast only when these options are selected. Also fix max_delta_ticks value for the pseudo clock device. Reported-by: Fengguang Wu Reported-by: Ingo Molnar Signed-off-by: Preeti U Murthy Link: http://lkml.kernel.org/r/52F719EE.9010304@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/Makefile | 5 ++++- kernel/time/tick-broadcast-hrtimer.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 06151ef4a744..57a413fd0ebf 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o tick-broadcast-hrtimer.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 92425279312b..eb682d5c697c 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -82,7 +82,7 @@ static struct clock_event_device ce_broadcast_hrtimer = { .min_delta_ns = 1, .max_delta_ns = KTIME_MAX, .min_delta_ticks = 1, - .max_delta_ticks = KTIME_MAX, + .max_delta_ticks = ULONG_MAX, .mult = 1, .shift = 0, .cpumask = cpu_all_mask, -- cgit v1.2.3 From 0668d3065128d39449c097e62dbdb5707820137d Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 2 Jan 2014 16:37:32 -0800 Subject: genirq: Add devm_request_any_context_irq() Some drivers use request_any_context_irq() but there isn't a devm_* function for it. Add one so that these drivers don't need to explicitly free the irq on driver detach. Signed-off-by: Stephen Boyd Cc: linux-arm-kernel@lists.infradead.org Cc: Dmitry Torokhov Link: http://lkml.kernel.org/r/1388709460-19222-3-git-send-email-sboyd@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/irq/devres.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index bd8e788d71e0..1ef0606797c9 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -72,6 +72,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, } EXPORT_SYMBOL(devm_request_threaded_irq); +/** + * devm_request_any_context_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_any_context_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + devres_free(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_any_context_irq); + /** * devm_free_irq - free an interrupt * @dev: device to free interrupt for -- cgit v1.2.3 From d0ea026808ad81de2af14938448419a95211b938 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 27 Jan 2014 22:00:45 -0500 Subject: sched: Implement task_nice() as static inline function As patch "sched: Move the priority specific bits into a new header file" exposes the priority related macros in linux/sched/prio.h, we don't have to implement task_nice() in kernel/sched/core.c any more. This patch implements it in linux/sched/sched.h as static inline function, saving the kernel stack and enhancing performance a bit. Signed-off-by: Dongsheng Yang Cc: clark.williams@gmail.com Cc: rostedt@goodmis.org Cc: raistlin@linux.it Cc: juri.lelli@gmail.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1390878045-7096-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 +++++++------------------- kernel/sched/cputime.c | 4 ++-- 2 files changed, 9 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 210a12acf2cd..104c8164e04f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3000,7 +3000,7 @@ void set_user_nice(struct task_struct *p, long nice) unsigned long flags; struct rq *rq; - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + if (task_nice(p) == nice || nice < -20 || nice > 19) return; /* * We have to be careful, if called from sys_setpriority(), @@ -3078,7 +3078,7 @@ SYSCALL_DEFINE1(nice, int, increment) if (increment > 40) increment = 40; - nice = TASK_NICE(current) + increment; + nice = task_nice(current) + increment; if (nice < -20) nice = -20; if (nice > 19) @@ -3110,18 +3110,6 @@ int task_prio(const struct task_struct *p) return p->prio - MAX_RT_PRIO; } -/** - * task_nice - return the nice value of a given task. - * @p: the task in question. - * - * Return: The nice value [ -20 ... 0 ... 19 ]. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. @@ -3321,7 +3309,7 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (fair_policy(policy)) { - if (attr->sched_nice < TASK_NICE(p) && + if (attr->sched_nice < task_nice(p) && !can_nice(p, attr->sched_nice)) return -EPERM; } @@ -3345,7 +3333,7 @@ recheck: * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, TASK_NICE(p))) + if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3385,7 +3373,7 @@ recheck: * If not changing anything there's no need to proceed further: */ if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; @@ -3837,7 +3825,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else if (task_has_rt_policy(p)) attr.sched_priority = p->rt_priority; else - attr.sched_nice = TASK_NICE(p); + attr.sched_nice = task_nice(p); rcu_read_unlock(); @@ -7010,7 +6998,7 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (TASK_NICE(p) < 0 && p->mm) + if (task_nice(p) < 0 && p->mm) set_user_nice(p, 0); continue; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30b..58624a65f124 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ task_group_account_field(p, index, (__force u64) cputime); @@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime += cputime; /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { + if (task_nice(p) > 0) { cpustat[CPUTIME_NICE] += (__force u64) cputime; cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { -- cgit v1.2.3 From fb9edbe98493fcd9df66de926ae9157cbe0e4dcd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:06 +0100 Subject: lockdep: Make held_lock->check and "int check" argument bool The "int check" argument of lock_acquire() and held_lock->check are misleading. This is actually a boolean: 2 means "true", everything else is "false". And there is no need to pass 1 or 0 to lock_acquire() depending on CONFIG_PROVE_LOCKING, __lock_acquire() checks prove_locking at the start and clears "check" if !CONFIG_PROVE_LOCKING. Note: probably we can simply kill this member/arg. The only explicit user of check => 0 is rcu_lock_acquire(), perhaps we can change it to use lock_acquire(trylock =>, read => 2). __lockdep_no_validate means check => 0 implicitly, but we can change validate_chain() to check hlock->instance->key instead. Not to mention it would be nice to get rid of lockdep_set_novalidate_class(). Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182006.GA26495@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eb8a54783fa0..8c85a0da5a38 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ - if (!hlock->trylock && (hlock->check == 2) && + if (!hlock->trylock && hlock->check && lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: @@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; - if (!prove_locking) - check = 1; - if (unlikely(!debug_locks)) return 0; @@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (lock->key == &__lockdep_no_validate__) - check = 1; + if (!prove_locking || lock->key == &__lockdep_no_validate__) + check = 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->holdtime_stamp = lockstat_clock(); #endif - if (check == 2 && !mark_irqflags(curr, hlock)) + if (check && !mark_irqflags(curr, hlock)) return 0; /* mark it as used: */ -- cgit v1.2.3 From 1b5ff816cab708ba44c7d7b56b613516269eb577 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:10 +0100 Subject: lockdep: Don't create the wrong dependency on hlock->check == 0 Test-case: DEFINE_MUTEX(m1); DEFINE_MUTEX(m2); DEFINE_MUTEX(mx); void lockdep_should_complain(void) { lockdep_set_novalidate_class(&mx); // m1 -> mx -> m2 mutex_lock(&m1); mutex_lock(&mx); mutex_lock(&m2); mutex_unlock(&m2); mutex_unlock(&mx); mutex_unlock(&m1); // m2 -> m1 ; should trigger the warning mutex_lock(&m2); mutex_lock(&m1); mutex_unlock(&m1); mutex_unlock(&m2); } this doesn't trigger any warning, lockdep can't detect the trivial deadlock. This is because lock(&mx) correctly avoids m1 -> mx dependency, it skips validate_chain() due to mx->check == 0. But lock(&m2) wrongly adds mx -> m2 and thus m1 -> m2 is not created. rcu_lock_acquire()->lock_acquire(check => 0) is fine due to read == 2, so currently only __lockdep_no_validate__ can trigger this problem. Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182010.GA26498@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8c85a0da5a38..f7eba92cb97c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; + hlock = curr->held_locks + depth - 1; /* * Only non-recursive-read entries get new dependencies * added: */ - if (hlock->read != 2) { + if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, distance, trylock_loop)) return 0; -- cgit v1.2.3 From 34d0ed5ea7a72d5961552fb1758a94f0d3f8f3dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:13 +0100 Subject: lockdep: Change mark_held_locks() to check hlock->check instead of lockdep_no_validate The __lockdep_no_validate check in mark_held_locks() adds the subtle and (afaics) unnecessary difference between no-validate and check==0. And this looks even more inconsistent because __lock_acquire() skips mark_irqflags()->mark_lock() if !check. Change mark_held_locks() to check hlock->check instead. Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182013.GA26505@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f7eba92cb97c..bf0c6b0dd9c5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); - if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) + if (!hlock->check) continue; if (!mark_lock(curr, hlock, usage_bit)) -- cgit v1.2.3 From b4f2ab43615e5b36c48fffa99f26aca381839ac6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 17 Jan 2014 10:04:01 +0100 Subject: sched: Remove 'cpu' parameter from idle_balance() The cpu parameter passed to idle_balance() is not needed as it could be retrieved from 'struct rq.' Signed-off-by: Daniel Lezcano Cc: alex.shi@linaro.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389949444-14821-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 3 ++- kernel/sched/sched.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 104c8164e04f..74dd565c2e1b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2705,7 +2705,7 @@ need_resched: pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); + idle_balance(rq); put_prev_task(rq, prev); next = pick_next_task(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4caa8030824d..428bc9d2c383 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6531,12 +6531,13 @@ out: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(int this_cpu, struct rq *this_rq) +void idle_balance(struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; + int this_cpu = this_rq->cpu; this_rq->idle_stamp = rq_clock(this_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b44720d38ae9..82c0e02f2a58 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1158,7 +1158,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_balance(int this_cpu, struct rq *this_rq); +extern void idle_balance(struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -- cgit v1.2.3 From e5fc66119ec97054eefc83f173a7ee9e133c3c3a Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 17 Jan 2014 10:04:02 +0100 Subject: sched: Fix race in idle_balance() The scheduler main function 'schedule()' checks if there are no more tasks on the runqueue. Then it checks if a task should be pulled in the current runqueue in idle_balance() assuming it will go to idle otherwise. But idle_balance() releases the rq->lock in order to look up the sched domains and takes the lock again right after. That opens a window where another cpu may put a task in our runqueue, so we won't go to idle but we have filled the idle_stamp, thinking we will. This patch closes the window by checking if the runqueue has been modified but without pulling a task after taking the lock again, so we won't go to idle right after in the __schedule() function. Signed-off-by: Daniel Lezcano Cc: alex.shi@linaro.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389949444-14821-2-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 428bc9d2c383..5ebc6817c036 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6589,6 +6589,13 @@ void idle_balance(struct rq *this_rq) raw_spin_lock(&this_rq->lock); + /* + * While browsing the domains, we released the rq lock. + * A task could have be enqueued in the meantime + */ + if (this_rq->nr_running && !pulled_task) + return; + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on -- cgit v1.2.3 From 3c4017c13f91069194fce3160944efec50f15a6e Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 17 Jan 2014 10:04:03 +0100 Subject: sched: Move rq->idle_stamp up to the core idle_balance() modifies the rq->idle_stamp field, making this information shared across core.c and fair.c. As we know if the cpu is going to idle or not with the previous patch, let's encapsulate the rq->idle_stamp information in core.c by moving it up to the caller. The idle_balance() function returns true in case a balancing occured and the cpu won't be idle, false if no balance happened and the cpu is going idle. Signed-off-by: Daniel Lezcano Cc: alex.shi@linaro.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389949444-14821-3-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++-- kernel/sched/fair.c | 14 ++++++-------- kernel/sched/sched.h | 2 +- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 74dd565c2e1b..417cf657a606 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2704,8 +2704,15 @@ need_resched: pre_schedule(rq, prev); - if (unlikely(!rq->nr_running)) - idle_balance(rq); + if (unlikely(!rq->nr_running)) { + /* + * We must set idle_stamp _before_ calling idle_balance(), such + * that we measure the duration of idle_balance() as idle time. + */ + rq->idle_stamp = rq_clock(rq); + if (idle_balance(rq)) + rq->idle_stamp = 0; + } put_prev_task(rq, prev); next = pick_next_task(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5ebc6817c036..04fea7744a9f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6531,7 +6531,7 @@ out: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(struct rq *this_rq) +int idle_balance(struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; @@ -6539,10 +6539,8 @@ void idle_balance(struct rq *this_rq) u64 curr_cost = 0; int this_cpu = this_rq->cpu; - this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; + return 0; /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6580,10 +6578,8 @@ void idle_balance(struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } rcu_read_unlock(); @@ -6594,7 +6590,7 @@ void idle_balance(struct rq *this_rq) * A task could have be enqueued in the meantime */ if (this_rq->nr_running && !pulled_task) - return; + return 1; if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -6606,6 +6602,8 @@ void idle_balance(struct rq *this_rq) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; + + return pulled_task; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 82c0e02f2a58..bb89991ee409 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1158,7 +1158,7 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_balance(struct rq *this_rq); +extern int idle_balance(struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -- cgit v1.2.3 From fed14d45f945042a15b09de48d7d3d58d9455fc4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 11 Feb 2012 06:05:00 +0100 Subject: sched/fair: Track cgroup depth Track depth in cgroup tree, this is useful for things like find_matching_se() where you need to get to a common parent of two sched entities. Keeping the depth avoids having to calculate it on the spot, which saves a number of possible cache-misses. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04fea7744a9f..748a7ac3388f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { if (se->cfs_rq == pse->cfs_rq) - return 1; + return se->cfs_rq; - return 0; + return NULL; } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { @@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) */ /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); + se_depth = (*se)->depth; + pse_depth = (*pse)->depth; while (se_depth > pse_depth) { se_depth--; @@ -426,10 +415,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { - return 1; + return cfs_rq_of(se); /* always the same rq */ } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -7262,7 +7251,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; + /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -7288,23 +7279,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) on_rq = 1; if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; if (!on_rq) { - cfs_rq = cfs_rq_of(&p->se); - p->se.vruntime += cfs_rq->min_vruntime; + cfs_rq = cfs_rq_of(se); + se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP /* * migrate_task_rq_fair() will have removed our previous * contribution, but we must synchronize for ongoing future * decay. */ - p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; #endif } } @@ -7400,10 +7392,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, if (!se) return; - if (!parent) + if (!parent) { se->cfs_rq = &rq->cfs; - else + se->depth = 0; + } else { se->cfs_rq = parent->my_q; + se->depth = parent->depth + 1; + } se->my_q = cfs_rq; /* guarantee group entities always have weight */ -- cgit v1.2.3 From 606dba2e289446600a0b68422ed2019af5355c12 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 11 Feb 2012 06:05:00 +0100 Subject: sched: Push put_prev_task() into pick_next_task() In order to avoid having to do put/set on a whole cgroup hierarchy when we context switch, push the put into pick_next_task() so that both operations are in the same function. Further changes then allow us to possibly optimize away redundant work. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 ++++++++------------- kernel/sched/deadline.c | 5 ++++- kernel/sched/fair.c | 6 +++++- kernel/sched/idle_task.c | 6 +++++- kernel/sched/rt.c | 27 ++++++++++++++++----------- kernel/sched/sched.h | 8 +++++++- kernel/sched/stop_task.c | 16 ++++++++++------ 7 files changed, 55 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 417cf657a606..dedb5f07666e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); } -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ - if (prev->on_rq || rq->skip_clock_update < 0) - update_rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev); -} - /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq) +pick_next_task(struct rq *rq, struct task_struct *prev) { const struct sched_class *class; struct task_struct *p; @@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq) * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq); + p = fair_sched_class.pick_next_task(rq, prev); if (likely(p)) return p; } for_each_class(class) { - p = class->pick_next_task(rq); + p = class->pick_next_task(rq, prev); if (p) return p; } @@ -2714,8 +2707,10 @@ need_resched: rq->idle_stamp = 0; } - put_prev_task(rq, prev); - next = pick_next_task(rq); + if (prev->on_rq || rq->skip_clock_update < 0) + update_rq_clock(rq); + + next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->skip_clock_update = 0; @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; - next = pick_next_task(rq); + next = pick_next_task(rq, NULL); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b5700bceee55..50797d576080 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct *pick_next_task_dl(struct rq *rq) +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq) if (unlikely(!dl_rq->dl_nr_running)) return NULL; + if (prev) + prev->sched_class->put_prev_task(rq, prev); + dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 748a7ac3388f..c4bb0ac26a7c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4655,7 +4655,8 @@ preempt: set_last_buddy(se); } -static struct task_struct *pick_next_task_fair(struct rq *rq) +static struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev) { struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; @@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) if (!cfs_rq->nr_running) return NULL; + if (prev) + prev->sched_class->put_prev_task(rq, prev); + do { se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 516c3d9ceea1..e5c922ac40ce 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_task(rq->idle); } -static struct task_struct *pick_next_task_idle(struct rq *rq) +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev) { + if (prev) + prev->sched_class->put_prev_task(rq, prev); + schedstat_inc(rq, sched_goidle); #ifdef CONFIG_SMP /* Trigger the post schedule to do an idle_enter for CFS */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b45..a15ca1c0c7bf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; + struct rt_rq *rt_rq = &rq->rt; do { rt_se = pick_next_rt_entity(rq, rt_rq); @@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct * +pick_next_task_rt(struct rq *rq, struct task_struct *prev) { - struct task_struct *p = _pick_next_task_rt(rq); + struct task_struct *p; + struct rt_rq *rt_rq = &rq->rt; + + if (!rt_rq->rt_nr_running) + return NULL; + + if (rt_rq_throttled(rt_rq)) + return NULL; + + if (prev) + prev->sched_class->put_prev_task(rq, prev); + + p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ if (p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bb89991ee409..c534cf4181ab 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1105,7 +1105,13 @@ struct sched_class { void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); - struct task_struct * (*pick_next_task) (struct rq *rq); + /* + * It is the responsibility of the pick_next_task() method that will + * return the next task to call put_prev_task() on the @prev task or + * something equivalent. + */ + struct task_struct * (*pick_next_task) (struct rq *rq, + struct task_struct *prev); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index fdb6bb0b3356..a4147c9d2017 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct * +pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) { - stop->se.exec_start = rq_clock_task(rq); - return stop; - } + if (!stop || !stop->on_rq) + return NULL; - return NULL; + if (prev) + prev->sched_class->put_prev_task(rq, prev); + + stop->se.exec_start = rq_clock_task(rq); + + return stop; } static void -- cgit v1.2.3 From f10447998a59b97747c16258a9c6e6a1512f27f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 11 Feb 2012 06:05:00 +0100 Subject: sched/fair: Clean up the __clear_buddies_*() functions Slightly easier code flow, no functional changes. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c4bb0ac26a7c..846172107ba5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2739,10 +2739,10 @@ static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else + if (cfs_rq->last != se) break; + + cfs_rq->last = NULL; } } @@ -2750,10 +2750,10 @@ static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else + if (cfs_rq->next != se) break; + + cfs_rq->next = NULL; } } @@ -2761,10 +2761,10 @@ static void __clear_buddies_skip(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else + if (cfs_rq->skip != se) break; + + cfs_rq->skip = NULL; } } -- cgit v1.2.3 From 678d5718d8d099421b0dd54c01b0528f4aaf5919 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 11 Feb 2012 06:05:00 +0100 Subject: sched/fair: Optimize cgroup pick_next_task_fair() Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt path") it is likely we pick a new task from the same cgroup, doing a put and then set on all intermediate entities is a waste of time, so try to avoid this. Measured using: mount nodev /cgroup -t cgroup -o cpu cd /cgroup mkdir a; cd a mkdir b; cd b mkdir c; cd c echo $$ > tasks perf stat --repeat 10 -- taskset 1 perf bench sched pipe PRE : 4.542422684 seconds time elapsed ( +- 0.33% ) POST: 4.389409991 seconds time elapsed ( +- 0.32% ) Which shows a significant improvement of ~3.5% Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Tejun Heo Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 110 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 846172107ba5..a81b241ff70f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2907,17 +2907,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *left = se; + struct sched_entity *left = __pick_first_entity(cfs_rq); + struct sched_entity *se; + + /* + * If curr is set we have to see if its left of the leftmost entity + * still in the tree, provided there was anything in the tree at all. + */ + if (!left || (curr && entity_before(curr, left))) + left = curr; + + se = left; /* ideally we run the leftmost entity */ /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { - struct sched_entity *second = __pick_next_entity(se); + struct sched_entity *second; + + if (se == curr) { + second = __pick_first_entity(cfs_rq); + } else { + second = __pick_next_entity(se); + if (!second || (curr && entity_before(curr, second))) + second = curr; + } + if (second && wakeup_preempt_entity(second, left) < 1) se = second; } @@ -2939,7 +2958,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { @@ -3594,22 +3613,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { if (!cfs_bandwidth_used()) - return; + return false; if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return; + return false; /* * it's possible for a throttled entity to be forced into a running * state (e.g. set_curr_task), in this case we're finished. */ if (cfs_rq_throttled(cfs_rq)) - return; + return true; throttle_cfs_rq(cfs_rq); + return true; } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -3719,7 +3739,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -4658,9 +4678,86 @@ preempt: static struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev) { - struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; + struct task_struct *p; + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!cfs_rq->nr_running) + return NULL; + + if (!prev || prev->sched_class != &fair_sched_class) + goto simple; + + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. + */ + + do { + struct sched_entity *curr = cfs_rq->curr; + + /* + * Since we got here without doing put_prev_entity() we also + * have to consider cfs_rq->curr. If it is still a runnable + * entity, update_curr() will update its vruntime, otherwise + * forget we've ever seen it. + */ + if (curr && curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + /* + * This call to check_cfs_rq_runtime() will do the throttle and + * dequeue its entity in the parent(s). Therefore the 'simple' + * nr_running test will indeed be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + /* + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; + int pse_depth = pse->depth; + + if (se_depth <= pse_depth) { + put_prev_entity(cfs_rq_of(pse), pse); + pse = parent_entity(pse); + } + if (se_depth >= pse_depth) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + } + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); + } + + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); + + return p; +simple: + cfs_rq = &rq->cfs; +#endif if (!cfs_rq->nr_running) return NULL; @@ -4669,12 +4766,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) prev->sched_class->put_prev_task(rq, prev); do { - se = pick_next_entity(cfs_rq); + se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); -- cgit v1.2.3 From 6c3b4d44ba2838f00614a5a2d777d4401e0bfd71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Jan 2014 15:09:39 +0100 Subject: sched: Clean up idle task SMP logic The idle post_schedule flag is just a vile waste of time, furthermore it appears unneeded, move the idle_enter_fair() call into pick_next_task_idle(). Signed-off-by: Peter Zijlstra Cc: Daniel Lezcano Cc: Vincent Guittot Cc: alex.shi@linaro.org Cc: mingo@kernel.org Cc: Steven Rostedt Link: http://lkml.kernel.org/n/tip-aljykihtxJt3mkokxi0qZurb@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/idle_task.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index e5c922ac40ce..721371bf03bd 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -19,11 +19,6 @@ static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) idle_exit_fair(rq); rq_last_tick_reset(rq); } - -static void post_schedule_idle(struct rq *rq) -{ - idle_enter_fair(rq); -} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -41,8 +36,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev) schedstat_inc(rq, sched_goidle); #ifdef CONFIG_SMP - /* Trigger the post schedule to do an idle_enter for CFS */ - rq->post_schedule = 1; + idle_enter_fair(rq); #endif return rq->idle; } @@ -106,7 +100,6 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, .pre_schedule = pre_schedule_idle, - .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, -- cgit v1.2.3 From 327adaedf2218b0e318eb393aa79cf2be64c199f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2014 00:35:29 +0100 Subject: PM / QoS: Add no_constraints_value field to struct pm_qos_constraints Add a new field, no_constraints_value, to struct pm_qos_constraints representing a list of PM QoS constraint requests to be returned by pm_qos_get_value() when that list of requests is empty. That field will be equal to default_value for all of the existing global PM QoS classes and for the resume latency device PM QoS type, but it will be different from default_value for the new latency tolerance device PM QoS type introduced by the next changeset. Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8dff9b48075a..e23ae38e647f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -66,6 +66,7 @@ static struct pm_qos_constraints cpu_dma_constraints = { .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, .notifiers = &cpu_dma_lat_notifier, }; @@ -79,6 +80,7 @@ static struct pm_qos_constraints network_lat_constraints = { .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, .notifiers = &network_lat_notifier, }; @@ -93,6 +95,7 @@ static struct pm_qos_constraints network_tput_constraints = { .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, .notifiers = &network_throughput_notifier, }; @@ -128,7 +131,7 @@ static const struct file_operations pm_qos_power_fops = { static inline int pm_qos_get_value(struct pm_qos_constraints *c) { if (plist_head_empty(&c->list)) - return c->default_value; + return c->no_constraint_value; switch (c->type) { case PM_QOS_MIN: -- cgit v1.2.3 From 2d984ad132a87ca2112f81f21039493176a8bca0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2014 00:35:38 +0100 Subject: PM / QoS: Introcuce latency tolerance device PM QoS type Add a new latency tolerance device PM QoS type to be use for specifying active state (RPM_ACTIVE) memory access (DMA) latency tolerance requirements for devices. It may be used to prevent hardware from choosing overly aggressive energy-saving operation modes (causing too much latency to appear) for the whole platform. This feature reqiures hardware support, so it only will be available for devices having a new .set_latency_tolerance() callback in struct dev_pm_info populated, in which case the routine pointed to by it should implement whatever is necessary to transfer the effective requirement value to the hardware. Whenever the effective latency tolerance changes for the device, its .set_latency_tolerance() callback will be executed and the effective value will be passed to it. If that value is negative, which means that the list of latency tolerance requirements for the device is empty, the callback is expected to switch the underlying hardware latency tolerance control mechanism to an autonomous mode if available. If that value is PM_QOS_LATENCY_ANY, in turn, and the hardware supports a special "no requirement" setting, the callback is expected to use it. That allows software to prevent the hardware from automatically updating the device's latency tolerance in response to its power state changes (e.g. during transitions from D3cold to D0), which generally may be done in the autonomous latency tolerance control mode. If .set_latency_tolerance() is present for the device, a new pm_qos_latency_tolerance_us attribute will be present in the devivce's power directory in sysfs. Then, user space can use that attribute to specify its latency tolerance requirement for the device, if any. Writing "any" to it means "no requirement, but do not let the hardware control latency tolerance" and writing "auto" to it allows the hardware to be switched to the autonomous mode if there are no other requirements from the kernel side in the device's list. This changeset includes a fix from Mika Westerberg. Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index e23ae38e647f..884b77058864 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -173,6 +173,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, { unsigned long flags; int prev_value, curr_value, new_value; + int ret; spin_lock_irqsave(&pm_qos_lock, flags); prev_value = pm_qos_get_value(c); @@ -208,13 +209,15 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, trace_pm_qos_update_target(action, prev_value, curr_value); if (prev_value != curr_value) { - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - return 1; + ret = 1; + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, + (unsigned long)curr_value, + NULL); } else { - return 0; + ret = 0; } + return ret; } /** -- cgit v1.2.3 From 38033c37faab850ed5d33bb675c4de6c66be84d8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jan 2014 20:32:21 +0100 Subject: sched: Push down pre_schedule() and idle_balance() This patch both merged idle_balance() and pre_schedule() and pushes both of them into pick_next_task(). Conceptually pre_schedule() and idle_balance() are rather similar, both are used to pull more work onto the current CPU. We cannot however first move idle_balance() into pre_schedule_fair() since there is no guarantee the last runnable task is a fair task, and thus we would miss newidle balances. Similarly, the dl and rt pre_schedule calls must be ran before idle_balance() since their respective tasks have higher priority and it would not do to delay their execution searching for less important tasks first. However, by noticing that pick_next_tasks() already traverses the sched_class hierarchy in the right order, we can get the right behaviour and do away with both calls. We must however change the special case optimization to also require that prev is of sched_class_fair, otherwise we can miss doing a dl or rt pull where we needed one. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-a8k6vvaebtn64nie345kx1je@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 ++------------------------ kernel/sched/deadline.c | 15 +++++++-------- kernel/sched/fair.c | 26 ++++++++++++++++++++++---- kernel/sched/idle_task.c | 12 +++++------- kernel/sched/rt.c | 16 ++++++++-------- kernel/sched/sched.h | 1 - 6 files changed, 44 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dedb5f07666e..3068f37f7c5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2169,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -} - /* rq->lock is NOT held, but preemption is disabled */ static inline void post_schedule(struct rq *rq) { @@ -2193,10 +2186,6 @@ static inline void post_schedule(struct rq *rq) #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} - static inline void post_schedule(struct rq *rq) { } @@ -2592,7 +2581,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev) * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { + if (likely(prev->sched_class == &fair_sched_class && + rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); if (likely(p)) return p; @@ -2695,18 +2685,6 @@ need_resched: switch_count = &prev->nvcsw; } - pre_schedule(rq, prev); - - if (unlikely(!rq->nr_running)) { - /* - * We must set idle_stamp _before_ calling idle_balance(), such - * that we measure the duration of idle_balance() as idle time. - */ - rq->idle_stamp = rq_clock(rq); - if (idle_balance(rq)) - rq->idle_stamp = 0; - } - if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 50797d576080..ed31ef66ab9d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } +static int pull_dl_task(struct rq *this_rq); + #endif /* CONFIG_SMP */ /* @@ -998,6 +1000,11 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) dl_rq = &rq->dl; +#ifdef CONFIG_SMP + if (dl_task(prev)) + pull_dl_task(rq); +#endif + if (unlikely(!dl_rq->dl_nr_running)) return NULL; @@ -1429,13 +1436,6 @@ skip: return ret; } -static void pre_schedule_dl(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull other tasks here */ - if (dl_task(prev)) - pull_dl_task(rq); -} - static void post_schedule_dl(struct rq *rq) { push_dl_tasks(rq); @@ -1628,7 +1628,6 @@ const struct sched_class dl_sched_class = { .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, - .pre_schedule = pre_schedule_dl, .post_schedule = post_schedule_dl, .task_woken = task_woken_dl, #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a81b241ff70f..43b49fe077ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2577,7 +2577,8 @@ void idle_exit_fair(struct rq *this_rq) update_rq_runnable_avg(this_rq, 0); } -#else +#else /* CONFIG_SMP */ + static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} @@ -2589,7 +2590,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) {} static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) {} -#endif +#endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -4682,9 +4683,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) struct sched_entity *se; struct task_struct *p; +again: __maybe_unused #ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) - return NULL; + goto idle; if (!prev || prev->sched_class != &fair_sched_class) goto simple; @@ -4760,7 +4762,7 @@ simple: #endif if (!cfs_rq->nr_running) - return NULL; + goto idle; if (prev) prev->sched_class->put_prev_task(rq, prev); @@ -4777,6 +4779,22 @@ simple: hrtick_start_fair(rq, p); return p; + +idle: +#ifdef CONFIG_SMP + idle_enter_fair(rq); + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + rq->idle_stamp = rq_clock(rq); + if (idle_balance(rq)) { /* drops rq->lock */ + rq->idle_stamp = 0; + goto again; + } +#endif + + return NULL; } /* diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 721371bf03bd..f7d03af79a5b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,13 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } - -static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) -{ - idle_exit_fair(rq); - rq_last_tick_reset(rq); -} #endif /* CONFIG_SMP */ + /* * Idle tasks are unconditionally rescheduled: */ @@ -56,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + idle_exit_fair(rq); + rq_last_tick_reset(rq); +#endif } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) @@ -99,7 +98,6 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, - .pre_schedule = pre_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a15ca1c0c7bf..72f9ec759972 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP +static int pull_rt_task(struct rq *this_rq); + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -1330,6 +1332,12 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; +#ifdef CONFIG_SMP + /* Try to pull RT tasks here if we lower this rq's prio */ + if (rq->rt.highest_prio.curr > prev->prio) + pull_rt_task(rq); +#endif + if (!rt_rq->rt_nr_running) return NULL; @@ -1721,13 +1729,6 @@ skip: return ret; } -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} - static void post_schedule_rt(struct rq *rq) { push_rt_tasks(rq); @@ -2004,7 +2005,6 @@ const struct sched_class rt_sched_class = { .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c534cf4181ab..1bf34c257d3b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1118,7 +1118,6 @@ struct sched_class { int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); - void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); -- cgit v1.2.3 From 27f17580fd2c7514c8f5cce22ab903c6f3ddf458 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 29 Jan 2014 14:29:33 +0000 Subject: sched: Delete is_same_group() outside CONFIG_FAIR_GROUP_SCHED Since is_same_group() is only used in the group scheduling code, there is no need to define it outside CONFIG_FAIR_GROUP_SCHED. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391005773-29493-1-git-send-email-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 43b49fe077ab..235cfa7ad8fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -415,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline struct cfs_rq * -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return cfs_rq_of(se); /* always the same rq */ -} - static inline struct sched_entity *parent_entity(struct sched_entity *se) { return NULL; -- cgit v1.2.3 From 37e6bae8395a94b4dd934c92b02b9408be992365 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 23 Jan 2014 18:39:54 +0800 Subject: sched: Add statistic for newidle load balance cost Tracking rq->max_idle_balance_cost and sd->max_newidle_lb_cost. It's useful to know these values in debug mode. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/52E0F3BF.5020904@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++--- kernel/sched/debug.c | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3068f37f7c5f..fb9764fbc537 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4811,7 +4811,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(13); + struct ctl_table *table = sd_alloc_ctl_entry(14); if (table == NULL) return NULL; @@ -4839,9 +4839,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "name", sd->name, + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[12] is terminator */ + /* &table[13] is terminator */ return table; } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 31b908daaa1b..f3344c31632a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -321,6 +321,7 @@ do { \ P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); + P64(max_idle_balance_cost); #endif P(ttwu_count); -- cgit v1.2.3 From af8cd8ef726f335815233d03b8723e9c52041edd Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 29 Jan 2014 15:31:36 -0500 Subject: sched/idle: Move the cpuidle entry point to the generic idle loop In order to integrate cpuidle with the scheduler, we must have a better proximity in the core code with what cpuidle is doing and not delegate such interaction to arch code. Architectures implementing arch_cpu_idle() should simply enter a cheap idle mode in the absence of a proper cpuidle driver. In both cases i.e. whether it is a cpuidle driver or the default arch_cpu_idle(), the calling convention expects IRQs to be disabled on entry and enabled on exit. There is a warning in place already but let's add a forced IRQ enable here as well. This will allow for removing the forced IRQ enable some implementations do locally and allowing for the warning to trig. Signed-off-by: Nicolas Pitre Acked-by: Daniel Lezcano Cc: Benjamin Herrenschmidt Cc: Preeti U Murthy Cc: Paul Mundt Cc: "Rafael J. Wysocki" Cc: Olof Johansson Cc: Russell King Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.LFD.2.11.1401291526320.1652@knanqh.ubzr Signed-off-by: Ingo Molnar --- kernel/cpu/idle.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 277f494c2a9a..b7976a127178 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include #include @@ -95,8 +96,10 @@ static void cpu_idle_loop(void) if (!current_clr_polling_and_test()) { stop_critical_timings(); rcu_idle_enter(); - arch_cpu_idle(); - WARN_ON_ONCE(irqs_disabled()); + if (cpuidle_idle_call()) + arch_cpu_idle(); + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); rcu_idle_exit(); start_critical_timings(); } else { -- cgit v1.2.3 From cf37b6b48428d6be8f8762b3599d529c44644fb2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Sun, 26 Jan 2014 23:42:01 -0500 Subject: sched/idle: Move cpu/idle.c to sched/idle.c Integration of cpuidle with the scheduler requires that the idle loop be closely integrated with the scheduler proper. Moving cpu/idle.c into the sched directory will allow for a smoother integration, and eliminate a subdirectory which contained only one source file. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.LFD.2.11.1401301102210.1652@knanqh.ubzr Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 - kernel/cpu/Makefile | 1 - kernel/sched/Makefile | 2 +- kernel/sched/idle.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 3 deletions(-) delete mode 100644 kernel/cpu/Makefile create mode 100644 kernel/sched/idle.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index bc010ee272b6..6f1c7e5cfca1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -22,7 +22,6 @@ obj-y += sched/ obj-y += locking/ obj-y += power/ obj-y += printk/ -obj-y += cpu/ obj-y += irq/ obj-y += rcu/ diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile deleted file mode 100644 index 59ab052ef7a0..000000000000 --- a/kernel/cpu/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-y = idle.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 9a95c8c2af2a..ab32b7b0db5c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -13,7 +13,7 @@ endif obj-y += core.o proc.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o +obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 000000000000..14ca43430aee --- /dev/null +++ b/kernel/sched/idle.c @@ -0,0 +1,144 @@ +/* + * Generic entry point for the idle threads + */ +#include +#include +#include +#include +#include +#include + +#include + +#include + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + rcu_idle_enter(); + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!tif_need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; + local_irq_enable(); +} + +/* + * Generic idle loop implementation + */ +static void cpu_idle_loop(void) +{ + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + cpu_idle_poll(); + } else { + if (!current_clr_polling_and_test()) { + stop_critical_timings(); + rcu_idle_enter(); + if (cpuidle_idle_call()) + arch_cpu_idle(); + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); + rcu_idle_exit(); + start_critical_timings(); + } else { + local_irq_enable(); + } + __current_set_polling(); + } + arch_cpu_idle_exit(); + /* + * We need to test and propagate the TIF_NEED_RESCHED + * bit here because we might not have send the + * reschedule IPI to idle tasks. + */ + if (tif_need_resched()) + set_preempt_need_resched(); + } + tick_nohz_idle_exit(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif + __current_set_polling(); + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} -- cgit v1.2.3 From 2c45aada341121438affc4cb8d5b4cfaa2813d3d Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 10 Feb 2014 13:39:53 -0500 Subject: genirq: Add missing irq_to_desc export for CONFIG_SPARSE_IRQ=n In allmodconfig builds for sparc and any other arch which does not set CONFIG_SPARSE_IRQ, the following will be seen at modpost: CC [M] lib/cpu-notifier-error-inject.o CC [M] lib/pm-notifier-error-inject.o ERROR: "irq_to_desc" [drivers/gpio/gpio-mcp23s08.ko] undefined! make[2]: *** [__modpost] Error 1 This happens because commit 3911ff30f5 ("genirq: export handle_edge_irq() and irq_to_desc()") added one export for it, but there were actually two instances of it, in an if/else clause for CONFIG_SPARSE_IRQ. Add the second one. Signed-off-by: Paul Gortmaker Cc: Jiri Kosina Cc: stable@vger.kernel.org # 3.4+ Link: http://lkml.kernel.org/r/1392057610-11514-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302d6cfd..8ab8e9390297 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -274,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } +EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { -- cgit v1.2.3 From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 11 Feb 2014 16:05:46 +0800 Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex Setup cgroupfs like this: # mount -t cgroup -o cpuacct xxx /cgroup # mkdir /cgroup/sub1 # mkdir /cgroup/sub2 Then run these two commands: # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } & # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } & After seconds you may see this warning: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0() idr_remove called for id=6 which is not allocated. ... Call Trace: [] dump_stack+0x7a/0x96 [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_fmt+0x46/0x50 [] sub_remove+0x87/0x1b0 [] ? css_killed_work_fn+0x32/0x1b0 [] idr_remove+0x25/0xd0 [] cgroup_destroy_css_killed+0x5b/0xc0 [] css_killed_work_fn+0x130/0x1b0 [] process_one_work+0x26c/0x550 [] worker_thread+0x12e/0x3b0 [] kthread+0xe6/0xf0 [] ret_from_fork+0x7c/0xb0 ---[ end trace 2d1577ec10cf80d0 ]--- It's because allocating/removing cgroup ID is not properly synchronized. The bug was introduced when we converted cgroup_ida to cgroup_idr. While synchronization is already done inside ida_simple_{get,remove}(), users are responsible for concurrent calls to idr_{alloc,remove}(). tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from cgroup_create()"). Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr") Cc: #3.12+ Reported-by: Michal Hocko Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3edf7163b84f..52719ce55dd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * per-subsystem and moved to css->id so that lookups are * successful until the target css is released. */ + mutex_lock(&cgroup_mutex); idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); cgrp->id = -1; call_rcu(&cgrp->rcu_head, cgroup_free_rcu); @@ -4167,16 +4169,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } rcu_assign_pointer(cgrp->name, name); - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) { - err = -ENOMEM; - goto err_free_name; - } - /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and @@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_id; + goto err_free_name; + } + + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) { + err = -ENOMEM; + goto err_unlock; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_unlock; + goto err_free_id; lockdep_assert_held(&dentry->d_inode->i_mutex); cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; -err_unlock: - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); + /* Release the reference count that we took on the superblock */ + deactivate_super(sb); +err_unlock: + mutex_unlock(&cgroup_mutex); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: -- cgit v1.2.3 From 5a17f543ed6808e9085063277fe46795dea484bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:47 -0500 Subject: cgroup: improve css_from_dir() into css_tryget_from_dir() css_from_dir() returns the matching css (cgroup_subsys_state) given a dentry and subsystem. The function doesn't pin the css before returning and requires the caller to be holding RCU read lock or cgroup_mutex and handling pinning on the caller side. Given that users of the function are likely to want to pin the returned css (both existing users do) and that getting and putting css's are very cheap, there's no reason for the interface to be tricky like this. Rename css_from_dir() to css_tryget_from_dir() and make it try to pin the found css and return it only if pinning succeeded. The callers are updated so that they no longer do RCU locking and pinning around the function and just use the returned css. This will also ease converting cgroup to kernfs. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- kernel/cgroup.c | 25 ++++++++++++++++--------- kernel/events/core.c | 17 +---------------- 2 files changed, 17 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2de8decfd99f..fc2db071d95e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4978,28 +4978,35 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under cgroup_mutex or RCU read lock. The caller is - * responsible for pinning the returned css if it needs to be accessed - * outside the critical section. + * If @dentry is a directory for a cgroup which has @ss enabled on it, try + * to get the corresponding css and return it. If such css doesn't exist + * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - - cgroup_assert_mutex_or_rcu_locked(); + struct cgroup_subsys_state *css; /* is @dentry a cgroup dir? */ if (!dentry->d_inode || dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); + rcu_read_lock(); + cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); + css = cgroup_css(cgrp, ss); + + if (!css || !css_tryget(css)) + css = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + return css; } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 64903731d834..a3c3ab50271a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -370,11 +370,6 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline bool perf_tryget_cgroup(struct perf_event *event) -{ - return css_tryget(&event->cgrp->css); -} - static inline void perf_put_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); @@ -593,9 +588,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - rcu_read_lock(); - - css = css_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); + css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -604,13 +597,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; - /* must be done before we fput() the file */ - if (!perf_tryget_cgroup(event)) { - event->cgrp = NULL; - ret = -ENOENT; - goto out; - } - /* * all events in a group must monitor * the same cgroup because a task belongs @@ -621,7 +607,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - rcu_read_unlock(); fdput(f); return ret; } -- cgit v1.2.3 From ace2bee8135a3dc725958b8d08c55ee9df813d39 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:47 -0500 Subject: cgroup: introduce cgroup_tree_mutex Currently cgroup uses combination of inode->i_mutex'es and cgroup_mutex for synchronization. With the scheduled kernfs conversion, i_mutex'es will be removed. Unfortunately, just using cgroup_mutex isn't possible. All kernfs file and syscall operations, most of which require grabbing cgroup_mutex, will be called with kernfs active ref held and, if we try to perform kernfs removals under cgroup_mutex, it can deadlock as kernfs_remove() tries to drain the target node. Let's introduce a new outer mutex, cgroup_tree_mutex, which protects stuff used during hierarchy changing operations - cftypes and all the operations which may affect the cgroupfs. It also covers css association and iteration. This allows cgroup_css(), for_each_css() and other css iterators to be called under cgroup_tree_mutex. The new mutex will nest above both kernfs's active ref protection and cgroup_mutex. By protecting tree modifications with a separate outer mutex, we can get rid of the forementioned deadlock condition. Actual file additions and removals now require cgroup_tree_mutex instead of cgroup_mutex. Currently, cgroup_tree_mutex is never used without cgroup_mutex; however, we'll soon add hierarchy modification sections which are only protected by cgroup_tree_mutex. In the future, we might want to make the locking more granular by better splitting the coverages of the two mutexes. For now, this should do. v2: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 66 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fc2db071d95e..cb20d12cb096 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -67,6 +67,15 @@ */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ +/* + * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file + * creation/removal and hierarchy changing operations including cgroup + * creation, removal, css association and controller rebinding. This outer + * lock is needed mainly to resolve the circular dependency between kernfs + * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. + */ +static DEFINE_MUTEX(cgroup_tree_mutex); + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -84,10 +93,11 @@ static DEFINE_MUTEX(cgroup_mutex); */ static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutex_or_rcu_locked() \ +#define cgroup_assert_mutexes_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_mutex or RCU read lock required"); + "cgroup_[tree_]mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -179,7 +189,8 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, { if (ss) return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_mutex)); + lockdep_is_held(&cgroup_tree_mutex) || + lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; } @@ -235,6 +246,7 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else @@ -883,7 +895,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) struct cfent *cfe; lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_tree_mutex); /* * If we're doing cleanup due to failure of cgroup_create(), @@ -948,7 +960,8 @@ static int rebind_subsystems(struct cgroupfs_root *root, struct cgroup_subsys *ss; int i, ret; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* Check that any added subsystems are currently free */ for_each_subsys(ss, i) @@ -1220,6 +1233,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) } mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* See what subsystems are wanted */ @@ -1263,6 +1277,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.release_agent); kfree(opts.name); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; } @@ -1494,6 +1509,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, inode = sb->s_root->d_inode; mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); @@ -1568,6 +1584,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); mutex_unlock(&inode->i_mutex); } else { /* @@ -1598,6 +1615,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, unlock_drop: cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); @@ -1620,6 +1638,7 @@ static void cgroup_kill_sb(struct super_block *sb) BUG_ON(!list_empty(&cgrp->children)); mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* Rebind all subsystems back to the default hierarchy */ @@ -1650,6 +1669,7 @@ static void cgroup_kill_sb(struct super_block *sb) cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); simple_xattrs_free(&cgrp->xattrs); @@ -2625,7 +2645,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], int ret; lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_tree_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2659,6 +2679,7 @@ static void cgroup_cfts_prepare(void) * Instead, we use css_for_each_descendant_pre() and drop RCU read * lock before calling cgroup_addrm_files(). */ + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); } @@ -2679,6 +2700,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); return 0; } @@ -2702,7 +2724,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) prev = cgrp->dentry; mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) ret = cgroup_addrm_files(cgrp, cfts, is_add); @@ -2711,6 +2735,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) break; } mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); dput(prev); deactivate_super(sb); return ret; @@ -2856,7 +2881,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -2914,7 +2939,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -2955,7 +2980,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); do { last = pos; @@ -3003,7 +3028,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3977,6 +4002,7 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) @@ -3994,6 +4020,7 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4093,6 +4120,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } rcu_assign_pointer(cgrp->name, name); + mutex_lock(&cgroup_tree_mutex); + /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and @@ -4102,7 +4131,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_name; + goto err_unlock_tree; } /* @@ -4176,6 +4205,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; @@ -4186,7 +4216,8 @@ err_free_id: deactivate_super(sb); err_unlock: mutex_unlock(&cgroup_mutex); -err_free_name: +err_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: kfree(cgrp); @@ -4195,6 +4226,7 @@ err_free_cgrp: err_destroy: cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); mutex_unlock(&dentry->d_inode->i_mutex); return err; } @@ -4217,6 +4249,7 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* @@ -4234,6 +4267,7 @@ static void css_killed_work_fn(struct work_struct *work) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); /* * Put the css refs from kill_css(). Each css holds an extra @@ -4321,6 +4355,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) int ssid; lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4407,6 +4442,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ @@ -4422,9 +4458,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { int ret; + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = cgroup_destroy_locked(dentry->d_fsdata); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -4454,6 +4492,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* init base cftset */ @@ -4482,6 +4521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); } /** @@ -5021,7 +5061,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) -- cgit v1.2.3 From 4ac0601744eb86e982fbdadde35f1945f7ce5882 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:47 -0500 Subject: cgroup: release cgroup_mutex over file removals Now that cftypes and all tree modification operations are protected by cgroup_tree_mutex, we can drop cgroup_mutex while deleting files and directories. Drop cgroup_mutex over removals. This doesn't make any noticeable difference now but is to help kernfs conversion. In kernfs, removals are sync points which drain in-flight operations as those operations would grab cgroup_mutex, trying to delete under cgroup_mutex would deadlock. This can be resolved by just holding the outer cgroup_tree_mutex which nests outside both kernfs active reference and cgroup_mutex. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb20d12cb096..d28cf75f33c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -976,7 +976,9 @@ static int rebind_subsystems(struct cgroupfs_root *root, * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ + mutex_unlock(&cgroup_mutex); cgroup_clear_dir(cgrp, removed_mask); + mutex_lock(&cgroup_mutex); for_each_subsys(ss, i) { unsigned long bit = 1UL << i; @@ -2696,10 +2698,11 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) u64 update_before; int ret = 0; + mutex_unlock(&cgroup_mutex); + /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { - mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); return 0; } @@ -2723,18 +2726,15 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) dput(prev); prev = cgrp->dentry; - mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) ret = cgroup_addrm_files(cgrp, cfts, is_add); mutex_unlock(&inode->i_mutex); if (ret) break; } - mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); dput(prev); deactivate_super(sb); @@ -4387,10 +4387,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Initiate massacre of all css's. cgroup_destroy_css_killed() * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. + * percpu refs of all css's are confirmed to be killed. This + * involves removing the subsystem's files, drop cgroup_mutex. */ + mutex_unlock(&cgroup_mutex); for_each_css(css, ssid, cgrp) kill_css(css); + mutex_lock(&cgroup_mutex); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4421,9 +4424,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * puts the base ref but we aren't quite done with @cgrp yet, so * hold onto it. */ + mutex_unlock(&cgroup_mutex); cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); + mutex_lock(&cgroup_mutex); return 0; }; -- cgit v1.2.3 From 8e30e2b8ba0ee58aa0f442d0b4a3cac1a4f2efb5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: restructure locking and error handling in cgroup_mount() cgroup is scheduled to be converted to kernfs. After conversion, cgroup_mount() won't use the sget() machinery for finding out existing super_blocks but instead would do that directly. It'll search the existing cgroupfs_roots for a matching one and create a new one iff a match doesn't exist. To ease such conversion, this patch restructures locking and error handling of the function. cgroup_tree_mutex and cgroup_mutex are grabbed from the get-go and held until return. For now, due to the way vfs locks nest outside cgroup mutexes, the two cgroup mutexes are temporarily dropped across sget() and inode mutex locking, which looks quite ridiculous; however, these will be removed through kernfs conversion and structuring the code this way makes the conversion less painful. The error goto labels are consolidated to two. This looks unwieldy now but the next patch will factor out creation of new root into a separate function with accompanying error handling and it'll look a lot better. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 73 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d28cf75f33c1..083b53d79d6f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1459,21 +1459,22 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + LIST_HEAD(tmp_links); + struct super_block *sb = NULL; + struct inode *inode = NULL; + struct cgroupfs_root *root = NULL; struct cgroup_sb_opts opts; - struct cgroupfs_root *root; - int ret = 0; - struct super_block *sb; struct cgroupfs_root *new_root; - struct list_head tmp_links; - struct inode *inode; const struct cred *cred; + int ret; - /* First find the desired set of subsystems */ + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); + + /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - mutex_unlock(&cgroup_mutex); if (ret) - goto out_err; + goto out_unlock; /* * Allocate a new cgroup root. We may not need it if we're @@ -1482,16 +1483,20 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto out_err; + goto out_unlock; } opts.new_root = new_root; /* Locate an existing or new sb for this hierarchy */ + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_free_root(opts.new_root); - goto out_err; + goto out_unlock; } root = sb->s_fs_info; @@ -1505,9 +1510,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(sb->s_root != NULL); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + ret = cgroup_get_rootdir(sb); if (ret) - goto drop_new_super; + goto out_unlock; inode = sb->s_root->d_inode; mutex_lock(&inode->i_mutex); @@ -1516,7 +1524,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); if (ret < 0) - goto unlock_drop; + goto out_unlock; root_cgrp->id = ret; /* Check for name clashes with existing mounts */ @@ -1524,7 +1532,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (strlen(root->name)) for_each_active_root(existing_root) if (!strcmp(existing_root->name, root->name)) - goto unlock_drop; + goto out_unlock; /* * We're accessing css_set_count without locking @@ -1535,12 +1543,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto unlock_drop; + goto out_unlock; /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ ret = cgroup_init_root_id(root, 2, 0); if (ret) - goto unlock_drop; + goto out_unlock; sb->s_root->d_fsdata = root_cgrp; root_cgrp->dentry = sb->s_root; @@ -1580,14 +1588,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, link_css_set(&tmp_links, cset, root_cgrp); write_unlock(&css_set_lock); - free_cgrp_cset_links(&tmp_links); - BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&inode->i_mutex); } else { /* * We re-used an existing hierarchy - the new root (if @@ -1599,32 +1601,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; - goto drop_new_super; + goto out_unlock; } else { pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } } - kfree(opts.release_agent); - kfree(opts.name); - return dget(sb->s_root); + ret = 0; + goto out_unlock; - rm_base_files: - free_cgrp_cset_links(&tmp_links); +rm_base_files: cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); revert_creds(cred); - unlock_drop: cgroup_exit_root_id(root); +out_unlock: mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&inode->i_mutex); - drop_new_super: - deactivate_locked_super(sb); - out_err: + if (inode) + mutex_unlock(&inode->i_mutex); + + if (ret && !IS_ERR_OR_NULL(sb)) + deactivate_locked_super(sb); + + free_cgrp_cset_links(&tmp_links); kfree(opts.release_agent); kfree(opts.name); - return ERR_PTR(ret); + + if (!ret) + return dget(sb->s_root); + else + return ERR_PTR(ret); } static void cgroup_kill_sb(struct super_block *sb) -- cgit v1.2.3 From d427dfeb120b92c0c5e2ca9d1ec6952de67ebad9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: factor out cgroup_setup_root() from cgroup_mount() Factor out new root initialization into cgroup_setup_root() from cgroup_mount(). This makes it easier to follow and will ease kernfs conversion. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 211 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 113 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 083b53d79d6f..0a178cd1f836 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1455,17 +1455,126 @@ static int cgroup_get_rootdir(struct super_block *sb) return 0; } +static int cgroup_setup_root(struct cgroupfs_root *root) +{ + LIST_HEAD(tmp_links); + struct super_block *sb = root->sb; + struct cgroup *root_cgrp = &root->top_cgroup; + struct cgroupfs_root *existing_root; + struct css_set *cset; + struct inode *inode; + const struct cred *cred; + int i, ret; + + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); + BUG_ON(sb->s_root != NULL); + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + + ret = cgroup_get_rootdir(sb); + if (ret) { + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + return ret; + } + inode = sb->s_root->d_inode; + + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + if (ret < 0) + goto out_unlock; + root_cgrp->id = ret; + + /* check for name clashes with existing mounts */ + ret = -EBUSY; + if (strlen(root->name)) + for_each_active_root(existing_root) + if (!strcmp(existing_root->name, root->name)) + goto out_unlock; + + /* + * We're accessing css_set_count without locking css_set_lock here, + * but that's OK - it can only be increased by someone holding + * cgroup_lock, and that's us. The worst that can happen is that we + * have some link structures left over + */ + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + if (ret) + goto out_unlock; + + /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ + ret = cgroup_init_root_id(root, 2, 0); + if (ret) + goto out_unlock; + + sb->s_root->d_fsdata = root_cgrp; + root_cgrp->dentry = sb->s_root; + + /* + * We're inside get_sb() and will call lookup_one_len() to create + * the root files, which doesn't work if SELinux is in use. The + * following cred dancing somehow works around it. See 2ce9738ba + * ("cgroupfs: use init_cred when populating new cgroupfs mount") + * for more details. + */ + cred = override_creds(&init_cred); + + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (ret) + goto rm_base_files; + + ret = rebind_subsystems(root, root->subsys_mask, 0); + if (ret) + goto rm_base_files; + + revert_creds(cred); + + /* + * There must be no failure case after here, since rebinding takes + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ + list_add(&root->root_list, &cgroup_roots); + cgroup_root_count++; + + /* + * Link the top cgroup in this hierarchy into all the css_set + * objects. + */ + write_lock(&css_set_lock); + hash_for_each(css_set_table, i, cset, hlist) + link_css_set(&tmp_links, cset, root_cgrp); + write_unlock(&css_set_lock); + + BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(root->number_of_cgroups != 1); + + ret = 0; + goto out_unlock; + +rm_base_files: + cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); + revert_creds(cred); + cgroup_exit_root_id(root); +out_unlock: + mutex_unlock(&inode->i_mutex); + free_cgrp_cset_links(&tmp_links); + return ret; +} + static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { - LIST_HEAD(tmp_links); struct super_block *sb = NULL; - struct inode *inode = NULL; struct cgroupfs_root *root = NULL; struct cgroup_sb_opts opts; struct cgroupfs_root *new_root; - const struct cred *cred; int ret; mutex_lock(&cgroup_tree_mutex); @@ -1502,94 +1611,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, root = sb->s_fs_info; BUG_ON(!root); if (root == opts.new_root) { - /* We used the new root structure, so this is a new hierarchy */ - struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; - int i; - struct css_set *cset; - - BUG_ON(sb->s_root != NULL); - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - - ret = cgroup_get_rootdir(sb); + ret = cgroup_setup_root(root); if (ret) goto out_unlock; - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); - if (ret < 0) - goto out_unlock; - root_cgrp->id = ret; - - /* Check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto out_unlock; - - /* - * We're accessing css_set_count without locking - * css_set_lock here, but that's OK - it can only be - * increased by someone holding cgroup_lock, and - * that's us. The worst that can happen is that we - * have some link structures left over - */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); - if (ret) - goto out_unlock; - - /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ - ret = cgroup_init_root_id(root, 2, 0); - if (ret) - goto out_unlock; - - sb->s_root->d_fsdata = root_cgrp; - root_cgrp->dentry = sb->s_root; - - /* - * We're inside get_sb() and will call lookup_one_len() to - * create the root files, which doesn't work if SELinux is - * in use. The following cred dancing somehow works around - * it. See 2ce9738ba ("cgroupfs: use init_cred when - * populating new cgroupfs mount") for more details. - */ - cred = override_creds(&init_cred); - - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); - if (ret) - goto rm_base_files; - - ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret) - goto rm_base_files; - - revert_creds(cred); - - /* - * There must be no failure case after here, since rebinding - * takes care of subsystems' refcounts, which are explicitly - * dropped in the failure exit path. - */ - - list_add(&root->root_list, &cgroup_roots); - cgroup_root_count++; - - /* Link the top cgroup in this hierarchy into all - * the css_set objects */ - write_lock(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) - link_css_set(&tmp_links, cset, root_cgrp); - write_unlock(&css_set_lock); - - BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); } else { /* * We re-used an existing hierarchy - the new root (if @@ -1609,22 +1633,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } ret = 0; - goto out_unlock; - -rm_base_files: - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); - cgroup_exit_root_id(root); out_unlock: mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - if (inode) - mutex_unlock(&inode->i_mutex); if (ret && !IS_ERR_OR_NULL(sb)) deactivate_locked_super(sb); - free_cgrp_cset_links(&tmp_links); kfree(opts.release_agent); kfree(opts.name); -- cgit v1.2.3 From 8d7e6fb0a1db970ac3589f87af0f2a20ef46654b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: update cgroup name handling Straightforward updates to cgroup name handling in preparation of kernfs conversion. * cgroup_alloc_name() is updated to take const char * isntead of dentry * for name source. * cgroup name formatting is separated out into cgroup_file_name(). While at it, buffer length protection is added. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0a178cd1f836..3f204429d108 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -67,6 +67,9 @@ */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ +#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ + MAX_CFTYPE_NAME + 2) + /* * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file * creation/removal and hierarchy changing operations including cgroup @@ -799,17 +802,29 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +static struct cgroup_name *cgroup_alloc_name(const char *name_str) { struct cgroup_name *name; - name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); + name = kmalloc(sizeof(*name) + strlen(name_str) + 1, GFP_KERNEL); if (!name) return NULL; - strcpy(name->name, dentry->d_name.name); + strcpy(name->name, name_str); return name; } +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) +{ + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", + cft->ss->name, cft->name); + else + strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + return buf; +} + static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); @@ -2437,7 +2452,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_dentry); + name = cgroup_alloc_name(new_dentry->d_name.name); if (!name) return -ENOMEM; @@ -2613,14 +2628,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) struct cfent *cfe; int error; umode_t mode; - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - - if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, cft->ss->name); - strcat(name, "."); - } - strcat(name, cft->name); + char name[CGROUP_FILE_NAME_MAX]; BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); @@ -2628,6 +2636,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) if (!cfe) return -ENOMEM; + cgroup_file_name(cgrp, cft, name); dentry = lookup_one_len(name, dir, strlen(name)); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); @@ -4135,7 +4144,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(dentry); + name = cgroup_alloc_name(dentry->d_name.name); if (!name) { err = -ENOMEM; goto err_free_cgrp; -- cgit v1.2.3 From de00ffa56ea3132c6013fc8f07133b8a1014cf53 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: make cgroup_subsys->base_cftypes use cgroup_add_cftypes() Currently, cgroup_subsys->base_cftypes registration is different from dynamic cftypes registartion. Instead of going through cgroup_add_cftypes(), cgroup_init_subsys() invokes cgroup_init_cftsets() which makes use of cgroup_subsys->base_cftset which doesn't involve dynamic allocation. While avoiding dynamic allocation is somewhat nice, having two separate paths for cftypes registration is nasty, especially as we're planning to add more operations during cftypes registration. This patch drops cgroup_init_cftsets() and cgroup_subsys->base_cftset and registers base_cftypes using cgroup_add_cftypes(). This is done as a separate step in cgroup_init() instead of a part of cgroup_init_subsys(). This is because cgroup_init_subsys() can be called very early during boot when kmalloc() isn't available yet. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f204429d108..eb002c622cd6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4503,25 +4503,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) return ret; } -static void __init cgroup_init_cftsets(struct cgroup_subsys *ss) -{ - INIT_LIST_HEAD(&ss->cftsets); - - /* - * base_cftset is embedded in subsys itself, no need to worry about - * deregistration. - */ - if (ss->base_cftypes) { - struct cftype *cft; - - for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) - cft->ss = ss; - - ss->base_cftset.cfts = ss->base_cftypes; - list_add_tail(&ss->base_cftset.node, &ss->cftsets); - } -} - static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; @@ -4531,8 +4512,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - /* init base cftset */ - cgroup_init_cftsets(ss); + INIT_LIST_HEAD(&ss->cftsets); /* Create the top cgroup state for this subsystem */ ss->root = &cgroup_dummy_root; @@ -4621,6 +4601,13 @@ int __init cgroup_init(void) for_each_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); + + /* + * cftype registration needs kmalloc and can't be done + * during early_init. Register base cftypes separately. + */ + if (ss->base_cftypes) + WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); } /* allocate id for the dummy hierarchy */ -- cgit v1.2.3 From 5f46990787e2721b4db190ddc8af6fdbe8f010d7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: update the meaning of cftype->max_write_len cftype->max_write_len is used to extend the maximum size of writes. It's interpreted in such a way that the actual maximum size is one less than the specified value. The default size is defined by CGROUP_LOCAL_BUFFER_SIZE. Its interpretation is quite confusing - its value is decremented by 1 and then compared for equality with max size, which means that the actual default size is CGROUP_LOCAL_BUFFER_SIZE - 2, which is 62 chars. There's no point in having a limit that low. Update its definition so that it means the actual string length sans termination and anything below PAGE_SIZE-1 is treated as PAGE_SIZE-1. .max_write_len for "release_agent" is updated to PATH_MAX-1 and cgroup_release_agent_write() is updated so that the redundant strlen() check is removed and it uses strlcpy() instead of strcpy(). .max_write_len initializations in blk-throttle.c and cfq-iosched.c are no longer necessary and removed. The one in cpuset is kept unchanged as it's an approximated value to begin with. This will also make transition to kernfs smoother. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index eb002c622cd6..fde3633ef389 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2213,13 +2213,14 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, static int cgroup_release_agent_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); - if (strlen(buffer) >= PATH_MAX) - return -EINVAL; + struct cgroupfs_root *root = css->cgroup->root; + + BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; spin_lock(&release_agent_path_lock); - strcpy(css->cgroup->root->release_agent_path, buffer); + strlcpy(root->release_agent_path, buffer, + sizeof(root->release_agent_path)); spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); return 0; @@ -2245,20 +2246,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 - static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, size_t nbytes, loff_t *ppos) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); struct cgroup_subsys_state *css = cfe->css; - size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; + size_t max_bytes = max(cft->max_write_len, PAGE_SIZE); char *buf; int ret; - if (nbytes >= max_bytes) + if (nbytes > max_bytes) return -E2BIG; buf = kmalloc(nbytes + 1, GFP_KERNEL); @@ -3919,7 +3917,7 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; -- cgit v1.2.3 From 2da440a26ce4743bd3e71ba964ba3f983d09bba5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:48 -0500 Subject: cgroup: introduce cgroup_init/exit_cftypes() Factor out cft->ss initialization into cgroup_init_cftypes() from cgroup_add_cftypes() and add cft->ss clearing to cgroup_rm_cftypes() through cgroup_exit_cftypes(). This doesn't make any meaningful difference now but the two new functions will be expanded during kernfs transition. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fde3633ef389..42e588ef62d1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2770,6 +2770,22 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) return ret; } +static void cgroup_exit_cftypes(struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = NULL; +} + +static void cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = ss; +} + /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem @@ -2787,15 +2803,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; - struct cftype *cft; int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; - for (cft = cfts; cft->name[0] != '\0'; cft++) - cft->ss = ss; + cgroup_init_cftypes(ss, cfts); cgroup_cfts_prepare(); set->cfts = cfts; @@ -2820,6 +2834,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); */ int cgroup_rm_cftypes(struct cftype *cfts) { + struct cftype *found = NULL; struct cftype_set *set; if (!cfts || !cfts[0].ss) @@ -2831,13 +2846,14 @@ int cgroup_rm_cftypes(struct cftype *cfts) if (set->cfts == cfts) { list_del(&set->node); kfree(set); - cgroup_cfts_commit(cfts, false); - return 0; + found = cfts; + break; } } - cgroup_cfts_commit(NULL, false); - return -ENOENT; + cgroup_cfts_commit(found, false); + cgroup_exit_cftypes(cfts); + return found ? 0 : -ENOENT; } /** @@ -4596,6 +4612,8 @@ int __init cgroup_init(void) if (err) return err; + cgroup_init_cftypes(NULL, cgroup_base_files); + for_each_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); -- cgit v1.2.3 From b1664924062393bb048203bd4622e0b1c9e1d328 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:49 -0500 Subject: cgroup: introduce cgroup_ino() mm/memory-failure.c::hwpoison_filter_task() has been reaching into cgroup to extract the associated ino to be used as a filtering criterion. This is an implementation detail which shouldn't be depended upon from outside cgroup proper and is about to change with the scheduled kernfs conversion. This patch introduces a proper interface to determine the associated ino, cgroup_ino(), and updates hwpoison_filter_task() to use it instead of reaching directly into cgroup. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Andi Kleen Cc: Wu Fengguang --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 42e588ef62d1..11f7a05e791e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -792,7 +792,10 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { - inode->i_ino = get_next_ino(); + do { + /* ino 0 is reserved for dummy_root */ + inode->i_ino = get_next_ino(); + } while (!inode->i_ino); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); -- cgit v1.2.3 From 59f5296b51b86718dd6eecf0a268b2f1a1ec0a2d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:49 -0500 Subject: cgroup: misc preps for kernfs conversion * Un-inline seq_css(). After kernfs conversion, the function will need to dereference internal data structures. * Add cgroup_get/put_root() and replace direct super_block->s_active manipulatinos with them. These will be converted to kernfs_root refcnting. * Add cgroup_get/put() and replace dget/put() on cgrp->dentry with them. These will be converted to kernfs refcnting. * Update current_css_set_cg_links_read() to use cgroup_name() instead of reaching into the dentry name. The end result is the same. These changes don't make functional differences but will make transition to kernfs easier. v2: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 85 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11f7a05e791e..9e9e8fd632d8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -169,6 +169,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; +static void cgroup_put(struct cgroup *cgrp); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -204,6 +205,13 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } +struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + struct cgroup_open_file *of = seq->private; + return of->cfe->css; +} +EXPORT_SYMBOL_GPL(seq_css); + /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested @@ -682,6 +690,16 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +static void cgroup_get_root(struct cgroupfs_root *root) +{ + atomic_inc(&root->sb->s_active); +} + +static void cgroup_put_root(struct cgroupfs_root *root) +{ + deactivate_super(root->sb); +} + /* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex held. @@ -837,18 +855,14 @@ static void cgroup_free_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); /* - * We get a ref to the parent's dentry, and put the ref when - * this cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. + * We get a ref to the parent, and put the ref when this cgroup is + * being freed, so it's guaranteed that the parent won't be + * destroyed before its children. */ - dput(cgrp->parent->dentry); + cgroup_put(cgrp->parent); - /* - * Drop the active superblock reference that we took when we - * created the cgroup. This will free cgrp->root, if we are - * holding the last reference to @sb. - */ - deactivate_super(cgrp->root->sb); + /* put the root reference that we took when we created the cgroup */ + cgroup_put_root(cgrp->root); cgroup_pidlist_destroy_all(cgrp); @@ -866,6 +880,11 @@ static void cgroup_free_rcu(struct rcu_head *head) queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } +static void cgroup_get(struct cgroup *cgrp) +{ + dget(cgrp->dentry); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -899,6 +918,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static void cgroup_put(struct cgroup *cgrp) +{ + dput(cgrp->dentry); +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -2724,7 +2748,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; - struct dentry *prev = NULL; + struct cgroup *prev = NULL; struct inode *inode; struct cgroup_subsys_state *css; u64 update_before; @@ -2754,9 +2778,10 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) continue; inode = cgrp->dentry->d_inode; - dget(cgrp->dentry); - dput(prev); - prev = cgrp->dentry; + cgroup_get(cgrp); + if (prev) + cgroup_put(prev); + prev = cgrp; mutex_unlock(&cgroup_tree_mutex); mutex_lock(&inode->i_mutex); @@ -2768,8 +2793,8 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) break; } mutex_unlock(&cgroup_tree_mutex); - dput(prev); - deactivate_super(sb); + cgroup_put(prev); + cgroup_put_root(ss->root); return ret; } @@ -3863,11 +3888,9 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, */ static void cgroup_dput(struct cgroup *cgrp) { - struct super_block *sb = cgrp->root->sb; - - atomic_inc(&sb->s_active); - dput(cgrp->dentry); - deactivate_super(sb); + cgroup_get_root(cgrp->root); + cgroup_put(cgrp); + cgroup_put_root(cgrp->root); } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, @@ -4118,7 +4141,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_free; - dget(cgrp->dentry); + cgroup_get(cgrp); css_get(css->parent); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && @@ -4197,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * can be done outside cgroup_mutex, since the sb can't * disappear while someone has an open control file on the * fs */ - atomic_inc(&sb->s_active); + cgroup_get_root(root); init_cgroup_housekeeping(cgrp); @@ -4231,7 +4254,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, root->number_of_cgroups++; /* hold a ref to the parent's dentry */ - dget(parent->dentry); + cgroup_get(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4261,7 +4284,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); /* Release the reference count that we took on the superblock */ - deactivate_super(sb); + cgroup_put_root(root); err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: @@ -4493,7 +4516,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - struct dentry *d = cgrp->dentry; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4501,7 +4523,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - dput(d); + cgroup_put(cgrp); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); @@ -5161,12 +5183,11 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name; + const char *name = "?"; + + if (c != cgroup_dummy_top) + name = cgroup_name(c); - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name); } -- cgit v1.2.3 From f2e85d574e881ff3c597518c1ab48c86f9109880 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:49 -0500 Subject: cgroup: relocate functions in preparation of kernfs conversion Relocate cgroup_init/exit_root_id(), cgroup_free_root(), cgroup_kill_sb() and cgroup_file_name() in preparation of kernfs conversion. These are pure relocations to make kernfs conversion easier to follow. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 232 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 117 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9e9e8fd632d8..d8efca44de5f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -170,6 +170,8 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); +static int rebind_subsystems(struct cgroupfs_root *root, + unsigned long added_mask, unsigned removed_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -690,6 +692,42 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) +{ + int id; + + lockdep_assert_held(&cgroup_mutex); + + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, + GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; + return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ + lockdep_assert_held(&cgroup_mutex); + + if (root->hierarchy_id) { + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); + root->hierarchy_id = 0; + } +} + +static void cgroup_free_root(struct cgroupfs_root *root) +{ + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); + + idr_destroy(&root->cgroup_idr); + kfree(root); + } +} + static void cgroup_get_root(struct cgroupfs_root *root) { atomic_inc(&root->sb->s_active); @@ -700,6 +738,59 @@ static void cgroup_put_root(struct cgroupfs_root *root) deactivate_super(root->sb); } +static void cgroup_kill_sb(struct super_block *sb) +{ + struct cgroupfs_root *root = sb->s_fs_info; + struct cgroup *cgrp = &root->top_cgroup; + struct cgrp_cset_link *link, *tmp_link; + int ret; + + BUG_ON(!root); + + BUG_ON(root->number_of_cgroups != 1); + BUG_ON(!list_empty(&cgrp->children)); + + mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + /* Rebind all subsystems back to the default hierarchy */ + if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { + ret = rebind_subsystems(root, 0, root->subsys_mask); + /* Shouldn't be able to fail ... */ + BUG_ON(ret); + } + + /* + * Release all the links from cset_links to this hierarchy's + * root cgroup + */ + write_lock(&css_set_lock); + + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); + kfree(link); + } + write_unlock(&css_set_lock); + + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + cgroup_root_count--; + } + + cgroup_exit_root_id(root); + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + + simple_xattrs_free(&cgrp->xattrs); + + kill_litter_super(sb); + cgroup_free_root(root); +} + /* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex held. @@ -846,6 +937,32 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, return buf; } +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static umode_t cgroup_file_mode(const struct cftype *cft) +{ + umode_t mode = 0; + + if (cft->mode) + return cft->mode; + + if (cft->read_u64 || cft->read_s64 || cft->seq_show) + mode |= S_IRUGO; + + if (cft->write_u64 || cft->write_s64 || cft->write_string || + cft->trigger) + mode |= S_IWUSR; + + return mode; +} + static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); @@ -1358,31 +1475,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) idr_init(&root->cgroup_idr); } -static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) -{ - int id; - - lockdep_assert_held(&cgroup_mutex); - - id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, - GFP_KERNEL); - if (id < 0) - return id; - - root->hierarchy_id = id; - return 0; -} - -static void cgroup_exit_root_id(struct cgroupfs_root *root) -{ - lockdep_assert_held(&cgroup_mutex); - - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } -} - static int cgroup_test_super(struct super_block *sb, void *data) { struct cgroup_sb_opts *opts = data; @@ -1435,17 +1527,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static void cgroup_free_root(struct cgroupfs_root *root) -{ - if (root) { - /* hierarhcy ID shoulid already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - - idr_destroy(&root->cgroup_idr); - kfree(root); - } -} - static int cgroup_set_super(struct super_block *sb, void *data) { int ret; @@ -1691,59 +1772,6 @@ out_unlock: return ERR_PTR(ret); } -static void cgroup_kill_sb(struct super_block *sb) -{ - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgrp_cset_link *link, *tmp_link; - int ret; - - BUG_ON(!root); - - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); - - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - - /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); - } - - /* - * Release all the links from cset_links to this hierarchy's - * root cgroup - */ - write_lock(&css_set_lock); - - list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { - list_del(&link->cset_link); - list_del(&link->cgrp_link); - kfree(link); - } - write_unlock(&css_set_lock); - - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } - - cgroup_exit_root_id(root); - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - - simple_xattrs_free(&cgrp->xattrs); - - kill_litter_super(sb); - cgroup_free_root(root); -} - static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, @@ -2619,32 +2647,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, return 0; } -/** - * cgroup_file_mode - deduce file mode of a control file - * @cft: the control file in question - * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander - */ -static umode_t cgroup_file_mode(const struct cftype *cft) -{ - umode_t mode = 0; - - if (cft->mode) - return cft->mode; - - if (cft->read_u64 || cft->read_s64 || cft->seq_show) - mode |= S_IRUGO; - - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) - mode |= S_IWUSR; - - return mode; -} - static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { struct dentry *dir = cgrp->dentry; -- cgit v1.2.3 From 2bd59d48ebfb3df41ee56938946ca0dd30887312 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Feb 2014 11:52:49 -0500 Subject: cgroup: convert to kernfs cgroup filesystem code was derived from the original sysfs implementation which was heavily intertwined with vfs objects and locking with the goal of re-using the existing vfs infrastructure. That experiment turned out rather disastrous and sysfs switched, a long time ago, to distributed filesystem model where a separate representation is maintained which is queried by vfs. Unfortunately, cgroup stuck with the failed experiment all these years and accumulated even more problems over time. Locking and object lifetime management being entangled with vfs is probably the most egregious. vfs is never designed to be misused like this and cgroup ends up jumping through various convoluted dancing to make things work. Even then, operations across multiple cgroups can't be done safely as it'll deadlock with rename locking. Recently, kernfs is separated out from sysfs so that it can be used by users other than sysfs. This patch converts cgroup to use kernfs, which will bring the following benefits. * Separation from vfs internals. Locking and object lifetime management is contained in cgroup proper making things a lot simpler. This removes significant amount of locking convolutions, hairy object lifetime rules and the restriction on multi-cgroup operations. * Can drop a lot of code to implement filesystem interface as most are provided by kernfs. * Proper "severing" semantics, which allows controllers to not worry about lingering file accesses after offline. While the preceding patches did as much as possible to make the transition less painful, large part of the conversion has to be one discrete step making this patch rather large. The rest of the commit message lists notable changes in different areas. Overall ------- * vfs constructs replaced with kernfs ones. cgroup->dentry w/ ->kn, cgroupfs_root->sb w/ ->kf_root. * All dentry accessors are removed. Helpers to map from kernfs constructs are added. * All vfs plumbing around dentry, inode and bdi removed. * cgroup_mount() now directly looks for matching root and then proceeds to create a new one if not found. Synchronization and object lifetime ----------------------------------- * vfs inode locking removed. Among other things, this removes the need for the convolution in cgroup_cfts_commit(). Future patches will further simplify it. * vfs refcnting replaced with cgroup internal ones. cgroup->refcnt, cgroupfs_root->refcnt added. cgroup_put_root() now directly puts root->refcnt and when it reaches zero proceeds to destroy it thus merging cgroup_put_root() and the former cgroup_kill_sb(). Simliarly, cgroup_put() now directly schedules cgroup_free_rcu() when refcnt reaches zero. * Unlike before, kernfs objects don't hold onto cgroup objects. When cgroup destroys a kernfs node, all existing operations are drained and the association is broken immediately. The same for cgroupfs_roots and mounts. * All operations which come through kernfs guarantee that the associated cgroup is and stays valid for the duration of operation; however, there are two paths which need to find out the associated cgroup from dentry without going through kernfs - css_tryget_from_dir() and cgroupstats_build(). For these two, kernfs_node->priv is RCU managed so that they can dereference it under RCU read lock. File and directory handling --------------------------- * File and directory operations converted to kernfs_ops and kernfs_syscall_ops. * xattrs is implicitly supported by kernfs. No need to worry about it from cgroup. This means that "xattr" mount option is no longer necessary. A future patch will add a deprecated warning message when sane_behavior. * When cftype->max_write_len > PAGE_SIZE, it's necessary to make a private copy of one of the kernfs_ops to set its atomic_write_len. cftype->kf_ops is added and cgroup_init/exit_cftypes() are updated to handle it. * cftype->lockdep_key added so that kernfs lockdep annotation can be per cftype. * Inidividual file entries and open states are now managed by kernfs. No need to worry about them from cgroup. cfent, cgroup_open_file and their friends are removed. * kernfs_nodes are created deactivated and kernfs_activate() invocations added to places where creation of new nodes are committed. * cgroup_rmdir() uses kernfs_[un]break_active_protection() for self-removal. v2: - Li pointed out in an earlier patch that specifying "name=" during mount without subsystem specification should succeed if there's an existing hierarchy with a matching name although it should fail with -EINVAL if a new hierarchy should be created. Prior to the conversion, this used by handled by deferring failure from NULL return from cgroup_root_from_opts(), which was necessary because root was being created before checking for existing ones. Note that cgroup_root_from_opts() returned an ERR_PTR() value for error conditions which require immediate mount failure. As we now have separate search and creation steps, deferring failure from cgroup_root_from_opts() is no longer necessary. cgroup_root_from_opts() is updated to always return ERR_PTR() value on failure. - The logic to match existing roots is updated so that a mount attempt with a matching name but different subsys_mask are rejected. This was handled by a separate matching loop under the comment "Check for name clashes with existing mounts" but got lost during conversion. Merge the check into the main search loop. - Add __rcu __force casting in RCU_INIT_POINTER() in cgroup_destroy_locked() to avoid the sparse address space warning reported by kbuild test bot. Maybe we want an explicit interface to use kn->priv as RCU protected pointer? v3: Make CONFIG_CGROUPS select CONFIG_KERNFS. v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: kbuild test robot fengguang.wu@intel.com> --- kernel/cgroup.c | 1115 ++++++++++++++++++------------------------------------- 1 file changed, 362 insertions(+), 753 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d8efca44de5f..cda614da40cf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -40,9 +40,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -50,7 +48,6 @@ #include #include #include -#include #include #include #include /* TODO: replace with more sophisticated array */ @@ -176,7 +173,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static int cgroup_file_release(struct inode *inode, struct file *file); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** @@ -209,8 +205,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) struct cgroup_subsys_state *seq_css(struct seq_file *seq) { - struct cgroup_open_file *of = seq->private; - return of->cfe->css; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = seq_cft(seq); + + /* + * This is open and unprotected implementation of cgroup_css(). + * seq_css() is only called from a kernfs file operation which has + * an active reference on the file. Because all the subsystem + * files are drained before a css is disassociated with a cgroup, + * the matching css from the cgroup's subsys table is guaranteed to + * be and stay valid until the enclosing operation is complete. + */ + if (cft->ss) + return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); + else + return &cgrp->dummy_css; } EXPORT_SYMBOL_GPL(seq_css); @@ -276,21 +286,6 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_active_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cfent *__d_cfe(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return __d_cfe(dentry)->type; -} - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -692,6 +687,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } +static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +{ + struct cgroup *top_cgrp = kf_root->kn->priv; + + return top_cgrp->root; +} + static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) { int id; @@ -730,30 +732,37 @@ static void cgroup_free_root(struct cgroupfs_root *root) static void cgroup_get_root(struct cgroupfs_root *root) { - atomic_inc(&root->sb->s_active); + /* + * The caller must ensure that @root is alive, which can be + * achieved by holding a ref on one of the member cgroups or + * following a registered reference to @root while holding + * cgroup_tree_mutex. + */ + WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0); + atomic_inc(&root->refcnt); } static void cgroup_put_root(struct cgroupfs_root *root) { - deactivate_super(root->sb); -} - -static void cgroup_kill_sb(struct super_block *sb) -{ - struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; int ret; - BUG_ON(!root); + /* + * @root's refcnt reaching zero and its deregistration should be + * atomic w.r.t. cgroup_tree_mutex. This ensures that + * cgroup_get_root() is safe to invoke if @root is registered. + */ + mutex_lock(&cgroup_tree_mutex); + if (!atomic_dec_and_test(&root->refcnt)) { + mutex_unlock(&cgroup_tree_mutex); + return; + } + mutex_lock(&cgroup_mutex); BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - /* Rebind all subsystems back to the default hierarchy */ if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { ret = rebind_subsystems(root, 0, root->subsys_mask); @@ -783,11 +792,8 @@ static void cgroup_kill_sb(struct super_block *sb) mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - - simple_xattrs_free(&cgrp->xattrs); - kill_litter_super(sb); + kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -878,42 +884,10 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); -static const struct inode_operations cgroup_dir_inode_operations; +static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct backing_dev_info cgroup_backing_dev_info = { - .name = "cgroup", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) -{ - struct inode *inode = new_inode(sb); - - if (inode) { - do { - /* ino 0 is reserved for dummy_root */ - inode->i_ino = get_next_ino(); - } while (!inode->i_ino); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; -} - static struct cgroup_name *cgroup_alloc_name(const char *name_str) { struct cgroup_name *name; @@ -983,8 +957,6 @@ static void cgroup_free_fn(struct work_struct *work) cgroup_pidlist_destroy_all(cgrp); - simple_xattrs_free(&cgrp->xattrs); - kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -999,81 +971,38 @@ static void cgroup_free_rcu(struct rcu_head *head) static void cgroup_get(struct cgroup *cgrp) { - dget(cgrp->dentry); -} - -static void cgroup_diput(struct dentry *dentry, struct inode *inode) -{ - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - - BUG_ON(!(cgroup_is_dead(cgrp))); - - /* - * XXX: cgrp->id is only used to look up css's. As cgroup - * and css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ - mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); - } else { - struct cfent *cfe = __d_cfe(dentry); - struct cgroup *cgrp = dentry->d_parent->d_fsdata; - - WARN_ONCE(!list_empty(&cfe->node) && - cgrp != &cgrp->root->top_cgroup, - "cfe still linked for %s\n", cfe->type->name); - simple_xattrs_free(&cfe->xattrs); - kfree(cfe); - } - iput(inode); + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); + atomic_inc(&cgrp->refcnt); } static void cgroup_put(struct cgroup *cgrp) { - dput(cgrp->dentry); -} + if (!atomic_dec_and_test(&cgrp->refcnt)) + return; + if (WARN_ON_ONCE(!cgroup_is_dead(cgrp))) + return; -static void remove_dir(struct dentry *d) -{ - struct dentry *parent = dget(d->d_parent); + /* + * XXX: cgrp->id is only used to look up css's. As cgroup and + * css's lifetimes will be decoupled, it should be made + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. + */ + mutex_lock(&cgroup_mutex); + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); + cgrp->id = -1; - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { - struct cfent *cfe; + char name[CGROUP_FILE_NAME_MAX]; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_tree_mutex); - - /* - * If we're doing cleanup due to failure of cgroup_create(), - * the corresponding @cfe may not exist. - */ - list_for_each_entry(cfe, &cgrp->files, node) { - struct dentry *d = cfe->dentry; - - if (cft && cfe->type != cft) - continue; - - dget(d); - d_delete(d); - simple_unlink(cgrp->dentry->d_inode, d); - list_del_init(&cfe->node); - dput(d); - - break; - } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** @@ -1096,22 +1025,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) } } -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) -{ - struct dentry *parent; - - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - remove_dir(dentry); -} - static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_mask, unsigned removed_mask) { @@ -1179,13 +1092,15 @@ static int rebind_subsystems(struct cgroupfs_root *root, * now matches the bound subsystems. */ root->flags |= CGRP_ROOT_SUBSYS_BOUND; + kernfs_activate(cgrp->kn); return 0; } -static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; @@ -1219,9 +1134,6 @@ struct cgroup_sb_opts { char *name; /* User explicitly requested empty subsystem */ bool none; - - struct cgroupfs_root *new_root; - }; /* @@ -1380,11 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static int cgroup_remount(struct super_block *sb, int *flags, char *data) +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; @@ -1393,7 +1304,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -1439,34 +1349,26 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return ret; } -static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; - static void init_cgroup_housekeeping(struct cgroup *cgrp) { + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; + atomic_set(&root->refcnt, 1); INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; @@ -1475,32 +1377,12 @@ static void init_cgroup_root(struct cgroupfs_root *root) idr_init(&root->cgroup_idr); } -static int cgroup_test_super(struct super_block *sb, void *data) -{ - struct cgroup_sb_opts *opts = data; - struct cgroupfs_root *root = sb->s_fs_info; - - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_mask || opts->none) - && (opts->subsys_mask != root->subsys_mask)) - return 0; - - return 1; -} - static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) { struct cgroupfs_root *root; if (!opts->subsys_mask && !opts->none) - return NULL; + return ERR_PTR(-EINVAL); root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) @@ -1527,99 +1409,21 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroup_sb_opts *opts = data; - - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; - - BUG_ON(!opts->subsys_mask && !opts->none); - - ret = set_anon_super(sb, NULL); - if (ret) - return ret; - - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; - - return 0; -} - -static int cgroup_get_rootdir(struct super_block *sb) -{ - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - .d_delete = always_delete_dentry, - }; - - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - - if (!inode) - return -ENOMEM; - - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - /* for everything else we want ->d_op set */ - sb->s_d_op = &cgroup_dops; - return 0; -} - static int cgroup_setup_root(struct cgroupfs_root *root) { LIST_HEAD(tmp_links); - struct super_block *sb = root->sb; struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; struct css_set *cset; - struct inode *inode; - const struct cred *cred; int i, ret; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - BUG_ON(sb->s_root != NULL); - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - - ret = cgroup_get_rootdir(sb); - if (ret) { - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - return ret; - } - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); if (ret < 0) - goto out_unlock; + goto out; root_cgrp->id = ret; - /* check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto out_unlock; - /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding @@ -1628,34 +1432,29 @@ static int cgroup_setup_root(struct cgroupfs_root *root) */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto out_unlock; + goto out; /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ ret = cgroup_init_root_id(root, 2, 0); if (ret) - goto out_unlock; - - sb->s_root->d_fsdata = root_cgrp; - root_cgrp->dentry = sb->s_root; + goto out; - /* - * We're inside get_sb() and will call lookup_one_len() to create - * the root files, which doesn't work if SELinux is in use. The - * following cred dancing somehow works around it. See 2ce9738ba - * ("cgroupfs: use init_cred when populating new cgroupfs mount") - * for more details. - */ - cred = override_creds(&init_cred); + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + root_cgrp); + if (IS_ERR(root->kf_root)) { + ret = PTR_ERR(root->kf_root); + goto exit_root_id; + } + root_cgrp->kn = root->kf_root->kn; ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); if (ret) - goto rm_base_files; + goto destroy_root; ret = rebind_subsystems(root, root->subsys_mask, 0); if (ret) - goto rm_base_files; - - revert_creds(cred); + goto destroy_root; /* * There must be no failure case after here, since rebinding takes @@ -1677,15 +1476,16 @@ static int cgroup_setup_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); + kernfs_activate(root_cgrp->kn); ret = 0; - goto out_unlock; + goto out; -rm_base_files: - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); +destroy_root: + kernfs_destroy_root(root->kf_root); + root->kf_root = NULL; +exit_root_id: cgroup_exit_root_id(root); -out_unlock: - mutex_unlock(&inode->i_mutex); +out: free_cgrp_cset_links(&tmp_links); return ret; } @@ -1694,10 +1494,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { - struct super_block *sb = NULL; - struct cgroupfs_root *root = NULL; + struct cgroupfs_root *root; struct cgroup_sb_opts opts; - struct cgroupfs_root *new_root; + struct dentry *dentry; int ret; mutex_lock(&cgroup_tree_mutex); @@ -1708,41 +1507,32 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. - */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto out_unlock; - } - opts.new_root = new_root; + /* look for a matching existing root */ + for_each_active_root(root) { + bool name_match = false; - /* Locate an existing or new sb for this hierarchy */ - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_free_root(opts.new_root); - goto out_unlock; - } + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - ret = cgroup_setup_root(root); - if (ret) - goto out_unlock; - } else { /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. */ - cgroup_free_root(opts.new_root); + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { @@ -1753,23 +1543,45 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } + + cgroup_get_root(root); + goto out_unlock; } - ret = 0; + /* no such thing, create a new one */ + root = cgroup_root_from_opts(&opts); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_unlock; + } + + ret = cgroup_setup_root(root); + if (ret) + cgroup_free_root(root); + out_unlock: mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - if (ret && !IS_ERR_OR_NULL(sb)) - deactivate_locked_super(sb); - kfree(opts.release_agent); kfree(opts.name); - if (!ret) - return dget(sb->s_root); - else + if (ret) return ERR_PTR(ret); + + dentry = kernfs_mount(fs_type, flags, root->kf_root); + if (IS_ERR(dentry)) + cgroup_put_root(root); + return dentry; +} + +static void cgroup_kill_sb(struct super_block *sb) +{ + struct kernfs_root *kf_root = kernfs_root_from_sb(sb); + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + + cgroup_put_root(root); + kernfs_kill_sb(sb); } static struct file_system_type cgroup_fs_type = { @@ -2301,29 +2113,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - size_t max_bytes = max(cft->max_write_len, PAGE_SIZE); - char *buf; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of->kn->priv; + struct cgroup_subsys_state *css; int ret; - if (nbytes > max_bytes) - return -E2BIG; - - buf = kmalloc(nbytes + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, userbuf, nbytes)) { - ret = -EFAULT; - goto out_free; - } - - buf[nbytes] = '\0'; + /* + * kernfs guarantees that a file isn't deleted with operations in + * flight, which means that the matching css is and stays alive and + * doesn't need to be pinned. The RCU locking is not necessary + * either. It's just for the convenience of using cgroup_css(). + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + rcu_read_unlock(); if (cft->write_string) { ret = cft->write_string(css, cft, strstrip(buf)); @@ -2342,53 +2148,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, } else { ret = -EINVAL; } -out_free: - kfree(buf); + return ret ?: nbytes; } -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_start) { - return cft->seq_start(seq, ppos); - } else { - /* - * The same behavior and code as single_open(). Returns - * !NULL if pos is at the beginning; otherwise, NULL. - */ - return NULL + !*ppos; - } + return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_next) { - return cft->seq_next(seq, v, ppos); - } else { - /* - * The same behavior and code as single_open(), always - * terminate after the initial read. - */ - ++*ppos; - return NULL; - } + return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_stop) - cft->seq_stop(seq, v); + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -2408,96 +2184,36 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return 0; } -static struct seq_operations cgroup_seq_operations = { - .start = cgroup_seqfile_start, - .next = cgroup_seqfile_next, - .stop = cgroup_seqfile_stop, - .show = cgroup_seqfile_show, +static struct kernfs_ops cgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_show = cgroup_seqfile_show, }; -static int cgroup_file_open(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - struct cgroup_subsys_state *css; - struct cgroup_open_file *of; - int err; - - err = generic_file_open(inode, file); - if (err) - return err; - - /* - * If the file belongs to a subsystem, pin the css. Will be - * unpinned either on open failure or release. This ensures that - * @css stays alive for all file operations. - */ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - if (cft->ss && !css_tryget(css)) - css = NULL; - rcu_read_unlock(); - - if (!css) - return -ENODEV; - - /* - * @cfe->css is used by read/write/close to determine the - * associated css. @file->private_data would be a better place but - * that's already used by seqfile. Multiple accessors may use it - * simultaneously which is okay as the association never changes. - */ - WARN_ON_ONCE(cfe->css && cfe->css != css); - cfe->css = css; - - of = __seq_open_private(file, &cgroup_seq_operations, - sizeof(struct cgroup_open_file)); - if (of) { - of->cfe = cfe; - return 0; - } - - if (css->ss) - css_put(css); - return -ENOMEM; -} - -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - - if (css->ss) - css_put(css); - return seq_release_private(inode, file); -} +static struct kernfs_ops cgroup_kf_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_start = cgroup_seqfile_start, + .seq_next = cgroup_seqfile_next, + .seq_stop = cgroup_seqfile_stop, + .seq_show = cgroup_seqfile_show, +}; /* * cgroup_rename - Only allow simple rename of directories in place. */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { - int ret; + struct cgroup *cgrp = kn->priv; struct cgroup_name *name, *old_name; - struct cgroup *cgrp; - - /* - * It's convinient to use parent dir's i_mutex to protected - * cgrp->name. - */ - lockdep_assert_held(&old_dir->i_mutex); + int ret; - if (!S_ISDIR(old_dentry->d_inode->i_mode)) + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) + if (kn->parent != new_parent) return -EIO; - cgrp = __d_cgrp(old_dentry); - /* * This isn't a proper migration and its usefulness is very * limited. Disallow if sane_behavior. @@ -2505,186 +2221,43 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_dentry->d_name.name); + name = cgroup_alloc_name(new_name_str); if (!name) return -ENOMEM; - ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - kfree(name); - return ret; + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) { + old_name = rcu_dereference_protected(cgrp->name, true); + rcu_assign_pointer(cgrp->name, name); + } else { + old_name = name; } - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); kfree_rcu(old_name, rcu_head); - return 0; -} - -static struct simple_xattrs *__d_xattrs(struct dentry *dentry) -{ - if (S_ISDIR(dentry->d_inode->i_mode)) - return &__d_cgrp(dentry)->xattrs; - else - return &__d_cfe(dentry)->xattrs; -} - -static inline int xattr_enabled(struct dentry *dentry) -{ - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return root->flags & CGRP_ROOT_XATTR; -} - -static bool is_valid_xattr(const char *name) -{ - if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) - return true; - return false; -} - -static int cgroup_setxattr(struct dentry *dentry, const char *name, - const void *val, size_t size, int flags) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); -} - -static int cgroup_removexattr(struct dentry *dentry, const char *name) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_remove(__d_xattrs(dentry), name); -} - -static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, - void *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_get(__d_xattrs(dentry), name, buf, size); -} - -static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - return simple_xattr_list(__d_xattrs(dentry), buf, size); -} - -static const struct file_operations cgroup_file_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; - -static const struct inode_operations cgroup_file_inode_operations = { - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static int cgroup_create_file(struct dentry *dentry, umode_t mode, - struct super_block *sb) -{ - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; - - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; - - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - inc_nlink(dentry->d_parent->d_inode); - - /* - * Control reaches here with cgroup_mutex held. - * @inode->i_mutex should nest outside cgroup_mutex but we - * want to populate it immediately without releasing - * cgroup_mutex. As @inode isn't visible to anyone else - * yet, trylock will always succeed without affecting - * lockdep checks. - */ - WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - inode->i_op = &cgroup_file_inode_operations; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; + return ret; } static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { - struct dentry *dir = cgrp->dentry; - struct cgroup *parent = __d_cgrp(dir); - struct dentry *dentry; - struct cfent *cfe; - int error; - umode_t mode; char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn; + struct lock_class_key *key = NULL; - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); - - cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); - if (!cfe) - return -ENOMEM; - - cgroup_file_name(cgrp, cft, name); - dentry = lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out; - } - - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); - - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); - if (!error) { - list_add_tail(&cfe->node, &parent->files); - cfe = NULL; - } - dput(dentry); -out: - kfree(cfe); - return error; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + key = &cft->lockdep_key; +#endif + kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), + cgroup_file_mode(cft), 0, cft->kf_ops, cft, + NULL, false, key); + if (IS_ERR(kn)) + return PTR_ERR(kn); + return 0; } /** @@ -2704,7 +2277,6 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_tree_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2749,9 +2321,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->top_cgroup; - struct super_block *sb = ss->root->sb; struct cgroup *prev = NULL; - struct inode *inode; struct cgroup_subsys_state *css; u64 update_before; int ret = 0; @@ -2759,12 +2329,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) mutex_unlock(&cgroup_mutex); /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &cgroup_dummy_root || - !atomic_inc_not_zero(&sb->s_active)) { + if (!cfts || ss->root == &cgroup_dummy_root) { mutex_unlock(&cgroup_tree_mutex); return 0; } + cgroup_get_root(ss->root); + /* * All cgroups which are created after we drop cgroup_mutex will * have the updated set of files, so we only need to update the @@ -2779,18 +2350,16 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - inode = cgrp->dentry->d_inode; cgroup_get(cgrp); if (prev) cgroup_put(prev); prev = cgrp; - mutex_unlock(&cgroup_tree_mutex); - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_tree_mutex); - if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) + if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) { ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&inode->i_mutex); + if (is_add) + kernfs_activate(cgrp->kn); + } if (ret) break; } @@ -2804,16 +2373,45 @@ static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft->name[0] != '\0'; cft++) + for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* free copy for custom atomic_write_len, see init_cftypes() */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) + kfree(cft->kf_ops); + cft->kf_ops = NULL; cft->ss = NULL; + } } -static void cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft->name[0] != '\0'; cft++) + for (cft = cfts; cft->name[0] != '\0'; cft++) { + struct kernfs_ops *kf_ops; + + if (cft->seq_start) + kf_ops = &cgroup_kf_ops; + else + kf_ops = &cgroup_kf_single_ops; + + /* + * Ugh... if @cft wants a custom max_write_len, we need to + * make a copy of kf_ops to set its atomic_write_len. + */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { + kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); + if (!kf_ops) { + cgroup_exit_cftypes(cfts); + return -ENOMEM; + } + kf_ops->atomic_write_len = cft->max_write_len; + } + + cft->kf_ops = kf_ops; cft->ss = ss; + } + + return 0; } /** @@ -2839,7 +2437,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (!set) return -ENOMEM; - cgroup_init_cftypes(ss, cfts); + ret = cgroup_init_cftypes(ss, cfts); + if (ret) + return ret; cgroup_cfts_prepare(); set->cfts = cfts; @@ -3706,21 +3306,27 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - int ret = -EINVAL; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp) { + rcu_read_unlock(); + return -ENOENT; + } css_task_iter_start(&cgrp->dummy_css, &it); while ((tsk = css_task_iter_next(&it))) { @@ -3745,8 +3351,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); -err: - return ret; + rcu_read_unlock(); + return 0; } @@ -3764,7 +3370,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -3819,7 +3425,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; if (l) @@ -3830,7 +3436,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; @@ -3880,21 +3486,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, return 0; } -/* - * When dput() is called asynchronously, if umount has been done and - * then deactivate_super() in cgroup_free_fn() kills the superblock, - * there's a small window that vfs will see the root dentry with non-zero - * refcnt and trigger BUG(). - * - * That's why we hold a reference before dput() and drop it right after. - */ -static void cgroup_dput(struct cgroup *cgrp) -{ - cgroup_get_root(cgrp->root); - cgroup_put(cgrp); - cgroup_put_root(cgrp->root); -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4029,7 +3620,7 @@ static void css_free_work_fn(struct work_struct *work) css_put(css->parent); css->ss->css_free(css); - cgroup_dput(cgrp); + cgroup_put(cgrp); } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4037,10 +3628,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) struct cgroup_subsys_state *css = container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context which we don't have. - */ INIT_WORK(&css->destroy_work, css_free_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } @@ -4122,7 +3709,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) struct cgroup_subsys_state *css; int err; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(cgroup_css(parent, ss)); @@ -4163,30 +3749,28 @@ err_free: return err; } -/* +/** * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held + * @name_str: name of the new cgroup + * @mode: mode to set on new cgroup */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - umode_t mode) +static long cgroup_create(struct cgroup *parent, const char *name_str, + umode_t mode) { struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; - struct super_block *sb = root->sb; + struct kernfs_node *kn; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(dentry->d_name.name); + name = cgroup_alloc_name(name_str); if (!name) { err = -ENOMEM; goto err_free_cgrp; @@ -4217,18 +3801,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_unlock; } - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - cgroup_get_root(root); - init_cgroup_housekeeping(cgrp); - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; - cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; @@ -4239,15 +3813,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* - * Create directory. cgroup_create_file() returns with the new - * directory locked on success so that it can be populated without - * dropping cgroup_mutex. - */ - err = cgroup_create_file(dentry, S_IFDIR | mode, sb); - if (err < 0) + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp); + if (IS_ERR(kn)) { + err = PTR_ERR(kn); goto err_free_id; - lockdep_assert_held(&dentry->d_inode->i_mutex); + } + cgrp->kn = kn; cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4255,7 +3827,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* hold a ref to the parent's dentry */ + /* + * Grab a reference on the root and parent so that they don't get + * deleted while there are child cgroups. + */ + cgroup_get_root(root); cgroup_get(parent); /* @@ -4277,16 +3853,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + kernfs_activate(kn); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); - /* Release the reference count that we took on the superblock */ - cgroup_put_root(root); err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: @@ -4300,16 +3875,15 @@ err_destroy: cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *c_parent = dentry->d_parent->d_fsdata; + struct cgroup *parent = parent_kn->priv; - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); + return cgroup_create(parent, name, mode); } /* @@ -4373,6 +3947,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { + /* + * This must happen before css is disassociated with its cgroup. + * See seq_css() for details. + */ cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* @@ -4421,13 +3999,12 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct dentry *d = cgrp->dentry; - struct cgroup_subsys_state *css; struct cgroup *child; + struct cgroup_subsys_state *css; + struct kernfs_node *kn; bool empty; int ssid; - lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4492,15 +4069,24 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); + /* remove @cgrp directory along with the base files */ + mutex_unlock(&cgroup_mutex); + /* - * Clear the base files and remove @cgrp directory. The removal - * puts the base ref but we aren't quite done with @cgrp yet, so - * hold onto it. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_from_dir(). Those are supported by RCU protecting + * clearing of cgrp->kn->priv backpointer, which should happen + * after all files under it have been removed. */ - mutex_unlock(&cgroup_mutex); - cgroup_addrm_files(cgrp, cgroup_base_files, false); - dget(d); - cgroup_d_remove_dir(d); + kn = cgrp->kn; + kernfs_get(kn); + + kernfs_remove(cgrp->kn); + + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + kernfs_put(kn); + mutex_lock(&cgroup_mutex); return 0; @@ -4531,19 +4117,46 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) check_for_release(parent); } -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +static int cgroup_rmdir(struct kernfs_node *kn) { - int ret; + struct cgroup *cgrp = kn->priv; + int ret = 0; + + /* + * This is self-destruction but @kn can't be removed while this + * callback is in progress. Let's break active protection. Once + * the protection is broken, @cgrp can be destroyed at any point. + * Pin it so that it stays accessible. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - ret = cgroup_destroy_locked(dentry->d_fsdata); + + /* + * @cgrp might already have been destroyed while we're trying to + * grab the mutexes. + */ + if (!cgroup_is_dead(cgrp)) + ret = cgroup_destroy_locked(cgrp); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); return ret; } +static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; @@ -4635,11 +4248,7 @@ int __init cgroup_init(void) unsigned long key; int i, err; - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; - - cgroup_init_cftypes(NULL, cgroup_base_files); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); for_each_subsys(ss, i) { if (!ss->early_init) @@ -4669,24 +4278,17 @@ int __init cgroup_init(void) mutex_unlock(&cgroup_mutex); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) { - err = -ENOMEM; - goto out; - } + if (!cgroup_kobj) + return -ENOMEM; err = register_filesystem(&cgroup_fs_type); if (err < 0) { kobject_put(cgroup_kobj); - goto out; + return err; } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; + return 0; } static int __init cgroup_wq_init(void) @@ -5095,18 +4697,25 @@ __setup("cgroup_disable=", cgroup_disable); struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; - struct cgroup_subsys_state *css; /* is @dentry a cgroup dir? */ - if (!dentry->d_inode || - dentry->d_inode->i_op != &cgroup_dir_inode_operations) + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); - cgrp = __d_cgrp(dentry); - css = cgroup_css(cgrp, ss); + /* + * This path doesn't originate from kernfs and @kn could already + * have been or be removed at any point. @kn->priv is RCU + * protected for this access. See destroy_locked() for details. + */ + cgrp = rcu_dereference(kn->priv); + if (cgrp) + css = cgroup_css(cgrp, ss); if (!css || !css_tryget(css)) css = ERR_PTR(-ENOENT); -- cgit v1.2.3 From d651aa1d68a2f0a7ee65697b04c6a92f8c0a12f2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 11 Feb 2014 13:38:54 -0500 Subject: ring-buffer: Fix first commit on sub-buffer having non-zero delta Each sub-buffer (buffer page) has a full 64 bit timestamp. The events on that page use a 27 bit delta against that timestamp in order to save on bits written to the ring buffer. If the time between events is larger than what the 27 bits can hold, a "time extend" event is added to hold the entire 64 bit timestamp again and the events after that hold a delta from that timestamp. As a "time extend" is always paired with an event, it is logical to just allocate the event with the time extend, to make things a bit more efficient. Unfortunately, when the pairing code was written, it removed the "delta = 0" from the first commit on a page, causing the events on the page to be slightly skewed. Fixes: 69d1b839f7ee "ring-buffer: Bind time extend and data events together" Cc: stable@vger.kernel.org # 2.6.37+ Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 294b8a271a04..fc4da2d97f9b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2397,6 +2397,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - length; + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + delta = 0; + /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, -- cgit v1.2.3 From 86bf4b68759141459864ebd36ac3038a9cda895b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:48 -0500 Subject: cgroup: warn if "xattr" is specified with "sane_behavior" Mount option "xattr" is no longer necessary as it's enabled by default on kernfs. Warn if "xattr" is specified with "sane_behavior" so that the option can be removed in the future. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cda614da40cf..a0fab71f200f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1267,6 +1267,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); return -EINVAL; } + + if (opts->flags & CGRP_ROOT_XATTR) + pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n"); } /* -- cgit v1.2.3 From 80b13586997d8e584caa772bd99e2a3e55ac6abe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:48 -0500 Subject: cgroup: relocate cgroup_rm_cftypes() cftype handling is about to be revamped. Relocate cgroup_rm_cftypes() above cgroup_add_cftypes() in preparation. This is pure relocation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 70 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a0fab71f200f..a2cbd1549995 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2417,6 +2417,41 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return 0; } +/** + * cgroup_rm_cftypes - remove an array of cftypes from a subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Unregister @cfts. Files described by @cfts are removed from all + * existing cgroups and all future cgroups won't have them either. This + * function can be called anytime whether @cfts' subsys is attached or not. + * + * Returns 0 on successful unregistration, -ENOENT if @cfts is not + * registered. + */ +int cgroup_rm_cftypes(struct cftype *cfts) +{ + struct cftype *found = NULL; + struct cftype_set *set; + + if (!cfts || !cfts[0].ss) + return -ENOENT; + + cgroup_cfts_prepare(); + + list_for_each_entry(set, &cfts[0].ss->cftsets, node) { + if (set->cfts == cfts) { + list_del(&set->node); + kfree(set); + found = cfts; + break; + } + } + + cgroup_cfts_commit(found, false); + cgroup_exit_cftypes(cfts); + return found ? 0 : -ENOENT; +} + /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem @@ -2454,41 +2489,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); -/** - * cgroup_rm_cftypes - remove an array of cftypes from a subsystem - * @cfts: zero-length name terminated array of cftypes - * - * Unregister @cfts. Files described by @cfts are removed from all - * existing cgroups and all future cgroups won't have them either. This - * function can be called anytime whether @cfts' subsys is attached or not. - * - * Returns 0 on successful unregistration, -ENOENT if @cfts is not - * registered. - */ -int cgroup_rm_cftypes(struct cftype *cfts) -{ - struct cftype *found = NULL; - struct cftype_set *set; - - if (!cfts || !cfts[0].ss) - return -ENOENT; - - cgroup_cfts_prepare(); - - list_for_each_entry(set, &cfts[0].ss->cftsets, node) { - if (set->cfts == cfts) { - list_del(&set->node); - kfree(set); - found = cfts; - break; - } - } - - cgroup_cfts_commit(found, false); - cgroup_exit_cftypes(cfts); - return found ? 0 : -ENOENT; -} - /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question -- cgit v1.2.3 From 0adb070426dde2fd0b84e7f4f5cefcd8f0b24410 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:48 -0500 Subject: cgroup: remove cftype_set cftype_set was added primarily to allow registering the same cftype array more than once for different subsystems. Nobody uses or needs such thing and it's already broken because each cftype has ->ss pointer which is initialized during registration. Let's add list_head ->node to cftype and use the first cftype entry in the array to link them instead of allocating separate cftype_set. While at it, trigger WARN if cft seems previously initialized during registration. This simplifies cftype handling a bit. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a2cbd1549995..506ebd61d1c2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1016,12 +1016,12 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) int i; for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, set->cfts, false); + list_for_each_entry(cfts, &ss->cfts, node) + cgroup_addrm_files(cgrp, cfts, false); } } @@ -2392,6 +2392,8 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; + WARN_ON(cft->ss || cft->kf_ops); + if (cft->seq_start) kf_ops = &cgroup_kf_ops; else @@ -2430,26 +2432,15 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - struct cftype *found = NULL; - struct cftype_set *set; - if (!cfts || !cfts[0].ss) return -ENOENT; cgroup_cfts_prepare(); + list_del(&cfts->node); + cgroup_cfts_commit(cfts, false); - list_for_each_entry(set, &cfts[0].ss->cftsets, node) { - if (set->cfts == cfts) { - list_del(&set->node); - kfree(set); - found = cfts; - break; - } - } - - cgroup_cfts_commit(found, false); cgroup_exit_cftypes(cfts); - return found ? 0 : -ENOENT; + return 0; } /** @@ -2468,20 +2459,14 @@ int cgroup_rm_cftypes(struct cftype *cfts) */ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { - struct cftype_set *set; int ret; - set = kzalloc(sizeof(*set), GFP_KERNEL); - if (!set) - return -ENOMEM; - ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; cgroup_cfts_prepare(); - set->cfts = cfts; - list_add_tail(&set->node, &ss->cftsets); + list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_cfts_commit(cfts, true); if (ret) cgroup_rm_cftypes(cfts); @@ -3574,13 +3559,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* process cftsets of each subsystem */ for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, set->cfts, true); + list_for_each_entry(cfts, &ss->cfts, node) { + ret = cgroup_addrm_files(cgrp, cfts, true); if (ret < 0) goto err; } @@ -4169,7 +4154,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - INIT_LIST_HEAD(&ss->cftsets); + INIT_LIST_HEAD(&ss->cfts); /* Create the top cgroup state for this subsystem */ ss->root = &cgroup_dummy_root; -- cgit v1.2.3 From 21a2d3430ba8c188af405a5c2eb9c06bdcb6add6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:49 -0500 Subject: cgroup: simplify dynamic cftype addition and removal Dynamic cftype addition and removal using cgroup_add/rm_cftypes() respectively has been quite hairy due to vfs i_mutex. As i_mutex nests outside cgroup_mutex, cgroup_mutex has to be released and regrabbed on each iteration through the hierarchy complicating the process. Now that i_mutex is no longer in play, it can be simplified. * Just holding cgroup_tree_mutex is enough. No need to meddle with cgroup_mutex. * No reason to play the unlock - relock - check serial_nr dancing. Everything can be atomically while holding cgroup_tree_mutex. * cgroup_cfts_prepare() is replaced with direct locking of cgroup_tree_mutex. * cgroup_cfts_commit() no longer fiddles with locking. It just applies the cftypes change to the existing cgroups in the hierarchy. Renamed to cgroup_cfts_apply(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 87 +++++++++++++++++++++------------------------------------ 1 file changed, 32 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 506ebd61d1c2..f4409715a2f5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2305,46 +2305,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], return 0; } -static void cgroup_cfts_prepare(void) - __acquires(&cgroup_mutex) -{ - /* - * Thanks to the entanglement with vfs inode locking, we can't walk - * the existing cgroups under cgroup_mutex and create files. - * Instead, we use css_for_each_descendant_pre() and drop RCU read - * lock before calling cgroup_addrm_files(). - */ - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); -} - -static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) +static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->top_cgroup; - struct cgroup *prev = NULL; struct cgroup_subsys_state *css; - u64 update_before; int ret = 0; - mutex_unlock(&cgroup_mutex); + lockdep_assert_held(&cgroup_tree_mutex); - /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &cgroup_dummy_root) { - mutex_unlock(&cgroup_tree_mutex); + /* don't bother if @ss isn't attached */ + if (ss->root == &cgroup_dummy_root) return 0; - } - - cgroup_get_root(ss->root); - - /* - * All cgroups which are created after we drop cgroup_mutex will - * have the updated set of files, so we only need to update the - * cgroups created before the current @cgroup_serial_nr_next. - */ - update_before = cgroup_serial_nr_next; /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -2353,22 +2326,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - cgroup_get(cgrp); - if (prev) - cgroup_put(prev); - prev = cgrp; - - if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) { - ret = cgroup_addrm_files(cgrp, cfts, is_add); - if (is_add) - kernfs_activate(cgrp->kn); - } + ret = cgroup_addrm_files(cgrp, cfts, is_add); if (ret) break; } - mutex_unlock(&cgroup_tree_mutex); - cgroup_put(prev); - cgroup_put_root(ss->root); + + if (is_add && !ret) + kernfs_activate(root->kn); return ret; } @@ -2419,6 +2383,19 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return 0; } +static int cgroup_rm_cftypes_locked(struct cftype *cfts) +{ + lockdep_assert_held(&cgroup_tree_mutex); + + if (!cfts || !cfts[0].ss) + return -ENOENT; + + list_del(&cfts->node); + cgroup_apply_cftypes(cfts, false); + cgroup_exit_cftypes(cfts); + return 0; +} + /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes @@ -2432,15 +2409,12 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - if (!cfts || !cfts[0].ss) - return -ENOENT; - - cgroup_cfts_prepare(); - list_del(&cfts->node); - cgroup_cfts_commit(cfts, false); + int ret; - cgroup_exit_cftypes(cfts); - return 0; + mutex_lock(&cgroup_tree_mutex); + ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_tree_mutex); + return ret; } /** @@ -2465,11 +2439,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - cgroup_cfts_prepare(); + mutex_lock(&cgroup_tree_mutex); + list_add_tail(&cfts->node, &ss->cfts); - ret = cgroup_cfts_commit(cfts, true); + ret = cgroup_apply_cftypes(cfts, true); if (ret) - cgroup_rm_cftypes(cfts); + cgroup_rm_cftypes_locked(cfts); + + mutex_unlock(&cgroup_tree_mutex); return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); -- cgit v1.2.3 From 6f30558f37bfbd428e3854c2c34b5c32117c8f7e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:50 -0500 Subject: cgroup: make cgroup hold onto its kernfs_node cgroup currently releases its kernfs_node when it gets removed. While not buggy, this makes cgroup->kn access rules complicated than necessary and leads to things like get/put protection around kernfs_remove() in cgroup_destroy_locked(). In addition, we want to use kernfs_name/path() and friends but also want to be able to determine a cgroup's name between removal and release. This patch makes cgroup hold onto its kernfs_node until freed so that cgroup->kn is always accessible. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f4409715a2f5..59dfb025f1ac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -957,6 +957,8 @@ static void cgroup_free_fn(struct work_struct *work) cgroup_pidlist_destroy_all(cgrp); + kernfs_put(cgrp->kn); + kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -3786,6 +3788,12 @@ static long cgroup_create(struct cgroup *parent, const char *name_str, } cgrp->kn = kn; + /* + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. + */ + kernfs_get(kn); + cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ @@ -3966,7 +3974,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) { struct cgroup *child; struct cgroup_subsys_state *css; - struct kernfs_node *kn; bool empty; int ssid; @@ -4044,13 +4051,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * clearing of cgrp->kn->priv backpointer, which should happen * after all files under it have been removed. */ - kn = cgrp->kn; - kernfs_get(kn); - - kernfs_remove(cgrp->kn); - + kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - kernfs_put(kn); mutex_lock(&cgroup_mutex); -- cgit v1.2.3 From e61734c55c24cdf11b07e52a74aec4dc4a7f4bd0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:50 -0500 Subject: cgroup: remove cgroup->name cgroup->name handling became quite complicated over time involving dedicated struct cgroup_name for RCU protection. Now that cgroup is on kernfs, we can drop all of it and simply use kernfs_name/path() and friends. Replace cgroup->name and all related code with kernfs name/path constructs. * Reimplement cgroup_name() and cgroup_path() as thin wrappers on top of kernfs counterparts, which involves semantic changes. pr_cont_cgroup_name() and pr_cont_cgroup_path() added. * cgroup->name handling dropped from cgroup_rename(). * All users of cgroup_name/path() updated to the new semantics. Users which were formatting the string just to printk them are converted to use pr_cont_cgroup_name/path() instead, which simplifies things quite a bit. As cgroup_name() no longer requires RCU read lock around it, RCU lockings which were protecting only cgroup_name() are removed. v2: Comment above oom_info_lock updated as suggested by Michal. v3: dummy_top doesn't have a kn associated and pr_cont_cgroup_name/path() ended up calling the matching kernfs functions with NULL kn leading to oops. Test for NULL kn and print "/" if so. This issue was reported by Fengguang Wu. v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to cgroup_idr with cgroup_mutex"). Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Fengguang Wu Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- kernel/cgroup.c | 146 ++++++++++++--------------------------------------- kernel/cpuset.c | 27 +++++----- kernel/sched/debug.c | 3 +- 3 files changed, 47 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 59dfb025f1ac..638df032fb94 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -145,8 +145,6 @@ static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); -static struct cgroup_name root_cgroup_name = { .name = "/" }; - /* * Assign a monotonically increasing serial number to cgroups. It * guarantees cgroups with bigger numbers are newer than those with smaller @@ -888,17 +886,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct cgroup_name *cgroup_alloc_name(const char *name_str) -{ - struct cgroup_name *name; - - name = kmalloc(sizeof(*name) + strlen(name_str) + 1, GFP_KERNEL); - if (!name) - return NULL; - strcpy(name->name, name_str); - return name; -} - static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { @@ -958,8 +945,6 @@ static void cgroup_free_fn(struct work_struct *work) cgroup_pidlist_destroy_all(cgrp); kernfs_put(cgrp->kn); - - kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -1377,7 +1362,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; - RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); } @@ -1597,57 +1581,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; -/** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Writes path of cgroup into buf. Returns 0 on success, -errno on error. - * - * We can't generate cgroup path using dentry->d_name, as accessing - * dentry->name must be protected by irq-unsafe dentry->d_lock or parent - * inode's i_mutex, while on the other hand cgroup_path() can be called - * with some irq-safe spinlocks held. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - int ret = -ENAMETOOLONG; - char *start; - - if (!cgrp->parent) { - if (strlcpy(buf, "/", buflen) >= buflen) - return -ENAMETOOLONG; - return 0; - } - - start = buf + buflen - 1; - *start = '\0'; - - rcu_read_lock(); - do { - const char *name = cgroup_name(cgrp); - int len; - - len = strlen(name); - if ((start -= len) < buf) - goto out; - memcpy(start, name, len); - - if (--start < buf) - goto out; - *start = '/'; - - cgrp = cgrp->parent; - } while (cgrp->parent); - ret = 0; - memmove(buf, start, buf + buflen - start); -out: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_path); - /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -1659,16 +1592,14 @@ EXPORT_SYMBOL_GPL(cgroup_path); * function grabs cgroup_mutex and shouldn't be used inside locks used by * cgroup controller callbacks. * - * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. + * Return value is the same as kernfs_path(). */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroupfs_root *root; struct cgroup *cgrp; - int hierarchy_id = 1, ret = 0; - - if (buflen < 2) - return -ENAMETOOLONG; + int hierarchy_id = 1; + char *path = NULL; mutex_lock(&cgroup_mutex); @@ -1676,14 +1607,15 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path(cgrp, buf, buflen); + path = cgroup_path(cgrp, buf, buflen); } else { /* if no hierarchy exists, everyone is in "/" */ - memcpy(buf, "/", 2); + if (strlcpy(buf, "/", buflen) < buflen) + path = buf; } mutex_unlock(&cgroup_mutex); - return ret; + return path; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -2211,7 +2143,6 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; - struct cgroup_name *name, *old_name; int ret; if (kernfs_type(kn) != KERNFS_DIR) @@ -2226,25 +2157,13 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_name_str); - if (!name) - return -ENOMEM; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); - if (!ret) { - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); - } else { - old_name = name; - } mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); - - kfree_rcu(old_name, rcu_head); return ret; } @@ -3719,14 +3638,13 @@ err_free: /** * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup - * @name_str: name of the new cgroup + * @name: name of the new cgroup * @mode: mode to set on new cgroup */ -static long cgroup_create(struct cgroup *parent, const char *name_str, +static long cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup *cgrp; - struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; @@ -3737,13 +3655,6 @@ static long cgroup_create(struct cgroup *parent, const char *name_str, if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(name_str); - if (!name) { - err = -ENOMEM; - goto err_free_cgrp; - } - rcu_assign_pointer(cgrp->name, name); - mutex_lock(&cgroup_tree_mutex); /* @@ -3781,7 +3692,7 @@ static long cgroup_create(struct cgroup *parent, const char *name_str, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); /* create the directory */ - kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp); + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { err = PTR_ERR(kn); goto err_free_id; @@ -3839,8 +3750,6 @@ err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: mutex_unlock(&cgroup_tree_mutex); - kfree(rcu_dereference_raw(cgrp->name)); -err_free_cgrp: kfree(cgrp); return err; @@ -4304,12 +4213,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *path; int retval; struct cgroupfs_root *root; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -4337,10 +4246,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; goto out_unlock; - seq_puts(m, buf); + } + seq_puts(m, path); seq_putc(m, '\n'); } @@ -4588,16 +4499,17 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf = NULL, *agentbuf = NULL, *path; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) goto continue_free; agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); if (!agentbuf) @@ -4605,7 +4517,7 @@ static void cgroup_release_agent(struct work_struct *work) i = 0; argv[i++] = agentbuf; - argv[i++] = pathbuf; + argv[i++] = path; argv[i] = NULL; i = 0; @@ -4755,6 +4667,11 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; read_lock(&css_set_lock); rcu_read_lock(); @@ -4763,14 +4680,17 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) struct cgroup *c = link->cgrp; const char *name = "?"; - if (c != cgroup_dummy_top) - name = cgroup_name(c); + if (c != cgroup_dummy_top) { + cgroup_name(c, name_buf, NAME_MAX + 1); + name = name_buf; + } seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name); } rcu_read_unlock(); read_unlock(&css_set_lock); + kfree(name_buf); return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2d018c795fea..e97a6e88d036 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2088,10 +2088,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - rcu_read_lock(); - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", - cgroup_name(cs->css.cgroup)); - rcu_read_unlock(); + printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); } } @@ -2619,19 +2618,17 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) /* Statically allocated to prevent using excess stack. */ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); - struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - rcu_read_lock(); spin_lock(&cpuset_buffer_lock); nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + printk(KERN_INFO "%s cpuset=", tsk->comm); + pr_cont_cgroup_name(cgrp); + pr_cont(" mems_allowed=%s\n", cpuset_nodelist); spin_unlock(&cpuset_buffer_lock); - rcu_read_unlock(); } /* @@ -2681,12 +2678,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *p; struct cgroup_subsys_state *css; int retval; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -2696,14 +2693,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; + retval = -ENAMETOOLONG; rcu_read_lock(); css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); - if (retval < 0) + if (!p) goto out_put_task; - seq_puts(m, buf); + seq_puts(m, p); seq_putc(m, '\n'); + retval = 0; out_put_task: put_task_struct(tsk); out_free: diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10e..30eee3b5293d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif -- cgit v1.2.3 From 3c9c825b8b50de7dbb015e6bfc04bb2da79364d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:50 -0500 Subject: cgroup: rename cgroupfs_root->number_of_cgroups to ->nr_cgrps and make it atomic_t root->number_of_cgroups is currently an integer protected with cgroup_mutex. Except for sanity checks and proc reporting, the only place it's used is to check whether the root has any child during remount; however, this is a bit flawed as the counter is not decremented when the cgroup is unlinked but when it's released, meaning that there could be an extended period where all cgroups are removed but remount is still not allowed because some internal objects are lingering. While not perfect either, it'd be better to use emptiness test on root->top_cgroup.children. This patch updates cgroup_remount() to test top_cgroup's children instead, which makes number_of_cgroups only actual usage statistics printing in proc implemented in proc_cgroupstats_show(). Let's shorten its name and make it an atomic_t so that we don't have to worry about its synchronization. It's purely auxiliary at this point. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 638df032fb94..cffdb6e2ad08 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -758,7 +758,7 @@ static void cgroup_put_root(struct cgroupfs_root *root) } mutex_lock(&cgroup_mutex); - BUG_ON(root->number_of_cgroups != 1); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ @@ -928,9 +928,7 @@ static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - mutex_lock(&cgroup_mutex); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); + atomic_dec(&cgrp->root->nr_cgrps); /* * We get a ref to the parent, and put the ref when this cgroup is @@ -1320,7 +1318,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (root->number_of_cgroups > 1) { + if (!list_empty(&root->top_cgroup.children)) { ret = -EBUSY; goto out_unlock; } @@ -1360,7 +1358,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) atomic_set(&root->refcnt, 1); INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; + atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); @@ -1463,7 +1461,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root) write_unlock(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); ret = 0; @@ -3709,7 +3707,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; + atomic_inc(&root->nr_cgrps); /* * Grab a reference on the root and parent so that they don't get @@ -4281,7 +4279,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, - ss->root->number_of_cgroups, !ss->disabled); + atomic_read(&ss->root->nr_cgrps), !ss->disabled); mutex_unlock(&cgroup_mutex); return 0; -- cgit v1.2.3 From 776f02fa4e1ad70557c0318c70ce928e0642bee0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 09:29:50 -0500 Subject: cgroup: remove cgroupfs_root->refcnt Currently, cgroupfs_root and its ->top_cgroup are separated reference counted and the latter's is ignored. There's no reason to do this separately. This patch removes cgroupfs_root->refcnt and destroys cgroupfs_root when the top_cgroup is released. * cgroup_put() updated to ignore cgroup_is_dead() test for top cgroups. cgroup_free_fn() updated to handle root destruction when releasing a top cgroup. * As root destruction is now bounced through cgroup destruction, it is asynchronous. Update cgroup_mount() so that it waits for pending release which is currently implemented using msleep(). Converting this to proper wait_queue isn't hard but likely unnecessary. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 86 +++++++++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cffdb6e2ad08..03845c5d082b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -53,6 +53,7 @@ #include /* TODO: replace with more sophisticated array */ #include /* used in cgroup_attach_task */ #include +#include #include @@ -728,37 +729,16 @@ static void cgroup_free_root(struct cgroupfs_root *root) } } -static void cgroup_get_root(struct cgroupfs_root *root) -{ - /* - * The caller must ensure that @root is alive, which can be - * achieved by holding a ref on one of the member cgroups or - * following a registered reference to @root while holding - * cgroup_tree_mutex. - */ - WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0); - atomic_inc(&root->refcnt); -} - -static void cgroup_put_root(struct cgroupfs_root *root) +static void cgroup_destroy_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; int ret; - /* - * @root's refcnt reaching zero and its deregistration should be - * atomic w.r.t. cgroup_tree_mutex. This ensures that - * cgroup_get_root() is safe to invoke if @root is registered. - */ mutex_lock(&cgroup_tree_mutex); - if (!atomic_dec_and_test(&root->refcnt)) { - mutex_unlock(&cgroup_tree_mutex); - return; - } mutex_lock(&cgroup_mutex); - BUG_ON(atomic_read(&root->nr_cgrps) != 1); + BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ @@ -929,21 +909,24 @@ static void cgroup_free_fn(struct work_struct *work) struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); atomic_dec(&cgrp->root->nr_cgrps); - - /* - * We get a ref to the parent, and put the ref when this cgroup is - * being freed, so it's guaranteed that the parent won't be - * destroyed before its children. - */ - cgroup_put(cgrp->parent); - - /* put the root reference that we took when we created the cgroup */ - cgroup_put_root(cgrp->root); - cgroup_pidlist_destroy_all(cgrp); - kernfs_put(cgrp->kn); - kfree(cgrp); + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when this + * cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is top cgroup's refcnt reaching zero, which + * indicates that the root should be released. + */ + cgroup_destroy_root(cgrp->root); + } } static void cgroup_free_rcu(struct rcu_head *head) @@ -965,7 +948,7 @@ static void cgroup_put(struct cgroup *cgrp) { if (!atomic_dec_and_test(&cgrp->refcnt)) return; - if (WARN_ON_ONCE(!cgroup_is_dead(cgrp))) + if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; /* @@ -1356,7 +1339,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; - atomic_set(&root->refcnt, 1); INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; @@ -1485,7 +1467,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup_sb_opts opts; struct dentry *dentry; int ret; - +retry: mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -1531,7 +1513,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } } - cgroup_get_root(root); + /* + * A root's lifetime is governed by its top cgroup. Zero + * ref indicate that the root is being destroyed. Wait for + * destruction to complete so that the subsystems are free. + * We can use wait_queue for the wait but this path is + * super cold. Let's just sleep for a bit and retry. + */ + if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + msleep(10); + goto retry; + } + + ret = 0; goto out_unlock; } @@ -1558,7 +1554,7 @@ out_unlock: dentry = kernfs_mount(fs_type, flags, root->kf_root); if (IS_ERR(dentry)) - cgroup_put_root(root); + cgroup_put(&root->top_cgroup); return dentry; } @@ -1567,7 +1563,7 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); - cgroup_put_root(root); + cgroup_put(&root->top_cgroup); kernfs_kill_sb(sb); } @@ -3708,12 +3704,6 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); atomic_inc(&root->nr_cgrps); - - /* - * Grab a reference on the root and parent so that they don't get - * deleted while there are child cgroups. - */ - cgroup_get_root(root); cgroup_get(parent); /* -- cgit v1.2.3 From 1a11533fbd71792e8c5d36f6763fbce8df0d231d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 19:06:19 -0500 Subject: Revert "cgroup: use an ordered workqueue for cgroup destruction" This reverts commit ab3f5faa6255a0eb4f832675507d9e295ca7e9ba. Explanation from Hugh: It's because more thorough testing, by others here, found that it wasn't always solving the problem: so I asked Tejun privately to hold off from sending it in, until we'd worked out why not. Most of our testing being on a v3,11-based kernel, it was perfectly possible that the problem was merely our own e.g. missing Tejun's 8a2b75384444 ("workqueue: fix ordered workqueues in NUMA setups"). But that turned out not to be enough to fix it either. Then Filipe pointed out how percpu_ref_kill_and_confirm() uses call_rcu_sched() before we ever get to put the offline on to the workqueue: by the time we get to the workqueue, the ordering has already been lost. So, thanks for the Acks, but I'm afraid that this ordered workqueue solution is just not good enough: we should simply forget that patch and provide a different answer." Signed-off-by: Tejun Heo Cc: Hugh Dickins --- kernel/cgroup.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 52719ce55dd3..68d87103b493 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4844,16 +4844,12 @@ static int __init cgroup_wq_init(void) /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. - * - * XXX: Must be ordered to make sure parent is offlined after - * children. The ordering requirement is for memcg where a - * parent's offline may wait for a child's leading to deadlock. In - * the long term, this should be fixed from memcg side. + * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0); + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); /* -- cgit v1.2.3 From d3ba07c3aa9ae3e03329b0a7f1a067c0647aa2af Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:38 -0500 Subject: cgroup: disallow xattr, release_agent and name if sane_behavior Disallow more mount options if sane_behavior. Note that xattr used to generate warning. While at it, simplify option check in cgroup_mount() and update sane_behavior comment in cgroup.h. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 03845c5d082b..079c478a4735 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1226,18 +1226,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (opts->flags & CGRP_ROOT_NOPREFIX) { - pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || + opts->cpuset_clone_children || opts->release_agent || + opts->name) { + pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } - - if (opts->cpuset_clone_children) { - pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); - return -EINVAL; - } - - if (opts->flags & CGRP_ROOT_XATTR) - pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n"); } /* -- cgit v1.2.3 From 35585573055f37837eb752ee22eb5523682ca742 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:38 -0500 Subject: cgroup: drop CGRP_ROOT_SUBSYS_BOUND Before kernfs conversion, due to the way super_block lookup works, cgroup roots were created and made visible before being fully initialized. This in turn required a special flag to mark that the root hasn't been fully initialized so that the destruction path can tell fully bound ones from half initialized. That flag is CGRP_ROOT_SUBSYS_BOUND and no longer necessary after the kernfs conversion as the lookup and creation of new root are atomic w.r.t. cgroup_mutex. This patch removes the flag and passes the requests subsystem mask to cgroup_setup_root() so that it can set the respective mask bits as subsystems are bound. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 079c478a4735..878cd1810ad1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -733,7 +733,6 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; - int ret; mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -742,11 +741,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); - } + WARN_ON(rebind_subsystems(root, 0, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's @@ -1055,13 +1050,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } - /* - * Mark @root has finished binding subsystems. @root->subsys_mask - * now matches the bound subsystems. - */ - root->flags |= CGRP_ROOT_SUBSYS_BOUND; kernfs_activate(cgrp->kn); - return 0; } @@ -1353,15 +1342,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) init_cgroup_root(root); - /* - * We need to set @root->subsys_mask now so that @root can be - * matched by cgroup_test_super() before it finishes - * initialization; otherwise, competing mounts with the same - * options may try to bind the same subsystems instead of waiting - * for the first one leading to unexpected mount errors. - * SUBSYS_BOUND will be set once actual binding is complete. - */ - root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); @@ -1372,7 +1352,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static int cgroup_setup_root(struct cgroupfs_root *root) +static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->top_cgroup; @@ -1415,7 +1395,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root) if (ret) goto destroy_root; - ret = rebind_subsystems(root, root->subsys_mask, 0); + ret = rebind_subsystems(root, ss_mask, 0); if (ret) goto destroy_root; @@ -1532,7 +1512,7 @@ retry: goto out_unlock; } - ret = cgroup_setup_root(root); + ret = cgroup_setup_root(root, opts.subsys_mask); if (ret) cgroup_free_root(root); -- cgit v1.2.3 From 56fde9e01de45bcfabbb444d33e8bdd8388d2da0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:38 -0500 Subject: cgroup: enable task_cg_lists on the first cgroup mount Tasks are not linked on their css_sets until cgroup task iteration is actually used. This is to avoid incurring overhead on the fork and exit paths for systems which have cgroup compiled in but don't use it. This lazy binding also affects the task migration path. It has to be careful so that it doesn't link tasks to css_sets when task_cg_lists linking is not enabled yet. Unfortunately, this conditional linking in the migration path interferes with planned migration updates. This patch moves the lazy binding a bit earlier, to the first cgroup mount. It's a clear indication that cgroup is being used on the system and task_cg_lists linking is highly likely to be enabled soon anyway through "tasks" and "cgroup.procs" files. This allows cgroup_task_migrate() to always link @tsk->cg_list. Note that it may still race with cgroup_post_fork() but who wins that race is inconsequential. While at it, make use_task_css_set_links a bool, add sanity checks in cgroup_enable_task_cg_lists() and css_task_iter_start(), and update the former so that it's guaranteed and assumes to run only once. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 878cd1810ad1..506f6da67ad1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -173,6 +173,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +static void cgroup_enable_task_cg_lists(void); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -375,7 +376,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) * fork()/exit() overhead for people who have cgroups compiled into their * kernel but not actually in use. */ -static int use_task_css_set_links __read_mostly; +static bool use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) { @@ -1441,6 +1442,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + + /* + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. + */ + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); retry: mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -1692,10 +1700,8 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); - /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &new_cset->tasks); + list_move(&tsk->cg_list, &new_cset->tasks); write_unlock(&css_set_lock); /* @@ -2362,13 +2368,19 @@ int cgroup_task_count(const struct cgroup *cgrp) * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through * each css_set to its tasks until we see the list actually used - in other - * words after the first call to css_task_iter_start(). + * words after the first mount. */ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; + write_lock(&css_set_lock); - use_task_css_set_links = 1; + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + /* * We need tasklist_lock because RCU is not safe against * while_each_thread(). Besides, a forking task that has passed @@ -2379,16 +2391,22 @@ static void cgroup_enable_task_cg_lists(void) read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); + + WARN_ON_ONCE(!list_empty(&p->cg_list) || + task_css_set(p) != &init_css_set); + /* * We should check if the process is exiting, otherwise * it will race with cgroup_exit() in that the list * entry won't be deleted though the process has exited. */ - if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) + if (!(p->flags & PF_EXITING)) list_add(&p->cg_list, &task_css_set(p)->tasks); + task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); +out_unlock: write_unlock(&css_set_lock); } @@ -2621,13 +2639,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) __acquires(css_set_lock) { - /* - * The first time anyone tries to iterate across a css, we need to - * enable the list linking each css_set to its tasks, and fix up - * all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); + /* no one should try to iterate before mounting cgroups */ + WARN_ON_ONCE(!use_task_css_set_links); read_lock(&css_set_lock); -- cgit v1.2.3 From afeb0f9fd425239aa477c842480f240bfb6325b3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:39 -0500 Subject: cgroup: relocate cgroup_enable_task_cg_lists() Move it above so that prototype isn't necessary. Let's also move the definition of use_task_css_set_links next to it. This is purely cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 103 ++++++++++++++++++++++++++------------------------------ 1 file changed, 48 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 506f6da67ad1..2469699408bd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -173,7 +173,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); -static void cgroup_enable_task_cg_lists(void); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -370,14 +369,6 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* - * We don't maintain the lists running through each css_set to its task - * until after the first call to css_task_iter_start(). This reduces the - * fork()/exit() overhead for people who have cgroups compiled into their - * kernel but not actually in use. - */ -static bool use_task_css_set_links __read_mostly; - static void __put_css_set(struct css_set *cset, int taskexit) { struct cgrp_cset_link *link, *tmp_link; @@ -1307,6 +1298,54 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return ret; } +/* + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first mount. + */ +static bool use_task_css_set_links __read_mostly; + +static void cgroup_enable_task_cg_lists(void) +{ + struct task_struct *p, *g; + + write_lock(&css_set_lock); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + + WARN_ON_ONCE(!list_empty(&p->cg_list) || + task_css_set(p) != &init_css_set); + + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + */ + if (!(p->flags & PF_EXITING)) + list_add(&p->cg_list, &task_css_set(p)->tasks); + + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +out_unlock: + write_unlock(&css_set_lock); +} + static void init_cgroup_housekeeping(struct cgroup *cgrp) { atomic_set(&cgrp->refcnt, 1); @@ -2364,52 +2403,6 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first mount. - */ -static void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - - write_lock(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - - WARN_ON_ONCE(!list_empty(&p->cg_list) || - task_css_set(p) != &init_css_set); - - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - */ - if (!(p->flags & PF_EXITING)) - list_add(&p->cg_list, &task_css_set(p)->tasks); - - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); -out_unlock: - write_unlock(&css_set_lock); -} - /** * css_next_child - find the next child of a given css * @pos_css: the current position (%NULL to initiate traversal) -- cgit v1.2.3 From 07bc356ed2950048d33d667e933e1b913c6e6b6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:39 -0500 Subject: cgroup: implement cgroup_has_tasks() and unexport cgroup_task_count() cgroup_task_count() read-locks css_set_lock and walks all tasks to count them and then returns the result. The only thing all the users want is determining whether the cgroup is empty or not. This patch implements cgroup_has_tasks() which tests whether cgroup->cset_links is empty, replaces all cgroup_task_count() usages and unexports it. Note that the test isn't synchronized. This is the same as before. The test has always been racy. This will help planned css_set locking update. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- kernel/cgroup.c | 2 +- kernel/cpuset.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2469699408bd..ec7746e5ded1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2391,7 +2391,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); * * Return the number of tasks in the cgroup. */ -int cgroup_task_count(const struct cgroup *cgrp) +static int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e97a6e88d036..ae190b0a196a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; -- cgit v1.2.3 From e406d1cfff6ab189c8676072d211809c94fecaf0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:39 -0500 Subject: cgroup: reimplement cgroup_transfer_tasks() without using css_scan_tasks() Reimplement cgroup_transfer_tasks() so that it repeatedly fetches the first task in the cgroup and then tranfers it. This achieves the same result without using css_scan_tasks() which is scheduled to be removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ec7746e5ded1..893b7b502e18 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2850,15 +2850,6 @@ int css_scan_tasks(struct cgroup_subsys_state *css, return 0; } -static void cgroup_transfer_one_task(struct task_struct *task, void *data) -{ - struct cgroup *new_cgroup = data; - - mutex_lock(&cgroup_mutex); - cgroup_attach_task(new_cgroup, task, false); - mutex_unlock(&cgroup_mutex); -} - /** * cgroup_trasnsfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved @@ -2866,8 +2857,26 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, - to, NULL); + struct css_task_iter it; + struct task_struct *task; + int ret = 0; + + do { + css_task_iter_start(&from->dummy_css, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + mutex_lock(&cgroup_mutex); + ret = cgroup_attach_task(to, task, false); + mutex_unlock(&cgroup_mutex); + put_task_struct(task); + } + } while (task && !ret); + + return ret; } /* -- cgit v1.2.3 From 96d365e0b86ee7ec6366c99669687e54c9f145e3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:40 -0500 Subject: cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem Currently there are two ways to walk tasks of a cgroup - css_task_iter_start/next/end() and css_scan_tasks(). The latter builds on the former but allows blocking while iterating. Unfortunately, the way css_scan_tasks() is implemented is rather nasty, it uses a priority heap of pointers to extract some number of tasks in task creation order and loops over them invoking the callback and repeats that until it reaches the end. It requires either preallocated heap or may fail under memory pressure, while unlikely to be problematic, the complexity is O(N^2), and in general just nasty. We're gonna convert all css_scan_users() to css_task_iter_start/next/end() and remove css_scan_users(). As css_scan_tasks() users may block, let's convert css_set_lock to a rwsem so that tasks can block during css_task_iter_*() is in progress. While this does increase the chance of possible deadlock scenarios, given the current usage, the probability is relatively low, and even if that happens, the right thing to do is updating the iteration in the similar way to css iterators so that it can handle blocking. Most conversions are trivial; however, task_cgroup_path() now expects to be called with css_set_rwsem locked instead of locking itself. This is because the function is called with RCU read lock held and rwsem locking should nest outside RCU read lock. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 104 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 57 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 893b7b502e18..89428b9d9933 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -341,11 +342,10 @@ static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; /* - * css_set_lock protects the list of css_set objects, and the chain of - * tasks off each css_set. Nests outside task->alloc_lock due to - * css_task_iter_start(). + * css_set_rwsem protects the list of css_set objects, and the chain of + * tasks off each css_set. */ -static DEFINE_RWLOCK(css_set_lock); +static DECLARE_RWSEM(css_set_rwsem); static int css_set_count; /* @@ -380,9 +380,9 @@ static void __put_css_set(struct css_set *cset, int taskexit) */ if (atomic_add_unless(&cset->refcount, -1, 1)) return; - write_lock(&css_set_lock); + down_write(&css_set_rwsem); if (!atomic_dec_and_test(&cset->refcount)) { - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); return; } @@ -396,7 +396,7 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cset_link); list_del(&link->cgrp_link); - /* @cgrp can't go away while we're holding css_set_lock */ + /* @cgrp can't go away while we're holding css_set_rwsem */ if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); @@ -406,7 +406,7 @@ static void __put_css_set(struct css_set *cset, int taskexit) kfree(link); } - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); kfree_rcu(cset, rcu_head); } @@ -627,11 +627,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (cset) return cset; @@ -655,7 +655,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - write_lock(&css_set_lock); + down_write(&css_set_rwsem); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -673,7 +673,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); return cset; } @@ -739,14 +739,14 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) * Release all the links from cset_links to this hierarchy's * root cgroup */ - write_lock(&css_set_lock); + down_write(&css_set_rwsem); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -764,7 +764,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex held. + * called with cgroup_mutex and css_set_rwsem held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroupfs_root *root) @@ -772,8 +772,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct css_set *cset; struct cgroup *res = NULL; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - read_lock(&css_set_lock); + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * No need to lock the task - since we hold cgroup_mutex the * task can't change groups, so the only thing that can happen @@ -794,7 +795,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, } } } - read_unlock(&css_set_lock); + BUG_ON(!res); return res; } @@ -1310,7 +1311,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - write_lock(&css_set_lock); + down_write(&css_set_rwsem); if (use_task_css_set_links) goto out_unlock; @@ -1343,7 +1344,7 @@ static void cgroup_enable_task_cg_lists(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1408,7 +1409,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) root_cgrp->id = ret; /* - * We're accessing css_set_count without locking css_set_lock here, + * We're accessing css_set_count without locking css_set_rwsem here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. The worst that can happen is that we * have some link structures left over @@ -1451,10 +1452,10 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) * Link the top cgroup in this hierarchy into all the css_set * objects. */ - write_lock(&css_set_lock); + down_write(&css_set_rwsem); hash_for_each(css_set_table, i, cset, hlist) link_css_set(&tmp_links, cset, root_cgrp); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -1617,6 +1618,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -1629,6 +1631,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) path = buf; } + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); return path; } @@ -1739,9 +1742,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); - write_lock(&css_set_lock); + down_write(&css_set_rwsem); list_move(&tsk->cg_list, &new_cset->tasks); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); /* * We just gained a reference on old_cset by taking it from the @@ -1799,6 +1802,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ + down_read(&css_set_rwsem); rcu_read_lock(); do { struct task_and_cgroup ent; @@ -1826,6 +1830,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, break; } while_each_thread(leader, tsk); rcu_read_unlock(); + up_read(&css_set_rwsem); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; @@ -2003,7 +2008,11 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cgrp = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp; + + down_read(&css_set_rwsem); + from_cgrp = task_cgroup_from_root(from, root); + up_read(&css_set_rwsem); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2396,10 +2405,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) int count = 0; struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return count; } @@ -2630,12 +2639,12 @@ static void css_advance_task_iter(struct css_task_iter *it) */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_lock) + __acquires(css_set_rwsem) { /* no one should try to iterate before mounting cgroups */ WARN_ON_ONCE(!use_task_css_set_links); - read_lock(&css_set_lock); + down_read(&css_set_rwsem); it->origin_css = css; it->cset_link = &css->cgroup->cset_links; @@ -2683,9 +2692,9 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_lock) + __releases(css_set_rwsem) { - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); } static inline int started_after_time(struct task_struct *t1, @@ -2735,7 +2744,7 @@ static inline int started_after(void *p1, void *p2) * * @test may be NULL, meaning always true (select all tasks), which * effectively duplicates css_task_iter_{start,next,end}() but does not - * lock css_set_lock for the call to @process. + * lock css_set_rwsem for the call to @process. * * It is guaranteed that @process will act on every task that is a member * of @css for the duration of this call. This function may or may not @@ -3867,12 +3876,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); /* - * css_set_lock synchronizes access to ->cset_links and prevents + * css_set_rwsem synchronizes access to ->cset_links and prevents * @cgrp from being removed while __put_css_set() is in progress. */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); empty = list_empty(&cgrp->cset_links); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (!empty) return -EBUSY; @@ -4208,6 +4217,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) retval = 0; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); for_each_active_root(root) { struct cgroup_subsys *ss; @@ -4233,6 +4243,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) } out_unlock: + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); put_task_struct(tsk); out_free: @@ -4328,12 +4339,12 @@ void cgroup_post_fork(struct task_struct *child) * lock on fork. */ if (use_task_css_set_links) { - write_lock(&css_set_lock); + down_write(&css_set_rwsem); task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &task_css_set(child)->tasks); task_unlock(child); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); } /* @@ -4390,15 +4401,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) int i; /* - * Unlink from the css_set task list if necessary. - * Optimistically check cg_list before taking - * css_set_lock + * Unlink from the css_set task list if necessary. Optimistically + * check cg_list before taking css_set_rwsem. */ if (!list_empty(&tsk->cg_list)) { - write_lock(&css_set_lock); + down_write(&css_set_rwsem); if (!list_empty(&tsk->cg_list)) list_del_init(&tsk->cg_list); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); } /* Reassign the task to the init_css_set. */ @@ -4650,7 +4660,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -4666,7 +4676,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name); } rcu_read_unlock(); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); kfree(name_buf); return 0; } @@ -4677,7 +4687,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -4693,7 +4703,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) } } } - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return 0; } -- cgit v1.2.3 From d66393e54e0a9dc743e440eb36c58bd1158a560e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:40 -0500 Subject: cpuset: use css_task_iter_start/next/end() instead of css_scan_tasks() Now that css_task_iter_start/next_end() supports blocking while iterating, there's no reason to use css_scan_tasks() which is more cumbersome to use and scheduled to be removed. Convert all css_scan_tasks() usages in cpuset to css_task_iter_start/next/end(). This simplifies the code by removing heap allocation and callbacks. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 186 ++++++++++++++++++-------------------------------------- 1 file changed, 58 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ae190b0a196a..65ae0bdf4af8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -828,56 +828,37 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) return cs; } -/** - * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's - * @tsk: task to test - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed - * mask needs to be changed. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_cpumask(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - - set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); -} - /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() - * - * Called with cpuset_mutex held - * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its cpus_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_cpumask(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + css_task_iter_end(&it); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, continue; rcu_read_unlock(); - update_tasks_cpumask(cp, heap); + update_tasks_cpumask(cp); rcu_read_lock(); css_put(&cp->css); @@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - struct ptr_heap heap; int retval; int is_load_balanced; @@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; - is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - update_tasks_cpumask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_cpumask_hier(cs, true); if (is_load_balanced) rebuild_sched_domains_locked(); @@ -1052,53 +1026,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } -struct cpuset_change_nodemask_arg { - struct cpuset *cs; - nodemask_t *newmems; -}; - -/* - * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy - * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cpuset_mutex held. - */ -static void cpuset_change_nodemask(struct task_struct *p, void *data) -{ - struct cpuset_change_nodemask_arg *arg = data; - struct cpuset *cs = arg->cs; - struct mm_struct *mm; - int migrate; - - cpuset_change_task_nodemask(p, arg->newmems); - - mm = get_task_mm(p); - if (!mm) - return; - - migrate = is_memory_migrate(cs); - - mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); - mmput(mm); -} - static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held. No return value. It's guaranteed that - * css_scan_tasks() always returns 0 if @heap != NULL. + * Iterate through each task of @cs updating its mems_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct cpuset *mems_cs = effective_nodemask_cpuset(cs); - struct cpuset_change_nodemask_arg arg = { .cs = cs, - .newmems = &newmems }; + struct css_task_iter it; + struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ @@ -1114,7 +1057,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) { + struct mm_struct *mm; + bool migrate; + + cpuset_change_task_nodemask(task, &newmems); + + mm = get_task_mm(task); + if (!mm) + continue; + + migrate = is_memory_migrate(cs); + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) + cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); + mmput(mm); + } + css_task_iter_end(&it); /* * All the tasks' nodemasks have been updated, update @@ -1130,15 +1091,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1159,7 +1118,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, continue; rcu_read_unlock(); - update_tasks_nodemask(cp, heap); + update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -1184,7 +1143,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - struct ptr_heap heap; /* * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; @@ -1223,17 +1181,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval < 0) - goto done; - mutex_lock(&callback_mutex); cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_nodemask_hier(cs, true); done: return retval; } @@ -1260,39 +1212,23 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } -/** - * cpuset_change_flag - make a task's spread flags the same as its cpuset's - * @tsk: task to be updated - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_flag(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - - cpuset_update_task_spread_flag(cs, tsk); -} - /** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() - * - * Called with cpuset_mutex held * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. */ -static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_flags(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + cpuset_update_task_spread_flag(cs, task); + css_task_iter_end(&it); } /* @@ -1310,7 +1246,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, struct cpuset *trialcs; int balance_flag_changed; int spread_flag_changed; - struct ptr_heap heap; int err; trialcs = alloc_trial_cpuset(cs); @@ -1326,10 +1261,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) goto out; - err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (err < 0) - goto out; - balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); @@ -1344,8 +1275,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, rebuild_sched_domains_locked(); if (spread_flag_changed) - update_tasks_flags(cs, &heap); - heap_free(&heap); + update_tasks_flags(cs); out: free_trial_cpuset(trialcs); return err; @@ -2138,7 +2068,7 @@ retry: */ if ((sane && cpumask_empty(cs->cpus_allowed)) || (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs, NULL); + update_tasks_cpumask(cs); mutex_lock(&callback_mutex); nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); @@ -2152,7 +2082,7 @@ retry: */ if ((sane && nodes_empty(cs->mems_allowed)) || (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs, NULL); + update_tasks_nodemask(cs); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -2214,7 +2144,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) mutex_lock(&callback_mutex); top_cpuset.mems_allowed = new_mems; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, NULL); + update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); -- cgit v1.2.3 From 889ed9ceaa97bb02bf5d7349e24639f7fc5f4fa0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:40 -0500 Subject: cgroup: remove css_scan_tasks() css_scan_tasks() doesn't have any user left. Remove it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 162 -------------------------------------------------------- 1 file changed, 162 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 89428b9d9933..05c0c23549f9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2697,168 +2697,6 @@ void css_task_iter_end(struct css_task_iter *it) up_read(&css_set_rwsem); } -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - -/** - * css_scan_tasks - iterate though all the tasks in a css - * @css: the css to iterate tasks of - * @test: optional test callback - * @process: process callback - * @data: data passed to @test and @process - * @heap: optional pre-allocated heap used for task iteration - * - * Iterate through all the tasks in @css, calling @test for each, and if it - * returns %true, call @process for it also. - * - * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates css_task_iter_{start,next,end}() but does not - * lock css_set_rwsem for the call to @process. - * - * It is guaranteed that @process will act on every task that is a member - * of @css for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different css during the - * call, or are forked or move into the css during the call. - * - * Note that @test may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should be - * cheap. - * - * If @heap is non-NULL, a heap has been pre-allocated and will be used for - * heap operations (and its "gt" member will be overwritten), else a - * temporary heap will be used (allocation of which may cause this function - * to fail). - */ -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) -{ - int retval, i; - struct css_task_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct timespec latest_time = { 0, 0 }; - - if (heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } - - again: - /* - * Scan tasks in the css, using the @test callback to determine - * which are of interest, and invoking @process callback on the - * ones which need an update. Since we don't want to hold any - * locks during the task updates, gather tasks to be processed in a - * heap structure. The heap is sorted by descending task start - * time. If the statically-sized heap fills up, we overflow tasks - * that started later, and in future iterations only consider tasks - * that started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (test && !test(p, data)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - css_task_iter_end(&it); - - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - process(q, data); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} - /** * cgroup_trasnsfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved -- cgit v1.2.3 From 89c5509b0d71d1609761bf72d33333ab206dac9f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:40 -0500 Subject: cgroup: separate out put_css_set_locked() and remove put_css_set_taskexit() put_css_set() is performed in two steps - it first tries to put without grabbing css_set_rwsem if such put wouldn't make the count zero. If that fails, it puts after write-locking css_set_rwsem. This patch separates out the second phase into put_css_set_locked() which should be called with css_set_rwsem locked. Also, put_css_set_taskexit() is droped and put_css_set() is made to take @taskexit. There are only a handful users of these functions. No point in providing different variants. put_css_locked() will be used by later changes. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 05c0c23549f9..17b10b8efbcf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -369,22 +369,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -static void __put_css_set(struct css_set *cset, int taskexit) +static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - down_write(&css_set_rwsem); - if (!atomic_dec_and_test(&cset->refcount)) { - up_write(&css_set_rwsem); + lockdep_assert_held(&css_set_rwsem); + + if (!atomic_dec_and_test(&cset->refcount)) return; - } /* This css_set is dead. unlink it and release cgroup refcounts */ hash_del(&cset->hlist); @@ -406,10 +398,24 @@ static void __put_css_set(struct css_set *cset, int taskexit) kfree(link); } - up_write(&css_set_rwsem); kfree_rcu(cset, rcu_head); } +static void put_css_set(struct css_set *cset, bool taskexit) +{ + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + down_write(&css_set_rwsem); + put_css_set_locked(cset, taskexit); + up_write(&css_set_rwsem); +} + /* * refcounted get/put for css_set objects */ @@ -418,16 +424,6 @@ static inline void get_css_set(struct css_set *cset) atomic_inc(&cset->refcount); } -static inline void put_css_set(struct css_set *cset) -{ - __put_css_set(cset, 0); -} - -static inline void put_css_set_taskexit(struct css_set *cset) -{ - __put_css_set(cset, 1); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -1752,7 +1748,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * we're safe to drop it here; it will be freed under RCU. */ set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set(old_cset); + put_css_set(old_cset, false); } /** @@ -1898,7 +1894,7 @@ out_put_css_set_refs: tc = flex_array_get(group, i); if (!tc->cset) break; - put_css_set(tc->cset); + put_css_set(tc->cset, false); } } out_cancel_attach: @@ -3715,7 +3711,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * css_set_rwsem synchronizes access to ->cset_links and prevents - * @cgrp from being removed while __put_css_set() is in progress. + * @cgrp from being removed while put_css_set() is in progress. */ down_read(&css_set_rwsem); empty = list_empty(&cgrp->cset_links); @@ -4267,7 +4263,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - put_css_set_taskexit(cset); + put_css_set(cset, true); } static void check_for_release(struct cgroup *cgrp) -- cgit v1.2.3 From cb0f1fe9ba47c202a98a9d41ad5c12c0ac7732e9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:41 -0500 Subject: cgroup: move css_set_rwsem locking outside of cgroup_task_migrate() Instead of repeatedly locking and unlocking css_set_rwsem inside cgroup_task_migrate(), update cgroup_attach_task() to grab it outside of the loop and update cgroup_task_migrate() to use put_css_set_locked(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 17b10b8efbcf..704c590a81d7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1715,10 +1715,13 @@ int cgroup_taskset_size(struct cgroup_taskset *tset) EXPORT_SYMBOL_GPL(cgroup_taskset_size); -/* +/** * cgroup_task_migrate - move a task from one cgroup to another. + * @old_cgrp; the cgroup @tsk is being migrated from + * @tsk: the task being migrated + * @new_cset: the new css_set @tsk is being attached to * - * Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. */ static void cgroup_task_migrate(struct cgroup *old_cgrp, struct task_struct *tsk, @@ -1726,6 +1729,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, { struct css_set *old_cset; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * We are synchronized through threadgroup_lock() against PF_EXITING * setting such that we can't race against cgroup_exit() changing the @@ -1738,9 +1744,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); - down_write(&css_set_rwsem); list_move(&tsk->cg_list, &new_cset->tasks); - up_write(&css_set_rwsem); /* * We just gained a reference on old_cset by taking it from the @@ -1748,7 +1752,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * we're safe to drop it here; it will be freed under RCU. */ set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set(old_cset, false); + put_css_set_locked(old_cset, false); } /** @@ -1871,10 +1875,12 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * proceed to move all tasks to the new cgroup. There are no * failure cases after here, so this is the commit point. */ + down_write(&css_set_rwsem); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } + up_write(&css_set_rwsem); /* nothing is sensitive to fork() after this point. */ /* -- cgit v1.2.3 From 924f0d9a2078f49ff331bb43196ec5afadc16b8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:41 -0500 Subject: cgroup: drop @skip_css from cgroup_taskset_for_each() If !NULL, @skip_css makes cgroup_taskset_for_each() skip the matching css. The intention of the interface is to make it easy to skip css's (cgroup_subsys_states) which already match the migration target; however, this is entirely unnecessary as migration taskset doesn't include tasks which are already in the target cgroup. Drop @skip_css from cgroup_taskset_for_each(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann --- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 4 ++-- kernel/events/core.c | 2 +- kernel/sched/core.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 98ea26a99076..7201a637c405 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -187,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css, tset) { + cgroup_taskset_for_each(task, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 65ae0bdf4af8..bf20e4ac2f75 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1398,7 +1398,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1467,7 +1467,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here diff --git a/kernel/events/core.c b/kernel/events/core.c index a3c3ab50271a..6dd714955b04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8021,7 +8021,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d4cfc5561830..ba386a06ab11 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7600,7 +7600,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7618,7 +7618,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } -- cgit v1.2.3 From 57fce0a68e3aa71d223d9023aae66c7393970c34 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:41 -0500 Subject: cpuset: don't use cgroup_taskset_cur_css() cgroup_taskset_cur_css() will be removed during the planned resturcturing of migration path. The only use of cgroup_taskset_cur_css() is finding out the old cgroup_subsys_state of the leader in cpuset_attach(). This usage can easily be removed by remembering the old value from cpuset_can_attach(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index bf20e4ac2f75..d8bec21d7a11 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1379,6 +1379,8 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +static struct cpuset *cpuset_attach_old_cs; + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) @@ -1387,6 +1389,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct task_struct *task; int ret; + /* used later by cpuset_attach() */ + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + mutex_lock(&cpuset_mutex); /* @@ -1450,10 +1455,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, - cpuset_cgrp_id); struct cpuset *cs = css_cs(css); - struct cpuset *oldcs = css_cs(oldcss); + struct cpuset *oldcs = cpuset_attach_old_cs; struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); -- cgit v1.2.3 From bc668c7519ff8b4681af80e92f463bec7bf7cf9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:43 -0500 Subject: cgroup: remove cgroup_taskset_cur_css() and cgroup_taskset_size() The two functions don't have any users left. Remove them along with cgroup_taskset->cur_cgrp. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 704c590a81d7..a9d9bbb12310 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1647,7 +1647,6 @@ struct cgroup_taskset { struct flex_array *tc_array; int tc_array_len; int idx; - struct cgroup *cur_cgrp; }; /** @@ -1662,7 +1661,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) tset->idx = 0; return cgroup_taskset_next(tset); } else { - tset->cur_cgrp = tset->single.cgrp; return tset->single.task; } } @@ -1683,38 +1681,10 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) return NULL; tc = flex_array_get(tset->tc_array, tset->idx++); - tset->cur_cgrp = tc->cgrp; return tc->task; } EXPORT_SYMBOL_GPL(cgroup_taskset_next); -/** - * cgroup_taskset_cur_css - return the matching css for the current task - * @tset: taskset of interest - * @subsys_id: the ID of the target subsystem - * - * Return the css for the current (last returned) task of @tset for - * subsystem specified by @subsys_id. This function must be preceded by - * either cgroup_taskset_first() or cgroup_taskset_next(). - */ -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id) -{ - return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); -} -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); - -/** - * cgroup_taskset_size - return the number of tasks in taskset - * @tset: taskset of interest - */ -int cgroup_taskset_size(struct cgroup_taskset *tset) -{ - return tset->tc_array ? tset->tc_array_len : 1; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_size); - - /** * cgroup_task_migrate - move a task from one cgroup to another. * @old_cgrp; the cgroup @tsk is being migrated from -- cgit v1.2.3 From 9db8de3722d184b8a431afd6bef803d6867ac889 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:43 -0500 Subject: cgroup: cosmetic updates to cgroup_attach_task() cgroup_attach_task() is planned to go through restructuring. Let's tidy it up a bit in preparation. * Update cgroup_attach_task() to receive the target task argument in @leader instead of @tsk. * Rename @tsk to @task. * Rename @retval to @ret. This is purely cosmetic. v2: get_nr_threads() was using uninitialized @task instead of @leader. Fixed. Reported by Dan Carpenter. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Dan Carpenter --- kernel/cgroup.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a9d9bbb12310..9a890a2e58fc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1728,20 +1728,20 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @tsk: the task or the leader of the threadgroup to be attached + * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, bool threadgroup) { - int retval, i, group_size; + int ret, i, group_size; struct cgroupfs_root *root = cgrp->root; struct cgroup_subsys_state *css, *failed_css = NULL; /* threadgroup list cursor and array */ - struct task_struct *leader = tsk; + struct task_struct *task; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; @@ -1754,7 +1754,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * threads exit, this will just be an over-estimate. */ if (threadgroup) - group_size = get_nr_threads(tsk); + group_size = get_nr_threads(leader); else group_size = 1; /* flex_array supports very large thread-groups better than kmalloc. */ @@ -1762,8 +1762,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); - if (retval) + ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); + if (ret) goto out_free_group_list; i = 0; @@ -1774,17 +1774,18 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ down_read(&css_set_rwsem); rcu_read_lock(); + task = leader; do { struct task_and_cgroup ent; - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - ent.task = tsk; - ent.cgrp = task_cgroup_from_root(tsk, root); + ent.task = task; + ent.cgrp = task_cgroup_from_root(task, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) goto next; @@ -1792,13 +1793,13 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. */ - retval = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(retval != 0); + ret = flex_array_put(group, i, &ent, GFP_ATOMIC); + BUG_ON(ret != 0); i++; next: if (!threadgroup) break; - } while_each_thread(leader, tsk); + } while_each_thread(leader, task); rcu_read_unlock(); up_read(&css_set_rwsem); /* remember the number of threads in the array for later. */ @@ -1807,7 +1808,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, tset.tc_array_len = group_size; /* methods shouldn't be called if no task is actually migrating */ - retval = 0; + ret = 0; if (!group_size) goto out_free_group_list; @@ -1816,8 +1817,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for_each_css(css, i, cgrp) { if (css->ss->can_attach) { - retval = css->ss->can_attach(css, &tset); - if (retval) { + ret = css->ss->can_attach(css, &tset); + if (ret) { failed_css = css; goto out_cancel_attach; } @@ -1835,7 +1836,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, old_cset = task_css_set(tc->task); tc->cset = find_css_set(old_cset, cgrp); if (!tc->cset) { - retval = -ENOMEM; + ret = -ENOMEM; goto out_put_css_set_refs; } } @@ -1863,9 +1864,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 5: success! and cleanup */ - retval = 0; + ret = 0; out_put_css_set_refs: - if (retval) { + if (ret) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); if (!tc->cset) @@ -1874,7 +1875,7 @@ out_put_css_set_refs: } } out_cancel_attach: - if (retval) { + if (ret) { for_each_css(css, i, cgrp) { if (css == failed_css) break; @@ -1884,7 +1885,7 @@ out_cancel_attach: } out_free_group_list: flex_array_free(group); - return retval; + return ret; } /* -- cgit v1.2.3 From 8541fecc04a91842f023cbfe2c376d4de3b5047e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 06:58:43 -0500 Subject: cgroup: unexport functions With module support gone, a lot of functions no longer need to be exported. Unexport them. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9a890a2e58fc..750d0e1e7e56 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -242,7 +242,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) } return false; } -EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -1664,7 +1663,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) return tset->single.task; } } -EXPORT_SYMBOL_GPL(cgroup_taskset_first); /** * cgroup_taskset_next - iterate to the next task in taskset @@ -1683,7 +1681,6 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) tc = flex_array_get(tset->tc_array, tset->idx++); return tc->task; } -EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** * cgroup_task_migrate - move a task from one cgroup to another. @@ -2365,7 +2362,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) mutex_unlock(&cgroup_tree_mutex); return ret; } -EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_task_count - count the number of tasks in a cgroup. @@ -2439,7 +2435,6 @@ css_next_child(struct cgroup_subsys_state *pos_css, return cgroup_css(next, parent_css->ss); } -EXPORT_SYMBOL_GPL(css_next_child); /** * css_next_descendant_pre - find the next descendant for pre-order walk @@ -2482,7 +2477,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } -EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css @@ -2514,7 +2508,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) return last; } -EXPORT_SYMBOL_GPL(css_rightmost_descendant); static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) @@ -2568,7 +2561,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* no sibling left, visit parent */ return css_parent(pos); } -EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * css_advance_task_iter - advance a task itererator to the next css_set -- cgit v1.2.3 From dd5fd9b91a77b4c9c28b7ef9c181b1a875820d0a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Feb 2014 14:35:40 +0100 Subject: tick: Clear broadcast pending bit when switching to oneshot AMD systems which use the C1E workaround in the amd_e400_idle routine trigger the WARN_ON_ONCE in the broadcast code when onlining a CPU. The reason is that the idle routine of those AMD systems switches the cpu into forced broadcast mode early on before the newly brought up CPU can switch over to high resolution / NOHZ mode. The timer related CPU1 bringup looks like this: clockevent_register_device(local_apic); tick_setup(local_apic); ... idle() tick_broadcast_on_off(FORCE); tick_broadcast_oneshot_control(ENTER) cpumask_set(cpu, broadcast_oneshot_mask); halt(); Now the broadcast interrupt on CPU0 sets CPU1 in the broadcast_pending_mask and wakes CPU1. So CPU1 continues: local_apic_timer_interrupt() tick_handle_periodic(); softirq() tick_init_highres(); cpumask_clr(cpu, broadcast_oneshot_mask); tick_broadcast_oneshot_control(ENTER) WARN_ON(cpumask_test(cpu, broadcast_pending_mask); So while we remove CPU1 from the broadcast_oneshot_mask when we switch over to highres mode, we do not clear the pending bit, which then triggers the warning when we go back to idle. The reason why this is only visible on C1E affected AMD systems is that the other machines enter the deep sleep states via acpi_idle/intel_idle and exit the broadcast mode before executing the remote triggered local_apic_timer_interrupt. So the pending bit is already cleared when the switch over to highres mode is clearing the oneshot mask. The solution is simple: Clear the pending bit together with the mask bit when we switch over to highres mode. Stanislaw came up independently with the same patch by enforcing the C1E workaround and debugging the fallout. I picked mine, because mine has a changelog :) Reported-by: poma Debugged-by: Stanislaw Gruszka Signed-off-by: Thomas Gleixner Cc: Olaf Hering Cc: Dave Jones Cc: Justin M. Forbes Cc: Josh Boyer Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1402111434180.21991@ionos.tec.linutronix.de Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 43780ab5e279..98977a57ac72 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -756,6 +756,7 @@ out: static void tick_broadcast_clear_oneshot(int cpu) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, -- cgit v1.2.3 From 430af8ad9dad82d775d688155e1db1da385d3e7a Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 13 Feb 2014 16:42:43 -0500 Subject: cgroup: fix coccinelle warnings kernel/cgroup.c:2256:1-3: WARNING: PTR_RET can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: coccinelle/api/ptr_ret.cocci Signed-off-by: Fengguang Wu Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 750d0e1e7e56..15dcae74b510 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2171,9 +2171,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), 0, cft->kf_ops, cft, NULL, false, key); - if (IS_ERR(kn)) - return PTR_ERR(kn); - return 0; + return PTR_ERR_OR_ZERO(kn); } /** -- cgit v1.2.3 From 40747ffa5aa8d5b99ca46c696234b9194b59e0ac Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:51:59 +0100 Subject: asmlinkage: Make jiffies visible Jiffies is referenced by the linker script, so it has to be visible. Handled both the generic and the x86 version. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-3-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e5..d78de047599b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -52,7 +52,7 @@ #define CREATE_TRACE_POINTS #include -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); -- cgit v1.2.3 From 63f9a7fde715352e0769302527670542a664b981 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:01 +0100 Subject: asmlinkage: Make lockdep_sys_exit asmlinkage lockdep_sys_exit can be called from assembler code, so make it asmlinkage. Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-5-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eb8a54783fa0..c8b6753c5bb1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4191,7 +4191,7 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); -void lockdep_sys_exit(void) +asmlinkage void lockdep_sys_exit(void) { struct task_struct *curr = current; -- cgit v1.2.3 From b35f8305339f1ba3070fe606c6ef0d86ef093dee Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:02 +0100 Subject: asmlinkage: Make trace_hardirq visible Can be called from assembler code. Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-6-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c8b6753c5bb1..aa3bf153b718 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -void trace_hardirqs_on_caller(unsigned long ip) +__visible void trace_hardirqs_on_caller(unsigned long ip) { time_hardirqs_on(CALLER_ADDR0, ip); @@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off_caller(unsigned long ip) +__visible void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; -- cgit v1.2.3 From 22d9fd3411c693ccae5f5c2280fb1f9bb106ad4f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:03 +0100 Subject: asmlinkage, mutex: Mark __visible Various kernel/mutex.c functions can be called from inline assembler, so they should be all global and __visible. Cc: Ingo Molnar Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-7-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/locking/mutex.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4dd6e4c219de..adbc0d0f314b 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -67,8 +67,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count); +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); /** * mutex_lock - acquire the mutex @@ -225,7 +224,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) } #endif -static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +__visible __used noinline +void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** * mutex_unlock - release the mutex @@ -746,7 +746,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) /* * Release the lock, slowpath: */ -static __used noinline void +__visible void __mutex_unlock_slowpath(atomic_t *lock_count) { __mutex_unlock_common_slowpath(lock_count, 1); @@ -803,7 +803,7 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); -static __used noinline void __sched +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); -- cgit v1.2.3 From 00b7103078596a243c16239004e0dc9416910f13 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:04 +0100 Subject: asmlinkage: Make main_extable_sort_needed visible main_extable_sort_needed is used by the build system and needs to be a normal ELF symbol. Make it visible so that LTO does not remove or mangle it. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-8-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 763faf037ec1..d8a6446adbcb 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[]; extern struct exception_table_entry __stop___ex_table[]; /* Cleared by build time tools if the table is already sorted. */ -u32 __initdata main_extable_sort_needed = 1; +u32 __initdata __visible main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) -- cgit v1.2.3 From 3ebae4f3a2e746ae17f25c741e249294e7d6d7c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:05 +0100 Subject: asmlinkage: Mark rwsem functions that can be called from assembler asmlinkage Mark the rwsem functions that can be called from assembler asmlinkage. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-9-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/locking/rwsem-xadd.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 19c5fa95e0b4..1d66e08e897d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -143,6 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) /* * wait for the read lock to be granted */ +__visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; @@ -190,6 +191,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait until we successfully acquire the write lock */ +__visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; @@ -252,6 +254,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ +__visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; @@ -272,6 +275,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ +__visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; -- cgit v1.2.3 From a7330c997d0f74d909a7d3553b1d550d8be2b61a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:06 +0100 Subject: asmlinkage Make __stack_chk_failed and memcmp visible In LTO symbols implicitely referenced by the compiler need to be visible. Earlier these symbols were visible implicitely from being exported, but we disabled implicit visibility fo EXPORTs when modules are disabled to improve code size. So now these symbols have to be marked visible explicitely. Do this for __stack_chk_fail (with stack protector) and memcmp. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-10-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6d6300375090..3eb0ffb25960 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -459,7 +459,7 @@ EXPORT_SYMBOL(warn_slowpath_null); * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ -void __stack_chk_fail(void) +__visible void __stack_chk_fail(void) { panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); -- cgit v1.2.3 From 285c00adf651c9b1d6c73d5eee482d2a617a64c1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:52:08 +0100 Subject: asmlinkage: Make trace_hardirqs_on/off_caller visible These functions are called from assembler, and thus need to be __visible. Cc: Steven Rostedt Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-12-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/trace/trace_irqsoff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2aefbee93a6d..887ef88b0bc7 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -498,14 +498,14 @@ void trace_hardirqs_off(void) } EXPORT_SYMBOL(trace_hardirqs_off); -void trace_hardirqs_on_caller(unsigned long caller_addr) +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void trace_hardirqs_off_caller(unsigned long caller_addr) +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); -- cgit v1.2.3 From 80375980f1608f43b47abc2671456b23ec68c434 Mon Sep 17 00:00:00 2001 From: Joe Mario Date: Sat, 8 Feb 2014 09:01:09 +0100 Subject: lto: Handle LTO common symbols in module loader Here is the workaround I made for having the kernel not reject modules built with -flto. The clean solution would be to get the compiler to not emit the symbol. Or if it has to emit the symbol, then emit it as initialized data but put it into a comdat/linkonce section. Minor tweaks by AK over Joe's patch. Cc: Rusty Russell Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391846481-31491-5-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/module.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d24fcf29cb64..b99e80119eef 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1948,6 +1948,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) switch (sym[i].st_shndx) { case SHN_COMMON: + /* Ignore common symbols */ + if (!strncmp(name, "__gnu_lto", 9)) + break; + /* We compiled with -fno-common. These are not supposed to happen. */ pr_debug("Common symbol: %s\n", name); -- cgit v1.2.3 From 58edae3aac9f2ccd1afb12ea08127e840a0a706c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 09:01:10 +0100 Subject: lto: Disable LTO for sys_ni The assembler alias code in cond_syscall does not work when compiled for LTO. Just disable LTO for that file. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391846481-31491-6-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index bc010ee272b6..31c26c61aaec 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,6 +18,9 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif +# cond_syscall is currently not LTO compatible +CFLAGS_sys_ni.o = $(DISABLE_LTO) + obj-y += sched/ obj-y += locking/ obj-y += power/ -- cgit v1.2.3 From bad34660344f37db8b55ce8bc139bddc7d83af1b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Feb 2014 16:54:28 +0800 Subject: cgroup: fix locking in cgroupstats_build() css_set_lock has been converted to css_set_rwsem, and rwsem can't nest inside rcu_read_lock. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15dcae74b510..5606c0f08d95 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2995,6 +2995,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) kernfs_type(kn) != KERNFS_DIR) return -EINVAL; + mutex_lock(&cgroup_mutex); + /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_from_dir(), @@ -3002,10 +3004,12 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) */ rcu_read_lock(); cgrp = rcu_dereference(kn->priv); - if (!cgrp) { + if (!cgrp || cgroup_is_dead(cgrp)) { rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); return -ENOENT; } + rcu_read_unlock(); css_task_iter_start(&cgrp->dummy_css, &it); while ((tsk = css_task_iter_next(&it))) { @@ -3030,7 +3034,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); - rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.3 From 6534fd6c15858fe4ce4ae568106225e68d5afa81 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Feb 2014 16:55:04 +0800 Subject: cgroup: fix memory leak in cgroup_mount() We should free the memory allocated in parse_cgroupfs_options() before calling this function again. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5606c0f08d95..3fe01102607b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1540,6 +1540,8 @@ retry: if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) { mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); + kfree(opts.release_agent); + kfree(opts.name); msleep(10); goto retry; } -- cgit v1.2.3 From 8ba14654282ed6bb386d0a2f1ab329bfb293403f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Feb 2014 17:09:54 +0100 Subject: timer: Spare IPI when deferrable timer is queued on idle remote targets When a timer is enqueued or modified on a remote target, the latter is expected to see and handle this timer on its next tick. However if the target is idle and CONFIG_NO_HZ_IDLE=y, the CPU may be sleeping tickless and the timer may be ignored. wake_up_nohz_cpu() takes care of that by setting TIF_NEED_RESCHED and sending an IPI to idle targets so that the tick is reevaluated on the idle loop through the tick_nohz_idle_*() APIs. Now this is all performed regardless of the power properties of the timer. If the timer is deferrable, idle targets don't need to be woken up. Only the next buzy tick needs to care about it, and no IPI kick is needed for that to happen. So lets spare the IPI on idle targets when the timer is deferrable. Meanwhile we keep the current behaviour on full dynticks targets. We can spare IPIs on idle full dynticks targets as well but some tricky races against idle_cpu() must be dealt all along to make sure that the timer is well handled after idle exit. We can deal with that later since NO_HZ_FULL already has more important powersaving issues. Reported-by: Thomas Gleixner Signed-off-by: Viresh Kumar Cc: Ingo Molnar Cc: Paul Gortmaker Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/CAKohpomMZ0TAN2e6N76_g4ZRzxd5vZ1XfuZfxrP7GMxfTNiLVw@mail.gmail.com Signed-off-by: Frederic Weisbecker --- kernel/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e5..b75e7893be14 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -939,8 +939,15 @@ void add_timer_on(struct timer_list *timer, int cpu) * with the timer by holding the timer base lock. This also * makes sure that a CPU on the way to stop its tick can not * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. */ - wake_up_nohz_cpu(cpu); + if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) + wake_up_nohz_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); -- cgit v1.2.3 From f96a34e27df19335155394a235ea3a096bc52a71 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 6 Feb 2014 13:36:21 -0500 Subject: nohz: ensure users are aware boot CPU is not NO_HZ_FULL This bit of information is in the Kconfig help text: "Note the boot CPU will still be kept outside the range to handle the timekeeping duty." However neither the variable NO_HZ_FULL_ALL, or the prompt convey this important detail, so lets add it to the prompt to make it more explicitly obvious to the average user. Acked-by: Paul E. McKenney Signed-off-by: Paul Gortmaker Cc: Ingo Molnar Cc: Paul Gortmaker Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1391711781-7466-1-git-send-email-paul.gortmaker@windriver.com Signed-off-by: Frederic Weisbecker --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 3ce6e8c5f3fc..f448513a45ed 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -124,7 +124,7 @@ config NO_HZ_FULL endchoice config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default" + bool "Full dynticks system on all CPUs by default (except CPU 0)" depends on NO_HZ_FULL help If the user doesn't pass the nohz_full boot option to -- cgit v1.2.3 From e4178d809fdaee32a56833fff1f5056c99e90a1a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Feb 2014 12:24:45 -0800 Subject: printk: fix syslog() overflowing user buffer This is not a buffer overflow in the traditional sense: we don't overflow any *kernel* buffers, but we do mis-count the amount of data we copy back to user space for the SYSLOG_ACTION_READ_ALL case. In particular, if the user buffer is too small to hold everything, and *if* there is a continuation line at just the right place, we can end up giving the user more data than he asked for. The reason is that we first count up the number of bytes all the log records contains, then we walk the records again until we've skipped the records at the beginning that won't fit, and then we walk the rest of the records and copy them to the user space buffer. And in between that "skip the initial records that won't fit" and the "copy the records that *will* fit to user space", we reset the 'prev' variable that contained the record information for the last record not copied. That meant that when we started copying to user space, we now had a different character count than what we had originally calculated in the first record walk-through. The fix is to simply not clear the 'prev' flags value (in both cases where we had the same logic: syslog_print_all and kmsg_dump_get_buffer: the latter is used for pstore-like dumping) Reported-and-tested-by: Debabrata Banerjee Acked-by: Kay Sievers Cc: Greg Kroah-Hartman Cc: Jeff Mahoney Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b1d255f04135..4dae9cbe9259 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1076,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) next_seq = log_next_seq; len = 0; - prev = 0; while (len >= 0 && seq < next_seq) { struct printk_log *msg = log_from_idx(idx); int textlen; @@ -2788,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, next_idx = idx; l = 0; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); -- cgit v1.2.3 From 3660c2813fb6d0ba48ee44bcbf9feddf7218c11d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Dec 2013 09:24:02 -0800 Subject: rcu: Add ACCESS_ONCE() to ->n_force_qs_lh accesses The ->n_force_qs_lh field is accessed without the benefit of any synchronization, so this commit adds the needed ACCESS_ONCE() wrappers. Yes, increments to ->n_force_qs_lh can be lost, but contention should be low and the field is strictly statistical in nature, so this is not a problem. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_trace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3d116cd072d..e64157798624 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2304,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp) if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; return; } rnp_old = rnp; @@ -2316,7 +2316,7 @@ static void force_quiescent_state(struct rcu_state *rsp) smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 4def475336d4..d1f1e64a6d72 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); + ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); -- cgit v1.2.3 From 87de1cfdc55b16b794e245b07322340725149d62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Dec 2013 10:02:52 -0800 Subject: rcu: Stop tracking FSF's postal address All of the RCU source files have the usual GPL header, which contains a long-obsolete postal address for FSF. To avoid the need to track the FSF office's movements, this commit substitutes the URL where GPL may be found. Reported-by: Greg KH Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcu.h | 4 ++-- kernel/rcu/srcu.c | 4 ++-- kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tiny_plugin.h | 4 ++-- kernel/rcu/torture.c | 4 ++-- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree.h | 4 ++-- kernel/rcu/tree_plugin.h | 4 ++-- kernel/rcu/tree_trace.c | 4 ++-- kernel/rcu/update.c | 4 ++-- 10 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 79c3877e9c5b..1bd787fddcb2 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2011 * diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 3318d8284384..5db7e9272d37 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 1254f312d024..53b95bbf4abb 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae352..431528520562 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (c) 2010 Linaro * diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c index 732f8ae3086a..ab7dd192a50b 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/torture.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2005, 2006 * diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e64157798624..321feef0f5c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8c19873f1ac9..75dc3c39a02a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6e2ef4b2b920..f9b9cdd36c8d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index d1f1e64a6d72..5cdc62e1beeb 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c54609faf233..fd0d5b5b8e7c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2001 * -- cgit v1.2.3 From cb1e78cfa267453bb19e7edafd214c03834b664c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Dec 2013 18:42:03 -0800 Subject: rcu: Remove ACCESS_ONCE() from jiffies Because jiffies is one of a very few variables marked "volatile", there is no need to use ACCESS_ONCE() when accessing it. This commit therefore removes the redundant ACCESS_ONCE() wrappers. Reported by: Eric Dumazet Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/torture.c | 4 ++-- kernel/rcu/tree.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c index ab7dd192a50b..022c5312b725 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/torture.c @@ -1352,7 +1352,7 @@ rcu_torture_shutdown(void *arg) unsigned long jiffies_snap; VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); + jiffies_snap = jiffies; while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && !kthread_should_stop()) { delta = shutdown_time - jiffies_snap; @@ -1361,7 +1361,7 @@ rcu_torture_shutdown(void *arg) "rcu_torture_shutdown task: %lu jiffies remaining\n", torture_type, delta); schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); + jiffies_snap = jiffies; } if (kthread_should_stop()) { VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 321feef0f5c0..73c3cd2b87ac 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -837,7 +837,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * to the next. Only do this for the primary flavor of RCU. */ if (rdp->rsp == rcu_state && - ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { + ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { rdp->rsp->jiffies_resched += 5; resched_cpu(rdp->cpu); } @@ -847,7 +847,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, static void record_gp_stall_check_time(struct rcu_state *rsp) { - unsigned long j = ACCESS_ONCE(jiffies); + unsigned long j = jiffies; unsigned long j1; rsp->gp_start = j; @@ -1005,7 +1005,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) return; - j = ACCESS_ONCE(jiffies); + j = jiffies; /* * Lots of memory barriers to reject false positives. -- cgit v1.2.3 From 52e2bb958ac4f9b3c4bdd78606d279852fd72922 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sun, 9 Feb 2014 14:35:11 +0100 Subject: rcu: Disambiguate CONFIG_RCU_NOCB_CPUs This commit fixes a grammar issue in the rcu_nohz_full_cpu() comment header, so that it is clear that the plural is CPUs not Kconfig options. Signed-off-by: Paul Bolle Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f9b9cdd36c8d..fffe4178a23d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2893,7 +2893,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) * CPU unless the grace period has extended for too long. * * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPUs. + * CONFIG_RCU_NOCB_CPU CPUs. */ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) { -- cgit v1.2.3 From ae1670339c95c3ff96ab10582506cf827c5fecc8 Mon Sep 17 00:00:00 2001 From: Shaibal Dutta Date: Fri, 31 Jan 2014 11:53:06 -0800 Subject: rcu: Move SRCU grace period work to power efficient workqueue For better use of CPU idle time, allow the scheduler to select the CPU on which the SRCU grace period work would be scheduled. This improves idle residency time and conserves power. This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected. Cc: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Signed-off-by: Shaibal Dutta [zoran.markovic@linaro.org: Rebased to latest kernel version. Added commit message. Fixed code alignment.] Signed-off-by: Zoran Markovic Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/srcu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 5db7e9272d37..2359779e1daa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -398,7 +398,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - schedule_delayed_work(&sp->work, 0); + queue_delayed_work(system_power_efficient_wq, &sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -674,7 +674,8 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - schedule_delayed_work(&sp->work, SRCU_INTERVAL); + queue_delayed_work(system_power_efficient_wq, + &sp->work, SRCU_INTERVAL); } /* -- cgit v1.2.3 From 2f33b512a5460578f6cf11d7b7867bed53157c7c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Nov 2013 18:25:48 -0800 Subject: rcu: Optimize rcu_is_nocb_cpu() for RCU_NOCB_CPU_ALL If CONFIG_RCU_NOCB_CPU_ALL=y, then rcu_is_nocb_cpu() will always return true, however, the current version nevertheless checks rcu_nocb_mask. This commit therefore creates a static inline implementation of rcu_is_nocb_cpu() that unconditionally returns true when CONFIG_RCU_NOCB_CPU_ALL=y. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6e2ef4b2b920..39a50b918bff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2101,6 +2101,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_waitqueue_head(&rnp->nocb_gp_wq[1]); } +#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CPUs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -2108,6 +2109,7 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Enqueue the specified string of rcu_head structures onto the specified -- cgit v1.2.3 From ffa83fb565fbc397cbafb4b71fd1cce276d4c3b6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Nov 2013 19:27:16 -0800 Subject: rcu: Optimize rcu_needs_cpu() for RCU_NOCB_CPU_ALL If CONFIG_RCU_NOCB_CPU_ALL=y, then rcu_needs_cpu() will always return false, however, the current version nevertheless checks for RCU callbacks. This commit therefore creates a static inline implementation of rcu_needs_cpu() that unconditionally returns false when CONFIG_RCU_NOCB_CPU_ALL=y. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3d116cd072d..c2c8234a0291 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2880,7 +2880,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { bool al = true; bool hc = false; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 39a50b918bff..820b06aefbee 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu, NULL); } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up @@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void) * * The caller must have disabled interrupts. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) } return 0; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Prepare a CPU for idle from an RCU perspective. The first major task -- cgit v1.2.3 From f1f399d1281ea339a08469f7e58193624992f620 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Nov 2013 21:08:07 -0800 Subject: rcu: Optimize RCU_FAST_NO_HZ for RCU_NOCB_CPU_ALL If CONFIG_RCU_NOCB_CPU_ALL=y, then no CPU will ever have RCU callbacks because these callbacks will instead be handled by the rcuo kthreads. However, the current version of RCU_FAST_NO_HZ nevertheless checks for RCU callbacks. This commit therefore creates static inline implementations of rcu_prepare_for_idle() and rcu_cleanup_after_idle() that are no-ops when CONFIG_RCU_NOCB_CPU_ALL=y. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree_plugin.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 820b06aefbee..41afc3fbfb6c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1658,7 +1658,7 @@ extern int tick_nohz_active; * only if it has been awhile since the last time we did so. Afterwards, * if there are any callbacks ready for immediate invocation, return true. */ -static bool rcu_try_advance_all_cbs(void) +static bool __maybe_unused rcu_try_advance_all_cbs(void) { bool cbs_ready = false; struct rcu_data *rdp; @@ -1743,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) */ static void rcu_prepare_for_idle(int cpu) { +#ifndef CONFIG_RCU_NOCB_CPU_ALL struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct rcu_node *rnp; @@ -1794,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu) rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1803,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - +#ifndef CONFIG_RCU_NOCB_CPU_ALL if (rcu_is_nocb_cpu(cpu)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* -- cgit v1.2.3 From 45a22f4c11fef4ecd5c61c0a299cd3f23d77be8e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Feb 2014 13:09:50 +0100 Subject: inotify: Fix reporting of cookies for inotify events My rework of handling of notification events (namely commit 7053aee26a35 "fsnotify: do not share events between notification groups") broke sending of cookies with inotify events. We didn't propagate the value passed to fsnotify() properly and passed 4 uninitialized bytes to userspace instead (so it is also an information leak). Sadly I didn't notice this during my testing because inotify cookies aren't used very much and LTP inotify tests ignore them. Fix the problem by passing the cookie value properly. Fixes: 7053aee26a3548ebaba046ae2e52396ccf56ac6c Reported-by: Vegard Nossum Signed-off-by: Jan Kara --- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 67ccf0e7cca9..135944a7b28a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *file_name) + const unsigned char *file_name, u32 cookie) { return 0; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 2596fac5dcb4..70b4554d2fbe 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, void *data, int data_type, - const unsigned char *dname) + const unsigned char *dname, u32 cookie) { struct inode *inode; struct audit_parent *parent; -- cgit v1.2.3 From 5bdfff96c69a4d5ab9c49e60abf9e070ecd2acbb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 15 Feb 2014 22:02:28 +0800 Subject: workqueue: ensure @task is valid across kthread_stop() When a kworker should die, the kworkre is notified through WORKER_DIE flag instead of kthread_should_stop(). This, IIRC, is primarily to keep the test synchronized inside worker_pool lock. WORKER_DIE is first set while holding pool->lock, the lock is dropped and kthread_stop() is called. Unfortunately, this means that there's a slight chance that the target kworker may see WORKER_DIE before kthread_stop() finishes and exits and frees the target task before or during kthread_stop(). Fix it by pinning the target task before setting WORKER_DIE and putting it after kthread_stop() is done. tj: Improved patch description and comment. Moved pinning above WORKER_DIE for better signify what it's protecting. CC: stable@vger.kernel.org Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82ef9f3b7473..193e977a10ea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker) if (worker->flags & WORKER_IDLE) pool->nr_idle--; + /* + * Once WORKER_DIE is set, the kworker may destroy itself at any + * point. Pin to ensure the task stays until we're done with it. + */ + get_task_struct(worker->task); + list_del_init(&worker->entry); worker->flags |= WORKER_DIE; @@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker) spin_unlock_irq(&pool->lock); kthread_stop(worker->task); + put_task_struct(worker->task); kfree(worker); spin_lock_irq(&pool->lock); -- cgit v1.2.3 From dc5736ed7aaf942caaac0c15af74a018e04ec79d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 17 Feb 2014 10:41:50 +0800 Subject: cgroup: add a validation check to cgroup_add_cftyps() Fengguang reported this bug: BUG: unable to handle kernel NULL pointer dereference at 0000003c IP: [] cgroup_cfts_commit+0x27/0x1c1 ... Call Trace: [] ? kmem_cache_alloc_trace+0x33f/0x3b7 [] cgroup_add_cftypes+0x8f/0xca [] cgroup_init+0x6a/0x26a [] start_kernel+0x4d7/0x57a [] i386_start_kernel+0x92/0x96 This happens in a corner case. If CGROUP_SCHED=y but CFS_BANDWIDTH=n && FAIR_GROUP_SCHED=n && RT_GROUP_SCHED=n, we have: cpu_files[] = { { } /* terminate */ } When we pass cpu_files to cgroup_apply_cftypes(), as cpu_files[0].ss is NULL, we'll access NULL pointer. The bug was introduced by commit de00ffa56ea3132c6013fc8f07133b8a1014cf53 ("cgroup: make cgroup_subsys->base_cftypes use cgroup_add_cftypes()"). Reported-by: Fengguang Wu Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3fe01102607b..771d1b8aaae9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2348,6 +2348,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; + if (!cfts || cfts[0].name[0] == '\0') + return 0; + ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; -- cgit v1.2.3 From 532de3fc72adc2a6525c4d53c07bf81e1732083d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 13:29:31 -0500 Subject: cgroup: update cgroup_enable_task_cg_lists() to grab siglock Currently, there's nothing preventing cgroup_enable_task_cg_lists() from missing set PF_EXITING and race against cgroup_exit(). Depending on the timing, cgroup_exit() may finish with the task still linked on css_set leading to list corruption. Fix it by grabbing siglock in cgroup_enable_task_cg_lists() so that PF_EXITING is guaranteed to be visible. This whole on-demand cg_list optimization is extremely fragile and has ample possibility to lead to bugs which can cause things like once-a-year oops during boot. I'm wondering whether the better approach would be just adding "cgroup_disable=all" handling which disables the whole cgroup rather than tempting fate with this on-demand craziness. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 68d87103b493..105f273b6f86 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2905,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void) * We should check if the process is exiting, otherwise * it will race with cgroup_exit() in that the list * entry won't be deleted though the process has exited. + * Do it while holding siglock so that we don't end up + * racing against cgroup_exit(). */ + spin_lock_irq(&p->sighand->siglock); if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &task_css_set(p)->tasks); + spin_unlock_irq(&p->sighand->siglock); + task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); -- cgit v1.2.3 From e227867f12302633737bd2a48a10a9a72c0630cb Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Tue, 18 Feb 2014 22:54:36 +0900 Subject: treewide: Fix typo in Documentation/DocBook This patch fix spelling typo in Documentation/DocBook. It is because .html and .xml files are generated by make htmldocs, I have to fix a typo within the source files. Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: Jiri Kosina --- kernel/relay.c | 2 +- kernel/signal.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 5001c9887db1..52d6a6f56261 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -227,7 +227,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) * relay_remove_buf - remove a channel buffer * @kref: target kernel reference that contains the relay buffer * - * Removes the file from the fileystem, which also frees the + * Removes the file from the filesystem, which also frees the * rchan_buf_struct and the channel buffer. Should only be called from * kref_put(). */ diff --git a/kernel/signal.c b/kernel/signal.c index 940b30ee9a30..f4812283c6e9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2382,7 +2382,7 @@ relock: * @regs: user register state * @stepping: nonzero if debugger single-step or block-step in use * - * This function should be called when a signal has succesfully been + * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER * is set in @ka->sa.sa_flags. Tracing is notified. -- cgit v1.2.3 From 392b21897d6cbff55c5b28910dfdc74e3020de6c Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Sun, 16 Feb 2014 22:58:12 -0500 Subject: user_namespace.c: Remove duplicated word in comment Signed-off-by: Brian Campbell Signed-off-by: Jiri Kosina --- kernel/user_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 240fb62cf394..dd06439b9c84 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test - * for and handle handle INVALID_UID being returned. INVALID_UID + * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) -- cgit v1.2.3 From 5ae8aabeaec3fe69c4fb21cbe5b17b72b35b5892 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 17 Feb 2014 10:45:36 -0800 Subject: sched_clock: Prevent callers from seeing half-updated data The generic sched_clock registration function was previously done lockless, due to the fact that it was expected to be called only once. However, now there are systems that may register multiple sched_clock sources, for which the lack of locking has casued problems: If two sched_clock sources are registered we may end up in a situation where a call to sched_clock() may be accessing the epoch cycle count for the old counter and the cycle count for the new counter. This can lead to confusing results where sched_clock() values jump and then are reset to 0 (due to the way the registration function forces the epoch_ns to be 0). Fix this by reorganizing the registration function to hold the seqlock for as short a time as possible while we update the clock_data structure for a new counter. We also put any accumulated time into epoch_ns instead of resetting the time to 0 so that the clock doesn't reset after each successful registration. [jstultz: Added extra context to the commit message] Reported-by: Will Deacon Signed-off-by: Stephen Boyd Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Peter Zijlstra Cc: Josh Cartwright Link: http://lkml.kernel.org/r/1392662736-7803-2-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/sched_clock.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0abb36464281..4d23dc4d8139 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) void __init sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + ktime_t new_wrap_kt; unsigned long r; - u64 res, wrap; char r_unit; if (cd.rate > rate) return; WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = CLOCKSOURCE_MASK(bits); - cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + + /* calculate how many ns until we wrap */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + /* update epoch for new counter and update epoch_ns from old counter*/ + new_epoch = read(); + cyc = read_sched_clock(); + ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_write_seqcount_begin(&cd.seq); + read_sched_clock = read; + sched_clock_mask = new_mask; + cd.rate = rate; + cd.wrap_kt = new_wrap_kt; + cd.mult = new_mult; + cd.shift = new_shift; + cd.epoch_cyc = new_epoch; + cd.epoch_ns = ns; + raw_write_seqcount_end(&cd.seq); r = rate; if (r >= 4000000) { @@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits, } else r_unit = ' '; - /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); - cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); + res = cyc_to_ns(1ULL, new_mult, new_shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; - /* Enable IRQ time accounting if we have a fast enough sched_clock */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); -- cgit v1.2.3 From 18258f7239a61d8929b8e0c7b6d46c446459074c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 15 Feb 2014 00:55:18 +0000 Subject: genirq: Provide synchronize_hardirq() synchronize_irq() waits for hard irq and threaded handlers to complete before returning. For some special cases we only need to make sure that the hard interrupt part of the irq line is not in progress when we disabled the - possibly shared - interrupt at the device level. A proper use case for this was provided by Russell. The sdhci driver requires some irq triggered functions to be run in thread context. The current implementation of the thread context is a sdio private kthread construct, which has quite some shortcomings. These can be avoided when the thread is directly associated to the device interrupt via the generic threaded irq infrastructure. Though there is a corner case related to run time power management where one side disables the device interrupts at the device level and needs to make sure, that an already running hard interrupt handler has completed before proceeding further. Though that hard interrupt handler might wake the associated thread, which in turn can request the runtime PM to reenable the device. Using synchronize_irq() leads to an immediate deadlock of the irq thread waiting for the PM lock and the synchronize_irq() waiting for the irq thread to complete. Due to the fact that it is sufficient for this case to ensure that no hard irq handler is executing a new function which avoids the check for the thread is required. Add a function, which just monitors the hard irq parts and ignores the threaded handlers. Signed-off-by: Thomas Gleixner Tested-by: Russell King Cc: Chris Ball Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140215003823.653236081@linutronix.de --- kernel/irq/manage.c | 70 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b17..274ba9238fb7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -32,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif -/** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for - * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void synchronize_irq(unsigned int irq) +static void __synchronize_hardirq(struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); bool inprogress; - if (!desc) - return; - do { unsigned long flags; @@ -67,12 +53,56 @@ void synchronize_irq(unsigned int irq) /* Oops, that failed? */ } while (inprogress); +} - /* - * We made sure that no hardirq handler is running. Now verify - * that no threaded handlers are active. - */ - wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); +/** + * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending hard IRQ handlers for this + * interrupt to complete before returning. If you use this + * function while holding a resource the IRQ handler may need you + * will deadlock. It does not take associated threaded handlers + * into account. + * + * Do not use this for shutdown scenarios where you must be sure + * that all parts (hardirq and threaded handler) have completed. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_hardirq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) + __synchronize_hardirq(desc); +} +EXPORT_SYMBOL(synchronize_hardirq); + +/** + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + __synchronize_hardirq(desc); + /* + * We made sure that no hardirq handler is + * running. Now verify that no threaded handlers are + * active. + */ + wait_event(desc->wait_for_threads, + !atomic_read(&desc->threads_active)); + } } EXPORT_SYMBOL(synchronize_irq); -- cgit v1.2.3 From a92444c6b2225a9115d661c950cb48a22aeace20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 15 Feb 2014 00:55:19 +0000 Subject: genirq: Provide irq_wake_thread() In course of the sdhci/sdio discussion with Russell about killing the sdio kthread hackery we discovered the need to be able to wake an interrupt thread from software. The rationale for this is, that sdio hardware can lack proper interrupt support for certain features. So the driver needs to poll the status registers, but at the same time it needs to be woken up by an hardware interrupt. To be able to get rid of the home brewn kthread construct of sdio we need a way to wake an irq thread independent of an actual hardware interrupt. Provide an irq_wake_thread() function which wakes up the thread which is associated to a given dev_id. This allows sdio to invoke the irq thread from the hardware irq handler via the IRQ_WAKE_THREAD return value and provides a possibility to wake it via a timer for the polling scenarios. That allows to simplify the sdio logic significantly. Signed-off-by: Thomas Gleixner Cc: Russell King Cc: Chris Ball Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140215003823.772565780@linutronix.de --- kernel/irq/handle.c | 4 ++-- kernel/irq/internals.h | 1 + kernel/irq/manage.c | 27 +++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 131ca176b497..bfec453557b4 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,7 +51,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) { /* * In case the thread crashed and was killed we just pretend that @@ -157,7 +157,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - irq_wake_thread(desc, action); + __irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 001fa5bab490..d61ac29e32d0 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -82,6 +82,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); bool irq_wait_for_poll(struct irq_desc *desc); +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 274ba9238fb7..54eb5c99351b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -911,6 +911,33 @@ static int irq_thread(void *data) return 0; } +/** + * irq_wake_thread - wake the irq thread for the action identified by dev_id + * @irq: Interrupt line + * @dev_id: Device identity for which the thread should be woken + * + */ +void irq_wake_thread(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + unsigned long flags; + + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return; + + raw_spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action; action; action = action->next) { + if (action->dev_id == dev_id) { + if (action->thread) + __irq_wake_thread(desc, action); + break; + } + } + raw_spin_unlock_irqrestore(&desc->lock, flags); +} +EXPORT_SYMBOL_GPL(irq_wake_thread); + static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) -- cgit v1.2.3 From b04c644e670f79417f1728e6be310cfd8e6a921b Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 10 Feb 2014 16:13:57 +0800 Subject: genirq: Update the a comment typo Change the comment "chasnge" to "change". Signed-off-by: Chuansheng Liu Link: http://lkml.kernel.org/r/1392020037-5484-2-git-send-email-chuansheng.liu@intel.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 54eb5c99351b..ada0c548c36a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -757,7 +757,7 @@ out_unlock: #ifdef CONFIG_SMP /* - * Check whether we need to chasnge the affinity of the interrupt thread. + * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) -- cgit v1.2.3 From 8c1a49aedb73fb2f15aaa32ad9e2e1c4289f45cb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 10 Jan 2014 11:13:54 -0500 Subject: tracing: Pass trace_array to set_flag callback As options (flags) may affect instances instead of being global the set_flag() callbacks need to receive the trace_array descriptor of the instance they will be modifying. Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 3 ++- kernel/trace/trace.c | 18 ++++++++++-------- kernel/trace/trace.h | 3 ++- kernel/trace/trace_functions.c | 3 ++- kernel/trace/trace_functions_graph.c | 3 ++- kernel/trace/trace_irqsoff.c | 6 ++++-- kernel/trace/trace_nop.c | 2 +- kernel/trace/trace_sched_wakeup.c | 6 ++++-- 8 files changed, 27 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b418cb0d7242..0d758ca61933 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1427,7 +1427,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } -static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +static int +blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 815c878f409b..d7dfc7efc4bf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = { .opts = dummy_tracer_opt }; -static int dummy_set_flag(u32 old_flags, u32 bit, int set) +static int +dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } @@ -3339,13 +3340,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) return 0; } -static int __set_tracer_option(struct tracer *trace, +static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { + struct tracer *trace = tr->current_trace; int ret; - ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -3357,8 +3359,9 @@ static int __set_tracer_option(struct tracer *trace, } /* Try to assign a tracer specific option */ -static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { + struct tracer *trace = tr->current_trace; struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int i; @@ -3367,8 +3370,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(trace, trace->flags, - opts, neg); + return __set_tracer_option(tr, trace->flags, opts, neg); } return -EINVAL; @@ -3440,7 +3442,7 @@ static int trace_set_options(struct trace_array *tr, char *option) /* If no option could be set, test the specific tracer options */ if (!trace_options[i]) - ret = set_tracer_option(tr->current_trace, cmp, neg); + ret = set_tracer_option(tr, cmp, neg); mutex_unlock(&trace_types_lock); @@ -5689,7 +5691,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - ret = __set_tracer_option(topt->tr->current_trace, topt->flags, + ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 02b592f2d4b7..649a23d421c1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -355,7 +355,8 @@ struct tracer { void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ - int (*set_flag)(u32 old_flags, u32 bit, int set); + int (*set_flag)(struct trace_array *tr, + u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ int (*flag_changed)(struct tracer *tracer, u32 mask, int set); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 38fe1483c508..85e517e84f50 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -175,7 +175,8 @@ static void tracing_stop_function_trace(void) unregister_ftrace_function(&trace_ops); } -static int func_set_flag(u32 old_flags, u32 bit, int set) +static int +func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { switch (bit) { case TRACE_FUNC_OPT_STACK: diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0b99120d395c..deff11200261 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1476,7 +1476,8 @@ void graph_trace_close(struct trace_iterator *iter) } } -static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +static int +func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (bit == TRACE_GRAPH_PRINT_IRQS) ftrace_graph_skip_irqs = !set; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2aefbee93a6d..fd99b0c183ac 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -160,7 +160,8 @@ static struct ftrace_ops trace_ops __read_mostly = #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { int cpu; @@ -266,7 +267,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 394f94417e2f..f3984098c0d7 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr) * If you don't implement it, then the flag setting will be * automatically accepted. */ -static int nop_set_flag(u32 old_flags, u32 bit, int set) +static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* * Note that you don't need to update nop_flags.val yourself. diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 6e32635e5e57..f0bbdc261028 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -209,7 +209,8 @@ static void stop_func_tracer(int graph) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (!(bit & TRACE_DISPLAY_GRAPH)) @@ -311,7 +312,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } -- cgit v1.2.3 From bf6065b5c7014ab30383405718c7a6b96d2cbdb2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 10 Jan 2014 17:51:01 -0500 Subject: tracing: Pass trace_array to flag_changed callback As options (flags) may affect instances instead of being global the flag_changed() callbacks need to receive the trace_array descriptor of the instance they will be modifying. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_irqsoff.c | 4 +++- kernel/trace/trace_sched_wakeup.c | 4 +++- 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7dfc7efc4bf..ee8da93e91e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3393,7 +3393,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) /* Give the tracer a chance to approve the change */ if (tr->current_trace->flag_changed) - if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) + if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; if (enabled) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 649a23d421c1..36e44732c650 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -358,7 +358,7 @@ struct tracer { int (*set_flag)(struct trace_array *tr, u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ - int (*flag_changed)(struct tracer *tracer, + int (*flag_changed)(struct trace_array *tr, u32 mask, int set); struct tracer *next; struct tracer_flags *flags; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index fd99b0c183ac..4bf812f454e6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -572,8 +572,10 @@ static void irqsoff_function_set(int set) unregister_irqsoff_function(is_graph()); } -static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) { + struct tracer *tracer = tr->current_trace; + if (mask & TRACE_ITER_FUNCTION) irqsoff_function_set(set); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f0bbdc261028..e14da5e97a69 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -179,8 +179,10 @@ static void wakeup_function_set(int set) unregister_wakeup_function(is_graph()); } -static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { + struct tracer *tracer = tr->current_trace; + if (mask & TRACE_ITER_FUNCTION) wakeup_function_set(set); -- cgit v1.2.3 From 607e2ea167e56db84387f3ab97e59a862e101cab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 6 Nov 2013 22:42:48 -0500 Subject: tracing: Set up infrastructure to allow tracers for instances Currently the tracers (function, function_graph, irqsoff, etc) can only be used by the top level tracing directory (not for instances). This sets up the infrastructure to allow instances to be able to run a separate tracer apart from the what the top level tracing is doing. As tracers need to adapt for being used by instances, the tracers must flag if they can be used by instances or not. Currently only the 'nop' tracer can be used by all instances. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 72 ++++++++++++++++++++++++++++++++++++++---------- kernel/trace/trace.h | 1 + kernel/trace/trace_nop.c | 3 +- 3 files changed, 60 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ee8da93e91e0..944cd021aabf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -119,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; -static int tracing_set_tracer(const char *buf); +static int tracing_set_tracer(struct trace_array *tr, const char *buf); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -1231,7 +1231,7 @@ int register_tracer(struct tracer *type) printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(type->name); + tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; /* disable other selftests, since this will break it. */ tracing_selftest_disabled = true; @@ -3122,27 +3122,52 @@ static int tracing_open(struct inode *inode, struct file *file) return ret; } +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ + return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ + while (t && !trace_ok_for_array(t, tr)) + t = t->next; + + return t; +} + static void * t_next(struct seq_file *m, void *v, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) - t = t->next; + t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) - ; + + t = get_tracer_for_array(tr, trace_types); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; return t; } @@ -3177,10 +3202,21 @@ static const struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + if (tracing_disabled) return -ENODEV; - return seq_open(file, &show_traces_seq_ops); + ret = seq_open(file, &show_traces_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = tr; + + return 0; } static ssize_t @@ -3871,10 +3907,9 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); -static int tracing_set_tracer(const char *buf) +static int tracing_set_tracer(struct trace_array *tr, const char *buf) { static struct trace_option_dentry *topts; - struct trace_array *tr = &global_trace; struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; @@ -3902,6 +3937,12 @@ static int tracing_set_tracer(const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers are only allowed for the top level buffer */ + if (!trace_ok_for_array(t, tr)) { + ret = -EINVAL; + goto out; + } + trace_branch_disable(); tr->current_trace->enabled = false; @@ -3958,6 +3999,7 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; @@ -3977,7 +4019,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) buf[i] = 0; - err = tracing_set_tracer(buf); + err = tracing_set_tracer(tr, buf); if (err) return err; @@ -6193,6 +6235,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; + trace_create_file("available_tracers", 0444, d_tracer, + tr, &show_traces_fops); + + trace_create_file("current_tracer", 0644, d_tracer, + tr, &set_tracer_fops); + trace_create_file("tracing_cpumask", 0644, d_tracer, tr, &tracing_cpumask_fops); @@ -6245,12 +6293,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); - trace_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - - trace_create_file("current_tracer", 0644, d_tracer, - &global_trace, &set_tracer_fops); - #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 36e44732c650..ea51bb2004d2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -364,6 +364,7 @@ struct tracer { struct tracer_flags *flags; bool print_max; bool enabled; + bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index f3984098c0d7..69a5cc94c01a 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -96,6 +96,7 @@ struct tracer nop_trace __read_mostly = .selftest = trace_selftest_startup_nop, #endif .flags = &nop_flags, - .set_flag = nop_set_flag + .set_flag = nop_set_flag, + .allow_instances = true, }; -- cgit v1.2.3 From f1b21c9a40704dfdf7b8423c7d2969ea31c9857d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 12:33:33 -0500 Subject: tracing: Only let top level have option files Currently, only the top level instance can have tracing options. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 944cd021aabf..da9543cdbe7a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3968,9 +3968,11 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - destroy_trace_option_files(topts); - - topts = create_trace_option_files(tr, t); + /* Currently, only the top instance has options */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); + } #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { -- cgit v1.2.3 From e6435e96ec6f31a05690876a19e63e451f7b37e2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 10 Jan 2014 14:31:31 -0500 Subject: ftrace: Copy ops private to global_ops private If global_ops function is being called directly, instead of the global_ops list function, set the global_ops private to be the same as the ops private that's being called directly. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cd7f76d1eb86..98ae4ed965db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -244,7 +244,11 @@ static void control_ops_free(struct ftrace_ops *ops) static void update_global_ops(void) { - ftrace_func_t func; + ftrace_func_t func = ftrace_global_list_func; + void *private = NULL; + + /* The list has its own recursion protection. */ + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; /* * If there's only one function registered, then call that @@ -254,23 +258,17 @@ static void update_global_ops(void) if (ftrace_global_list == &ftrace_list_end || ftrace_global_list->next == &ftrace_list_end) { func = ftrace_global_list->func; + private = ftrace_global_list->private; /* * As we are calling the function directly. * If it does not have recursion protection, * the function_trace_op needs to be updated * accordingly. */ - if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - else + if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; - } else { - func = ftrace_global_list_func; - /* The list has its own recursion protection. */ - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; } - /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); @@ -278,6 +276,7 @@ static void update_global_ops(void) } global_ops.func = func; + global_ops.private = private; } static void ftrace_sync(struct work_struct *work) -- cgit v1.2.3 From 6b450d2533e2c1c71fbe7f1bdce0bb1c9f813030 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 08:43:01 -0500 Subject: tracing: Disable tracers before deletion of instance When an instance is about to be deleted, make sure the tracer is set to nop. If it isn't reset the tracer and set it to the nop tracer, otherwise memory leaks and bad pointers may result. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index da9543cdbe7a..7d5913bb46e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3907,6 +3907,23 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ + if (tr->current_trace == &nop_trace) + return; + + tr->current_trace->enabled = false; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + tr->current_trace = &nop_trace; +} + static int tracing_set_tracer(struct trace_array *tr, const char *buf) { static struct trace_option_dentry *topts; @@ -6142,6 +6159,7 @@ static int instance_delete(const char *name) list_del(&tr->list); + tracing_set_nop(tr); event_trace_del_tracer(tr); debugfs_remove_recursive(tr->dir); free_percpu(tr->trace_buffer.data); -- cgit v1.2.3 From 50512ab576e1ce29953c9259e1f36ce16f350f20 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 14 Jan 2014 08:52:35 -0500 Subject: tracing: Convert tracer->enabled to counter As tracers will soon be used by instances, the tracer enabled field needs to be converted to a counter instead of a boolean. This counter is protected by the trace_types_lock mutex. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- kernel/trace/trace.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d5913bb46e8..f9f22c435036 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3916,7 +3916,7 @@ static void tracing_set_nop(struct trace_array *tr) if (tr->current_trace == &nop_trace) return; - tr->current_trace->enabled = false; + tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); @@ -3962,7 +3962,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) trace_branch_disable(); - tr->current_trace->enabled = false; + tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); @@ -4006,7 +4006,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) } tr->current_trace = t; - tr->current_trace->enabled = true; + tr->current_trace->enabled++; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ea51bb2004d2..86915b220bbe 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -362,8 +362,8 @@ struct tracer { u32 mask, int set); struct tracer *next; struct tracer_flags *flags; + int enabled; bool print_max; - bool enabled; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; -- cgit v1.2.3 From f20a580627f43e73e4e57cb37e3864080ca06088 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 7 Nov 2013 20:08:58 -0500 Subject: ftrace: Allow instances to use function tracing Allow instances (sub-buffers) to enable function tracing. Each instance will have its own function tracing capability. For now, instances will not have function stack tracing, or will they be able to pick and choose what functions they can trace. Picking and choosing their own functions will come later. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 5 ++ kernel/trace/trace_functions.c | 116 +++++++++++++++++++++++++++-------------- 2 files changed, 81 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86915b220bbe..35cca055da0f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -210,6 +210,11 @@ struct trace_array { struct list_head events; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops *ops; + /* function tracing enabled */ + int function_enabled; +#endif }; enum { diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 85e517e84f50..3f8dc1ce8b9c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,33 +13,83 @@ #include #include #include +#include #include #include "trace.h" -/* function tracing enabled */ -static int ftrace_function_enabled; +static void tracing_start_function_trace(struct trace_array *tr); +static void tracing_stop_function_trace(struct trace_array *tr); +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static struct ftrace_ops trace_ops; +static struct ftrace_ops trace_stack_ops; +static struct tracer_flags func_flags; + +/* Our option */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static int allocate_ftrace_ops(struct trace_array *tr) +{ + struct ftrace_ops *ops; -static struct trace_array *func_trace; + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; -static void tracing_start_function_trace(void); -static void tracing_stop_function_trace(void); + /* Currently only the non stack verision is supported */ + ops->func = function_trace_call; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + + tr->ops = ops; + ops->private = tr; + return 0; +} static int function_trace_init(struct trace_array *tr) { - func_trace = tr; + struct ftrace_ops *ops; + int ret; + + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + /* There's only one global tr */ + if (!trace_ops.private) { + trace_ops.private = tr; + trace_stack_ops.private = tr; + } + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + ops = &trace_stack_ops; + else + ops = &trace_ops; + tr->ops = ops; + } else { + ret = allocate_ftrace_ops(tr); + if (ret) + return ret; + } + tr->trace_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); - tracing_start_function_trace(); + tracing_start_function_trace(tr); return 0; } static void function_trace_reset(struct trace_array *tr) { - tracing_stop_function_trace(); + tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + kfree(tr->ops); + tr->ops = NULL; } static void function_trace_start(struct trace_array *tr) @@ -47,25 +97,18 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->trace_buffer); } -/* Our option */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - -static struct tracer_flags func_flags; - static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; int bit; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; pc = preempt_count(); @@ -91,14 +134,14 @@ static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; long disabled; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; /* @@ -128,7 +171,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } - static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, @@ -153,26 +195,17 @@ static struct tracer_flags func_flags = { .opts = func_opts }; -static void tracing_start_function_trace(void) +static void tracing_start_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - register_ftrace_function(&trace_stack_ops); - else - register_ftrace_function(&trace_ops); - - ftrace_function_enabled = 1; + tr->function_enabled = 0; + register_ftrace_function(tr->ops); + tr->function_enabled = 1; } -static void tracing_stop_function_trace(void) +static void tracing_stop_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - unregister_ftrace_function(&trace_stack_ops); - else - unregister_ftrace_function(&trace_ops); + tr->function_enabled = 0; + unregister_ftrace_function(tr->ops); } static int @@ -184,12 +217,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + unregister_ftrace_function(tr->ops); + if (set) { - unregister_ftrace_function(&trace_ops); - register_ftrace_function(&trace_stack_ops); + tr->ops = &trace_stack_ops; + register_ftrace_function(tr->ops); } else { - unregister_ftrace_function(&trace_stack_ops); - register_ftrace_function(&trace_ops); + tr->ops = &trace_ops; + register_ftrace_function(tr->ops); } break; @@ -209,6 +244,7 @@ static struct tracer function_trace __tracer_data = .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, + .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif -- cgit v1.2.3 From e3b3e2e847080e3cc14bee778c6ced3d59bfd76c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 11 Nov 2013 23:07:14 -0500 Subject: ftrace: Pass in global_ops for use with filtering files In preparation for having the function tracing instances be able to filter on functions, the generic filter functions must first be converted to take in the global_ops as a parameter. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98ae4ed965db..2b3e23991c8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2870,7 +2870,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, inode, file); } @@ -2878,7 +2880,9 @@ ftrace_filter_open(struct inode *inode, struct file *file) static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -4118,10 +4122,10 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) d_tracer, NULL, &ftrace_enabled_fops); trace_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); + &global_ops, &ftrace_filter_fops); trace_create_file("set_ftrace_notrace", 0644, d_tracer, - NULL, &ftrace_notrace_fops); + &global_ops, &ftrace_notrace_fops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER trace_create_file("set_graph_function", 0444, d_tracer, -- cgit v1.2.3 From 591dffdade9f07692a7dd3ed16830ec24e901ece Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 10 Jan 2014 16:17:45 -0500 Subject: ftrace: Allow for function tracing instance to filter functions Create a "set_ftrace_filter" and "set_ftrace_notrace" files in the instance directories to let users filter of functions to trace for the given instance. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 39 ++++++++++++++++++++++++++++++++++----- kernel/trace/trace.c | 4 ++++ kernel/trace/trace.h | 25 ++++++++++++++++++++++++- kernel/trace/trace_functions.c | 40 ++++++++++++++++++++++++++++++++-------- 4 files changed, 94 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2b3e23991c8a..dcee546f21bc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -436,6 +436,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { + if (ops->flags & FTRACE_OPS_FL_DELETED) + return -EINVAL; + if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; @@ -4112,6 +4115,36 @@ static const struct file_operations ftrace_graph_notrace_fops = { }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent) +{ + + trace_create_file("set_ftrace_filter", 0644, parent, + ops, &ftrace_filter_fops); + + trace_create_file("set_ftrace_notrace", 0644, parent, + ops, &ftrace_notrace_fops); +} + +/* + * The name "destroy_filter_files" is really a misnomer. Although + * in the future, it may actualy delete the files, but this is + * really intended to make sure the ops passed in are disabled + * and that when this function returns, the caller is free to + * free the ops. + * + * The "destroy" name is only to match the "create" name that this + * should be paired with. + */ +void ftrace_destroy_filter_files(struct ftrace_ops *ops) +{ + mutex_lock(&ftrace_lock); + if (ops->flags & FTRACE_OPS_FL_ENABLED) + ftrace_shutdown(ops, 0); + ops->flags |= FTRACE_OPS_FL_DELETED; + mutex_unlock(&ftrace_lock); +} + static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { @@ -4121,11 +4154,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("enabled_functions", 0444, d_tracer, NULL, &ftrace_enabled_fops); - trace_create_file("set_ftrace_filter", 0644, d_tracer, - &global_ops, &ftrace_filter_fops); - - trace_create_file("set_ftrace_notrace", 0644, d_tracer, - &global_ops, &ftrace_notrace_fops); + ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER trace_create_file("set_graph_function", 0444, d_tracer, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f9f22c435036..d95ec2876bbb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6161,6 +6161,7 @@ static int instance_delete(const char *name) tracing_set_nop(tr); event_trace_del_tracer(tr); + ftrace_destroy_function_files(tr); debugfs_remove_recursive(tr->dir); free_percpu(tr->trace_buffer.data); ring_buffer_free(tr->trace_buffer.buffer); @@ -6291,6 +6292,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + if (ftrace_create_function_files(tr, d_tracer)) + WARN(1, "Could not allocate function filter files"); + #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, tr, &snapshot_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 35cca055da0f..ffc314b7e92b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -819,13 +819,36 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } extern int ftrace_is_dead(void); +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent); +void ftrace_destroy_function_files(struct trace_array *tr); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } static inline int ftrace_is_dead(void) { return 0; } -#endif +static inline int +ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + return 0; +} +static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +#endif /* CONFIG_FUNCTION_TRACER */ + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent); +void ftrace_destroy_filter_files(struct ftrace_ops *ops); +#else +/* + * The ops parameter passed in is usually undefined. + * This must be a macro. + */ +#define ftrace_create_filter_files(ops, parent) do { } while (0) +#define ftrace_destroy_filter_files(ops) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ int ftrace_event_is_function(struct ftrace_event_call *call); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 3f8dc1ce8b9c..5b781d2be383 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -52,10 +52,34 @@ static int allocate_ftrace_ops(struct trace_array *tr) return 0; } + +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + int ret; + + /* The top level array uses the "global_ops". */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { + ret = allocate_ftrace_ops(tr); + if (ret) + return ret; + } + + ftrace_create_filter_files(tr->ops, parent); + + return 0; +} + +void ftrace_destroy_function_files(struct trace_array *tr) +{ + ftrace_destroy_filter_files(tr->ops); + kfree(tr->ops); + tr->ops = NULL; +} + static int function_trace_init(struct trace_array *tr) { struct ftrace_ops *ops; - int ret; if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { /* There's only one global tr */ @@ -69,10 +93,13 @@ static int function_trace_init(struct trace_array *tr) else ops = &trace_ops; tr->ops = ops; - } else { - ret = allocate_ftrace_ops(tr); - if (ret) - return ret; + } else if (!tr->ops) { + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + return -ENOMEM; } tr->trace_buffer.cpu = get_cpu(); @@ -87,9 +114,6 @@ static void function_trace_reset(struct trace_array *tr) { tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) - kfree(tr->ops); - tr->ops = NULL; } static void function_trace_start(struct trace_array *tr) -- cgit v1.2.3 From a43b97043048eac1686f409af7ad3bb8071b9d83 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 17 Jan 2014 17:08:36 +0900 Subject: tracing/uprobes: Rename uprobe_{trace,perf}_print() functions The uprobe_{trace,perf}_print functions are misnomers since what they do is not printing. There's also a real print function named print_uprobe_event() so they'll only increase confusion IMHO. Rename them with double underscores to follow convention of kprobe. Link: http://lkml.kernel.org/r/1389946120-19610-2-git-send-email-namhyung@kernel.org Reviewed-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Srikar Dronamraju Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 79e52d93860b..c5d2612bf233 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -758,7 +758,7 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) mutex_unlock(&ucb->mutex); } -static void uprobe_trace_print(struct trace_uprobe *tu, +static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; @@ -807,14 +807,14 @@ out: static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { if (!is_ret_probe(tu)) - uprobe_trace_print(tu, 0, regs); + __uprobe_trace_func(tu, 0, regs); return 0; } static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { - uprobe_trace_print(tu, func, regs); + __uprobe_trace_func(tu, func, regs); } /* Event entry printers */ @@ -1014,7 +1014,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, return ret; } -static void uprobe_perf_print(struct trace_uprobe *tu, +static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->tp.call; @@ -1078,14 +1078,14 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - uprobe_perf_print(tu, 0, regs); + __uprobe_perf_func(tu, 0, regs); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs) { - uprobe_perf_print(tu, func, regs); + __uprobe_perf_func(tu, func, regs); } #endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From dd9fa555d7bbfcc7dbc63eb744806e9f6cb62e9f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 17 Jan 2014 17:08:37 +0900 Subject: tracing/uprobes: Move argument fetching to uprobe_dispatcher() A single uprobe event might serve different users like ftrace and perf. And this is especially important for upcoming multi buffer support. But in this case it'll fetch (same) data from userspace multiple times. So move it to the beginning of the dispatcher function and reuse it for each users. Link: http://lkml.kernel.org/r/1389946120-19610-3-git-send-email-namhyung@kernel.org Reviewed-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Srikar Dronamraju Cc: zhangwei(Jovi) Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 93 +++++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c5d2612bf233..d83155e0da78 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -759,30 +759,25 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) } static void __uprobe_trace_func(struct trace_uprobe *tu, - unsigned long func, struct pt_regs *regs) + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - struct uprobe_cpu_buffer *ucb; void *data; - int size, dsize, esize; + int size, esize; struct ftrace_event_call *call = &tu->tp.call; - dsize = __get_data_size(&tu->tp, regs); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) + if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) return; - ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); - + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, 0, 0); if (!event) - goto out; + return; entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { @@ -798,23 +793,22 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, 0); - -out: - uprobe_buffer_put(ucb); } /* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { if (!is_ret_probe(tu)) - __uprobe_trace_func(tu, 0, regs); + __uprobe_trace_func(tu, 0, regs, ucb, dsize); return 0; } static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - __uprobe_trace_func(tu, func, regs); + __uprobe_trace_func(tu, func, regs, ucb, dsize); } /* Event entry printers */ @@ -1015,30 +1009,23 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, } static void __uprobe_perf_func(struct trace_uprobe *tu, - unsigned long func, struct pt_regs *regs) + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { struct ftrace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; - struct uprobe_cpu_buffer *ucb; void *data; - int size, dsize, esize; + int size, esize; int rctx; - dsize = __get_data_size(&tu->tp, regs); esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - if (WARN_ON_ONCE(!uprobe_cpu_buffer)) - return; - size = esize + tu->tp.size + dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - ucb = uprobe_buffer_get(); - store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); - preempt_disable(); head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) @@ -1068,24 +1055,25 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); - uprobe_buffer_put(ucb); } /* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs); + __uprobe_perf_func(tu, 0, regs, ucb, dsize); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - __uprobe_perf_func(tu, func, regs); + __uprobe_perf_func(tu, func, regs, ucb, dsize); } #endif /* CONFIG_PERF_EVENTS */ @@ -1127,8 +1115,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; int ret = 0; + tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; @@ -1137,13 +1128,29 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) current->utask->vaddr = (unsigned long) &udd; +#ifdef CONFIG_PERF_EVENTS + if ((tu->tp.flags & TP_FLAG_TRACE) == 0 && + !uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; +#endif + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + if (tu->tp.flags & TP_FLAG_TRACE) - ret |= uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS if (tu->tp.flags & TP_FLAG_PROFILE) - ret |= uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs, ucb, dsize); #endif + uprobe_buffer_put(ucb); return ret; } @@ -1152,6 +1159,8 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; tu = container_of(con, struct trace_uprobe, consumer); @@ -1160,13 +1169,23 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, current->utask->vaddr = (unsigned long) &udd; + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + if (tu->tp.flags & TP_FLAG_TRACE) - uretprobe_trace_func(tu, func, regs); + uretprobe_trace_func(tu, func, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS if (tu->tp.flags & TP_FLAG_PROFILE) - uretprobe_perf_func(tu, func, regs); + uretprobe_perf_func(tu, func, regs, ucb, dsize); #endif + uprobe_buffer_put(ucb); return 0; } -- cgit v1.2.3 From 70ed91c6ec7f8bf20369634017d887d48ac979d2 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Fri, 17 Jan 2014 17:08:38 +0900 Subject: tracing/uprobes: Support ftrace_event_file base multibuffer Support multi-buffer on uprobe-based dynamic events by using ftrace_event_file. This patch is based kprobe-based dynamic events multibuffer support work initially, commited by Masami(commit 41a7dd420c), but revised as below: Oleg changed the kprobe-based multibuffer design from array-pointers of ftrace_event_file into simple list, so this patch also change to the list design. rcu_read_lock/unlock added into uprobe_trace_func/uretprobe_trace_func, to synchronize with ftrace_event_file list add and delete. Even though we allow multi-uprobes instances now, but TP_FLAG_PROFILE/TP_FLAG_TRACE are still mutually exclusive in probe_event_enable currently, this means we cannot allow one user is using uprobe-tracer, and another user is using perf-probe on same uprobe concurrently. (Perhaps this will be fix in future, kprobe don't have this limitation now) Link: http://lkml.kernel.org/r/1389946120-19610-4-git-send-email-namhyung@kernel.org Reviewed-by: Masami Hiramatsu Reviewed-by: Oleg Nesterov Cc: Ingo Molnar Cc: Frederic Weisbecker Cc: Srikar Dronamraju Signed-off-by: zhangwei(Jovi) Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 17 ------- kernel/trace/trace_probe.h | 17 +++++++ kernel/trace/trace_uprobe.c | 105 +++++++++++++++++++++++++++++++++++--------- 3 files changed, 101 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bdbae450c13e..d021d21dd150 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,11 +35,6 @@ struct trace_kprobe { struct trace_probe tp; }; -struct event_file_link { - struct ftrace_event_file *file; - struct list_head list; -}; - #define SIZEOF_TRACE_KPROBE(n) \ (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) @@ -387,18 +382,6 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) return ret; } -static struct event_file_link * -find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) -{ - struct event_file_link *link; - - list_for_each_entry(link, &tp->files, list) - if (link->file == file) - return link; - - return NULL; -} - /* * Disable trace_probe * if the file is NULL, disable "perf" handler, or disable "trace" handler. diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index b73574a5f429..fb1ab5dfbd42 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -288,6 +288,11 @@ struct trace_probe { struct probe_arg args[]; }; +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + static inline bool trace_probe_is_enabled(struct trace_probe *tp) { return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); @@ -316,6 +321,18 @@ static inline int is_good_name(const char *name) return 1; } +static inline struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +{ + struct event_file_link *link; + + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; + + return NULL; +} + extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d83155e0da78..349c6df9e332 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -260,6 +260,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) goto error; INIT_LIST_HEAD(&tu->list); + INIT_LIST_HEAD(&tu->tp.files); tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; @@ -760,7 +761,8 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer *ucb, int dsize, + struct ftrace_event_file *ftrace_file) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -769,13 +771,15 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, int size, esize; struct ftrace_event_call *call = &tu->tp.call; + WARN_ON(call != ftrace_file->event_call); + if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, 0, 0); + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, size, 0, 0); if (!event) return; @@ -799,8 +803,16 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize) { - if (!is_ret_probe(tu)) - __uprobe_trace_func(tu, 0, regs, ucb, dsize); + struct event_file_link *link; + + if (is_ret_probe(tu)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + rcu_read_unlock(); + return 0; } @@ -808,7 +820,12 @@ static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, struct uprobe_cpu_buffer *ucb, int dsize) { - __uprobe_trace_func(tu, func, regs, ucb, dsize); + struct event_file_link *link; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + rcu_read_unlock(); } /* Event entry printers */ @@ -855,12 +872,31 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int -probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) +probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, + filter_func_t filter) { - int ret = 0; + bool enabled = trace_probe_is_enabled(&tu->tp); + struct event_file_link *link = NULL; + int ret; + + if (file) { + if (tu->tp.flags & TP_FLAG_PROFILE) + return -EINTR; - if (trace_probe_is_enabled(&tu->tp)) - return -EINTR; + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + list_add_tail_rcu(&link->list, &tu->tp.files); + + tu->tp.flags |= TP_FLAG_TRACE; + } else { + if (tu->tp.flags & TP_FLAG_TRACE) + return -EINTR; + + tu->tp.flags |= TP_FLAG_PROFILE; + } ret = uprobe_buffer_enable(); if (ret < 0) @@ -868,24 +904,49 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - tu->tp.flags |= flag; + if (enabled) + return 0; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); - if (ret) - tu->tp.flags &= ~flag; + if (ret) { + if (file) { + list_del(&link->list); + kfree(link); + tu->tp.flags &= ~TP_FLAG_TRACE; + } else + tu->tp.flags &= ~TP_FLAG_PROFILE; + } return ret; } -static void probe_event_disable(struct trace_uprobe *tu, int flag) +static void +probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) { if (!trace_probe_is_enabled(&tu->tp)) return; + if (file) { + struct event_file_link *link; + + link = find_event_file_link(&tu->tp, file); + if (!link) + return; + + list_del_rcu(&link->list); + /* synchronize with u{,ret}probe_trace_func */ + synchronize_sched(); + kfree(link); + + if (!list_empty(&tu->tp.files)) + return; + } + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->tp.flags &= ~flag; + tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); } @@ -1077,25 +1138,27 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, } #endif /* CONFIG_PERF_EVENTS */ -static -int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) +static int +trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, + void *data) { struct trace_uprobe *tu = event->data; + struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE, NULL); + return probe_event_enable(tu, file, NULL); case TRACE_REG_UNREGISTER: - probe_event_disable(tu, TP_FLAG_TRACE); + probe_event_disable(tu, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); + return probe_event_enable(tu, NULL, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: - probe_event_disable(tu, TP_FLAG_PROFILE); + probe_event_disable(tu, NULL); return 0; case TRACE_REG_PERF_OPEN: -- cgit v1.2.3 From ca3b162021a421b38a9cd7b66555b9b01568dc9d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 17 Jan 2014 17:08:39 +0900 Subject: tracing/uprobes: Support event triggering Add support for event triggering to uprobes. This is same as kprobes support added by Tom (plus cleanup by Steven). Link: http://lkml.kernel.org/r/1389946120-19610-5-git-send-email-namhyung@kernel.org Reviewed-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Srikar Dronamraju Cc: zhangwei(Jovi) Cc: Tom Zanussi Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 349c6df9e332..01fcb0db75cb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -776,6 +776,9 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) return; + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, @@ -795,8 +798,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); } /* uprobe handler */ -- cgit v1.2.3 From 43fe98913c9f67e3b523615ee3316f9520a623e0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 17 Jan 2014 17:08:40 +0900 Subject: tracing/uprobes: Support mix of ftrace and perf It seems there's no reason to prevent mixed used of ftrace and perf for a single uprobe event. At least the kprobes already support it. Link: http://lkml.kernel.org/r/1389946120-19610-6-git-send-email-namhyung@kernel.org Reviewed-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Srikar Dronamraju Cc: zhangwei(Jovi) Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 01fcb0db75cb..e4473367e7a4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -882,9 +882,6 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, int ret; if (file) { - if (tu->tp.flags & TP_FLAG_PROFILE) - return -EINTR; - link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) return -ENOMEM; @@ -893,12 +890,8 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, list_add_tail_rcu(&link->list, &tu->tp.files); tu->tp.flags |= TP_FLAG_TRACE; - } else { - if (tu->tp.flags & TP_FLAG_TRACE) - return -EINTR; - + } else tu->tp.flags |= TP_FLAG_PROFILE; - } ret = uprobe_buffer_enable(); if (ret < 0) -- cgit v1.2.3 From e1e232ca6b8faa210e5509f17d55519b4392524f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Feb 2014 23:38:46 -0500 Subject: tracing: Add trace_clock= kernel parameter Being able to change the trace clock at boot can be advantageous if you need a better source of when things happen across CPUs. The default trace clock is the fastest, but it uses local clocks which may not be synced across CPUs and it does not let you know when events took place with respect to events on other CPUs. The global trace clock can help in this case, and if you do not care about timings, the counter "clock" is the best, as that is just a simple atomic counter that is incremented for every event. Usage is to add "trace_clock=counter" on the kernel command line. You can replace counter with "global" or any of the clocks listed in /sys/kernel/debug/tracing/trace_clock Suggested-by: Thomas Gleixner Tested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Appreciated-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 61 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d95ec2876bbb..c90f55d80f86 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -181,6 +181,17 @@ static int __init set_trace_boot_options(char *str) } __setup("trace_options=", set_trace_boot_options); +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ + strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + trace_boot_clock = trace_boot_clock_buf; + return 0; +} +__setup("trace_clock=", set_trace_boot_clock); + unsigned long long ns2usecs(cycle_t nsec) { @@ -4746,25 +4757,10 @@ static int tracing_clock_show(struct seq_file *m, void *v) return 0; } -static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static int tracing_set_clock(struct trace_array *tr, const char *clockstr) { - struct seq_file *m = filp->private_data; - struct trace_array *tr = m->private; - char buf[64]; - const char *clockstr; int i; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - clockstr = strstrip(buf); - for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { if (strcmp(trace_clocks[i].name, clockstr) == 0) break; @@ -4792,6 +4788,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, mutex_unlock(&trace_types_lock); + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + const char *clockstr; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + ret = tracing_set_clock(tr, clockstr); + if (ret) + return ret; + *fpos += cnt; return cnt; @@ -6574,6 +6596,13 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); + if (trace_boot_clock) { + ret = tracing_set_clock(&global_trace, trace_boot_clock); + if (ret < 0) + pr_warning("Trace clock %s not defined, going back to default\n", + trace_boot_clock); + } + /* * register_tracer() might reference current_trace, so it * needs to be set before we register anything. This is -- cgit v1.2.3 From 1fcc155351f183e5044180eeb372a8ff47710855 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 19 Feb 2014 15:12:18 -0500 Subject: ftrace: Have static function trace clear ENABLED flag on unregister The ENABLED flag needs to be cleared when a ftrace_ops is unregistered otherwise it wont be able to be registered again. This is only for static tracing and does not affect DYNAMIC_FTRACE at all. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcee546f21bc..5313c1100d30 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4463,7 +4463,13 @@ static inline void ftrace_startup_enable(int command) { } (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ ___ret; \ }) -# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) -- cgit v1.2.3 From b080e047a61f7050246ff3081f87832997170d29 Mon Sep 17 00:00:00 2001 From: Brian Campbell Date: Sun, 16 Feb 2014 22:58:12 -0500 Subject: user_namespace.c: Remove duplicated word in comment Signed-off-by: Brian Campbell Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 240fb62cf394..dd06439b9c84 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test - * for and handle handle INVALID_UID being returned. INVALID_UID + * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) -- cgit v1.2.3 From a53efe5ff88d0283bae8a2c2fa066d0fff31dc91 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 26 Oct 2012 17:17:44 +0200 Subject: sched/mm: call finish_arch_post_lock_switch in idle_task_exit and use_mm The finish_arch_post_lock_switch is called at the end of the task switch after all locks have been released. In concept it is paired with the switch_mm function, but the current code only does the call in finish_task_switch. Add the call to idle_task_exit and use_mm. One use case for the additional calls is s390 which will use finish_arch_post_lock_switch to wait for the completion of TLB flush operations. Signed-off-by: Martin Schwidefsky --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..4b0739c9558e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4692,8 +4692,10 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); - if (mm != &init_mm) + if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + finish_arch_post_lock_switch(); + } mmdrop(mm); } -- cgit v1.2.3 From 3d5f35bdfdef5fd627afe9b4bf9c4f32d17f4593 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 20 Feb 2014 09:19:39 +0100 Subject: sched/deadline: Fix bad accounting of nr_running Rostedt writes: My test suite was locking up hard when enabling mmiotracer. This was due to the mmiotracer placing all but one CPU offline. I found this out when I was able to reproduce the bug with just my stress-cpu-hotplug test. This bug baffled me because it would not always trigger, and would only trigger on the first run after boot up. The stress-cpu-hotplug test would crash hard the first run, or never crash at all. But a new reboot may cause it to crash on the first run again. I spent all week bisecting this, as I couldn't find a consistent reproducer. I finally narrowed it down to the sched deadline patches, and even more peculiar, to the commit that added the sched deadline boot up self test to the latency tracer. Then it dawned on me to what the bug was. All it took was to run a task under sched deadline to screw up the CPU hot plugging. This explained why it would lock up only on the first run of the stress-cpu-hotplug test. The bug happened when the boot up self test of the schedule latency tracer would test a deadline task. The deadline task would corrupt something that would cause CPU hotplug to fail. If it didn't corrupt it, the stress test would always work (there's no other sched deadline tasks that would run to cause problems). If it did corrupt on boot up, the first test would lockup hard. I proved this theory by running my deadline test program on another box, and then run the stress-cpu-hotplug test, and it would now consistently lock up. I could run stress-cpu-hotplug over and over with no problem, but once I ran the deadline test, the next run of the stress-cpu-hotplug would lock hard. After adding lots of tracing to the code, I found the cause. The function tracer showed that migrate_tasks() was stuck in an infinite loop, where rq->nr_running never equaled 1 to break out of it. When I added a trace_printk() to see what that number was, it was 335 and never decrementing! Looking at the deadline code I found: static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { dequeue_dl_entity(&p->dl); dequeue_pushable_dl_task(rq, p); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); dec_nr_running(rq); } And this: if (dl_runtime_exceeded(rq, dl_se)) { __dequeue_task_dl(rq, curr, 0); if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) dl_se->dl_throttled = 1; else enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) resched_task(curr); } Notice how we call __dequeue_task_dl() and in the else case we call enqueue_task_dl()? Also notice that dequeue_task_dl() has underscores where enqueue_task_dl() does not. The enqueue_task_dl() calls inc_nr_running(rq), but __dequeue_task_dl() does not. This is where we get nr_running out of sync. [snip] Another point where nr_running can get out of sync is when the dl_timer fires: dl_se->dl_throttled = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_task(rq->curr); This patch does two things: - correctly accounts for throttled tasks (that are now considered !running); - fixes the bug, updating nr_running from {inc,dec}_dl_tasks(), since we risk to update it twice in some situations (e.g., a task is dequeued while it has exceeded its budget). Cc: mingo@redhat.com Cc: torvalds@linux-foundation.org Cc: akpm@linux-foundation.org Reported-by: Steven Rostedt Reviewed-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392884379-13744-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0dd5e0971a07..b819577c21de 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -717,6 +717,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; + inc_nr_running(rq_of_dl_rq(dl_rq)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -730,6 +731,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; + dec_nr_running(rq_of_dl_rq(dl_rq)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -836,8 +838,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - inc_nr_running(rq); } static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -850,8 +850,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); - - dec_nr_running(rq); } /* -- cgit v1.2.3 From 4df1638cfaf9b2b7ad993979a41965acab9cd156 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 19 Feb 2014 13:53:35 -0500 Subject: sched/deadline: Fix overflow to handle period==0 and deadline!=0 While debugging the crash with the bad nr_running accounting, I hit another bug where, after running my sched deadline test, I was getting failures to take a CPU offline. It was giving me a -EBUSY error. Adding a bunch of trace_printk()s around, I found that the cpu notifier that called sched_cpu_inactive() was returning a failure. The overflow value was coming up negative? Talking this over with Juri, the problem is that the total_bw update was suppose to be made by dl_overflow() which, during my tests, seemed to not be called. Adding more trace_printk()s, it wasn't that it wasn't called, but it exited out right away with the check of new_bw being equal to p->dl.dl_bw. The new_bw calculates the ratio between period and runtime. The bug is that if you set a deadline, you do not need to set a period if you plan on the period being equal to the deadline. That is, if period is zero and deadline is not, then the system call should set the period to be equal to the deadline. This is done elsewhere in the code. The fix is easy, check if period is set, and if it is not, then use the deadline. Cc: Juri Lelli Cc: Ingo Molnar Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140219135335.7e74abd4@gandalf.local.home Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..24914488da41 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1952,7 +1952,7 @@ static int dl_overflow(struct task_struct *p, int policy, { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period; + u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; -- cgit v1.2.3 From e9e7cb38c21c80c82af4b16608bb4c8c5ec6a28e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 11 Feb 2014 09:24:26 +0100 Subject: sched/core: Fix sched_rt_global_validate Don't compare sysctl_sched_rt_runtime against sysctl_sched_rt_period if the former is equal to RUNTIME_INF, otherwise disabling -rt bandwidth management (with CONFIG_RT_GROUP_SCHED=n) fails. Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392107067-19907-2-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 24914488da41..98d33c105252 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7475,7 +7475,8 @@ static int sched_rt_global_validate(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; - if (sysctl_sched_rt_runtime > sysctl_sched_rt_period) + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) return -EINVAL; return 0; -- cgit v1.2.3 From 495163420ab5398c84af96ca3eae2c6aa4a140da Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 11 Feb 2014 09:24:27 +0100 Subject: sched/core: Make dl_b->lock IRQ safe Fix this lockdep warning: [ 44.804600] ========================================================= [ 44.805746] [ INFO: possible irq lock inversion dependency detected ] [ 44.805746] 3.14.0-rc2-test+ #14 Not tainted [ 44.805746] --------------------------------------------------------- [ 44.805746] bash/3674 just changed the state of lock: [ 44.805746] (&dl_b->lock){+.....}, at: [] sched_rt_handler+0x132/0x248 [ 44.805746] but this lock was taken by another, HARDIRQ-safe lock in the past: [ 44.805746] (&rq->lock){-.-.-.} and interrupts could create inverse lock ordering between them. [ 44.805746] [ 44.805746] other info that might help us debug this: [ 44.805746] Possible interrupt unsafe locking scenario: [ 44.805746] [ 44.805746] CPU0 CPU1 [ 44.805746] ---- ---- [ 44.805746] lock(&dl_b->lock); [ 44.805746] local_irq_disable(); [ 44.805746] lock(&rq->lock); [ 44.805746] lock(&dl_b->lock); [ 44.805746] [ 44.805746] lock(&rq->lock); by making dl_b->lock acquiring always IRQ safe. Cc: Ingo Molnar Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392107067-19907-3-git-send-email-juri.lelli@gmail.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 98d33c105252..33d030a133d2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7422,6 +7422,7 @@ static int sched_dl_global_constraints(void) u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); int cpu, ret = 0; + unsigned long flags; /* * Here we want to check the bandwidth not being set to some @@ -7435,10 +7436,10 @@ static int sched_dl_global_constraints(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); if (ret) break; @@ -7451,6 +7452,7 @@ static void sched_dl_do_global(void) { u64 new_bw = -1; int cpu; + unsigned long flags; def_dl_bandwidth.dl_period = global_rt_period(); def_dl_bandwidth.dl_runtime = global_rt_runtime(); @@ -7464,9 +7466,9 @@ static void sched_dl_do_global(void) for_each_possible_cpu(cpu) { struct dl_bw *dl_b = dl_bw_of(cpu); - raw_spin_lock(&dl_b->lock); + raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; - raw_spin_unlock(&dl_b->lock); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); } } -- cgit v1.2.3 From 3cf1962cdbf6b3a9e3ef21116d215bbab350ea37 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 18 Feb 2014 17:12:44 -0500 Subject: sched,numa: add cond_resched to task_numa_work Normally task_numa_work scans over a fairly small amount of memory, but it is possible to run into a large unpopulated part of virtual memory, with no pages mapped. In that case, task_numa_work can run for a while, and it may make sense to reschedule as required. Cc: akpm@linux-foundation.org Cc: Andrea Arcangeli Signed-off-by: Rik van Riel Reported-by: Xing Gang Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392761566-24834-2-git-send-email-riel@redhat.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 966cc2bfcb77..78157099b167 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work) start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } -- cgit v1.2.3 From 4efbc454ba68def5ef285b26ebfcfdb605b52755 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 16 Feb 2014 22:24:17 +0100 Subject: sched: Fix information leak in sys_sched_getattr() We're copying the on-stack structure to userspace, but forgot to give the right number of bytes to copy. This allows the calling process to obtain up to PAGE_SIZE bytes from the stack (and possibly adjacent kernel memory). This fix copies only as much as we actually have on the stack (attr->size defaults to the size of the struct) and leaves the rest of the userspace-provided buffer untouched. Found using kmemcheck + trinity. Fixes: d50dde5a10f30 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI") Cc: Dario Faggioli Cc: Juri Lelli Cc: Ingo Molnar Signed-off-by: Vegard Nossum Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392585857-10725-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 33d030a133d2..a6e7470166c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3786,7 +3786,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, attr->size = usize; } - ret = copy_to_user(uattr, attr, usize); + ret = copy_to_user(uattr, attr, attr->size); if (ret) return -EFAULT; -- cgit v1.2.3 From 6d35ab48090b10c5ea5604ed5d6e91f302dc6060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Feb 2014 17:19:29 +0100 Subject: sched: Add 'flags' argument to sched_{set,get}attr() syscalls Because of a recent syscall design debate; its deemed appropriate for each syscall to have a flags argument for future extension; without immediately requiring new syscalls. Cc: juri.lelli@gmail.com Cc: Ingo Molnar Suggested-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140214161929.GL27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6e7470166c7..6edbef296ece 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3661,13 +3661,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * @pid: the pid in question. * @uattr: structure containing the extended parameters. */ -SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) { struct sched_attr attr; struct task_struct *p; int retval; - if (!uattr || pid < 0) + if (!uattr || pid < 0 || flags) return -EINVAL; if (sched_copy_attr(uattr, &attr)) @@ -3804,8 +3805,8 @@ err_size: * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. */ -SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size) +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) { struct sched_attr attr = { .size = sizeof(struct sched_attr), @@ -3814,7 +3815,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0) + size < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); -- cgit v1.2.3 From 82b95800b256205cff2eeab5bbd03430d2d0f20d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 17 Feb 2014 09:12:33 -0500 Subject: sched/deadline: Test for CPU's presence explicitly A hot-removed CPU may have ID that is numerically larger than the number of existing CPUs in the system (e.g. we can unplug CPU 4 from a system that has CPUs 0, 1 and 4). Thus the WARN_ONs should check whether the CPU in question is currently present, not whether its ID value is less than num_present_cpus(). Cc: Ingo Molnar Cc: Juri Lelli Cc: Steven Rostedt Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Boris Ostrovsky Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392646353-1874-1-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Thomas Gleixner --- kernel/sched/cpudeadline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 045fc74e3f09..5b8838b56d1c 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID); + WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1); + WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); return best_cpu; } @@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) int old_idx, new_cpu; unsigned long flags; - WARN_ON(cpu > num_present_cpus()); + WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); old_idx = cp->cpu_to_idx[cpu]; -- cgit v1.2.3 From 995b9ea440862def83e8fcb1b498e68f93d4af59 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 18 Feb 2014 02:24:13 +0400 Subject: sched/deadline: Remove useless dl_nr_total In deadline class we do not have group scheduling like in RT. dl_nr_total is the same as dl_nr_running. So, one of them should be removed. Cc: Ingo Molnar Cc: Juri Lelli Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/368631392675853@web20h.yandex.ru Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 4 +--- kernel/sched/sched.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b819577c21de..15cbc17fbf84 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq) static void update_dl_migration(struct dl_rq *dl_rq) { - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) { + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { if (!dl_rq->overloaded) { dl_set_overload(rq_of_dl_rq(dl_rq)); dl_rq->overloaded = 1; @@ -137,7 +137,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) struct task_struct *p = dl_task_of(dl_se); dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total++; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -149,7 +148,6 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) struct task_struct *p = dl_task_of(dl_se); dl_rq = &rq_of_dl_rq(dl_rq)->dl; - dl_rq->dl_nr_total--; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd20f8b..f964add50f38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -462,7 +462,6 @@ struct dl_rq { } earliest_dl; unsigned long dl_nr_migratory; - unsigned long dl_nr_total; int overloaded; /* -- cgit v1.2.3 From eb7a59b2c888c2518ba2c9d0020343ca71aa9dee Mon Sep 17 00:00:00 2001 From: Michael wang Date: Thu, 20 Feb 2014 11:14:53 +0800 Subject: sched/fair: Reset se-depth when task switched to FAIR Sasha reported: [ 522.645288] BUG: unable to handle kernel NULL pointer dereference at ... [ 522.646271] IP: [] check_preempt_wakeup+0x11f/0x210 ... [ 522.650021] Call Trace: [ 522.650021] [ 522.650021] [] check_preempt_curr+0x3d/0xb0 [ 522.650021] [] ttwu_do_wakeup+0x18/0x130 ... which was caused by the se-depth changed during the time when task is not FAIR, and we will use the wrong depth value after it switched back to FAIR. This patch reset the depth at the time when task switched to FAIR, make sure that we always have the correct value when task is FAIR. Cc: Ingo Molnar Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5305732D.70001@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 235cfa7ad8fc..280da893cd0f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7317,7 +7317,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - if (!p->se.on_rq) + struct sched_entity *se = &p->se; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + if (!se->on_rq) return; /* -- cgit v1.2.3 From 6e83125c6b151afa139c8852c099d6d92954fe3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Feb 2014 16:11:48 +0100 Subject: sched/fair: Remove idle_balance() declaration in sched.h Remove idle_balance() from the public life; also reduce some #ifdef clutter by folding the pick_next_task_fair() idle path into idle_balance(). Cc: mingo@kernel.org Reported-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140211151148.GP27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++------------------ kernel/sched/sched.h | 7 ------- 2 files changed, 29 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 280da893cd0f..40c758bbdd57 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2374,13 +2374,13 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) se->avg.load_avg_contrib >>= NICE_0_SHIFT; } } -#else +#else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) { @@ -2571,6 +2571,8 @@ void idle_exit_fair(struct rq *this_rq) update_rq_runnable_avg(this_rq, 0); } +static int idle_balance(struct rq *this_rq); + #else /* CONFIG_SMP */ static inline void update_entity_load_avg(struct sched_entity *se, @@ -2584,6 +2586,12 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) {} static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) {} + +static inline int idle_balance(struct rq *rq) +{ + return 0; +} + #endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4677,7 +4685,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) struct sched_entity *se; struct task_struct *p; -again: __maybe_unused +again: #ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) goto idle; @@ -4775,18 +4783,8 @@ simple: return p; idle: -#ifdef CONFIG_SMP - idle_enter_fair(rq); - /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - rq->idle_stamp = rq_clock(rq); - if (idle_balance(rq)) { /* drops rq->lock */ - rq->idle_stamp = 0; + if (idle_balance(rq)) /* drops rq->lock */ goto again; - } -#endif return NULL; } @@ -6634,7 +6632,7 @@ out: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -int idle_balance(struct rq *this_rq) +static int idle_balance(struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; @@ -6642,8 +6640,15 @@ int idle_balance(struct rq *this_rq) u64 curr_cost = 0; int this_cpu = this_rq->cpu; + idle_enter_fair(this_rq); + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + if (this_rq->avg_idle < sysctl_sched_migration_cost) - return 0; + goto out; /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6692,8 +6697,10 @@ int idle_balance(struct rq *this_rq) * While browsing the domains, we released the rq lock. * A task could have be enqueued in the meantime */ - if (this_rq->nr_running && !pulled_task) - return 1; + if (this_rq->nr_running && !pulled_task) { + pulled_task = 1; + goto out; + } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -6706,6 +6713,10 @@ int idle_balance(struct rq *this_rq) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; +out: + if (pulled_task) + this_rq->idle_stamp = 0; + return pulled_task; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1bf34c257d3b..92018f9821e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1163,17 +1163,10 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern int idle_balance(struct rq *this_rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -#else /* CONFIG_SMP */ - -static inline void idle_balance(int cpu, struct rq *rq) -{ -} - #endif extern void sysrq_sched_debug_show(void); -- cgit v1.2.3 From 3f1d2a318171bf61850d4e5a72031271e5aada76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Feb 2014 10:49:30 +0100 Subject: sched: Fix hotplug task migration Dan Carpenter reported: > kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338) > kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005) Kirill also spotted that migrate_tasks() will have an instant NULL deref because pick_next_task() will immediately deref prev. Instead of fixing all the corner cases because migrate_tasks() can pass in a NULL prev task in the unlikely case of hot-un-plug, provide a fake task such that we can remove all the NULL checks from the far more common paths. A further problem; not previously spotted; is that because we pushed pre_schedule() and idle_balance() into pick_next_task() we now need to avoid those getting called and pulling more tasks on our dying CPU. We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1. We also note that since we call pick_next_task() exactly the amount of times we have runnable tasks present, we should never land in idle_balance(). Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()") Cc: Juri Lelli Cc: Ingo Molnar Cc: Steven Rostedt Reported-by: Kirill Tkhai Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140212094930.GB3545@laptop.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 18 +++++++++++++++++- kernel/sched/deadline.c | 3 +-- kernel/sched/fair.c | 5 ++--- kernel/sched/idle_task.c | 3 +-- kernel/sched/rt.c | 3 +-- kernel/sched/sched.h | 5 +++++ kernel/sched/stop_task.c | 3 +-- 7 files changed, 28 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fb9764fbc537..49db434a35d0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +{ +} + +static const struct sched_class fake_sched_class = { + .put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { + /* + * Avoid pull_{rt,dl}_task() + */ + .prio = MAX_PRIO + 1, + .sched_class = &fake_sched_class, +}; + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; - next = pick_next_task(rq, NULL); + next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ed31ef66ab9d..bfeb84ecc32b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) if (unlikely(!dl_rq->dl_nr_running)) return NULL; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40c758bbdd57..e884e45982af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4690,7 +4690,7 @@ again: if (!cfs_rq->nr_running) goto idle; - if (!prev || prev->sched_class != &fair_sched_class) + if (prev->sched_class != &fair_sched_class) goto simple; /* @@ -4766,8 +4766,7 @@ simple: if (!cfs_rq->nr_running) goto idle; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index f7d03af79a5b..53ff9e7c76d2 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -26,8 +26,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev) { - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); schedstat_inc(rq, sched_goidle); #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 72f9ec759972..65c2d6881ac3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (rt_rq_throttled(rt_rq)) return NULL; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); p = _pick_next_task_rt(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 92018f9821e8..d276147ba5e4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1147,6 +1147,11 @@ struct sched_class { #endif }; +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + prev->sched_class->put_prev_task(rq, prev); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index a4147c9d2017..d6ce65dde541 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) if (!stop || !stop->on_rq) return NULL; - if (prev) - prev->sched_class->put_prev_task(rq, prev); + put_prev_task(rq, prev); stop->se.exec_start = rq_clock_task(rq); -- cgit v1.2.3 From dc87734106bb6e97c92d8bd81f261fb71976ec2c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Feb 2014 15:47:29 +0100 Subject: sched: Remove some #ifdeffery Remove a few gratuitous #ifdefs in pick_next_task*(). Cc: Ingo Molnar Cc: Steven Rostedt Cc: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-nnzddp5c4fijyzzxxrwlxghf@git.kernel.org Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 31 +++++++++++++++++++++++++------ kernel/sched/idle_task.c | 4 ---- kernel/sched/rt.c | 41 ++++++++++++++++++++++++++++++----------- kernel/sched/sched.h | 5 +++++ 4 files changed, 60 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index bfeb84ecc32b..3185b775dbf7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -214,6 +214,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq) static int push_dl_task(struct rq *rq); +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return dl_task(prev); +} + +static inline void set_post_schedule(struct rq *rq) +{ + rq->post_schedule = has_pushable_dl_tasks(rq); +} + #else static inline @@ -236,6 +246,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_dl_task(struct rq *rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -1000,10 +1023,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) dl_rq = &rq->dl; -#ifdef CONFIG_SMP - if (dl_task(prev)) + if (need_pull_dl_task(rq, prev)) pull_dl_task(rq); -#endif if (unlikely(!dl_rq->dl_nr_running)) return NULL; @@ -1024,9 +1045,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) start_hrtick_dl(rq, p); #endif -#ifdef CONFIG_SMP - rq->post_schedule = has_pushable_dl_tasks(rq); -#endif /* CONFIG_SMP */ + set_post_schedule(rq); return p; } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 53ff9e7c76d2..1f3725882838 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -29,9 +29,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev) put_prev_task(rq, prev); schedstat_inc(rq, sched_goidle); -#ifdef CONFIG_SMP idle_enter_fair(rq); -#endif return rq->idle; } @@ -50,10 +48,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { -#ifdef CONFIG_SMP idle_exit_fair(rq); rq_last_tick_reset(rq); -#endif } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 65c2d6881ac3..3e488ca6050d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -231,6 +231,12 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) static int pull_rt_task(struct rq *this_rq); +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + return rq->rt.highest_prio.curr > prev->prio; +} + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -317,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } +static inline void set_post_schedule(struct rq *rq) +{ + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -361,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_rt_task(struct rq *this_rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static inline int on_rt_rq(struct sched_rt_entity *rt_se) @@ -1332,11 +1360,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; -#ifdef CONFIG_SMP - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) + if (need_pull_rt_task(rq, prev)) pull_rt_task(rq); -#endif if (!rt_rq->rt_nr_running) return NULL; @@ -1352,13 +1377,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (p) dequeue_pushable_task(rq, p); -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif + set_post_schedule(rq); return p; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d276147ba5e4..caf4abda45e3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1172,6 +1172,11 @@ extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); +#else + +static inline void idle_enter_fair(struct rq *rq) { } +static inline void idle_exit_fair(struct rq *rq) { } + #endif extern void sysrq_sched_debug_show(void); -- cgit v1.2.3 From cd578abb24aa67ce468c427d3356c08ea32cf768 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Feb 2014 16:01:16 +0100 Subject: perf/x86: Warn to early_printk() in case irq_work is too slow On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote: > The reason I coded this up was that NMIs were firing off so fast that > nothing else was getting a chance to run. With this patch, at least the > printk() would come out and I'd have some idea what was going on. It will start spewing to early_printk() (which is a lot nicer to use from NMI context too) when it fails to queue the IRQ-work because its already enqueued. It does have the false-positive for when two CPUs trigger the warn concurrently, but that should be rare and some extra clutter on the early printk shouldn't be a problem. Cc: hpa@zytor.com Cc: tglx@linutronix.de Cc: dzickus@redhat.com Cc: Dave Hansen Cc: mingo@kernel.org Fixes: 6a02ad66b2c4 ("perf/x86: Push the duration-logging printk() to IRQ context") Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140211150116.GO27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 9 +++++++-- kernel/irq_work.c | 6 ++++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2067cbb378eb..45e5543e2a1e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -243,7 +243,7 @@ static void perf_duration_warn(struct irq_work *w) printk_ratelimited(KERN_WARNING "perf interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, + avg_local_sample_len, allowed_ns >> 1, sysctl_perf_event_sample_rate); } @@ -283,7 +283,12 @@ void perf_sample_event_took(u64 sample_len_ns) update_perf_cpu_limits(); - irq_work_queue(&perf_duration_work); + if (!irq_work_queue(&perf_duration_work)) { + early_printk("perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns >> 1, + sysctl_perf_event_sample_rate); + } } static atomic64_t perf_event_id; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 55fcce6065cf..a82170e2fa78 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void) * * Can be re-enqueued while the callback is still in progress. */ -void irq_work_queue(struct irq_work *work) +bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) - return; + return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); @@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work) } preempt_enable(); + + return true; } EXPORT_SYMBOL_GPL(irq_work_queue); -- cgit v1.2.3 From 77177856e3bf39d435b3ae4bfd164ca3c8cd4577 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:37 +0100 Subject: sched: Init idle->on_rq in init_idle() We stumbled in RT over a SMP bringup issue on ARM where the idle->on_rq == 0 was causing try_to_wakeup() on the other cpu to run into nada land. After adding that idle->on_rq = 1; I was able to find the root cause of the lockup: the idle task on the newly woken up cpu was fiddling with a sleeping spinlock, which is a nono. I kept the init of idle->on_rq to keep the state consistent and to avoid another long lasting debug session. As a side note, the whole debug mess could have been avoided if might_sleep() would have yelled when called from the idle task. That's fixed with patch 2/6 - and that one actually has a changelog :) Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-2-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 49db434a35d0..06da865043ec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4443,6 +4443,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; + idle->on_rq = 1; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif -- cgit v1.2.3 From db273be2a7d42f92b3471e0f717982928214a650 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:38 +0100 Subject: sched: Check for idle task in might_sleep() Idle is not allowed to call sleeping functions ever! Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-3-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 06da865043ec..a01fe6cfdb9b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6927,7 +6927,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + !is_idle_task(current)) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -- cgit v1.2.3 From 8f47b1871b8aac98f1a9d93bc3467fb97b65199a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:39 +0100 Subject: sched: Add better debug output for might_sleep() might_sleep() can tell us where interrupts have been disabled, but we have no idea what disabled preemption. Add some debug infrastructure. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-4-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a01fe6cfdb9b..c94e851dc981 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2501,8 +2501,13 @@ void __kprobes preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == val) { + unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } EXPORT_SYMBOL(preempt_count_add); @@ -2545,6 +2550,13 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -6946,6 +6958,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); -- cgit v1.2.3 From d6b1e9119787fd2e31dcf0f0ce90b71197604206 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:40 +0100 Subject: sched: Adjust p->sched_reset_on_fork when nothing else changes If the policy and priority remain unchanged a possible modification of p->sched_reset_on_fork gets lost in the early exit path. Signed-off-by: Thomas Gleixner [ Rebase ontop of v3.14-rc1. ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-5-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c94e851dc981..771eb8762df4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3362,7 +3362,8 @@ recheck: } /* - * If not changing anything there's no need to proceed further: + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. */ if (unlikely(policy == p->policy)) { if (fair_policy(policy) && attr->sched_nice != task_nice(p)) @@ -3372,6 +3373,7 @@ recheck: if (dl_policy(policy)) goto change; + p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &flags); return 0; } -- cgit v1.2.3 From 81a44c5441d7f7d2c3dc9105f4d65ad0d5818617 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:41 +0100 Subject: sched: Queue RT tasks to head when prio drops The following scenario does not work correctly: Runqueue of CPUx contains two runnable and pinned tasks: T1: SCHED_FIFO, prio 80 T2: SCHED_FIFO, prio 80 T1 is on the cpu and executes the following syscalls (classic priority ceiling scenario): sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 90); ... sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 80); ... Now T1 gets preempted by T3 (SCHED_FIFO, prio 95). After T3 goes back to sleep the scheduler picks T2. Surprise! The same happens w/o actual preemption when T1 is forced into the scheduler due to a sporadic NEED_RESCHED event. The scheduler invokes pick_next_task() which returns T2. So T1 gets preempted and scheduled out. This happens because sched_setscheduler() dequeues T1 from the prio 90 list and then enqueues it on the tail of the prio 80 list behind T2. This violates the POSIX spec and surprises user space which relies on the guarantee that SCHED_FIFO tasks are not scheduled out unless they give the CPU up voluntarily or are preempted by a higher priority task. In the latter case the preempted task must get back on the CPU after the preempting task schedules out again. We fixed a similar issue already in commit 60db48c (sched: Queue a deboosted task to the head of the RT prio queue). The same treatment is necessary for sched_setscheduler(). So enqueue to head of the prio bucket list if the priority of the task is lowered. It might be possible that existing user space relies on the current behaviour, but it can be considered highly unlikely due to the corner case nature of the application scenario. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-6-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 771eb8762df4..9c2fcbf9a266 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3442,8 +3442,13 @@ change: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); + if (on_rq) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + } check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); -- cgit v1.2.3 From c365c292d05908c6ea6f32708f331e21033fe71d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2014 20:58:42 +0100 Subject: sched: Consider pi boosting in setscheduler() If a PI boosted task policy/priority is modified by a setscheduler() call we unconditionally dequeue and requeue the task if it is on the runqueue even if the new priority is lower than the current effective boosted priority. This can result in undesired reordering of the priority bucket list. If the new priority is less or equal than the current effective we just store the new parameters in the task struct and leave the scheduler class and the runqueue untouched. This is handled when the task deboosts itself. Only if the new priority is higher than the effective boosted priority we apply the change immediately. Signed-off-by: Thomas Gleixner [ Rebase ontop of v3.14-rc1. ] Signed-off-by: Sebastian Andrzej Siewior Cc: Dario Faggioli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1391803122-4425-7-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 12 ++++++++++++ kernel/sched/core.c | 41 ++++++++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2e960a2bab81..aa4dff04b594 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -212,6 +212,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task) return task_top_pi_waiter(task)->task; } +/* + * Called by sched_setscheduler() to check whether the priority change + * is overruled by a possible priority boosting. + */ +int rt_mutex_check_prio(struct task_struct *task, int newprio) +{ + if (!task_has_pi_waiters(task)) + return 0; + + return task_top_pi_waiter(task)->task->prio <= newprio; +} + /* * Adjust the priority of a task, after its pi_waiters got modified. * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9c2fcbf9a266..003263b3b05c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2902,7 +2902,8 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. */ void rt_mutex_setprio(struct task_struct *p, int prio) { @@ -3171,9 +3172,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_new = 1; } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr) +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) { int policy = attr->sched_policy; @@ -3193,9 +3193,14 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; + set_load_weight(p); +} - p->normal_prio = normal_prio(p); - p->prio = rt_mutex_getprio(p); +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) +{ + __setscheduler_params(p, attr); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -3203,8 +3208,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; - - set_load_weight(p); } static void @@ -3257,6 +3260,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { + int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; int policy = attr->sched_policy; unsigned long flags; @@ -3427,6 +3431,24 @@ change: return -EBUSY; } + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + /* + * Special case for priority boosted tasks. + * + * If the new priority is lower or equal (user space view) + * than the current (boosted) priority, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + if (rt_mutex_check_prio(p, newprio)) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3434,9 +3456,6 @@ change: if (running) p->sched_class->put_prev_task(rq, p); - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, attr); -- cgit v1.2.3 From d82fd25356b902703152c1800845661835541878 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 8 Feb 2014 14:17:26 +0800 Subject: sched/rt: Remove 'leaf_rt_rq_list' from 'struct rq' This is a leftover from commit e23ee74777f389369431d77390c4b09332ce026a ("sched/rt: Simplify pull_rt_task() logic and remove .leaf_rt_rq_list"). Signed-off-by: Li Zefan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/52F5CBF6.4060901@huawei.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/sched.h | 4 ---- 2 files changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 003263b3b05c..cc4965e969b1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6876,7 +6876,6 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index caf4abda45e3..d608125b36ef 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -543,10 +543,6 @@ struct rq { struct list_head leaf_cfs_rq_list; #endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif - /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on -- cgit v1.2.3 From 11c785b79ef2a669e4bf7be5cf2c3904b8fed015 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 8 Feb 2014 14:17:45 +0800 Subject: sched/rt: Make init_sched_rt_calss() __init It's a bootstrap function. Signed-off-by: Li Zefan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/52F5CC09.1080502@huawei.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3e488ca6050d..4d4b386598aa 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1849,7 +1849,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void) { unsigned int i; -- cgit v1.2.3 From d277d868dab6537a85f4757e39648b1d6afc60d5 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Feb 2014 15:34:49 +0800 Subject: rcu: Use MAX_NICE to replace hardcoding of 19 Reviewed-by: Josh Triplett Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Cc: "Paul E. McKenney" Link: http://lkml.kernel.org/r/5b3bf232f41b33ab703a1595e94671b303e2d1fc.1392103744.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/rcu/torture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c index 732f8ae3086a..219761db1a46 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/torture.c @@ -805,7 +805,7 @@ rcu_torture_writer(void *arg) static DEFINE_RCU_RANDOM(rand); VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); do { schedule_timeout_uninterruptible(1); @@ -871,7 +871,7 @@ rcu_torture_fakewriter(void *arg) DEFINE_RCU_RANDOM(rand); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); do { schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); @@ -987,7 +987,7 @@ rcu_torture_reader(void *arg) unsigned long long ts; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) setup_timer_on_stack(&t, rcu_torture_timer, 0); @@ -1584,7 +1584,7 @@ static int rcu_torture_barrier_cbs(void *arg) init_rcu_head_on_stack(&rcu); VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); do { wait_event(barrier_cbs_wq[myid], (newphase = -- cgit v1.2.3 From 75e45d512f257beedae0d8a67d053cde5537bd4c Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Feb 2014 15:34:50 +0800 Subject: sched: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/bd80780f19b4f9b4a765acc353c8dbc130274dd6.1392103744.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 2 +- kernel/sched/core.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 4a073539c58e..e73efba98301 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) struct autogroup *ag; int err; - if (nice < -20 || nice > 19) + if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; err = security_task_setnice(current, nice); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cc4965e969b1..a8a73b8897bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2993,7 +2993,7 @@ void set_user_nice(struct task_struct *p, long nice) unsigned long flags; struct rq *rq; - if (task_nice(p) == nice || nice < -20 || nice > 19) + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; /* * We have to be careful, if called from sys_setpriority(), @@ -3072,10 +3072,10 @@ SYSCALL_DEFINE1(nice, int, increment) increment = 40; nice = task_nice(current) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; + if (nice < MIN_NICE) + nice = MIN_NICE; + if (nice > MAX_NICE) + nice = MAX_NICE; if (increment < 0 && !can_nice(current, nice)) return -EPERM; @@ -3623,7 +3623,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, * XXX: do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? */ - attr->sched_nice = clamp(attr->sched_nice, -20, 19); + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); out: return ret; -- cgit v1.2.3 From c4a4d2f43177f6165132c6d36a4d46963018f726 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Feb 2014 15:34:51 +0800 Subject: sys: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Oleg Nesterov Cc: Robin Holt Cc: Al Viro Cc: Kees Cook Cc: "Eric W. Biederman" Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/0261f094b836f1acbcdf52e7166487c0c77323c8.1392103744.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index c0a58be780a4..adaeab6f7a87 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; + if (niceval < MIN_NICE) + niceval = MIN_NICE; + if (niceval > MAX_NICE) + niceval = MAX_NICE; rcu_read_lock(); read_lock(&tasklist_lock); -- cgit v1.2.3 From 144818422bfa272531250473b888394c21e6fe19 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Feb 2014 15:34:52 +0800 Subject: workqueue: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE Signed-off-by: Dongsheng Yang Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/6d85138180c00ce86975addab6e34b24b84f00a5.1392103744.git.yangds.fnst@cn.fujitsu.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 193e977a10ea..3fa5b8f3aae3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3225,7 +3225,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, return -ENOMEM; if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= -20 && attrs->nice <= 19) + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) ret = apply_workqueue_attrs(wq, attrs); else ret = -EINVAL; -- cgit v1.2.3 From de91b9cb97fe68cb6ef0cfe9bee09d015c152af8 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 18 Feb 2014 14:14:24 +0000 Subject: sched: Fix select_task_rq_fair() description comments Brings select_task_rq_fair() description comments up-to-date. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392732864-10927-1-git-send-email-morten.rasmussen@arm.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e884e45982af..7982faf7223b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4397,13 +4397,14 @@ done: } /* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balance, ie. select the least loaded group. + * Balances load by selecting the idlest cpu in the idlest group, or under + * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. * - * Returns the target CPU number, or the same CPU if no balancing is needed. + * Returns the target cpu number. * * preempt must be disabled. */ -- cgit v1.2.3 From d987fc7f3228bf94cb6b21313ebab1d64ee637ad Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 5 Dec 2011 10:01:47 +0100 Subject: sched, nohz: Exclude isolated cores from load balancing The user explicitly disabled load balancing, else this core would not be disconnected. Don't add these to nohz.idle_cpus_mask. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Lei Wen Link: http://lkml.kernel.org/n/tip-vmme4f49psirp966pklm5l9j@git.kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7982faf7223b..a3a41c61a2c9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6788,6 +6788,11 @@ out_unlock: return 0; } +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6842,8 +6847,13 @@ static void nohz_balancer_kick(void) static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); + /* + * Completely isolated CPUs don't ever set, so we must test. + */ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -6897,6 +6907,12 @@ void nohz_balance_enter_idle(int cpu) if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; + /* + * If we're a completely isolated CPU, we don't play. + */ + if (on_null_domain(cpu_rq(cpu))) + return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); @@ -7159,11 +7175,6 @@ static void run_rebalance_domains(struct softirq_action *h) nohz_idle_balance(this_rq, idle); } -static inline int on_null_domain(struct rq *rq) -{ - return !rcu_dereference_sched(rq->sd); -} - /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -- cgit v1.2.3 From 806274c018e9858320a27b785df761f45c33a56c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Feb 2014 13:13:05 -0800 Subject: rcutorture: Fix checkpatch complaint This commit does a code-style cleanup so that the first curly brace of an initializer does not appear at the beginning of a line. Signed-off-by: Paul E. McKenney --- kernel/rcu/torture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c index 732f8ae3086a..dad67238d086 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/torture.c @@ -170,10 +170,10 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = - { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = - { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; -- cgit v1.2.3 From 51b1130eb5823ddb90a9ad07d243031d8cb7ecf2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Jan 2014 11:49:39 -0800 Subject: rcutorture: Abstract rcu_torture_random() Because rcu_torture_random() will be used by the locking equivalent to rcutorture, pull it out into its own module. This new module cannot be separately configured, instead, use the Kconfig "select" statement from the Kconfig options of tests depending on it. Suggested-by: Rusty Russell Signed-off-by: Paul E. McKenney --- kernel/Makefile | 1 + kernel/rcu/Makefile | 2 +- kernel/rcu/rcutorture.c | 2125 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/torture.c | 2148 ----------------------------------------------- kernel/torture.c | 71 ++ 5 files changed, 2198 insertions(+), 2149 deletions(-) create mode 100644 kernel/rcu/rcutorture.c delete mode 100644 kernel/rcu/torture.c create mode 100644 kernel/torture.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index bc010ee272b6..5c0e7666811d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -93,6 +93,7 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o +obj-$(CONFIG_TORTURE_TEST) += torture.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 01e9ec37a3e3..807ccfbf69b3 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,5 +1,5 @@ obj-y += update.o srcu.o -obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c new file mode 100644 index 000000000000..94b1cd8b214c --- /dev/null +++ b/kernel/rcu/rcutorture.c @@ -0,0 +1,2125 @@ +/* + * Read-Copy Update module-based torture test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005, 2006 + * + * Authors: Paul E. McKenney + * Josh Triplett + * + * See also: Documentation/RCU/torture.txt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); + +MODULE_ALIAS("rcutorture"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutorture." + +static int fqs_duration; +module_param(fqs_duration, int, 0444); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); +static int fqs_holdoff; +module_param(fqs_holdoff, int, 0444); +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +static int fqs_stutter = 3; +module_param(fqs_stutter, int, 0444); +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +static bool gp_exp; +module_param(gp_exp, bool, 0444); +MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); +static bool gp_normal; +module_param(gp_normal, bool, 0444); +MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +static int irqreader = 1; +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int n_barrier_cbs; +module_param(n_barrier_cbs, int, 0444); +MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); +static int nfakewriters = 4; +module_param(nfakewriters, int, 0444); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); +static int nreaders = -1; +module_param(nreaders, int, 0444); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +static int object_debug; +module_param(object_debug, int, 0444); +MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); +static int onoff_holdoff; +module_param(onoff_holdoff, int, 0444); +MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); +static int onoff_interval; +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int shuffle_interval = 3; +module_param(shuffle_interval, int, 0444); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +static int shutdown_secs; +module_param(shutdown_secs, int, 0444); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); +static int stall_cpu; +module_param(stall_cpu, int, 0444); +MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +static int stall_cpu_holdoff = 10; +module_param(stall_cpu_holdoff, int, 0444); +MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); +static int stat_interval = 60; +module_param(stat_interval, int, 0644); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +static int stutter = 5; +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +static int test_boost = 1; +module_param(test_boost, int, 0444); +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +static int test_boost_duration = 4; +module_param(test_boost_duration, int, 0444); +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); +static int test_boost_interval = 7; +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static bool test_no_idle_hz = true; +module_param(test_no_idle_hz, bool, 0444); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +static char *torture_type = "rcu"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +static bool verbose; +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); + +#define TORTURE_FLAG "-torture:" +#define PRINTK_STRING(s) \ + do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_STRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + +static int nrealreaders; +static struct task_struct *writer_task; +static struct task_struct **fakewriter_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *stats_task; +static struct task_struct *shuffler_task; +static struct task_struct *stutter_task; +static struct task_struct *fqs_task; +static struct task_struct *boost_tasks[NR_CPUS]; +static struct task_struct *shutdown_task; +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct *onoff_task; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static struct task_struct *stall_task; +static struct task_struct **barrier_cbs_tasks; +static struct task_struct *barrier_task; + +#define RCU_TORTURE_PIPE_LEN 10 + +struct rcu_torture { + struct rcu_head rtort_rcu; + int rtort_pipe_count; + struct list_head rtort_free; + int rtort_mbtest; +}; + +static LIST_HEAD(rcu_torture_freelist); +static struct rcu_torture __rcu *rcu_torture_current; +static unsigned long rcu_torture_current_version; +static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; +static DEFINE_SPINLOCK(rcu_torture_lock); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_batch) = { 0 }; +static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static atomic_t n_rcu_torture_alloc; +static atomic_t n_rcu_torture_alloc_fail; +static atomic_t n_rcu_torture_free; +static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_error; +static long n_rcu_torture_barrier_error; +static long n_rcu_torture_boost_ktrerror; +static long n_rcu_torture_boost_rterror; +static long n_rcu_torture_boost_failure; +static long n_rcu_torture_boosts; +static long n_rcu_torture_timers; +static long n_offline_attempts; +static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; +static long n_online_attempts; +static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; +static long n_barrier_attempts; +static long n_barrier_successes; +static struct list_head rcu_torture_removed; +static cpumask_var_t shuffle_tmp_mask; + +static int stutter_pause_test; + +#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) +#define RCUTORTURE_RUNNABLE_INIT 1 +#else +#define RCUTORTURE_RUNNABLE_INIT 0 +#endif +int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(rcutorture_runnable, int, 0444); +MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); + +#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) +#define rcu_can_boost() 1 +#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#define rcu_can_boost() 0 +#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ + +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ + u64 ts = trace_clock_local(); + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ + return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static unsigned long boost_starttime; /* jiffies of next boost test start. */ +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ + /* and boost task create/destroy. */ +static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static bool barrier_phase; /* Test phase. */ +static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ +static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ +static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); + +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ + +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +static int fullstop = FULLSTOP_RMMOD; +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); + +/* Forward reference. */ +static void rcu_torture_cleanup(void); + +/* + * Detect and respond to a system shutdown. + */ +static int +rcutorture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_DONTSTOP) + fullstop = FULLSTOP_SHUTDOWN; + else + pr_warn(/* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +static void rcutorture_shutdown_absorb(const char *title) +{ + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_notice( + "rcutorture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} + +/* + * Allocate an element from the rcu_tortures pool. + */ +static struct rcu_torture * +rcu_torture_alloc(void) +{ + struct list_head *p; + + spin_lock_bh(&rcu_torture_lock); + if (list_empty(&rcu_torture_freelist)) { + atomic_inc(&n_rcu_torture_alloc_fail); + spin_unlock_bh(&rcu_torture_lock); + return NULL; + } + atomic_inc(&n_rcu_torture_alloc); + p = rcu_torture_freelist.next; + list_del_init(p); + spin_unlock_bh(&rcu_torture_lock); + return container_of(p, struct rcu_torture, rtort_free); +} + +/* + * Free an element to the rcu_tortures pool. + */ +static void +rcu_torture_free(struct rcu_torture *p) +{ + atomic_inc(&n_rcu_torture_free); + spin_lock_bh(&rcu_torture_lock); + list_add_tail(&p->rtort_free, &rcu_torture_freelist); + spin_unlock_bh(&rcu_torture_lock); +} + +static void +rcu_stutter_wait(const char *title) +{ + while (stutter_pause_test || !rcutorture_runnable) { + if (rcutorture_runnable) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + rcutorture_shutdown_absorb(title); + } +} + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_torture_ops { + void (*init)(void); + int (*readlock)(void); + void (*read_delay)(struct torture_random_state *rrsp); + void (*readunlock)(int idx); + int (*completed)(void); + void (*deferred_free)(struct rcu_torture *p); + void (*sync)(void); + void (*exp_sync)(void); + void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + void (*cb_barrier)(void); + void (*fqs)(void); + void (*stats)(char *page); + int irq_capable; + int can_boost; + const char *name; +}; + +static struct rcu_torture_ops *cur_ops; + +/* + * Definitions for rcu torture testing. + */ + +static int rcu_torture_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_read_delay(struct torture_random_state *rrsp) +{ + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; + + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ + + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && + !(torture_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif +} + +static void rcu_torture_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static int rcu_torture_completed(void) +{ + return rcu_batches_completed(); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop != FULLSTOP_DONTSTOP) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else { + cur_ops->deferred_free(rp); + } +} + +static int rcu_no_completed(void) +{ + return 0; +} + +static void rcu_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu(&p->rtort_rcu, rcu_torture_cb); +} + +static void rcu_sync_torture_init(void) +{ + INIT_LIST_HEAD(&rcu_torture_removed); +} + +static struct rcu_torture_ops rcu_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .call = call_rcu, + .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .can_boost = rcu_can_boost(), + .name = "rcu" +}; + +/* + * Definitions for rcu_bh torture testing. + */ + +static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static int rcu_bh_torture_completed(void) +{ + return rcu_batches_completed_bh(); +} + +static void rcu_bh_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_bh_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_bh_torture_deferred_free, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .call = call_rcu_bh, + .cb_barrier = rcu_barrier_bh, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu torture testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl); + +static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) +{ + return srcu_read_lock(&srcu_ctl); +} + +static void srcu_read_delay(struct torture_random_state *rrsp) +{ + long delay; + const long uspertick = 1000000 / HZ; + const long longdelay = 10; + + /* We want there to be long-running readers, but not all the time. */ + + delay = torture_random(rrsp) % + (nrealreaders * 2 * longdelay * uspertick); + if (!delay) + schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); +} + +static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) +{ + srcu_read_unlock(&srcu_ctl, idx); +} + +static int srcu_torture_completed(void) +{ + return srcu_batches_completed(&srcu_ctl); +} + +static void srcu_torture_deferred_free(struct rcu_torture *rp) +{ + call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); +} + +static void srcu_torture_synchronize(void) +{ + synchronize_srcu(&srcu_ctl); +} + +static void srcu_torture_call(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + call_srcu(&srcu_ctl, head, func); +} + +static void srcu_torture_barrier(void) +{ + srcu_barrier(&srcu_ctl); +} + +static void srcu_torture_stats(char *page) +{ + int cpu; + int idx = srcu_ctl.completed & 0x1; + + page += sprintf(page, "%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); + for_each_possible_cpu(cpu) { + page += sprintf(page, " %d(%lu,%lu)", cpu, + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + } + sprintf(page, "\n"); +} + +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + +static struct rcu_torture_ops srcu_ops = { + .init = rcu_sync_torture_init, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .name = "srcu" +}; + +/* + * Definitions for sched torture testing. + */ + +static int sched_torture_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_torture_read_unlock(int idx) +{ + preempt_enable(); +} + +static void rcu_sched_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops sched_ops = { + .init = rcu_sync_torture_init, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sched_torture_deferred_free, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .call = call_rcu_sched, + .cb_barrier = rcu_barrier_sched, + .fqs = rcu_sched_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "sched" +}; + +/* + * RCU torture priority-boost testing. Runs one real-time thread per + * CPU for moderate bursts, repeatedly registering RCU callbacks and + * spinning waiting for them to be invoked. If a given callback takes + * too long to be invoked, we assume that priority inversion has occurred. + */ + +struct rcu_boost_inflight { + struct rcu_head rcu; + int inflight; +}; + +static void rcu_torture_boost_cb(struct rcu_head *head) +{ + struct rcu_boost_inflight *rbip = + container_of(head, struct rcu_boost_inflight, rcu); + + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ + rbip->inflight = 0; +} + +static int rcu_torture_boost(void *arg) +{ + unsigned long call_rcu_time; + unsigned long endtime; + unsigned long oldstarttime; + struct rcu_boost_inflight rbi = { .inflight = 0 }; + struct sched_param sp; + + VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + + /* Set real-time priority. */ + sp.sched_priority = 1; + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + n_rcu_torture_boost_rterror++; + } + + init_rcu_head_on_stack(&rbi.rcu); + /* Each pass through the following loop does one boost-test cycle. */ + do { + /* Wait for the next test interval. */ + oldstarttime = boost_starttime; + while (ULONG_CMP_LT(jiffies, oldstarttime)) { + schedule_timeout_interruptible(oldstarttime - jiffies); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* Do one boost-test interval. */ + endtime = oldstarttime + test_boost_duration * HZ; + call_rcu_time = jiffies; + while (ULONG_CMP_LT(jiffies, endtime)) { + /* If we don't have a callback in flight, post one. */ + if (!rbi.inflight) { + smp_mb(); /* RCU core before ->inflight = 1. */ + rbi.inflight = 1; + call_rcu(&rbi.rcu, rcu_torture_boost_cb); + if (jiffies - call_rcu_time > + test_boost_duration * HZ - HZ / 2) { + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + } + call_rcu_time = jiffies; + } + cond_resched(); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* + * Set the start time of the next test interval. + * Yes, this is vulnerable to long delays, but such + * delays simply cause a false negative for the next + * interval. Besides, we are running at RT priority, + * so delays should be relatively rare. + */ + while (oldstarttime == boost_starttime && + !kthread_should_stop()) { + if (mutex_trylock(&boost_mutex)) { + boost_starttime = jiffies + + test_boost_interval * HZ; + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + + /* Go do the stutter. */ +checkwait: rcu_stutter_wait("rcu_torture_boost"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + /* Clean up and exit. */ + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + rcutorture_shutdown_absorb("rcu_torture_boost"); + while (!kthread_should_stop() || rbi.inflight) + schedule_timeout_uninterruptible(1); + smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + destroy_rcu_head_on_stack(&rbi.rcu); + return 0; +} + +/* + * RCU torture force-quiescent-state kthread. Repeatedly induces + * bursts of calls to force_quiescent_state(), increasing the probability + * of occurrence of some important types of race conditions. + */ +static int +rcu_torture_fqs(void *arg) +{ + unsigned long fqs_resume_time; + int fqs_burst_remaining; + + VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + do { + fqs_resume_time = jiffies + fqs_stutter * HZ; + while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + !kthread_should_stop()) { + schedule_timeout_interruptible(1); + } + fqs_burst_remaining = fqs_duration; + while (fqs_burst_remaining > 0 && + !kthread_should_stop()) { + cur_ops->fqs(); + udelay(fqs_holdoff); + fqs_burst_remaining -= fqs_holdoff; + } + rcu_stutter_wait("rcu_torture_fqs"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fqs"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + bool exp; + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + struct rcu_torture *old_rp; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1); + rp = rcu_torture_alloc(); + if (rp == NULL) + continue; + rp->rtort_pipe_count = 0; + udelay(torture_random(&rand) & 0x3ff); + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); + rp->rtort_mbtest = 1; + rcu_assign_pointer(rcu_torture_current, rp); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ + if (old_rp) { + i = old_rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + old_rp->rtort_pipe_count++; + if (gp_normal == gp_exp) + exp = !!(torture_random(&rand) & 0x80); + else + exp = gp_exp; + if (!exp) { + cur_ops->deferred_free(old_rp); + } else { + cur_ops->exp_sync(); + list_add(&old_rp->rtort_free, + &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, + &rcu_torture_removed, + rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= + RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } + } + } + rcutorture_record_progress(++rcu_torture_current_version); + rcu_stutter_wait("rcu_torture_writer"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + rcutorture_shutdown_absorb("rcu_torture_writer"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture fake writer kthread. Repeatedly calls sync, with a random + * delay between calls. + */ +static int +rcu_torture_fakewriter(void *arg) +{ + DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); + udelay(torture_random(&rand) & 0x3ff); + if (cur_ops->cb_barrier != NULL && + torture_random(&rand) % (nfakewriters * 8) == 0) { + cur_ops->cb_barrier(); + } else if (gp_normal == gp_exp) { + if (torture_random(&rand) & 0x80) + cur_ops->sync(); + else + cur_ops->exp_sync(); + } else if (gp_normal) { + cur_ops->sync(); + } else { + cur_ops->exp_sync(); + } + rcu_stutter_wait("rcu_torture_fakewriter"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +void rcutorture_trace_dump(void) +{ + static atomic_t beenhere = ATOMIC_INIT(0); + + if (atomic_read(&beenhere)) + return; + if (atomic_xchg(&beenhere, 1) != 0) + return; + ftrace_dump(DUMP_ALL); +} + +/* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(unsigned long unused) +{ + int idx; + int completed; + int completed_end; + static DEFINE_TORTURE_RANDOM(rand); + static DEFINE_SPINLOCK(rand_lock); + struct rcu_torture *p; + int pipe_count; + unsigned long long ts; + + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Leave because rcu_torture_writer is not yet underway */ + cur_ops->readunlock(idx); + return; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + spin_lock(&rand_lock); + cur_ops->read_delay(&rand); + n_rcu_torture_timers++; + spin_unlock(&rand_lock); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, + completed, completed_end); + rcutorture_trace_dump(); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = completed_end - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); +} + +/* + * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static int +rcu_torture_reader(void *arg) +{ + int completed; + int completed_end; + int idx; + DEFINE_TORTURE_RANDOM(rand); + struct rcu_torture *p; + int pipe_count; + struct timer_list t; + unsigned long long ts; + + VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + set_user_nice(current, 19); + if (irqreader && cur_ops->irq_capable) + setup_timer_on_stack(&t, rcu_torture_timer, 0); + + do { + if (irqreader && cur_ops->irq_capable) { + if (!timer_pending(&t)) + mod_timer(&t, jiffies + 1); + } + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + cur_ops->readunlock(idx); + schedule_timeout_interruptible(HZ); + continue; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + cur_ops->read_delay(&rand); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, completed, completed_end); + rcutorture_trace_dump(); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = completed_end - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); + schedule(); + rcu_stutter_wait("rcu_torture_reader"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + rcutorture_shutdown_absorb("rcu_torture_reader"); + if (irqreader && cur_ops->irq_capable) + del_timer_sync(&t); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * Create an RCU-torture statistics message in the specified buffer. + */ +static void +rcu_torture_printk(char *page) +{ + int cpu; + int i; + long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; + batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + } + } + for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + if (pipesummary[i] != 0) + break; + } + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror); + page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + page += sprintf(page, + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); + page += sprintf(page, "barrier: %ld/%ld:%ld", + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_barrier_error != 0 || + n_rcu_torture_boost_ktrerror != 0 || + n_rcu_torture_boost_rterror != 0 || + n_rcu_torture_boost_failure != 0 || + i > 1) { + page += sprintf(page, "!!! "); + atomic_inc(&n_rcu_torture_error); + WARN_ON_ONCE(1); + } + page += sprintf(page, "Reader Pipe: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + page += sprintf(page, " %ld", pipesummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Reader Batch: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + page += sprintf(page, " %ld", batchsummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Free-Block Circulation: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + page += sprintf(page, " %d", + atomic_read(&rcu_torture_wcount[i])); + } + page += sprintf(page, "\n"); + if (cur_ops->stats) + cur_ops->stats(page); +} + +/* + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). + */ +static void +rcu_torture_stats_print(void) +{ + int size = nr_cpu_ids * 200 + 8192; + char *buf; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("rcu-torture: Out of memory, need: %d", size); + return; + } + rcu_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int +rcu_torture_stats(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + rcu_torture_stats_print(); + rcutorture_shutdown_absorb("rcu_torture_stats"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + return 0; +} + +static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ + +/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case + * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. + */ +static void rcu_torture_shuffle_tasks(void) +{ + int i; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + if (rcu_idle_cpu != -1) + cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); + + set_cpus_allowed_ptr(current, shuffle_tmp_mask); + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + if (reader_tasks[i]) + set_cpus_allowed_ptr(reader_tasks[i], + shuffle_tmp_mask); + } + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) + if (fakewriter_tasks[i]) + set_cpus_allowed_ptr(fakewriter_tasks[i], + shuffle_tmp_mask); + } + if (writer_task) + set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); + if (stats_task) + set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + if (stutter_task) + set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); + if (fqs_task) + set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); + if (shutdown_task) + set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task) + set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + if (stall_task) + set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); + if (barrier_cbs_tasks) + for (i = 0; i < n_barrier_cbs; i++) + if (barrier_cbs_tasks[i]) + set_cpus_allowed_ptr(barrier_cbs_tasks[i], + shuffle_tmp_mask); + if (barrier_task) + set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); + + if (rcu_idle_cpu == -1) + rcu_idle_cpu = num_online_cpus() - 1; + else + rcu_idle_cpu--; + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int +rcu_torture_shuffle(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval * HZ); + rcu_torture_shuffle_tasks(); + rcutorture_shutdown_absorb("rcu_torture_shuffle"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); + return 0; +} + +/* Cause the rcutorture test to "stutter", starting and stopping all + * threads periodically. + */ +static int +rcu_torture_stutter(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + do { + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 1; + if (!kthread_should_stop()) + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 0; + rcutorture_shutdown_absorb("rcu_torture_stutter"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + return 0; +} + +static inline void +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) +{ + pr_alert("%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d shutdown_secs=%d " + "stall_cpu=%d stall_cpu_holdoff=%d " + "n_barrier_cbs=%d " + "onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration, shutdown_secs, + stall_cpu, stall_cpu_holdoff, + n_barrier_cbs, + onoff_interval, onoff_holdoff); +} + +static struct notifier_block rcutorture_shutdown_nb = { + .notifier_call = rcutorture_shutdown_notify, +}; + +static void rcutorture_booster_cleanup(int cpu) +{ + struct task_struct *t; + + if (boost_tasks[cpu] == NULL) + return; + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); + t = boost_tasks[cpu]; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + + /* This must be outside of the mutex, otherwise deadlock! */ + kthread_stop(t); + boost_tasks[cpu] = NULL; +} + +static int rcutorture_booster_init(int cpu) +{ + int retval; + + if (boost_tasks[cpu] != NULL) + return 0; /* Already created, nothing more to do. */ + + /* Don't allow time recalculation while creating a new task. */ + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, + cpu_to_node(cpu), + "rcu_torture_boost"); + if (IS_ERR(boost_tasks[cpu])) { + retval = PTR_ERR(boost_tasks[cpu]); + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + n_rcu_torture_boost_ktrerror++; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + return retval; + } + kthread_bind(boost_tasks[cpu], cpu); + wake_up_process(boost_tasks[cpu]); + mutex_unlock(&boost_mutex); + return 0; +} + +/* + * Cause the rcutorture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs module parameter. + */ +static int +rcu_torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); + jiffies_snap = ACCESS_ONCE(jiffies); + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !kthread_should_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = ACCESS_ONCE(jiffies); + } + if (kthread_should_stop()) { + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + rcu_torture_cleanup(); /* Get the success/failure message. */ + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +rcu_torture_onoff(void *arg) +{ + int cpu; + unsigned long delta; + int maxcpu = -1; + DEFINE_TORTURE_RANDOM(rand); + int ret; + unsigned long starttime; + + VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); + } + while (!kthread_should_stop()) { + cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_offline_attempts++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlined %d\n", + torture_type, cpu); + n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_online_attempts++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlined %d\n", + torture_type, cpu); + n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; + } + } + schedule_timeout_interruptible(onoff_interval * HZ); + } + VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); + return 0; +} + +static int +rcu_torture_onoff_init(void) +{ + int ret; + + if (onoff_interval <= 0) + return 0; + onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); + if (IS_ERR(onoff_task)) { + ret = PTR_ERR(onoff_task); + onoff_task = NULL; + return ret; + } + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ + if (onoff_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); + kthread_stop(onoff_task); + onoff_task = NULL; +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static int +rcu_torture_onoff_init(void) +{ + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then + * induces a CPU stall for the time specified by stall_cpu. + */ +static int rcu_torture_stall(void *args) +{ + unsigned long stop_at; + + VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + if (stall_cpu_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + schedule_timeout_interruptible(stall_cpu_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + } + if (!kthread_should_stop()) { + stop_at = get_seconds() + stall_cpu; + /* RCU CPU stall is expected behavior in following code. */ + pr_alert("rcu_torture_stall start.\n"); + rcu_read_lock(); + preempt_disable(); + while (ULONG_CMP_LT(get_seconds(), stop_at)) + continue; /* Induce RCU CPU stall warning. */ + preempt_enable(); + rcu_read_unlock(); + pr_alert("rcu_torture_stall end.\n"); + } + rcutorture_shutdown_absorb("rcu_torture_stall"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(10 * HZ); + return 0; +} + +/* Spawn CPU-stall kthread, if stall_cpu specified. */ +static int __init rcu_torture_stall_init(void) +{ + int ret; + + if (stall_cpu <= 0) + return 0; + stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); + if (IS_ERR(stall_task)) { + ret = PTR_ERR(stall_task); + stall_task = NULL; + return ret; + } + return 0; +} + +/* Clean up after the CPU-stall kthread, if one was spawned. */ +static void rcu_torture_stall_cleanup(void) +{ + if (stall_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); + kthread_stop(stall_task); + stall_task = NULL; +} + +/* Callback function for RCU barrier testing. */ +void rcu_torture_barrier_cbf(struct rcu_head *rcu) +{ + atomic_inc(&barrier_cbs_invoked); +} + +/* kthread function to register callbacks used to test RCU barriers. */ +static int rcu_torture_barrier_cbs(void *arg) +{ + long myid = (long)arg; + bool lastphase = 0; + bool newphase; + struct rcu_head rcu; + + init_rcu_head_on_stack(&rcu); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, 19); + do { + wait_event(barrier_cbs_wq[myid], + (newphase = + ACCESS_ONCE(barrier_phase)) != lastphase || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + lastphase = newphase; + smp_mb(); /* ensure barrier_phase load before ->call(). */ + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + if (atomic_dec_and_test(&barrier_cbs_count)) + wake_up(&barrier_wq); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + destroy_rcu_head_on_stack(&rcu); + return 0; +} + +/* kthread function to drive and coordinate RCU barrier testing. */ +static int rcu_torture_barrier(void *arg) +{ + int i; + + VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + do { + atomic_set(&barrier_cbs_invoked, 0); + atomic_set(&barrier_cbs_count, n_barrier_cbs); + smp_mb(); /* Ensure barrier_phase after prior assignments. */ + barrier_phase = !barrier_phase; + for (i = 0; i < n_barrier_cbs; i++) + wake_up(&barrier_cbs_wq[i]); + wait_event(barrier_wq, + atomic_read(&barrier_cbs_count) == 0 || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + n_barrier_attempts++; + cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ + if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { + n_rcu_torture_barrier_error++; + WARN_ON_ONCE(1); + } + n_barrier_successes++; + schedule_timeout_interruptible(HZ / 10); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + return 0; +} + +/* Initialize RCU barrier testing. */ +static int rcu_torture_barrier_init(void) +{ + int i; + int ret; + + if (n_barrier_cbs == 0) + return 0; + if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { + pr_alert("%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); + return 0; + } + atomic_set(&barrier_cbs_count, 0); + atomic_set(&barrier_cbs_invoked, 0); + barrier_cbs_tasks = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + GFP_KERNEL); + barrier_cbs_wq = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), + GFP_KERNEL); + if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) + return -ENOMEM; + for (i = 0; i < n_barrier_cbs; i++) { + init_waitqueue_head(&barrier_cbs_wq[i]); + barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, + (void *)(long)i, + "rcu_torture_barrier_cbs"); + if (IS_ERR(barrier_cbs_tasks[i])) { + ret = PTR_ERR(barrier_cbs_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); + barrier_cbs_tasks[i] = NULL; + return ret; + } + } + barrier_task = kthread_run(rcu_torture_barrier, NULL, + "rcu_torture_barrier"); + if (IS_ERR(barrier_task)) { + ret = PTR_ERR(barrier_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); + barrier_task = NULL; + } + return 0; +} + +/* Clean up after RCU barrier testing. */ +static void rcu_torture_barrier_cleanup(void) +{ + int i; + + if (barrier_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); + kthread_stop(barrier_task); + barrier_task = NULL; + } + if (barrier_cbs_tasks != NULL) { + for (i = 0; i < n_barrier_cbs; i++) { + if (barrier_cbs_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); + kthread_stop(barrier_cbs_tasks[i]); + barrier_cbs_tasks[i] = NULL; + } + } + kfree(barrier_cbs_tasks); + barrier_cbs_tasks = NULL; + } + if (barrier_cbs_wq != NULL) { + kfree(barrier_cbs_wq); + barrier_cbs_wq = NULL; + } +} + +static int rcutorture_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + (void)rcutorture_booster_init(cpu); + break; + case CPU_DOWN_PREPARE: + rcutorture_booster_cleanup(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_cpu_nb = { + .notifier_call = rcutorture_cpu_notify, +}; + +static void +rcu_torture_cleanup(void) +{ + int i; + + mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); + if (fullstop == FULLSTOP_SHUTDOWN) { + pr_warn(/* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + return; + } + fullstop = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_barrier_cleanup(); + rcu_torture_stall_cleanup(); + if (stutter_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + kthread_stop(stutter_task); + } + stutter_task = NULL; + if (shuffler_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; + + if (writer_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + kthread_stop(writer_task); + } + writer_task = NULL; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) { + if (reader_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_reader task"); + kthread_stop(reader_tasks[i]); + } + reader_tasks[i] = NULL; + } + kfree(reader_tasks); + reader_tasks = NULL; + } + rcu_torture_current = NULL; + + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) { + if (fakewriter_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_fakewriter task"); + kthread_stop(fakewriter_tasks[i]); + } + fakewriter_tasks[i] = NULL; + } + kfree(fakewriter_tasks); + fakewriter_tasks = NULL; + } + + if (stats_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + kthread_stop(stats_task); + } + stats_task = NULL; + + if (fqs_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + kthread_stop(fqs_task); + } + fqs_task = NULL; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + unregister_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) + rcutorture_booster_cleanup(i); + } + if (shutdown_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; + rcu_torture_onoff_cleanup(); + + /* Wait for all RCU callbacks to fire. */ + + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); + else if (n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts) + rcu_torture_print_module_parms(cur_ops, + "End of test: RCU_HOTPLUG"); + else + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); +} + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static void rcu_torture_leak_cb(struct rcu_head *rhp) +{ +} + +static void rcu_torture_err_cb(struct rcu_head *rhp) +{ + /* + * This -might- happen due to race conditions, but is unlikely. + * The scenario that leads to this happening is that the + * first of the pair of duplicate callbacks is queued, + * someone else starts a grace period that includes that + * callback, then the second of the pair must wait for the + * next grace period. Unlikely, but can happen. If it + * does happen, the debug-objects subsystem won't have splatted. + */ + pr_alert("rcutorture: duplicated callback was invoked.\n"); +} +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +/* + * Verify that double-free causes debug-objects to complain, but only + * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test + * cannot be carried out. + */ +static void rcu_test_debug_objects(void) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + struct rcu_head rh1; + struct rcu_head rh2; + + init_rcu_head_on_stack(&rh1); + init_rcu_head_on_stack(&rh2); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + + /* Try to queue the rh2 pair of callbacks for the same grace period. */ + preempt_disable(); /* Prevent preemption from interrupting test. */ + rcu_read_lock(); /* Make it impossible to finish a grace period. */ + call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + local_irq_disable(); /* Make it harder to start a new grace period. */ + call_rcu(&rh2, rcu_torture_leak_cb); + call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + local_irq_enable(); + rcu_read_unlock(); + preempt_enable(); + + /* Wait for them all to get done so we can safely return. */ + rcu_barrier(); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + destroy_rcu_head_on_stack(&rh1); + destroy_rcu_head_on_stack(&rh2); +#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); +#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +} + +static int __init +rcu_torture_init(void) +{ + int i; + int cpu; + int firsterr = 0; + int retval; + static struct rcu_torture_ops *torture_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + }; + + mutex_lock(&fullstop_mutex); + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + pr_alert("rcu-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("rcu-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); + mutex_unlock(&fullstop_mutex); + return -EINVAL; + } + if (cur_ops->fqs == NULL && fqs_duration != 0) { + pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); + fqs_duration = 0; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nreaders >= 0) + nrealreaders = nreaders; + else + nrealreaders = 2 * num_online_cpus(); + rcu_torture_print_module_parms(cur_ops, "Start of test"); + fullstop = FULLSTOP_DONTSTOP; + + /* Set up the freelist. */ + + INIT_LIST_HEAD(&rcu_torture_freelist); + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { + rcu_tortures[i].rtort_mbtest = 0; + list_add_tail(&rcu_tortures[i].rtort_free, + &rcu_torture_freelist); + } + + /* Initialize the statistics so that each run gets its own numbers. */ + + rcu_torture_current = NULL; + rcu_torture_current_version = 0; + atomic_set(&n_rcu_torture_alloc, 0); + atomic_set(&n_rcu_torture_alloc_fail, 0); + atomic_set(&n_rcu_torture_free, 0); + atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_barrier_error = 0; + n_rcu_torture_boost_ktrerror = 0; + n_rcu_torture_boost_rterror = 0; + n_rcu_torture_boost_failure = 0; + n_rcu_torture_boosts = 0; + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + atomic_set(&rcu_torture_wcount[i], 0); + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + per_cpu(rcu_torture_count, cpu)[i] = 0; + per_cpu(rcu_torture_batch, cpu)[i] = 0; + } + } + + /* Start up the kthreads. */ + + VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + writer_task = kthread_create(rcu_torture_writer, NULL, + "rcu_torture_writer"); + if (IS_ERR(writer_task)) { + firsterr = PTR_ERR(writer_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + writer_task = NULL; + goto unwind; + } + wake_up_process(writer_task); + fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nfakewriters; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + "rcu_torture_fakewriter"); + if (IS_ERR(fakewriter_tasks[i])) { + firsterr = PTR_ERR(fakewriter_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + fakewriter_tasks[i] = NULL; + goto unwind; + } + } + reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, + "rcu_torture_reader"); + if (IS_ERR(reader_tasks[i])) { + firsterr = PTR_ERR(reader_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + reader_tasks[i] = NULL; + goto unwind; + } + } + if (stat_interval > 0) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + stats_task = kthread_run(rcu_torture_stats, NULL, + "rcu_torture_stats"); + if (IS_ERR(stats_task)) { + firsterr = PTR_ERR(stats_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + stats_task = NULL; + goto unwind; + } + } + if (test_no_idle_hz) { + rcu_idle_cpu = num_online_cpus() - 1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + firsterr = -ENOMEM; + VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); + goto unwind; + } + + /* Create the shuffler thread */ + shuffler_task = kthread_run(rcu_torture_shuffle, NULL, + "rcu_torture_shuffle"); + if (IS_ERR(shuffler_task)) { + free_cpumask_var(shuffle_tmp_mask); + firsterr = PTR_ERR(shuffler_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); + shuffler_task = NULL; + goto unwind; + } + } + if (stutter < 0) + stutter = 0; + if (stutter) { + /* Create the stutter thread */ + stutter_task = kthread_run(rcu_torture_stutter, NULL, + "rcu_torture_stutter"); + if (IS_ERR(stutter_task)) { + firsterr = PTR_ERR(stutter_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + goto unwind; + } + } + if (fqs_duration < 0) + fqs_duration = 0; + if (fqs_duration) { + /* Create the stutter thread */ + fqs_task = kthread_run(rcu_torture_fqs, NULL, + "rcu_torture_fqs"); + if (IS_ERR(fqs_task)) { + firsterr = PTR_ERR(fqs_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + fqs_task = NULL; + goto unwind; + } + } + if (test_boost_interval < 1) + test_boost_interval = 1; + if (test_boost_duration < 2) + test_boost_duration = 2; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + + boost_starttime = jiffies + test_boost_interval * HZ; + register_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) { + if (cpu_is_offline(i)) + continue; /* Heuristic: CPU can go offline. */ + retval = rcutorture_booster_init(i); + if (retval < 0) { + firsterr = retval; + goto unwind; + } + } + } + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + shutdown_task = kthread_create(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); + if (IS_ERR(shutdown_task)) { + firsterr = PTR_ERR(shutdown_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); + shutdown_task = NULL; + goto unwind; + } + wake_up_process(shutdown_task); + } + i = rcu_torture_onoff_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } + register_reboot_notifier(&rcutorture_shutdown_nb); + i = rcu_torture_stall_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } + retval = rcu_torture_barrier_init(); + if (retval != 0) { + firsterr = retval; + goto unwind; + } + if (object_debug) + rcu_test_debug_objects(); + rcutorture_record_test_transition(); + mutex_unlock(&fullstop_mutex); + return 0; + +unwind: + mutex_unlock(&fullstop_mutex); + rcu_torture_cleanup(); + return firsterr; +} + +module_init(rcu_torture_init); +module_exit(rcu_torture_cleanup); diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c deleted file mode 100644 index dad67238d086..000000000000 --- a/kernel/rcu/torture.c +++ /dev/null @@ -1,2148 +0,0 @@ -/* - * Read-Copy Update module-based torture test facility - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2005, 2006 - * - * Authors: Paul E. McKenney - * Josh Triplett - * - * See also: Documentation/RCU/torture.txt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); - -MODULE_ALIAS("rcutorture"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutorture." - -static int fqs_duration; -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); -static int fqs_holdoff; -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -static int fqs_stutter = 3; -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -static bool gp_exp; -module_param(gp_exp, bool, 0444); -MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); -static bool gp_normal; -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); -static int irqreader = 1; -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -static int n_barrier_cbs; -module_param(n_barrier_cbs, int, 0444); -MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -static int nfakewriters = 4; -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -static int nreaders = -1; -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -static int object_debug; -module_param(object_debug, int, 0444); -MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -static int onoff_holdoff; -module_param(onoff_holdoff, int, 0444); -MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); -static int onoff_interval; -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -static int shuffle_interval = 3; -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -static int shutdown_secs; -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); -static int stall_cpu; -module_param(stall_cpu, int, 0444); -MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); -static int stall_cpu_holdoff = 10; -module_param(stall_cpu_holdoff, int, 0444); -MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); -static int stat_interval = 60; -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -static int stutter = 5; -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -static int test_boost = 1; -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -static int test_boost_duration = 4; -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -static int test_boost_interval = 7; -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -static bool test_no_idle_hz = true; -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -static char *torture_type = "rcu"; -module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -static bool verbose; -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - -static int nrealreaders; -static struct task_struct *writer_task; -static struct task_struct **fakewriter_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *stats_task; -static struct task_struct *shuffler_task; -static struct task_struct *stutter_task; -static struct task_struct *fqs_task; -static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static struct task_struct *stall_task; -static struct task_struct **barrier_cbs_tasks; -static struct task_struct *barrier_task; - -#define RCU_TORTURE_PIPE_LEN 10 - -struct rcu_torture { - struct rcu_head rtort_rcu; - int rtort_pipe_count; - struct list_head rtort_free; - int rtort_mbtest; -}; - -static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture __rcu *rcu_torture_current; -static unsigned long rcu_torture_current_version; -static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; -static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; -static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -static atomic_t n_rcu_torture_alloc; -static atomic_t n_rcu_torture_alloc_fail; -static atomic_t n_rcu_torture_free; -static atomic_t n_rcu_torture_mberror; -static atomic_t n_rcu_torture_error; -static long n_rcu_torture_barrier_error; -static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; -static long n_rcu_torture_boost_failure; -static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static unsigned long sum_offline; -static int min_offline = -1; -static int max_offline; -static long n_online_attempts; -static long n_online_successes; -static unsigned long sum_online; -static int min_online = -1; -static int max_online; -static long n_barrier_attempts; -static long n_barrier_successes; -static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; - -static int stutter_pause_test; - -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); - -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ - -#ifdef CONFIG_RCU_TRACE -static u64 notrace rcu_trace_clock_local(void) -{ - u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); - return ts; -} -#else /* #ifdef CONFIG_RCU_TRACE */ -static u64 notrace rcu_trace_clock_local(void) -{ - return 0ULL; -} -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -static unsigned long shutdown_time; /* jiffies to system shutdown. */ -static unsigned long boost_starttime; /* jiffies of next boost test start. */ -DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ - /* and boost task create/destroy. */ -static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ -static bool barrier_phase; /* Test phase. */ -static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ -static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ -static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); - -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - -/* Forward reference. */ -static void rcu_torture_cleanup(void); - -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(const char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - -/* - * Allocate an element from the rcu_tortures pool. - */ -static struct rcu_torture * -rcu_torture_alloc(void) -{ - struct list_head *p; - - spin_lock_bh(&rcu_torture_lock); - if (list_empty(&rcu_torture_freelist)) { - atomic_inc(&n_rcu_torture_alloc_fail); - spin_unlock_bh(&rcu_torture_lock); - return NULL; - } - atomic_inc(&n_rcu_torture_alloc); - p = rcu_torture_freelist.next; - list_del_init(p); - spin_unlock_bh(&rcu_torture_lock); - return container_of(p, struct rcu_torture, rtort_free); -} - -/* - * Free an element to the rcu_tortures pool. - */ -static void -rcu_torture_free(struct rcu_torture *p) -{ - atomic_inc(&n_rcu_torture_free); - spin_lock_bh(&rcu_torture_lock); - list_add_tail(&p->rtort_free, &rcu_torture_freelist); - spin_unlock_bh(&rcu_torture_lock); -} - -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += (unsigned long)local_clock(); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -static void -rcu_stutter_wait(const char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); - } -} - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_torture_ops { - void (*init)(void); - int (*readlock)(void); - void (*read_delay)(struct rcu_random_state *rrsp); - void (*readunlock)(int idx); - int (*completed)(void); - void (*deferred_free)(struct rcu_torture *p); - void (*sync)(void); - void (*exp_sync)(void); - void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); - void (*cb_barrier)(void); - void (*fqs)(void); - void (*stats)(char *page); - int irq_capable; - int can_boost; - const char *name; -}; - -static struct rcu_torture_ops *cur_ops; - -/* - * Definitions for rcu torture testing. - */ - -static int rcu_torture_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_read_delay(struct rcu_random_state *rrsp) -{ - const unsigned long shortdelay_us = 200; - const unsigned long longdelay_ms = 50; - - /* We want a short delay sometimes to make a reader delay the grace - * period, and we want a long delay occasionally to trigger - * force_quiescent_state. */ - - if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) - udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT - if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif -} - -static void rcu_torture_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop != FULLSTOP_DONTSTOP) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else { - cur_ops->deferred_free(rp); - } -} - -static int rcu_no_completed(void) -{ - return 0; -} - -static void rcu_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu(&p->rtort_rcu, rcu_torture_cb); -} - -static void rcu_sync_torture_init(void) -{ - INIT_LIST_HEAD(&rcu_torture_removed); -} - -static struct rcu_torture_ops rcu_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu" -}; - -/* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_bh_torture_deferred_free, - .sync = synchronize_rcu_bh, - .exp_sync = synchronize_rcu_bh_expedited, - .call = call_rcu_bh, - .cb_barrier = rcu_barrier_bh, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh" -}; - -/* - * Definitions for srcu torture testing. - */ - -DEFINE_STATIC_SRCU(srcu_ctl); - -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock(&srcu_ctl); -} - -static void srcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long uspertick = 1000000 / HZ; - const long longdelay = 10; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) - schedule_timeout_interruptible(longdelay); - else - rcu_read_delay(rrsp); -} - -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock(&srcu_ctl, idx); -} - -static int srcu_torture_completed(void) -{ - return srcu_batches_completed(&srcu_ctl); -} - -static void srcu_torture_deferred_free(struct rcu_torture *rp) -{ - call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); -} - -static void srcu_torture_synchronize(void) -{ - synchronize_srcu(&srcu_ctl); -} - -static void srcu_torture_call(struct rcu_head *head, - void (*func)(struct rcu_head *head)) -{ - call_srcu(&srcu_ctl, head, func); -} - -static void srcu_torture_barrier(void) -{ - srcu_barrier(&srcu_ctl); -} - -static void srcu_torture_stats(char *page) -{ - int cpu; - int idx = srcu_ctl.completed & 0x1; - - page += sprintf(page, "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - page += sprintf(page, " %d(%lu,%lu)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); - } - sprintf(page, "\n"); -} - -static void srcu_torture_synchronize_expedited(void) -{ - synchronize_srcu_expedited(&srcu_ctl); -} - -static struct rcu_torture_ops srcu_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = srcu_torture_deferred_free, - .sync = srcu_torture_synchronize, - .exp_sync = srcu_torture_synchronize_expedited, - .call = srcu_torture_call, - .cb_barrier = srcu_torture_barrier, - .stats = srcu_torture_stats, - .name = "srcu" -}; - -/* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static void rcu_sched_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sched_torture_deferred_free, - .sync = synchronize_sched, - .exp_sync = synchronize_sched_expedited, - .call = call_rcu_sched, - .cb_barrier = rcu_barrier_sched, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched" -}; - -/* - * RCU torture priority-boost testing. Runs one real-time thread per - * CPU for moderate bursts, repeatedly registering RCU callbacks and - * spinning waiting for them to be invoked. If a given callback takes - * too long to be invoked, we assume that priority inversion has occurred. - */ - -struct rcu_boost_inflight { - struct rcu_head rcu; - int inflight; -}; - -static void rcu_torture_boost_cb(struct rcu_head *head) -{ - struct rcu_boost_inflight *rbip = - container_of(head, struct rcu_boost_inflight, rcu); - - smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ - rbip->inflight = 0; -} - -static int rcu_torture_boost(void *arg) -{ - unsigned long call_rcu_time; - unsigned long endtime; - unsigned long oldstarttime; - struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; - - VERBOSE_PRINTK_STRING("rcu_torture_boost started"); - - /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } - - init_rcu_head_on_stack(&rbi.rcu); - /* Each pass through the following loop does one boost-test cycle. */ - do { - /* Wait for the next test interval. */ - oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_interruptible(oldstarttime - jiffies); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* Do one boost-test interval. */ - endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { - /* If we don't have a callback in flight, post one. */ - if (!rbi.inflight) { - smp_mb(); /* RCU core before ->inflight = 1. */ - rbi.inflight = 1; - call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } - call_rcu_time = jiffies; - } - cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* - * Set the start time of the next test interval. - * Yes, this is vulnerable to long delays, but such - * delays simply cause a false negative for the next - * interval. Besides, we are running at RT priority, - * so delays should be relatively rare. - */ - while (oldstarttime == boost_starttime && - !kthread_should_stop()) { - if (mutex_trylock(&boost_mutex)) { - boost_starttime = jiffies + - test_boost_interval * HZ; - n_rcu_torture_boosts++; - mutex_unlock(&boost_mutex); - break; - } - schedule_timeout_uninterruptible(1); - } - - /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - /* Clean up and exit. */ - VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) - schedule_timeout_uninterruptible(1); - smp_mb(); /* order accesses to ->inflight before stack-frame death. */ - destroy_rcu_head_on_stack(&rbi.rcu); - return 0; -} - -/* - * RCU torture force-quiescent-state kthread. Repeatedly induces - * bursts of calls to force_quiescent_state(), increasing the probability - * of occurrence of some important types of race conditions. - */ -static int -rcu_torture_fqs(void *arg) -{ - unsigned long fqs_resume_time; - int fqs_burst_remaining; - - VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); - do { - fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && - !kthread_should_stop()) { - schedule_timeout_interruptible(1); - } - fqs_burst_remaining = fqs_duration; - while (fqs_burst_remaining > 0 && - !kthread_should_stop()) { - cur_ops->fqs(); - udelay(fqs_holdoff); - fqs_burst_remaining -= fqs_holdoff; - } - rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). - */ -static int -rcu_torture_writer(void *arg) -{ - bool exp; - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1); - rp = rcu_torture_alloc(); - if (rp == NULL) - continue; - rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_dereference_check(rcu_torture_current, - current == writer_task); - rp->rtort_mbtest = 1; - rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ - if (old_rp) { - i = old_rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; - if (gp_normal == gp_exp) - exp = !!(rcu_random(&rand) & 0x80); - else - exp = gp_exp; - if (!exp) { - cur_ops->deferred_free(old_rp); - } else { - cur_ops->exp_sync(); - list_add(&old_rp->rtort_free, - &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, - &rcu_torture_removed, - rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= - RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } - } - } - rcutorture_record_progress(++rcu_torture_current_version); - rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture fake writer kthread. Repeatedly calls sync, with a random - * delay between calls. - */ -static int -rcu_torture_fakewriter(void *arg) -{ - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); - if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) { - cur_ops->cb_barrier(); - } else if (gp_normal == gp_exp) { - if (rcu_random(&rand) & 0x80) - cur_ops->sync(); - else - cur_ops->exp_sync(); - } else if (gp_normal) { - cur_ops->sync(); - } else { - cur_ops->exp_sync(); - } - rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - -/* - * RCU torture reader from timer handler. Dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static void rcu_torture_timer(unsigned long unused) -{ - int idx; - int completed; - int completed_end; - static DEFINE_RCU_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); - struct rcu_torture *p; - int pipe_count; - unsigned long long ts; - - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - n_rcu_torture_timers++; - spin_unlock(&rand_lock); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed_end = cur_ops->completed(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); - rcutorture_trace_dump(); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); -} - -/* - * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static int -rcu_torture_reader(void *arg) -{ - int completed; - int completed_end; - int idx; - DEFINE_RCU_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; - struct timer_list t; - unsigned long long ts; - - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); - if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); - - do { - if (irqreader && cur_ops->irq_capable) { - if (!timer_pending(&t)) - mod_timer(&t, jiffies + 1); - } - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed_end = cur_ops->completed(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); - rcutorture_trace_dump(); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); - schedule(); - rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irq_capable) - del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * Create an RCU-torture statistics message in the specified buffer. - */ -static void -rcu_torture_printk(char *page) -{ - int cpu; - int i; - long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; - } - } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { - if (pipesummary[i] != 0) - break; - } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free)); - page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); - page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers); - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); - page += sprintf(page, "barrier: %ld/%ld:%ld", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - if (atomic_read(&n_rcu_torture_mberror) != 0 || - n_rcu_torture_barrier_error != 0 || - n_rcu_torture_boost_ktrerror != 0 || - n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0 || - i > 1) { - page += sprintf(page, "!!! "); - atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); - } - page += sprintf(page, "Reader Pipe: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", pipesummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", batchsummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Free-Block Circulation: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - page += sprintf(page, " %d", - atomic_read(&rcu_torture_wcount[i])); - } - page += sprintf(page, "\n"); - if (cur_ops->stats) - cur_ops->stats(page); -} - -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int size = nr_cpu_ids * 200 + 8192; - char *buf; - - buf = kmalloc(size, GFP_KERNEL); - if (!buf) { - pr_err("rcu-torture: Out of memory, need: %d", size); - return; - } - rcu_torture_printk(buf); - pr_alert("%s", buf); - kfree(buf); -} - -/* - * Periodically prints torture statistics, if periodic statistics printing - * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. - */ -static int -rcu_torture_stats(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); - do { - schedule_timeout_interruptible(stat_interval * HZ); - rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - if (stutter_task) - set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); - if (fqs_task) - set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); - if (shutdown_task) - set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); -#ifdef CONFIG_HOTPLUG_CPU - if (onoff_task) - set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - if (stall_task) - set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); - if (barrier_cbs_tasks) - for (i = 0; i < n_barrier_cbs; i++) - if (barrier_cbs_tasks[i]) - set_cpus_allowed_ptr(barrier_cbs_tasks[i], - shuffle_tmp_mask); - if (barrier_task) - set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); - return 0; -} - -static inline void -rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) -{ - pr_alert("%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " - "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "stall_cpu=%d stall_cpu_holdoff=%d " - "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, - test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - stall_cpu, stall_cpu_holdoff, - n_barrier_cbs, - onoff_interval, onoff_holdoff); -} - -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - -static void rcutorture_booster_cleanup(int cpu) -{ - struct task_struct *t; - - if (boost_tasks[cpu] == NULL) - return; - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); - t = boost_tasks[cpu]; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - - /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); - boost_tasks[cpu] = NULL; -} - -static int rcutorture_booster_init(int cpu) -{ - int retval; - - if (boost_tasks[cpu] != NULL) - return 0; /* Already created, nothing more to do. */ - - /* Don't allow time recalculation while creating a new task. */ - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); - if (IS_ERR(boost_tasks[cpu])) { - retval = PTR_ERR(boost_tasks[cpu]); - VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); - n_rcu_torture_boost_ktrerror++; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - return retval; - } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); - mutex_unlock(&boost_mutex); - return 0; -} - -/* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - unsigned long delta; - int maxcpu = -1; - DEFINE_RCU_RANDOM(rand); - int ret; - unsigned long starttime; - - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - if (onoff_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); - } - while (!kthread_should_stop()) { - cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - int ret; - - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); - onoff_task = NULL; -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static int -rcu_torture_onoff_init(void) -{ - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. - */ -static int rcu_torture_stall(void *args) -{ - unsigned long stop_at; - - VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); - if (stall_cpu_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); - schedule_timeout_interruptible(stall_cpu_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); - } - if (!kthread_should_stop()) { - stop_at = get_seconds() + stall_cpu; - /* RCU CPU stall is expected behavior in following code. */ - pr_alert("rcu_torture_stall start.\n"); - rcu_read_lock(); - preempt_disable(); - while (ULONG_CMP_LT(get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ - preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); - } - rcutorture_shutdown_absorb("rcu_torture_stall"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(10 * HZ); - return 0; -} - -/* Spawn CPU-stall kthread, if stall_cpu specified. */ -static int __init rcu_torture_stall_init(void) -{ - int ret; - - if (stall_cpu <= 0) - return 0; - stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); - if (IS_ERR(stall_task)) { - ret = PTR_ERR(stall_task); - stall_task = NULL; - return ret; - } - return 0; -} - -/* Clean up after the CPU-stall kthread, if one was spawned. */ -static void rcu_torture_stall_cleanup(void) -{ - if (stall_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); - kthread_stop(stall_task); - stall_task = NULL; -} - -/* Callback function for RCU barrier testing. */ -void rcu_torture_barrier_cbf(struct rcu_head *rcu) -{ - atomic_inc(&barrier_cbs_invoked); -} - -/* kthread function to register callbacks used to test RCU barriers. */ -static int rcu_torture_barrier_cbs(void *arg) -{ - long myid = (long)arg; - bool lastphase = 0; - bool newphase; - struct rcu_head rcu; - - init_rcu_head_on_stack(&rcu); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); - set_user_nice(current, 19); - do { - wait_event(barrier_cbs_wq[myid], - (newphase = - ACCESS_ONCE(barrier_phase)) != lastphase || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - lastphase = newphase; - smp_mb(); /* ensure barrier_phase load before ->call(). */ - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) - break; - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - if (atomic_dec_and_test(&barrier_cbs_count)) - wake_up(&barrier_wq); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); - cur_ops->cb_barrier(); - destroy_rcu_head_on_stack(&rcu); - return 0; -} - -/* kthread function to drive and coordinate RCU barrier testing. */ -static int rcu_torture_barrier(void *arg) -{ - int i; - - VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); - do { - atomic_set(&barrier_cbs_invoked, 0); - atomic_set(&barrier_cbs_count, n_barrier_cbs); - smp_mb(); /* Ensure barrier_phase after prior assignments. */ - barrier_phase = !barrier_phase; - for (i = 0; i < n_barrier_cbs; i++) - wake_up(&barrier_cbs_wq[i]); - wait_event(barrier_wq, - atomic_read(&barrier_cbs_count) == 0 || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) - break; - n_barrier_attempts++; - cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ - if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { - n_rcu_torture_barrier_error++; - WARN_ON_ONCE(1); - } - n_barrier_successes++; - schedule_timeout_interruptible(HZ / 10); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); - return 0; -} - -/* Initialize RCU barrier testing. */ -static int rcu_torture_barrier_init(void) -{ - int i; - int ret; - - if (n_barrier_cbs == 0) - return 0; - if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { - pr_alert("%s" TORTURE_FLAG - " Call or barrier ops missing for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " RCU barrier testing omitted from run.\n", - torture_type); - return 0; - } - atomic_set(&barrier_cbs_count, 0); - atomic_set(&barrier_cbs_invoked, 0); - barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), - GFP_KERNEL); - barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); - if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) - return -ENOMEM; - for (i = 0; i < n_barrier_cbs; i++) { - init_waitqueue_head(&barrier_cbs_wq[i]); - barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)(long)i, - "rcu_torture_barrier_cbs"); - if (IS_ERR(barrier_cbs_tasks[i])) { - ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); - barrier_cbs_tasks[i] = NULL; - return ret; - } - } - barrier_task = kthread_run(rcu_torture_barrier, NULL, - "rcu_torture_barrier"); - if (IS_ERR(barrier_task)) { - ret = PTR_ERR(barrier_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); - barrier_task = NULL; - } - return 0; -} - -/* Clean up after RCU barrier testing. */ -static void rcu_torture_barrier_cleanup(void) -{ - int i; - - if (barrier_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); - kthread_stop(barrier_task); - barrier_task = NULL; - } - if (barrier_cbs_tasks != NULL) { - for (i = 0; i < n_barrier_cbs; i++) { - if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); - kthread_stop(barrier_cbs_tasks[i]); - barrier_cbs_tasks[i] = NULL; - } - } - kfree(barrier_cbs_tasks); - barrier_cbs_tasks = NULL; - } - if (barrier_cbs_wq != NULL) { - kfree(barrier_cbs_wq); - barrier_cbs_wq = NULL; - } -} - -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; - -static void -rcu_torture_cleanup(void) -{ - int i; - - mutex_lock(&fullstop_mutex); - rcutorture_record_test_transition(); - if (fullstop == FULLSTOP_SHUTDOWN) { - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - return; - } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_barrier_cleanup(); - rcu_torture_stall_cleanup(); - if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; - - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } - kfree(reader_tasks); - reader_tasks = NULL; - } - rcu_torture_current = NULL; - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; - } - kfree(fakewriter_tasks); - fakewriter_tasks = NULL; - } - - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } - if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; - rcu_torture_onoff_cleanup(); - - /* Wait for all RCU callbacks to fire. */ - - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - - rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - - if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) - rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else if (n_online_successes != n_online_attempts || - n_offline_successes != n_offline_attempts) - rcu_torture_print_module_parms(cur_ops, - "End of test: RCU_HOTPLUG"); - else - rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); -} - -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static void rcu_torture_leak_cb(struct rcu_head *rhp) -{ -} - -static void rcu_torture_err_cb(struct rcu_head *rhp) -{ - /* - * This -might- happen due to race conditions, but is unlikely. - * The scenario that leads to this happening is that the - * first of the pair of duplicate callbacks is queued, - * someone else starts a grace period that includes that - * callback, then the second of the pair must wait for the - * next grace period. Unlikely, but can happen. If it - * does happen, the debug-objects subsystem won't have splatted. - */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); -} -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - -/* - * Verify that double-free causes debug-objects to complain, but only - * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test - * cannot be carried out. - */ -static void rcu_test_debug_objects(void) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - struct rcu_head rh1; - struct rcu_head rh2; - - init_rcu_head_on_stack(&rh1); - init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); - - /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu(&rh2, rcu_torture_leak_cb); - call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); - - /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); - destroy_rcu_head_on_stack(&rh1); - destroy_rcu_head_on_stack(&rh2); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -} - -static int __init -rcu_torture_init(void) -{ - int i; - int cpu; - int firsterr = 0; - int retval; - static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, - }; - - mutex_lock(&fullstop_mutex); - - /* Process args and tell the world that the torturer is on the job. */ - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(torture_ops)) { - pr_alert("rcu-torture: invalid torture type: \"%s\"\n", - torture_type); - pr_alert("rcu-torture types:"); - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - pr_alert(" %s", torture_ops[i]->name); - pr_alert("\n"); - mutex_unlock(&fullstop_mutex); - return -EINVAL; - } - if (cur_ops->fqs == NULL && fqs_duration != 0) { - pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); - fqs_duration = 0; - } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - - if (nreaders >= 0) - nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; - - /* Set up the freelist. */ - - INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { - rcu_tortures[i].rtort_mbtest = 0; - list_add_tail(&rcu_tortures[i].rtort_free, - &rcu_torture_freelist); - } - - /* Initialize the statistics so that each run gets its own numbers. */ - - rcu_torture_current = NULL; - rcu_torture_current_version = 0; - atomic_set(&n_rcu_torture_alloc, 0); - atomic_set(&n_rcu_torture_alloc_fail, 0); - atomic_set(&n_rcu_torture_free, 0); - atomic_set(&n_rcu_torture_mberror, 0); - atomic_set(&n_rcu_torture_error, 0); - n_rcu_torture_barrier_error = 0; - n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; - n_rcu_torture_boost_failure = 0; - n_rcu_torture_boosts = 0; - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - atomic_set(&rcu_torture_wcount[i], 0); - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - per_cpu(rcu_torture_count, cpu)[i] = 0; - per_cpu(rcu_torture_batch, cpu)[i] = 0; - } - } - - /* Start up the kthreads. */ - - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_create(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; - goto unwind; - } - wake_up_process(writer_task); - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; - goto unwind; - } - } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; - goto unwind; - } - } - if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; - goto unwind; - } - } - if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); - goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } - } - if (stutter < 0) - stutter = 0; - if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; - goto unwind; - } - } - if (fqs_duration < 0) - fqs_duration = 0; - if (fqs_duration) { - /* Create the stutter thread */ - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; - goto unwind; - } - } - if (test_boost_interval < 1) - test_boost_interval = 1; - if (test_boost_duration < 2) - test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - - boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - retval = rcutorture_booster_init(i); - if (retval < 0) { - firsterr = retval; - goto unwind; - } - } - } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - wake_up_process(shutdown_task); - } - i = rcu_torture_onoff_init(); - if (i != 0) { - firsterr = i; - goto unwind; - } - register_reboot_notifier(&rcutorture_shutdown_nb); - i = rcu_torture_stall_init(); - if (i != 0) { - firsterr = i; - goto unwind; - } - retval = rcu_torture_barrier_init(); - if (retval != 0) { - firsterr = retval; - goto unwind; - } - if (object_debug) - rcu_test_debug_objects(); - rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); - return 0; - -unwind: - mutex_unlock(&fullstop_mutex); - rcu_torture_cleanup(); - return firsterr; -} - -module_init(rcu_torture_init); -module_exit(rcu_torture_cleanup); diff --git a/kernel/torture.c b/kernel/torture.c new file mode 100644 index 000000000000..c82c70f7828e --- /dev/null +++ b/kernel/torture.c @@ -0,0 +1,71 @@ +/* + * Common functions for in-kernel torture tests. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney + * Based on kernel/rcu/torture.c. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +#define TORTURE_RANDOM_MULT 39916801 /* prime */ +#define TORTURE_RANDOM_ADD 479001701 /* prime */ +#define TORTURE_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from cpu_clock(). + */ +unsigned long +torture_random(struct torture_random_state *trsp) +{ + if (--trsp->trs_count < 0) { + trsp->trs_state += (unsigned long)local_clock(); + trsp->trs_count = TORTURE_RANDOM_REFRESH; + } + trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT + + TORTURE_RANDOM_ADD; + return swahw32(trsp->trs_state); +} +EXPORT_SYMBOL_GPL(torture_random); -- cgit v1.2.3 From 9e2502254132261e0ea8010692fd447b1cedf627 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Jan 2014 16:27:00 -0800 Subject: rcutorture: Abstract torture_param() Create a torture_param() macro and apply it to rcutorture in order to save a few lines of code. This same macro may be applied to other torture frameworks. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 103 ++++++++++++++++-------------------------------- 1 file changed, 34 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 94b1cd8b214c..930791e0698d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -59,78 +59,43 @@ MODULE_ALIAS("rcutorture"); #endif #define MODULE_PARAM_PREFIX "rcutorture." -static int fqs_duration; -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); -static int fqs_holdoff; -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -static int fqs_stutter = 3; -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -static bool gp_exp; -module_param(gp_exp, bool, 0444); -MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); -static bool gp_normal; -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); -static int irqreader = 1; -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -static int n_barrier_cbs; -module_param(n_barrier_cbs, int, 0444); -MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -static int nfakewriters = 4; -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -static int nreaders = -1; -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -static int object_debug; -module_param(object_debug, int, 0444); -MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -static int onoff_holdoff; -module_param(onoff_holdoff, int, 0444); -MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); -static int onoff_interval; -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -static int shuffle_interval = 3; -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -static int shutdown_secs; -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); -static int stall_cpu; -module_param(stall_cpu, int, 0444); -MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); -static int stall_cpu_holdoff = 10; -module_param(stall_cpu_holdoff, int, 0444); -MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); -static int stat_interval = 60; -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -static int stutter = 5; -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -static int test_boost = 1; -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -static int test_boost_duration = 4; -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -static int test_boost_interval = 7; -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -static bool test_no_idle_hz = true; -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +torture_param(int, fqs_duration, 0, + "Duration of fqs bursts (us), 0 to disable"); +torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); +torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); +torture_param(bool, gp_normal, false, + "Use normal (non-expedited) GP wait primitives"); +torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, n_barrier_cbs, 0, + "# of callbacks/kthreads for barrier testing"); +torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, object_debug, 0, + "Enable debug-object double call_rcu() testing"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); +torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); +torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); +torture_param(int, stall_cpu_holdoff, 10, + "Time to wait before starting stall (s)."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of seconds to run/halt test"); +torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +torture_param(int, test_boost_duration, 4, + "Duration of each boost test, seconds."); +torture_param(int, test_boost_interval, 7, + "Interval between boost tests, seconds."); +torture_param(bool, test_no_idle_hz, true, + "Test support for tickless idle CPUs"); +torture_param(bool, verbose, false, "Enable verbose debugging printk()s"); + static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -static bool verbose; -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ -- cgit v1.2.3 From 5ccf60f23d33afd53568cff4f3f421f2ca624401 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jan 2014 07:25:25 -0800 Subject: rcutorture: Rename PRINTK to TOROUT Since it doesn't do printk()s anymore anyway, this commit renames these macros from PRINTK to TOROUT (short for torture output). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 134 ++++++++++++++++++++++++------------------------ 1 file changed, 67 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 930791e0698d..34a75b1170e8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -98,11 +98,11 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); #define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ +#define TOROUT_STRING(s) \ do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ +#define VERBOSE_TOROUT_STRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ +#define VERBOSE_TOROUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static int nrealreaders; @@ -619,12 +619,12 @@ static int rcu_torture_boost(void *arg) struct rcu_boost_inflight rbi = { .inflight = 0 }; struct sched_param sp; - VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ sp.sched_priority = 1; if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); n_rcu_torture_boost_rterror++; } @@ -652,7 +652,7 @@ static int rcu_torture_boost(void *arg) call_rcu(&rbi.rcu, rcu_torture_boost_cb); if (jiffies - call_rcu_time > test_boost_duration * HZ - HZ / 2) { - VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; } call_rcu_time = jiffies; @@ -688,7 +688,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); /* Clean up and exit. */ - VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping"); rcutorture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); @@ -708,7 +708,7 @@ rcu_torture_fqs(void *arg) unsigned long fqs_resume_time; int fqs_burst_remaining; - VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; while (ULONG_CMP_LT(jiffies, fqs_resume_time) && @@ -724,7 +724,7 @@ rcu_torture_fqs(void *arg) } rcu_stutter_wait("rcu_torture_fqs"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); rcutorture_shutdown_absorb("rcu_torture_fqs"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); @@ -746,7 +746,7 @@ rcu_torture_writer(void *arg) struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); set_user_nice(current, 19); do { @@ -796,7 +796,7 @@ rcu_torture_writer(void *arg) rcutorture_record_progress(++rcu_torture_current_version); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); rcutorture_shutdown_absorb("rcu_torture_writer"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); @@ -812,7 +812,7 @@ rcu_torture_fakewriter(void *arg) { DEFINE_TORTURE_RANDOM(rand); - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, 19); do { @@ -834,7 +834,7 @@ rcu_torture_fakewriter(void *arg) rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); rcutorture_shutdown_absorb("rcu_torture_fakewriter"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); @@ -928,7 +928,7 @@ rcu_torture_reader(void *arg) struct timer_list t; unsigned long long ts; - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); if (irqreader && cur_ops->irq_capable) setup_timer_on_stack(&t, rcu_torture_timer, 0); @@ -978,7 +978,7 @@ rcu_torture_reader(void *arg) schedule(); rcu_stutter_wait("rcu_torture_reader"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); rcutorture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); @@ -1099,13 +1099,13 @@ rcu_torture_stats_print(void) static int rcu_torture_stats(void *arg) { - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + VERBOSE_TOROUT_STRING("rcu_torture_stats task started"); do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); rcutorture_shutdown_absorb("rcu_torture_stats"); } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping"); return 0; } @@ -1183,13 +1183,13 @@ static void rcu_torture_shuffle_tasks(void) static int rcu_torture_shuffle(void *arg) { - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); + VERBOSE_TOROUT_STRING("rcu_torture_shuffle task started"); do { schedule_timeout_interruptible(shuffle_interval * HZ); rcu_torture_shuffle_tasks(); rcutorture_shutdown_absorb("rcu_torture_shuffle"); } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping"); return 0; } @@ -1199,7 +1199,7 @@ rcu_torture_shuffle(void *arg) static int rcu_torture_stutter(void *arg) { - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + VERBOSE_TOROUT_STRING("rcu_torture_stutter task started"); do { schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 1; @@ -1208,7 +1208,7 @@ rcu_torture_stutter(void *arg) stutter_pause_test = 0; rcutorture_shutdown_absorb("rcu_torture_stutter"); } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping"); return 0; } @@ -1246,7 +1246,7 @@ static void rcutorture_booster_cleanup(int cpu) if (boost_tasks[cpu] == NULL) return; mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_boost task"); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); @@ -1265,13 +1265,13 @@ static int rcutorture_booster_init(int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, cpu_to_node(cpu), "rcu_torture_boost"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); - VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); n_rcu_torture_boost_ktrerror++; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); @@ -1293,7 +1293,7 @@ rcu_torture_shutdown(void *arg) long delta; unsigned long jiffies_snap; - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); + VERBOSE_TOROUT_STRING("rcu_torture_shutdown task started"); jiffies_snap = ACCESS_ONCE(jiffies); while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && !kthread_should_stop()) { @@ -1306,13 +1306,13 @@ rcu_torture_shutdown(void *arg) jiffies_snap = ACCESS_ONCE(jiffies); } if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_shutdown task stopping"); return 0; } /* OK, shut down the system. */ - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); + VERBOSE_TOROUT_STRING("rcu_torture_shutdown task shutting down system"); shutdown_task = NULL; /* Avoid self-kill deadlock. */ rcu_torture_cleanup(); /* Get the success/failure message. */ kernel_power_off(); /* Shut down the system. */ @@ -1335,14 +1335,14 @@ rcu_torture_onoff(void *arg) int ret; unsigned long starttime; - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); + VERBOSE_TOROUT_STRING("rcu_torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); if (onoff_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_onoff begin holdoff"); schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_onoff end holdoff"); } while (!kthread_should_stop()) { cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); @@ -1409,7 +1409,7 @@ rcu_torture_onoff(void *arg) } schedule_timeout_interruptible(onoff_interval * HZ); } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_onoff task stopping"); return 0; } @@ -1433,7 +1433,7 @@ static void rcu_torture_onoff_cleanup(void) { if (onoff_task == NULL) return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_onoff task"); kthread_stop(onoff_task); onoff_task = NULL; } @@ -1460,11 +1460,11 @@ static int rcu_torture_stall(void *args) { unsigned long stop_at; - VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); if (stall_cpu_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; @@ -1505,7 +1505,7 @@ static void rcu_torture_stall_cleanup(void) { if (stall_task == NULL) return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_stall_task."); kthread_stop(stall_task); stall_task = NULL; } @@ -1525,7 +1525,7 @@ static int rcu_torture_barrier_cbs(void *arg) struct rcu_head rcu; init_rcu_head_on_stack(&rcu); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); + VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started"); set_user_nice(current, 19); do { wait_event(barrier_cbs_wq[myid], @@ -1541,7 +1541,7 @@ static int rcu_torture_barrier_cbs(void *arg) if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping"); rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); @@ -1555,7 +1555,7 @@ static int rcu_torture_barrier(void *arg) { int i; - VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting"); do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); @@ -1578,7 +1578,7 @@ static int rcu_torture_barrier(void *arg) n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); + VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping"); rcutorture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); @@ -1619,7 +1619,7 @@ static int rcu_torture_barrier_init(void) "rcu_torture_barrier_cbs"); if (IS_ERR(barrier_cbs_tasks[i])) { ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); + VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); barrier_cbs_tasks[i] = NULL; return ret; } @@ -1628,7 +1628,7 @@ static int rcu_torture_barrier_init(void) "rcu_torture_barrier"); if (IS_ERR(barrier_task)) { ret = PTR_ERR(barrier_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); + VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier"); barrier_task = NULL; } return 0; @@ -1640,14 +1640,14 @@ static void rcu_torture_barrier_cleanup(void) int i; if (barrier_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier task"); kthread_stop(barrier_task); barrier_task = NULL; } if (barrier_cbs_tasks != NULL) { for (i = 0; i < n_barrier_cbs; i++) { if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier_cbs task"); kthread_stop(barrier_cbs_tasks[i]); barrier_cbs_tasks[i] = NULL; } @@ -1706,19 +1706,19 @@ rcu_torture_cleanup(void) rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_stutter task"); kthread_stop(stutter_task); } stutter_task = NULL; if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); free_cpumask_var(shuffle_tmp_mask); } shuffler_task = NULL; if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task"); kthread_stop(writer_task); } writer_task = NULL; @@ -1726,7 +1726,7 @@ rcu_torture_cleanup(void) if (reader_tasks) { for (i = 0; i < nrealreaders; i++) { if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( + VERBOSE_TOROUT_STRING( "Stopping rcu_torture_reader task"); kthread_stop(reader_tasks[i]); } @@ -1740,7 +1740,7 @@ rcu_torture_cleanup(void) if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( + VERBOSE_TOROUT_STRING( "Stopping rcu_torture_fakewriter task"); kthread_stop(fakewriter_tasks[i]); } @@ -1751,13 +1751,13 @@ rcu_torture_cleanup(void) } if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_stats task"); kthread_stop(stats_task); } stats_task = NULL; if (fqs_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_fqs task"); kthread_stop(fqs_task); } fqs_task = NULL; @@ -1768,7 +1768,7 @@ rcu_torture_cleanup(void) rcutorture_booster_cleanup(i); } if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); + VERBOSE_TOROUT_STRING("Stopping rcu_torture_shutdown task"); kthread_stop(shutdown_task); } shutdown_task = NULL; @@ -1924,12 +1924,12 @@ rcu_torture_init(void) /* Start up the kthreads. */ - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + VERBOSE_TOROUT_STRING("Creating rcu_torture_writer task"); writer_task = kthread_create(rcu_torture_writer, NULL, "rcu_torture_writer"); if (IS_ERR(writer_task)) { firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + VERBOSE_TOROUT_ERRSTRING("Failed to create writer"); writer_task = NULL; goto unwind; } @@ -1937,17 +1937,17 @@ rcu_torture_init(void) fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); + VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + VERBOSE_TOROUT_STRING("Creating rcu_torture_fakewriter task"); fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, "rcu_torture_fakewriter"); if (IS_ERR(fakewriter_tasks[i])) { firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + VERBOSE_TOROUT_ERRSTRING("Failed to create fakewriter"); fakewriter_tasks[i] = NULL; goto unwind; } @@ -1955,28 +1955,28 @@ rcu_torture_init(void) reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); + VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + VERBOSE_TOROUT_STRING("Creating rcu_torture_reader task"); reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, "rcu_torture_reader"); if (IS_ERR(reader_tasks[i])) { firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + VERBOSE_TOROUT_ERRSTRING("Failed to create reader"); reader_tasks[i] = NULL; goto unwind; } } if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task"); stats_task = kthread_run(rcu_torture_stats, NULL, "rcu_torture_stats"); if (IS_ERR(stats_task)) { firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + VERBOSE_TOROUT_ERRSTRING("Failed to create stats"); stats_task = NULL; goto unwind; } @@ -1986,7 +1986,7 @@ rcu_torture_init(void) if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { firsterr = -ENOMEM; - VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); + VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); goto unwind; } @@ -1996,7 +1996,7 @@ rcu_torture_init(void) if (IS_ERR(shuffler_task)) { free_cpumask_var(shuffle_tmp_mask); firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); + VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler"); shuffler_task = NULL; goto unwind; } @@ -2009,7 +2009,7 @@ rcu_torture_init(void) "rcu_torture_stutter"); if (IS_ERR(stutter_task)) { firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + VERBOSE_TOROUT_ERRSTRING("Failed to create stutter"); stutter_task = NULL; goto unwind; } @@ -2022,7 +2022,7 @@ rcu_torture_init(void) "rcu_torture_fqs"); if (IS_ERR(fqs_task)) { firsterr = PTR_ERR(fqs_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + VERBOSE_TOROUT_ERRSTRING("Failed to create fqs"); fqs_task = NULL; goto unwind; } @@ -2052,7 +2052,7 @@ rcu_torture_init(void) "rcu_torture_shutdown"); if (IS_ERR(shutdown_task)) { firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); + VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown"); shutdown_task = NULL; goto unwind; } -- cgit v1.2.3 From c2884de38e01134ae040d55aa5644049d1bb850f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jan 2014 07:30:50 -0800 Subject: rcutorture: Abstract TOROUT_STRING() and friends These diagnostic macros are not confined to torturing RCU, so this commit makes them available to other torture tests. Also removed the do-while from TOROUT_STRING() in response to checkpatch complaints. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 34a75b1170e8..86fd8c11257b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -97,14 +97,6 @@ static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -#define TORTURE_FLAG "-torture:" -#define TOROUT_STRING(s) \ - do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_TOROUT_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_TOROUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; -- cgit v1.2.3 From f67a33561e6e5463b548219df98130da95f2e4a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Jan 2014 07:40:27 -0800 Subject: rcutorture: Abstract torture_shutdown_absorb() Because handling races between rmmod and normal shutdown is not specific to rcutorture, this commit renames rcutorture_shutdown_absorb() to torture_shutdown_absorb() and pulls it out into then kernel/torture.c module. This implies pulling the fullstop mechanism into kernel/torture.c as well. The exporting of fullstop and fullstop_mutex is ugly and must die. And it does in fact die in later commits that introduce higher-level APIs that encapsulate both of these variables. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett ` --- kernel/rcu/rcutorture.c | 57 ++++++++++++++++--------------------------------- kernel/torture.c | 20 +++++++++++++++++ 2 files changed, 38 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 86fd8c11257b..a868758a6f9c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -91,11 +91,15 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); -torture_param(bool, verbose, false, "Enable verbose debugging printk()s"); -static char *torture_type = "rcu"; +char *torture_type = "rcu"; +EXPORT_SYMBOL_GPL(torture_type); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +bool verbose; +EXPORT_SYMBOL_GPL(verbose); +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); static int nrealreaders; static struct task_struct *writer_task; @@ -200,17 +204,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - /* Forward reference. */ static void rcu_torture_cleanup(void); @@ -231,20 +224,6 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, return NOTIFY_DONE; } -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(const char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - /* * Allocate an element from the rcu_tortures pool. */ @@ -286,7 +265,7 @@ rcu_stutter_wait(const char *title) schedule_timeout_interruptible(1); else schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); + torture_shutdown_absorb(title); } } @@ -681,7 +660,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); + torture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); smp_mb(); /* order accesses to ->inflight before stack-frame death. */ @@ -717,7 +696,7 @@ rcu_torture_fqs(void *arg) rcu_stutter_wait("rcu_torture_fqs"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); + torture_shutdown_absorb("rcu_torture_fqs"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -789,7 +768,7 @@ rcu_torture_writer(void *arg) rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); + torture_shutdown_absorb("rcu_torture_writer"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -827,7 +806,7 @@ rcu_torture_fakewriter(void *arg) } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + torture_shutdown_absorb("rcu_torture_fakewriter"); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -971,7 +950,7 @@ rcu_torture_reader(void *arg) rcu_stutter_wait("rcu_torture_reader"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); + torture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); while (!kthread_should_stop()) @@ -1095,7 +1074,7 @@ rcu_torture_stats(void *arg) do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); + torture_shutdown_absorb("rcu_torture_stats"); } while (!kthread_should_stop()); VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping"); return 0; @@ -1179,7 +1158,7 @@ rcu_torture_shuffle(void *arg) do { schedule_timeout_interruptible(shuffle_interval * HZ); rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); + torture_shutdown_absorb("rcu_torture_shuffle"); } while (!kthread_should_stop()); VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping"); return 0; @@ -1198,7 +1177,7 @@ rcu_torture_stutter(void *arg) if (!kthread_should_stop()) schedule_timeout_interruptible(stutter * HZ); stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); + torture_shutdown_absorb("rcu_torture_stutter"); } while (!kthread_should_stop()); VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping"); return 0; @@ -1470,7 +1449,7 @@ static int rcu_torture_stall(void *args) rcu_read_unlock(); pr_alert("rcu_torture_stall end.\n"); } - rcutorture_shutdown_absorb("rcu_torture_stall"); + torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); return 0; @@ -1534,7 +1513,7 @@ static int rcu_torture_barrier_cbs(void *arg) wake_up(&barrier_wq); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + torture_shutdown_absorb("rcu_torture_barrier_cbs"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); cur_ops->cb_barrier(); @@ -1571,7 +1550,7 @@ static int rcu_torture_barrier(void *arg) schedule_timeout_interruptible(HZ / 10); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier"); + torture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) schedule_timeout_interruptible(1); return 0; diff --git a/kernel/torture.c b/kernel/torture.c index c82c70f7828e..f05042036ae8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -49,6 +49,11 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); +int fullstop = FULLSTOP_RMMOD; +EXPORT_SYMBOL_GPL(fullstop); +DEFINE_MUTEX(fullstop_mutex); +EXPORT_SYMBOL_GPL(fullstop_mutex); + #define TORTURE_RANDOM_MULT 39916801 /* prime */ #define TORTURE_RANDOM_ADD 479001701 /* prime */ #define TORTURE_RANDOM_REFRESH 10000 @@ -69,3 +74,18 @@ torture_random(struct torture_random_state *trsp) return swahw32(trsp->trs_state); } EXPORT_SYMBOL_GPL(torture_random); + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +void torture_shutdown_absorb(const char *title) +{ + while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_notice( + "torture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} +EXPORT_SYMBOL_GPL(torture_shutdown_absorb); -- cgit v1.2.3 From 3808dc9fab05913060626d7f0edd0f195cb9dcab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Jan 2014 15:29:21 -0800 Subject: rcutorture: Abstract torture_shuffle() The torture_shuffle() function forces each CPU in turn to go idle periodically in order to check for problems interacting with per-CPU variables and with dyntick-idle mode. Because this sort of debugging is not specific to RCU, this commit abstracts that functionality. This in turn requires abstracting some additional infrastructure. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 124 +++++---------------------------------- kernel/torture.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a868758a6f9c..0380696f1844 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -106,7 +106,6 @@ static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; -static struct task_struct *shuffler_task; static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; @@ -161,7 +160,6 @@ static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; static int stutter_pause_test; @@ -1080,90 +1078,6 @@ rcu_torture_stats(void *arg) return 0; } -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - if (stutter_task) - set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); - if (fqs_task) - set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); - if (shutdown_task) - set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); -#ifdef CONFIG_HOTPLUG_CPU - if (onoff_task) - set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - if (stall_task) - set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); - if (barrier_cbs_tasks) - for (i = 0; i < n_barrier_cbs; i++) - if (barrier_cbs_tasks[i]) - set_cpus_allowed_ptr(barrier_cbs_tasks[i], - shuffle_tmp_mask); - if (barrier_task) - set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_TOROUT_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - torture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - /* Cause the rcutorture test to "stutter", starting and stopping all * threads periodically. */ @@ -1397,6 +1311,7 @@ rcu_torture_onoff_init(void) onoff_task = NULL; return ret; } + torture_shuffle_task_register(onoff_task); return 0; } @@ -1468,6 +1383,7 @@ static int __init rcu_torture_stall_init(void) stall_task = NULL; return ret; } + torture_shuffle_task_register(stall_task); return 0; } @@ -1594,6 +1510,7 @@ static int rcu_torture_barrier_init(void) barrier_cbs_tasks[i] = NULL; return ret; } + torture_shuffle_task_register(barrier_cbs_tasks[i]); } barrier_task = kthread_run(rcu_torture_barrier, NULL, "rcu_torture_barrier"); @@ -1602,6 +1519,7 @@ static int rcu_torture_barrier_init(void) VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier"); barrier_task = NULL; } + torture_shuffle_task_register(barrier_task); return 0; } @@ -1674,6 +1592,8 @@ rcu_torture_cleanup(void) fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); + + torture_shuffle_cleanup(); /* Must be first task cleaned up. */ rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { @@ -1681,12 +1601,6 @@ rcu_torture_cleanup(void) kthread_stop(stutter_task); } stutter_task = NULL; - if (shuffler_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; if (writer_task) { VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task"); @@ -1904,6 +1818,7 @@ rcu_torture_init(void) writer_task = NULL; goto unwind; } + torture_shuffle_task_register(writer_task); wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); @@ -1922,6 +1837,7 @@ rcu_torture_init(void) fakewriter_tasks[i] = NULL; goto unwind; } + torture_shuffle_task_register(fakewriter_tasks[i]); } reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); @@ -1940,6 +1856,7 @@ rcu_torture_init(void) reader_tasks[i] = NULL; goto unwind; } + torture_shuffle_task_register(reader_tasks[i]); } if (stat_interval > 0) { VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task"); @@ -1951,26 +1868,12 @@ rcu_torture_init(void) stats_task = NULL; goto unwind; } + torture_shuffle_task_register(stats_task); } if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + firsterr = torture_shuffle_init(shuffle_interval * HZ); + if (firsterr) goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } } if (stutter < 0) stutter = 0; @@ -1984,6 +1887,7 @@ rcu_torture_init(void) stutter_task = NULL; goto unwind; } + torture_shuffle_task_register(stutter_task); } if (fqs_duration < 0) fqs_duration = 0; @@ -1997,6 +1901,7 @@ rcu_torture_init(void) fqs_task = NULL; goto unwind; } + torture_shuffle_task_register(fqs_task); } if (test_boost_interval < 1) test_boost_interval = 1; @@ -2027,6 +1932,7 @@ rcu_torture_init(void) shutdown_task = NULL; goto unwind; } + torture_shuffle_task_register(shutdown_task); wake_up_process(shutdown_task); } i = rcu_torture_onoff_init(); diff --git a/kernel/torture.c b/kernel/torture.c index f05042036ae8..26058f20ee83 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -75,6 +75,157 @@ torture_random(struct torture_random_state *trsp) } EXPORT_SYMBOL_GPL(torture_random); +/* + * Variables for shuffling. The idea is to ensure that each CPU stays + * idle for an extended period to test interactions with dyntick idle, + * as well as interactions with any per-CPU varibles. + */ +struct shuffle_task { + struct list_head st_l; + struct task_struct *st_t; +}; + +static long shuffle_interval; /* In jiffies. */ +static struct task_struct *shuffler_task; +static cpumask_var_t shuffle_tmp_mask; +static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */ +static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list); +static DEFINE_MUTEX(shuffle_task_mutex); + +/* + * Register a task to be shuffled. If there is no memory, just splat + * and don't bother registering. + */ +void torture_shuffle_task_register(struct task_struct *tp) +{ + struct shuffle_task *stp; + + if (WARN_ON_ONCE(tp == NULL)) + return; + stp = kmalloc(sizeof(*stp), GFP_KERNEL); + if (WARN_ON_ONCE(stp == NULL)) + return; + stp->st_t = tp; + mutex_lock(&shuffle_task_mutex); + list_add(&stp->st_l, &shuffle_task_list); + mutex_unlock(&shuffle_task_mutex); +} +EXPORT_SYMBOL_GPL(torture_shuffle_task_register); + +/* + * Unregister all tasks, for example, at the end of the torture run. + */ +static void torture_shuffle_task_unregister_all(void) +{ + struct shuffle_task *stp; + struct shuffle_task *p; + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) { + list_del(&stp->st_l); + kfree(stp); + } + mutex_unlock(&shuffle_task_mutex); +} + +/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle. + * A special case is when shuffle_idle_cpu = -1, in which case we allow + * the tasks to run on all CPUs. + */ +static void torture_shuffle_tasks(void) +{ + struct shuffle_task *stp; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */ + shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); + if (shuffle_idle_cpu >= nr_cpu_ids) + shuffle_idle_cpu = -1; + if (shuffle_idle_cpu != -1) { + cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); + if (cpumask_empty(shuffle_tmp_mask)) { + put_online_cpus(); + return; + } + } + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry(stp, &shuffle_task_list, st_l) + set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + mutex_unlock(&shuffle_task_mutex); + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int torture_shuffle(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval); + torture_shuffle_tasks(); + torture_shutdown_absorb("torture_shuffle"); + } while (!torture_must_stop()); + VERBOSE_TOROUT_STRING("torture_shuffle task stopping"); + return 0; +} + +/* + * Start the shuffler, with shuffint in jiffies. + */ +int torture_shuffle_init(long shuffint) +{ + int ret; + + shuffle_interval = shuffint; + + shuffle_idle_cpu = -1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + return -ENOMEM; + } + + /* Create the shuffler thread */ + shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle"); + if (IS_ERR(shuffler_task)) { + ret = PTR_ERR(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler"); + shuffler_task = NULL; + return ret; + } + torture_shuffle_task_register(shuffler_task); + return 0; +} +EXPORT_SYMBOL_GPL(torture_shuffle_init); + +/* + * Stop the shuffling. + */ +void torture_shuffle_cleanup(void) +{ + torture_shuffle_task_unregister_all(); + if (shuffler_task) { + VERBOSE_TOROUT_STRING("Stopping torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); + /* * Absorb kthreads into a kernel function that won't return, so that * they won't ever access module text or data again. -- cgit v1.2.3 From 2e9e8081d2e7a4efb582a240aa7fee991bbbabb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Jan 2014 15:58:22 -0800 Subject: rcutorture: Abstract torture_onoff() Because online/offline torturing is not specific to RCU, this commit abstracts it into the kernel/torture.c module to allow other torture tests to use it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 162 ++--------------------------------------- kernel/torture.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+), 158 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0380696f1844..0e8b52b71d76 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -110,9 +110,6 @@ static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static struct task_struct *stall_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -147,16 +144,6 @@ static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static unsigned long sum_offline; -static int min_offline = -1; -static int max_offline; -static long n_online_attempts; -static long n_online_successes; -static unsigned long sum_online; -static int min_online = -1; -static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; @@ -994,13 +981,7 @@ rcu_torture_printk(char *page) n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); + page = torture_onoff_stats(page); page += sprintf(page, "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, @@ -1204,140 +1185,6 @@ rcu_torture_shutdown(void *arg) return 0; } -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - unsigned long delta; - int maxcpu = -1; - DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; - - VERBOSE_TOROUT_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - if (onoff_holdoff > 0) { - VERBOSE_TOROUT_STRING("rcu_torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_TOROUT_STRING("rcu_torture_onoff end holdoff"); - } - while (!kthread_should_stop()) { - cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_TOROUT_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - int ret; - - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - torture_shuffle_task_register(onoff_task); - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_TOROUT_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); - onoff_task = NULL; -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static int -rcu_torture_onoff_init(void) -{ - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -1657,7 +1504,7 @@ rcu_torture_cleanup(void) kthread_stop(shutdown_task); } shutdown_task = NULL; - rcu_torture_onoff_cleanup(); + torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1668,8 +1515,7 @@ rcu_torture_cleanup(void) if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else if (n_online_successes != n_online_attempts || - n_offline_successes != n_offline_attempts) + else if (torture_onoff_failures()) rcu_torture_print_module_parms(cur_ops, "End of test: RCU_HOTPLUG"); else @@ -1935,7 +1781,7 @@ rcu_torture_init(void) torture_shuffle_task_register(shutdown_task); wake_up_process(shutdown_task); } - i = rcu_torture_onoff_init(); + i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); if (i != 0) { firsterr = i; goto unwind; diff --git a/kernel/torture.c b/kernel/torture.c index 26058f20ee83..a7ec8a7d561e 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -54,6 +54,192 @@ EXPORT_SYMBOL_GPL(fullstop); DEFINE_MUTEX(fullstop_mutex); EXPORT_SYMBOL_GPL(fullstop_mutex); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Variables for online-offline handling. Only present if CPU hotplug + * is enabled, otherwise does nothing. + */ + +static struct task_struct *onoff_task; +static long onoff_holdoff; +static long onoff_interval; +static long n_offline_attempts; +static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; +static long n_online_attempts; +static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +torture_onoff(void *arg) +{ + int cpu; + unsigned long delta; + int maxcpu = -1; + DEFINE_TORTURE_RANDOM(rand); + int ret; + unsigned long starttime; + + VERBOSE_TOROUT_STRING("torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff); + VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); + } + while (!torture_must_stop()) { + cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_offline_attempts++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_online_attempts++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; + } + } + schedule_timeout_interruptible(onoff_interval); + } + VERBOSE_TOROUT_STRING("torture_onoff task stopping"); + return 0; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Initiate online-offline handling. + */ +int torture_onoff_init(long ooholdoff, long oointerval) +{ +#ifdef CONFIG_HOTPLUG_CPU + int ret; + + onoff_holdoff = ooholdoff; + onoff_interval = oointerval; + if (onoff_interval <= 0) + return 0; + onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff"); + if (IS_ERR(onoff_task)) { + ret = PTR_ERR(onoff_task); + onoff_task = NULL; + return ret; + } + torture_shuffle_task_register(onoff_task); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return 0; +} +EXPORT_SYMBOL_GPL(torture_onoff_init); + +/* + * Clean up after online/offline testing. + */ +void torture_onoff_cleanup(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task == NULL) + return; + VERBOSE_TOROUT_STRING("Stopping torture_onoff task"); + kthread_stop(onoff_task); + onoff_task = NULL; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_cleanup); + +/* + * Print online/offline testing statistics. + */ +char *torture_onoff_stats(char *page) +{ +#ifdef CONFIG_HOTPLUG_CPU + page += sprintf(page, + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return page; +} +EXPORT_SYMBOL_GPL(torture_onoff_stats); + +/* + * Were all the online/offline operations successful? + */ +bool torture_onoff_failures(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + return n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts; +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + return false; +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_failures); + #define TORTURE_RANDOM_MULT 39916801 /* prime */ #define TORTURE_RANDOM_ADD 479001701 /* prime */ #define TORTURE_RANDOM_REFRESH 10000 -- cgit v1.2.3 From b5daa8f3b3b2b0133ad40e13d4f722070119ce36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 13:38:09 -0800 Subject: rcutorture: Abstract torture-test initialization This commit creates torture_init_begin() and torture_init_end() functions to abstract locking and allow the torture_type and verbose variables in kernel/torture.o to become static. With a bit more abstraction, fullstop_mutex will also become static. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 19 ++++++++----------- kernel/torture.c | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0e8b52b71d76..93aca2f9261e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -91,15 +91,12 @@ torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); -char *torture_type = "rcu"; -EXPORT_SYMBOL_GPL(torture_type); +static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -bool verbose; -EXPORT_SYMBOL_GPL(verbose); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); static int nrealreaders; static struct task_struct *writer_task; @@ -1425,8 +1422,8 @@ rcu_torture_cleanup(void) { int i; - mutex_lock(&fullstop_mutex); rcutorture_record_test_transition(); + mutex_lock(&fullstop_mutex); if (fullstop == FULLSTOP_SHUTDOWN) { pr_warn(/* but going down anyway, so... */ "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); @@ -1589,7 +1586,7 @@ rcu_torture_init(void) &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, }; - mutex_lock(&fullstop_mutex); + torture_init_begin(torture_type, verbose); /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -1604,7 +1601,7 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { @@ -1800,11 +1797,11 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return 0; unwind: - mutex_unlock(&fullstop_mutex); + torture_init_end(); rcu_torture_cleanup(); return firsterr; } diff --git a/kernel/torture.c b/kernel/torture.c index a7ec8a7d561e..828d0b1a49b8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -49,6 +49,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); +static char *torture_type; +static bool verbose; + int fullstop = FULLSTOP_RMMOD; EXPORT_SYMBOL_GPL(fullstop); DEFINE_MUTEX(fullstop_mutex); @@ -426,3 +429,27 @@ void torture_shutdown_absorb(const char *title) } } EXPORT_SYMBOL_GPL(torture_shutdown_absorb); + +/* + * Initialize torture module. Please note that this is -not- invoked via + * the usual module_init() mechanism, but rather by an explicit call from + * the client torture module. This call must be paired with a later + * torture_init_end(). + */ +void __init torture_init_begin(char *ttype, bool v) +{ + mutex_lock(&fullstop_mutex); + torture_type = ttype; + verbose = v; + +} +EXPORT_SYMBOL_GPL(torture_init_begin); + +/* + * Tell the torture module that initialization is complete. + */ +void __init torture_init_end(void) +{ + mutex_unlock(&fullstop_mutex); +} +EXPORT_SYMBOL_GPL(torture_init_end); -- cgit v1.2.3 From cc47ae0830264f07442070b36fe0d0a4d4e3c313 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 14:21:11 -0800 Subject: rcutorture: Abstract torture-test cleanup This commit creates a torture_cleanup() that handles the generic cleanup actions local to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 11 +---------- kernel/torture.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 93aca2f9261e..68a689fc6ffa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1423,21 +1423,13 @@ rcu_torture_cleanup(void) int i; rcutorture_record_test_transition(); - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_SHUTDOWN) { - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); + if (torture_cleanup()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); - torture_shuffle_cleanup(); /* Must be first task cleaned up. */ rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); if (stutter_task) { @@ -1501,7 +1493,6 @@ rcu_torture_cleanup(void) kthread_stop(shutdown_task); } shutdown_task = NULL; - torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ diff --git a/kernel/torture.c b/kernel/torture.c index 828d0b1a49b8..41ae5cc3c4c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL_GPL(torture_onoff_init); /* * Clean up after online/offline testing. */ -void torture_onoff_cleanup(void) +static void torture_onoff_cleanup(void) { #ifdef CONFIG_HOTPLUG_CPU if (onoff_task == NULL) @@ -403,7 +403,7 @@ EXPORT_SYMBOL_GPL(torture_shuffle_init); /* * Stop the shuffling. */ -void torture_shuffle_cleanup(void) +static void torture_shuffle_cleanup(void) { torture_shuffle_task_unregister_all(); if (shuffler_task) { @@ -453,3 +453,29 @@ void __init torture_init_end(void) mutex_unlock(&fullstop_mutex); } EXPORT_SYMBOL_GPL(torture_init_end); + +/* + * Clean up torture module. Please note that this is -not- invoked via + * the usual module_exit() mechanism, but rather by an explicit call from + * the client torture module. Returns true if a race with system shutdown + * is detected. + * + * This must be called before the caller starts shutting down its own + * kthreads. + */ +bool torture_cleanup(void) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_SHUTDOWN) { + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + return true; + } + fullstop = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + torture_shuffle_cleanup(); + torture_onoff_cleanup(); + return false; +} +EXPORT_SYMBOL_GPL(torture_cleanup); -- cgit v1.2.3 From 4622b487ecf0094401ac10e504606e5cbdea5a6e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 15:37:19 -0800 Subject: rcutorture: Abstract torture_shutdown_notify() Because handling the race between rmmod and system shutdown is not specific to RCU, this commit abstracts torture_shutdown_notify(), placing this code into kernel/torture.c. This change also allows fullstop_mutex to be private to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 23 ----------------------- kernel/torture.c | 29 ++++++++++++++++++++++++----- 2 files changed, 24 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 68a689fc6ffa..2560e9313887 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -189,23 +189,6 @@ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); /* Forward reference. */ static void rcu_torture_cleanup(void); -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - /* * Allocate an element from the rcu_tortures pool. */ @@ -1098,10 +1081,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - static void rcutorture_booster_cleanup(int cpu) { struct task_struct *t; @@ -1428,7 +1407,6 @@ rcu_torture_cleanup(void) cur_ops->cb_barrier(); return; } - unregister_reboot_notifier(&rcutorture_shutdown_nb); rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); @@ -1774,7 +1752,6 @@ rcu_torture_init(void) firsterr = i; goto unwind; } - register_reboot_notifier(&rcutorture_shutdown_nb); i = rcu_torture_stall_init(); if (i != 0) { firsterr = i; diff --git a/kernel/torture.c b/kernel/torture.c index 41ae5cc3c4c3..b02fa2785bbb 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -54,8 +54,7 @@ static bool verbose; int fullstop = FULLSTOP_RMMOD; EXPORT_SYMBOL_GPL(fullstop); -DEFINE_MUTEX(fullstop_mutex); -EXPORT_SYMBOL_GPL(fullstop_mutex); +static DEFINE_MUTEX(fullstop_mutex); #ifdef CONFIG_HOTPLUG_CPU @@ -422,14 +421,32 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); void torture_shutdown_absorb(const char *title) { while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "torture thread %s parking due to system shutdown\n", - title); + pr_notice("torture thread %s parking due to system shutdown\n", + title); schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); } } EXPORT_SYMBOL_GPL(torture_shutdown_absorb); +/* + * Detect and respond to a system shutdown. + */ +static int torture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_DONTSTOP) + fullstop = FULLSTOP_SHUTDOWN; + else + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block torture_shutdown_nb = { + .notifier_call = torture_shutdown_notify, +}; + /* * Initialize torture module. Please note that this is -not- invoked via * the usual module_init() mechanism, but rather by an explicit call from @@ -451,6 +468,7 @@ EXPORT_SYMBOL_GPL(torture_init_begin); void __init torture_init_end(void) { mutex_unlock(&fullstop_mutex); + register_reboot_notifier(&torture_shutdown_nb); } EXPORT_SYMBOL_GPL(torture_init_end); @@ -474,6 +492,7 @@ bool torture_cleanup(void) } fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); + unregister_reboot_notifier(&torture_shutdown_nb); torture_shuffle_cleanup(); torture_onoff_cleanup(); return false; -- cgit v1.2.3 From 36970bb91d89618d3495babf44b934e9c9db6bbc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 15:49:29 -0800 Subject: rcutorture: Privatize fullstop This commit introduces the torture_must_stop() function in order to keep use of the fullstop variable local to kernel/torture.c. There is also a torture_must_stop_irq() counterpart for use from RCU callbacks, timeout handlers, and the like. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 38 +++++++++++++++----------------------- kernel/torture.c | 27 +++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2560e9313887..9357c88cc8cc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -304,7 +304,7 @@ rcu_torture_cb(struct rcu_head *p) int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - if (fullstop != FULLSTOP_DONTSTOP) { + if (torture_must_stop_irq()) { /* Test is ending, just drop callbacks on the floor. */ /* The next initialization will pick up the pieces. */ return; @@ -572,8 +572,7 @@ static int rcu_torture_boost(void *arg) while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) goto checkwait; } @@ -595,8 +594,7 @@ static int rcu_torture_boost(void *arg) } cond_resched(); rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) goto checkwait; } @@ -621,7 +619,7 @@ static int rcu_torture_boost(void *arg) /* Go do the stutter. */ checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); /* Clean up and exit. */ VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping"); @@ -659,7 +657,7 @@ rcu_torture_fqs(void *arg) fqs_burst_remaining -= fqs_holdoff; } rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); torture_shutdown_absorb("rcu_torture_fqs"); while (!kthread_should_stop()) @@ -731,7 +729,7 @@ rcu_torture_writer(void *arg) } rcutorture_record_progress(++rcu_torture_current_version); rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); torture_shutdown_absorb("rcu_torture_writer"); while (!kthread_should_stop()) @@ -768,7 +766,7 @@ rcu_torture_fakewriter(void *arg) cur_ops->exp_sync(); } rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); torture_shutdown_absorb("rcu_torture_fakewriter"); @@ -913,7 +911,7 @@ rcu_torture_reader(void *arg) cur_ops->readunlock(idx); schedule(); rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); torture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irq_capable) @@ -1022,9 +1020,6 @@ rcu_torture_stats_print(void) /* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. */ static int rcu_torture_stats(void *arg) @@ -1034,7 +1029,7 @@ rcu_torture_stats(void *arg) schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); torture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping"); return 0; } @@ -1241,16 +1236,15 @@ static int rcu_torture_barrier_cbs(void *arg) wait_event(barrier_cbs_wq[myid], (newphase = ACCESS_ONCE(barrier_phase)) != lastphase || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); + torture_must_stop()); lastphase = newphase; smp_mb(); /* ensure barrier_phase load before ->call(). */ - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping"); torture_shutdown_absorb("rcu_torture_barrier_cbs"); while (!kthread_should_stop()) @@ -1275,9 +1269,8 @@ static int rcu_torture_barrier(void *arg) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, atomic_read(&barrier_cbs_count) == 0 || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + torture_must_stop()); + if (torture_must_stop()) break; n_barrier_attempts++; cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ @@ -1287,7 +1280,7 @@ static int rcu_torture_barrier(void *arg) } n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping"); torture_shutdown_absorb("rcu_torture_barrier"); while (!kthread_should_stop()) @@ -1585,7 +1578,6 @@ rcu_torture_init(void) else nrealreaders = 2 * num_online_cpus(); rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ diff --git a/kernel/torture.c b/kernel/torture.c index b02fa2785bbb..ed360cf948da 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -52,8 +52,11 @@ MODULE_AUTHOR("Paul E. McKenney "); static char *torture_type; static bool verbose; -int fullstop = FULLSTOP_RMMOD; -EXPORT_SYMBOL_GPL(fullstop); +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ +static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); #ifdef CONFIG_HOTPLUG_CPU @@ -458,6 +461,7 @@ void __init torture_init_begin(char *ttype, bool v) mutex_lock(&fullstop_mutex); torture_type = ttype; verbose = v; + fullstop = FULLSTOP_DONTSTOP; } EXPORT_SYMBOL_GPL(torture_init_begin); @@ -498,3 +502,22 @@ bool torture_cleanup(void) return false; } EXPORT_SYMBOL_GPL(torture_cleanup); + +/* + * Is it time for the current torture test to stop? + */ +bool torture_must_stop(void) +{ + return torture_must_stop_irq() || kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(torture_must_stop); + +/* + * Is it time for the current torture test to stop? This is the irq-safe + * version, hence no check for kthread_should_stop(). + */ +bool torture_must_stop_irq(void) +{ + return fullstop != FULLSTOP_DONTSTOP; +} +EXPORT_SYMBOL_GPL(torture_must_stop_irq); -- cgit v1.2.3 From fac480efcba6a9f0aea91947f151fd569538b0af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 30 Jan 2014 17:06:30 -0800 Subject: rcutorture: Add diagnostic for unscheduled system shutdown Currently, rcutorture can terminate via rmmod, via self-shutdown, via something else shutting the system down, or of course the usual catastrophic termination. The first two get flagged, so this commit adds a message for the third. For the fourth, your warranty is void as always. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index ed360cf948da..d51de3029a5c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -438,10 +438,12 @@ static int torture_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) + if (fullstop == FULLSTOP_DONTSTOP) { + VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected"); fullstop = FULLSTOP_SHUTDOWN; - else + } else { pr_warn("Concurrent rmmod and shutdown illegal!\n"); + } mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; } -- cgit v1.2.3 From 628edaa5062282b6e3d76c886fd2cbccae5cb87b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 11:57:43 -0800 Subject: rcutorture: Abstract stutter_wait() Because stuttering the test load (stopping and restarting it) is useful for non-RCU testing, this commit moves the load-stuttering functionality to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 69 +++++++-------------------------------- kernel/torture.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 98 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9357c88cc8cc..4329ad14f8dc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -103,7 +103,6 @@ static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; -static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *shutdown_task; @@ -145,8 +144,6 @@ static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; -static int stutter_pause_test; - #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 #else @@ -222,18 +219,6 @@ rcu_torture_free(struct rcu_torture *p) spin_unlock_bh(&rcu_torture_lock); } -static void -rcu_stutter_wait(const char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - torture_shutdown_absorb(title); - } -} - /* * Operations vector for selecting different types of tests. */ @@ -571,7 +556,7 @@ static int rcu_torture_boost(void *arg) oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); - rcu_stutter_wait("rcu_torture_boost"); + stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; } @@ -593,7 +578,7 @@ static int rcu_torture_boost(void *arg) call_rcu_time = jiffies; } cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); + stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; } @@ -618,7 +603,7 @@ static int rcu_torture_boost(void *arg) } /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); +checkwait: stutter_wait("rcu_torture_boost"); } while (!torture_must_stop()); /* Clean up and exit. */ @@ -656,7 +641,7 @@ rcu_torture_fqs(void *arg) udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; } - rcu_stutter_wait("rcu_torture_fqs"); + stutter_wait("rcu_torture_fqs"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); torture_shutdown_absorb("rcu_torture_fqs"); @@ -728,7 +713,7 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); - rcu_stutter_wait("rcu_torture_writer"); + stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); torture_shutdown_absorb("rcu_torture_writer"); @@ -765,7 +750,7 @@ rcu_torture_fakewriter(void *arg) } else { cur_ops->exp_sync(); } - rcu_stutter_wait("rcu_torture_fakewriter"); + stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); @@ -910,7 +895,7 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); - rcu_stutter_wait("rcu_torture_reader"); + stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); torture_shutdown_absorb("rcu_torture_reader"); @@ -1034,25 +1019,6 @@ rcu_torture_stats(void *arg) return 0; } -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_TOROUT_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - torture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping"); - return 0; -} - static inline void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { @@ -1403,11 +1369,7 @@ rcu_torture_cleanup(void) rcu_torture_barrier_cleanup(); rcu_torture_stall_cleanup(); - if (stutter_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; + torture_stutter_cleanup(); if (writer_task) { VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task"); @@ -1548,7 +1510,7 @@ rcu_torture_init(void) &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, }; - torture_init_begin(torture_type, verbose); + torture_init_begin(torture_type, verbose, &rcutorture_runnable); /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -1682,21 +1644,14 @@ rcu_torture_init(void) if (stutter < 0) stutter = 0; if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; + firsterr = torture_stutter_init(stutter * HZ); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(stutter_task); } if (fqs_duration < 0) fqs_duration = 0; if (fqs_duration) { - /* Create the stutter thread */ + /* Create the fqs thread */ fqs_task = kthread_run(rcu_torture_fqs, NULL, "rcu_torture_fqs"); if (IS_ERR(fqs_task)) { diff --git a/kernel/torture.c b/kernel/torture.c index d51de3029a5c..b30c2ee78580 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -58,6 +58,7 @@ static bool verbose; #define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +static int *torture_runnable; #ifdef CONFIG_HOTPLUG_CPU @@ -452,17 +453,101 @@ static struct notifier_block torture_shutdown_nb = { .notifier_call = torture_shutdown_notify, }; +/* + * Variables for stuttering, which means to periodically pause and + * restart testing in order to catch bugs that appear when load is + * suddenly applied to or removed from the system. + */ +static struct task_struct *stutter_task; +static int stutter_pause_test; +static int stutter; + +/* + * Block until the stutter interval ends. This must be called periodically + * by all running kthreads that need to be subject to stuttering. + */ +void stutter_wait(const char *title) +{ + while (ACCESS_ONCE(stutter_pause_test) || + (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { + if (stutter_pause_test) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + torture_shutdown_absorb(title); + } +} +EXPORT_SYMBOL_GPL(stutter_wait); + +/* + * Cause the torture test to "stutter", starting and stopping all + * threads periodically. + */ +static int torture_stutter(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_stutter task started"); + do { + if (!torture_must_stop()) { + schedule_timeout_interruptible(stutter); + ACCESS_ONCE(stutter_pause_test) = 1; + } + if (!torture_must_stop()) + schedule_timeout_interruptible(stutter); + ACCESS_ONCE(stutter_pause_test) = 0; + torture_shutdown_absorb("torture_stutter"); + } while (!torture_must_stop()); + VERBOSE_TOROUT_STRING("torture_stutter task stopping"); + return 0; +} + +/* + * Initialize and kick off the torture_stutter kthread. + */ +int torture_stutter_init(int s) +{ + int ret; + + stutter = s; + stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter"); + if (IS_ERR(stutter_task)) { + ret = PTR_ERR(stutter_task); + VERBOSE_TOROUT_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + return ret; + } + torture_shuffle_task_register(stutter_task); + return 0; +} +EXPORT_SYMBOL_GPL(torture_stutter_init); + +/* + * Cleanup after the torture_stutter kthread. + */ +void torture_stutter_cleanup(void) +{ + if (!stutter_task) + return; + VERBOSE_TOROUT_STRING("Stopping torture_stutter task"); + kthread_stop(stutter_task); + stutter_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_stutter_cleanup); + /* * Initialize torture module. Please note that this is -not- invoked via * the usual module_init() mechanism, but rather by an explicit call from * the client torture module. This call must be paired with a later * torture_init_end(). + * + * The runnable parameter points to a flag that controls whether or not + * the test is currently runnable. If there is no such flag, pass in NULL. */ -void __init torture_init_begin(char *ttype, bool v) +void __init torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); torture_type = ttype; verbose = v; + torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; } -- cgit v1.2.3 From 57a2fe90fcdaa812ac1aa6c91ba0e591c30f461a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 12:58:39 -0800 Subject: rcutorture: Apply ACCESS_ONCE() to racy fullstop accesses Because the fullstop variable can be accessed while it is being updated, this commit avoids any resulting compiler mischief through use of ACCESS_ONCE() for non-initialization accesses to this shared variable. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index b30c2ee78580..1bafd02d1eed 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -439,9 +439,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) { + if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) { VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected"); - fullstop = FULLSTOP_SHUTDOWN; + ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN; } else { pr_warn("Concurrent rmmod and shutdown illegal!\n"); } @@ -575,13 +575,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); bool torture_cleanup(void) { mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_SHUTDOWN) { + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { pr_warn("Concurrent rmmod and shutdown illegal!\n"); mutex_unlock(&fullstop_mutex); schedule_timeout_uninterruptible(10); return true; } - fullstop = FULLSTOP_RMMOD; + ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&torture_shutdown_nb); torture_shuffle_cleanup(); @@ -605,6 +605,6 @@ EXPORT_SYMBOL_GPL(torture_must_stop); */ bool torture_must_stop_irq(void) { - return fullstop != FULLSTOP_DONTSTOP; + return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP; } EXPORT_SYMBOL_GPL(torture_must_stop_irq); -- cgit v1.2.3 From e991dbc0770b01b7dc7d6d7660442e83ebd11828 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 14:52:13 -0800 Subject: rcutorture: Abstract torture_shutdown() Because auto-shutdown of torture testing is not specific to RCU, this commit moves the auto-shutdown function to kernel/torture.c. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 63 +++---------------------------------- kernel/torture.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4329ad14f8dc..897b0f91f899 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -105,7 +105,6 @@ static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; static struct task_struct *stall_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -173,7 +172,6 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ -static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -183,9 +181,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -/* Forward reference. */ -static void rcu_torture_cleanup(void); - /* * Allocate an element from the rcu_tortures pool. */ @@ -1086,42 +1081,6 @@ static int rcutorture_booster_init(int cpu) return 0; } -/* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_TOROUT_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_TOROUT_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_TOROUT_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -1421,11 +1380,7 @@ rcu_torture_cleanup(void) for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } - if (shutdown_task != NULL) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; + torture_shutdown_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1681,18 +1636,10 @@ rcu_torture_init(void) } } } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - torture_shuffle_task_register(shutdown_task); - wake_up_process(shutdown_task); + i = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); + if (i != 0) { + firsterr = i; + goto unwind; } i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); if (i != 0) { diff --git a/kernel/torture.c b/kernel/torture.c index 1bafd02d1eed..df2c700e96e4 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -418,6 +418,15 @@ static void torture_shuffle_cleanup(void) } EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); +/* + * Variables for auto-shutdown. This allows "lights out" torture runs + * to be fully scripted. + */ +static int shutdown_secs; /* desired test duration in seconds. */ +static struct task_struct *shutdown_task; +static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static void (*torture_shutdown_hook)(void); + /* * Absorb kthreads into a kernel function that won't return, so that * they won't ever access module text or data again. @@ -432,6 +441,81 @@ void torture_shutdown_absorb(const char *title) } EXPORT_SYMBOL_GPL(torture_shutdown_absorb); +/* + * Cause the torture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs parameter. + */ +static int torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_TOROUT_STRING("torture_shutdown task started"); + jiffies_snap = jiffies; + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !torture_must_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = jiffies; + } + if (torture_must_stop()) { + VERBOSE_TOROUT_STRING("torture_shutdown task stopping"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + torture_shutdown_hook();/* Shut down the enclosing torture test. */ + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +/* + * Start up the shutdown task. + */ +int torture_shutdown_init(int ssecs, void (*cleanup)(void)) +{ + int ret; + + shutdown_secs = ssecs; + torture_shutdown_hook = cleanup; + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + shutdown_task = kthread_create(torture_shutdown, NULL, + "torture_shutdown"); + if (IS_ERR(shutdown_task)) { + ret = PTR_ERR(shutdown_task); + VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown"); + shutdown_task = NULL; + return ret; + } + torture_shuffle_task_register(shutdown_task); + wake_up_process(shutdown_task); + } + return 0; +} +EXPORT_SYMBOL_GPL(torture_shutdown_init); + +/* + * Shut down the shutdown task. Say what??? Heh! This can happen if + * the torture module gets an rmmod before the shutdown time arrives. ;-) + */ +void torture_shutdown_cleanup(void) +{ + if (shutdown_task != NULL) { + VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_shutdown_cleanup); + /* * Detect and respond to a system shutdown. */ -- cgit v1.2.3 From 01025ebc99e39ac962c32e063cad9a3012ee8b0a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 15:15:02 -0800 Subject: rcutorture: Clean up rcu_torture_init() error checking This commit applies some simple cleanups to rcu_torture_init() error checking. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 897b0f91f899..746c4278ea5e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1460,7 +1460,6 @@ rcu_torture_init(void) int i; int cpu; int firsterr = 0; - int retval; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, }; @@ -1629,33 +1628,23 @@ rcu_torture_init(void) for_each_possible_cpu(i) { if (cpu_is_offline(i)) continue; /* Heuristic: CPU can go offline. */ - retval = rcutorture_booster_init(i); - if (retval < 0) { - firsterr = retval; + firsterr = rcutorture_booster_init(i); + if (firsterr) goto unwind; - } } } - i = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); - if (i != 0) { - firsterr = i; + firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); + if (firsterr) goto unwind; - } - i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); - if (i != 0) { - firsterr = i; + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); + if (firsterr) goto unwind; - } - i = rcu_torture_stall_init(); - if (i != 0) { - firsterr = i; + firsterr = rcu_torture_stall_init(); + if (firsterr) goto unwind; - } - retval = rcu_torture_barrier_init(); - if (retval != 0) { - firsterr = retval; + firsterr = rcu_torture_barrier_init(); + if (firsterr) goto unwind; - } if (object_debug) rcu_test_debug_objects(); rcutorture_record_test_transition(); -- cgit v1.2.3 From 14562d1cf12b434da2c69b5603a4149ac43f3b48 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 15:39:52 -0800 Subject: rcutorture: Announce task creation A few "stealth-start rcutorture kthreads" have accumulated over the years, so this commit adds console-log announcements (but only if the torture tests are running verbose). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 4 ++++ kernel/torture.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 746c4278ea5e..6e9ba51b23b9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1120,6 +1120,7 @@ static int __init rcu_torture_stall_init(void) if (stall_cpu <= 0) return 0; + VERBOSE_TOROUT_STRING("Creating rcu_torture_stall task"); stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); if (IS_ERR(stall_task)) { ret = PTR_ERR(stall_task); @@ -1242,6 +1243,7 @@ static int rcu_torture_barrier_init(void) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); + VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier_cbs task"); barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, (void *)(long)i, "rcu_torture_barrier_cbs"); @@ -1253,6 +1255,7 @@ static int rcu_torture_barrier_init(void) } torture_shuffle_task_register(barrier_cbs_tasks[i]); } + VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier task"); barrier_task = kthread_run(rcu_torture_barrier, NULL, "rcu_torture_barrier"); if (IS_ERR(barrier_task)) { @@ -1606,6 +1609,7 @@ rcu_torture_init(void) fqs_duration = 0; if (fqs_duration) { /* Create the fqs thread */ + VERBOSE_TOROUT_STRING("Creating rcu_torture_fqs task"); fqs_task = kthread_run(rcu_torture_fqs, NULL, "rcu_torture_fqs"); if (IS_ERR(fqs_task)) { diff --git a/kernel/torture.c b/kernel/torture.c index df2c700e96e4..5e2838f902f9 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -187,6 +187,7 @@ int torture_onoff_init(long ooholdoff, long oointerval) onoff_interval = oointerval; if (onoff_interval <= 0) return 0; + VERBOSE_TOROUT_STRING("Creating torture_onoff task"); onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff"); if (IS_ERR(onoff_task)) { ret = PTR_ERR(onoff_task); @@ -390,6 +391,7 @@ int torture_shuffle_init(long shuffint) } /* Create the shuffler thread */ + VERBOSE_TOROUT_STRING("Creating torture_shuffle task"); shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle"); if (IS_ERR(shuffler_task)) { ret = PTR_ERR(shuffler_task); @@ -486,6 +488,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) shutdown_secs = ssecs; torture_shutdown_hook = cleanup; if (shutdown_secs > 0) { + VERBOSE_TOROUT_STRING("Creating torture_shutdown task"); shutdown_time = jiffies + shutdown_secs * HZ; shutdown_task = kthread_create(torture_shutdown, NULL, "torture_shutdown"); @@ -592,6 +595,7 @@ int torture_stutter_init(int s) int ret; stutter = s; + VERBOSE_TOROUT_STRING("Creating torture_stutter task"); stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter"); if (IS_ERR(stutter_task)) { ret = PTR_ERR(stutter_task); -- cgit v1.2.3 From 7fafaac5b9ce22cc57777865390520476ad2262d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 31 Jan 2014 17:37:28 -0800 Subject: rcutorture: Fix rcutorture shutdown races Not all of the rcutorture kthreads waited for kthread_should_stop() before returning from their top-level functions, and none of them used torture_shutdown_absorb() properly. These problems can result in segfaults and hangs at shutdown time, and some recent changes perturbed timing sufficiently to make them much more probable. This commit therefore creates a torture_kthread_stopping() function that does the proper kthread shutdown dance in one centralized location. Accommodate this grouping by making VERBOSE_TOROUT_STRING() capable of taking a non-const string as its argument, which allows the new torture_kthread_stopping() to pass its "title" argument directly to the updated version of VERBOSE_TOROUT_STRING(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 39 +++++++++++---------------------------- kernel/torture.c | 26 ++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6e9ba51b23b9..bcaafd6cf633 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -602,12 +602,13 @@ checkwait: stutter_wait("rcu_torture_boost"); } while (!torture_must_stop()); /* Clean up and exit. */ - VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping"); - torture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) + while (!kthread_should_stop() || rbi.inflight) { + torture_shutdown_absorb("rcu_torture_boost"); schedule_timeout_uninterruptible(1); + } smp_mb(); /* order accesses to ->inflight before stack-frame death. */ destroy_rcu_head_on_stack(&rbi.rcu); + torture_kthread_stopping("rcu_torture_boost"); return 0; } @@ -638,10 +639,7 @@ rcu_torture_fqs(void *arg) } stutter_wait("rcu_torture_fqs"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping"); - torture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_fqs"); return 0; } @@ -710,10 +708,7 @@ rcu_torture_writer(void *arg) rcutorture_record_progress(++rcu_torture_current_version); stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping"); - torture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_writer"); return 0; } @@ -748,10 +743,7 @@ rcu_torture_fakewriter(void *arg) stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping"); - torture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_fakewriter"); return 0; } @@ -892,12 +884,9 @@ rcu_torture_reader(void *arg) schedule(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping"); - torture_shutdown_absorb("rcu_torture_reader"); if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_reader"); return 0; } @@ -1010,7 +999,7 @@ rcu_torture_stats(void *arg) rcu_torture_stats_print(); torture_shutdown_absorb("rcu_torture_stats"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping"); + torture_kthread_stopping("rcu_torture_stats"); return 0; } @@ -1171,12 +1160,9 @@ static int rcu_torture_barrier_cbs(void *arg) if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping"); - torture_shutdown_absorb("rcu_torture_barrier_cbs"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); + torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; } @@ -1207,10 +1193,7 @@ static int rcu_torture_barrier(void *arg) n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping"); - torture_shutdown_absorb("rcu_torture_barrier"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); + torture_kthread_stopping("rcu_torture_barrier"); return 0; } diff --git a/kernel/torture.c b/kernel/torture.c index 5e2838f902f9..330576660cf4 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -169,7 +169,7 @@ torture_onoff(void *arg) } schedule_timeout_interruptible(onoff_interval); } - VERBOSE_TOROUT_STRING("torture_onoff task stopping"); + torture_kthread_stopping("torture_onoff"); return 0; } @@ -370,7 +370,7 @@ static int torture_shuffle(void *arg) torture_shuffle_tasks(); torture_shutdown_absorb("torture_shuffle"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("torture_shuffle task stopping"); + torture_kthread_stopping("torture_shuffle"); return 0; } @@ -465,7 +465,7 @@ static int torture_shutdown(void *arg) jiffies_snap = jiffies; } if (torture_must_stop()) { - VERBOSE_TOROUT_STRING("torture_shutdown task stopping"); + torture_kthread_stopping("torture_shutdown"); return 0; } @@ -583,7 +583,7 @@ static int torture_stutter(void *arg) ACCESS_ONCE(stutter_pause_test) = 0; torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); - VERBOSE_TOROUT_STRING("torture_stutter task stopping"); + torture_kthread_stopping("torture_stutter"); return 0; } @@ -696,3 +696,21 @@ bool torture_must_stop_irq(void) return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP; } EXPORT_SYMBOL_GPL(torture_must_stop_irq); + +/* + * Each kthread must wait for kthread_should_stop() before returning from + * its top-level function, otherwise segfaults ensue. This function + * prints a "stopping" message and waits for kthread_should_stop(), and + * should be called from all torture kthreads immediately prior to + * returning. + */ +void torture_kthread_stopping(char *title) +{ + if (verbose) + VERBOSE_TOROUT_STRING(title); + while (!kthread_should_stop()) { + torture_shutdown_absorb(title); + schedule_timeout_uninterruptible(1); + } +} +EXPORT_SYMBOL_GPL(torture_kthread_stopping); -- cgit v1.2.3 From bc8f83e2c0d585b201dfbb52e98f6f8741d324ea Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Feb 2014 10:02:41 -0800 Subject: rcutorture: Fix missing-return bug in rcu_torture_barrier_init() This commit adds a missing error return to the code path that creates the rcu_torture_barrier() kthread. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bcaafd6cf633..25e9b16fe7f0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1245,6 +1245,7 @@ static int rcu_torture_barrier_init(void) ret = PTR_ERR(barrier_task); VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier"); barrier_task = NULL; + return ret; } torture_shuffle_task_register(barrier_task); return 0; -- cgit v1.2.3 From 47cf29b9e721967aac95ebda9e50408219755852 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Feb 2014 11:52:27 -0800 Subject: rcutorture: Abstract torture_create_kthread() Creation of kthreads is not RCU-specific, so this commit abstracts out torture_create_kthread(), saving a few tens of lines of code in the process. This change requires modifying VERBOSE_TOROUT_ERRSTRING() to take a non-const string, so that _torture_create_kthread() can avoid an open-coded substitute. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 98 ++++++++++--------------------------------------- kernel/torture.c | 80 +++++++++++++++++----------------------- 2 files changed, 53 insertions(+), 125 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 25e9b16fe7f0..a6f6c8418d87 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1105,19 +1105,9 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - int ret; - if (stall_cpu <= 0) return 0; - VERBOSE_TOROUT_STRING("Creating rcu_torture_stall task"); - stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); - if (IS_ERR(stall_task)) { - ret = PTR_ERR(stall_task); - stall_task = NULL; - return ret; - } - torture_shuffle_task_register(stall_task); - return 0; + return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } /* Clean up after the CPU-stall kthread, if one was spawned. */ @@ -1226,29 +1216,13 @@ static int rcu_torture_barrier_init(void) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); - VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier_cbs task"); - barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)(long)i, - "rcu_torture_barrier_cbs"); - if (IS_ERR(barrier_cbs_tasks[i])) { - ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); - barrier_cbs_tasks[i] = NULL; + ret = torture_create_kthread(rcu_torture_barrier_cbs, + (void *)(long)i, + barrier_cbs_tasks[i]); + if (ret) return ret; - } - torture_shuffle_task_register(barrier_cbs_tasks[i]); } - VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier task"); - barrier_task = kthread_run(rcu_torture_barrier, NULL, - "rcu_torture_barrier"); - if (IS_ERR(barrier_task)) { - ret = PTR_ERR(barrier_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier"); - barrier_task = NULL; - return ret; - } - torture_shuffle_task_register(barrier_task); - return 0; + return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task); } /* Clean up after RCU barrier testing. */ @@ -1516,17 +1490,10 @@ rcu_torture_init(void) /* Start up the kthreads. */ - VERBOSE_TOROUT_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_create(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create writer"); - writer_task = NULL; + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(writer_task); - wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -1535,16 +1502,10 @@ rcu_torture_init(void) goto unwind; } for (i = 0; i < nfakewriters; i++) { - VERBOSE_TOROUT_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_TOROUT_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_fakewriter, + NULL, fakewriter_tasks[i]); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(fakewriter_tasks[i]); } reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); @@ -1554,28 +1515,16 @@ rcu_torture_init(void) goto unwind; } for (i = 0; i < nrealreaders; i++) { - VERBOSE_TOROUT_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_TOROUT_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_reader, NULL, + reader_tasks[i]); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(reader_tasks[i]); } if (stat_interval > 0) { - VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create stats"); - stats_task = NULL; + firsterr = torture_create_kthread(rcu_torture_stats, NULL, + stats_task); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(stats_task); } if (test_no_idle_hz) { firsterr = torture_shuffle_init(shuffle_interval * HZ); @@ -1593,16 +1542,9 @@ rcu_torture_init(void) fqs_duration = 0; if (fqs_duration) { /* Create the fqs thread */ - VERBOSE_TOROUT_STRING("Creating rcu_torture_fqs task"); - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; + torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); + if (firsterr) goto unwind; - } - torture_shuffle_task_register(fqs_task); } if (test_boost_interval < 1) test_boost_interval = 1; diff --git a/kernel/torture.c b/kernel/torture.c index 330576660cf4..439451821a7f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -180,23 +180,16 @@ torture_onoff(void *arg) */ int torture_onoff_init(long ooholdoff, long oointerval) { -#ifdef CONFIG_HOTPLUG_CPU - int ret; + int ret = 0; +#ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; if (onoff_interval <= 0) return 0; - VERBOSE_TOROUT_STRING("Creating torture_onoff task"); - onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - torture_shuffle_task_register(onoff_task); + ret = torture_create_kthread(torture_onoff, NULL, onoff_task); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return 0; + return ret; } EXPORT_SYMBOL_GPL(torture_onoff_init); @@ -379,8 +372,6 @@ static int torture_shuffle(void *arg) */ int torture_shuffle_init(long shuffint) { - int ret; - shuffle_interval = shuffint; shuffle_idle_cpu = -1; @@ -391,17 +382,7 @@ int torture_shuffle_init(long shuffint) } /* Create the shuffler thread */ - VERBOSE_TOROUT_STRING("Creating torture_shuffle task"); - shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle"); - if (IS_ERR(shuffler_task)) { - ret = PTR_ERR(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - return ret; - } - torture_shuffle_task_register(shuffler_task); - return 0; + return torture_create_kthread(torture_shuffle, NULL, shuffler_task); } EXPORT_SYMBOL_GPL(torture_shuffle_init); @@ -483,25 +464,16 @@ static int torture_shutdown(void *arg) */ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { - int ret; + int ret = 0; shutdown_secs = ssecs; torture_shutdown_hook = cleanup; if (shutdown_secs > 0) { - VERBOSE_TOROUT_STRING("Creating torture_shutdown task"); shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(torture_shutdown, NULL, - "torture_shutdown"); - if (IS_ERR(shutdown_task)) { - ret = PTR_ERR(shutdown_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - return ret; - } - torture_shuffle_task_register(shutdown_task); - wake_up_process(shutdown_task); + ret = torture_create_kthread(torture_shutdown, NULL, + shutdown_task); } - return 0; + return ret; } EXPORT_SYMBOL_GPL(torture_shutdown_init); @@ -595,16 +567,8 @@ int torture_stutter_init(int s) int ret; stutter = s; - VERBOSE_TOROUT_STRING("Creating torture_stutter task"); - stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter"); - if (IS_ERR(stutter_task)) { - ret = PTR_ERR(stutter_task); - VERBOSE_TOROUT_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; - return ret; - } - torture_shuffle_task_register(stutter_task); - return 0; + ret = torture_create_kthread(torture_stutter, NULL, stutter_task); + return ret; } EXPORT_SYMBOL_GPL(torture_stutter_init); @@ -714,3 +678,25 @@ void torture_kthread_stopping(char *title) } } EXPORT_SYMBOL_GPL(torture_kthread_stopping); + +/* + * Create a generic torture kthread that is immediately runnable. If you + * need the kthread to be stopped so that you can do something to it before + * it starts, you will need to open-code your own. + */ +int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, + char *f, struct task_struct **tp) +{ + int ret = 0; + + VERBOSE_TOROUT_STRING(m); + *tp = kthread_run(fn, arg, s); + if (IS_ERR(*tp)) { + ret = PTR_ERR(*tp); + VERBOSE_TOROUT_ERRSTRING(f); + *tp = NULL; + } + torture_shuffle_task_register(*tp); + return ret; +} +EXPORT_SYMBOL_GPL(_torture_create_kthread); -- cgit v1.2.3 From 9c029b86098decd4660eec511b8d2d42da3e7dd9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2014 11:47:08 -0800 Subject: rcutorture: Abstract torture_stop_kthread() Stopping of kthreads is not RCU-specific, so this commit abstracts out torture_stop_kthread(), saving a few lines of code in the process. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 72 ++++++++++--------------------------------------- kernel/torture.c | 13 +++++++++ 2 files changed, 27 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a6f6c8418d87..37bd4beea198 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1033,14 +1033,12 @@ static void rcutorture_booster_cleanup(int cpu) if (boost_tasks[cpu] == NULL) return; mutex_lock(&boost_mutex); - VERBOSE_TOROUT_STRING("Stopping rcu_torture_boost task"); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); - boost_tasks[cpu] = NULL; + torture_stop_kthread(rcu_torture_boost, t); } static int rcutorture_booster_init(int cpu) @@ -1110,16 +1108,6 @@ static int __init rcu_torture_stall_init(void) return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } -/* Clean up after the CPU-stall kthread, if one was spawned. */ -static void rcu_torture_stall_cleanup(void) -{ - if (stall_task == NULL) - return; - VERBOSE_TOROUT_STRING("Stopping rcu_torture_stall_task."); - kthread_stop(stall_task); - stall_task = NULL; -} - /* Callback function for RCU barrier testing. */ void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -1230,19 +1218,11 @@ static void rcu_torture_barrier_cleanup(void) { int i; - if (barrier_task != NULL) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier task"); - kthread_stop(barrier_task); - barrier_task = NULL; - } + torture_stop_kthread(rcu_torture_barrier, barrier_task); if (barrier_cbs_tasks != NULL) { - for (i = 0; i < n_barrier_cbs; i++) { - if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier_cbs task"); - kthread_stop(barrier_cbs_tasks[i]); - barrier_cbs_tasks[i] = NULL; - } - } + for (i = 0; i < n_barrier_cbs; i++) + torture_stop_kthread(rcu_torture_barrier_cbs, + barrier_cbs_tasks[i]); kfree(barrier_cbs_tasks); barrier_cbs_tasks = NULL; } @@ -1288,53 +1268,29 @@ rcu_torture_cleanup(void) } rcu_torture_barrier_cleanup(); - rcu_torture_stall_cleanup(); + torture_stop_kthread(rcu_torture_stall, stall_task); torture_stutter_cleanup(); - - if (writer_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; + torture_stop_kthread(rcu_torture_writer, writer_task); if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_TOROUT_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_torture_reader, + reader_tasks[i]); kfree(reader_tasks); - reader_tasks = NULL; } rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_TOROUT_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; + torture_stop_kthread(rcu_torture_fakewriter, + fakewriter_tasks[i]); } kfree(fakewriter_tasks); fakewriter_tasks = NULL; } - if (stats_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_TOROUT_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; + torture_stop_kthread(rcu_torture_stats, stats_task); + torture_stop_kthread(rcu_torture_fqs, fqs_task); if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { unregister_cpu_notifier(&rcutorture_cpu_nb); diff --git a/kernel/torture.c b/kernel/torture.c index 439451821a7f..871f63611f7f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -700,3 +700,16 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, return ret; } EXPORT_SYMBOL_GPL(_torture_create_kthread); + +/* + * Stop a generic kthread, emitting a message. + */ +void _torture_stop_kthread(char *m, struct task_struct **tp) +{ + if (*tp == NULL) + return; + VERBOSE_TOROUT_STRING(m); + kthread_stop(*tp); + *tp = NULL; +} +EXPORT_SYMBOL_GPL(_torture_stop_kthread); -- cgit v1.2.3 From bfefc73aa1d1bad317bccef8a15da39263d3d962 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2014 12:35:27 -0800 Subject: rcutorture: Stop generic kthreads in torture_cleanup() The specific torture modules (like rcutorture) need to call torture_cleanup() in any case, so this commit makes torture_cleanup() deal with torture_shutdown_cleanup() and torture_stutter_cleanup() so that the specific modules don't have to deal with these details. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 7 ------- kernel/torture.c | 37 +++++++++++++++++++------------------ 2 files changed, 19 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 37bd4beea198..40792e76a116 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -53,11 +53,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -MODULE_ALIAS("rcutorture"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutorture." torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); @@ -1269,7 +1264,6 @@ rcu_torture_cleanup(void) rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); - torture_stutter_cleanup(); torture_stop_kthread(rcu_torture_writer, writer_task); if (reader_tasks) { @@ -1297,7 +1291,6 @@ rcu_torture_cleanup(void) for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } - torture_shutdown_cleanup(); /* Wait for all RCU callbacks to fire. */ diff --git a/kernel/torture.c b/kernel/torture.c index 871f63611f7f..b26c7b42becd 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -477,20 +477,6 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) } EXPORT_SYMBOL_GPL(torture_shutdown_init); -/* - * Shut down the shutdown task. Say what??? Heh! This can happen if - * the torture module gets an rmmod before the shutdown time arrives. ;-) - */ -void torture_shutdown_cleanup(void) -{ - if (shutdown_task != NULL) { - VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; -} -EXPORT_SYMBOL_GPL(torture_shutdown_cleanup); - /* * Detect and respond to a system shutdown. */ @@ -512,6 +498,20 @@ static struct notifier_block torture_shutdown_nb = { .notifier_call = torture_shutdown_notify, }; +/* + * Shut down the shutdown task. Say what??? Heh! This can happen if + * the torture module gets an rmmod before the shutdown time arrives. ;-) + */ +static void torture_shutdown_cleanup(void) +{ + unregister_reboot_notifier(&torture_shutdown_nb); + if (shutdown_task != NULL) { + VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; +} + /* * Variables for stuttering, which means to periodically pause and * restart testing in order to catch bugs that appear when load is @@ -575,7 +575,7 @@ EXPORT_SYMBOL_GPL(torture_stutter_init); /* * Cleanup after the torture_stutter kthread. */ -void torture_stutter_cleanup(void) +static void torture_stutter_cleanup(void) { if (!stutter_task) return; @@ -583,7 +583,6 @@ void torture_stutter_cleanup(void) kthread_stop(stutter_task); stutter_task = NULL; } -EXPORT_SYMBOL_GPL(torture_stutter_cleanup); /* * Initialize torture module. Please note that this is -not- invoked via @@ -619,7 +618,8 @@ EXPORT_SYMBOL_GPL(torture_init_end); * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from * the client torture module. Returns true if a race with system shutdown - * is detected. + * is detected, otherwise, all kthreads started by functions in this file + * will be shut down. * * This must be called before the caller starts shutting down its own * kthreads. @@ -635,8 +635,9 @@ bool torture_cleanup(void) } ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&torture_shutdown_nb); + torture_shutdown_cleanup(); torture_shuffle_cleanup(); + torture_stutter_cleanup(); torture_onoff_cleanup(); return false; } -- cgit v1.2.3 From 0af3fe1efa534a43385fe2694c42ffec7a310e46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Feb 2014 15:51:41 -0800 Subject: locktorture: Add a lock-torture kernel module This commit adds the locking counterpart to rcutorture. Signed-off-by: Paul E. McKenney [ paulmck: Make n_lock_torture_errors and torture_spinlock static as suggested by Fengguang Wu. ] Reviewed-by: Josh Triplett --- kernel/locking/Makefile | 1 + kernel/locking/locktorture.c | 421 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 422 insertions(+) create mode 100644 kernel/locking/locktorture.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index baab8e5e7f66..a28ea6d9e6e8 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c new file mode 100644 index 000000000000..d69d20d9c9db --- /dev/null +++ b/kernel/locking/locktorture.c @@ -0,0 +1,421 @@ +/* + * Module-based torture test facility for locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney + * Based on kernel/rcu/torture.c. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney "); + +torture_param(int, nwriters_stress, -1, + "Number of write-locking stress-test threads"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, + "Number of jiffies between shuffles, 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); + +static char *torture_type = "spin_lock"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, + "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); + +static atomic_t n_lock_torture_errors; + +static struct task_struct *stats_task; +static struct task_struct **writer_tasks; + +static int nrealwriters_stress; +static bool lock_is_write_held; + +struct lock_writer_stress_stats { + long n_write_lock_fail; + long n_write_lock_acquired; +}; +static struct lock_writer_stress_stats *lwsa; + +#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) +#define LOCKTORTURE_RUNNABLE_INIT 1 +#else +#define LOCKTORTURE_RUNNABLE_INIT 0 +#endif +int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(locktorture_runnable, int, 0444); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); + +/* Forward reference. */ +static void lock_torture_cleanup(void); + +/* + * Operations vector for selecting different types of tests. + */ +struct lock_torture_ops { + void (*init)(void); + int (*writelock)(void); + void (*write_delay)(struct torture_random_state *trsp); + void (*writeunlock)(void); + unsigned long flags; + const char *name; +}; + +static struct lock_torture_ops *cur_ops; + +/* + * Definitions for lock torture testing. + */ + +static DEFINE_SPINLOCK(torture_spinlock); + +static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +{ + spin_lock(&torture_spinlock); + return 0; +} + +static void torture_spin_lock_write_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_us = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); + if (!(torture_random(trsp) % + (nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +{ + spin_unlock(&torture_spinlock); +} + +static struct lock_torture_ops spin_lock_ops = { + .writelock = torture_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_spin_lock_write_unlock, + .name = "spin_lock" +}; + +static int torture_spin_lock_write_lock_irq(void) +__acquires(torture_spinlock_irq) +{ + unsigned long flags; + + spin_lock_irqsave(&torture_spinlock, flags); + cur_ops->flags = flags; + return 0; +} + +static void torture_lock_spin_write_unlock_irq(void) +__releases(torture_spinlock) +{ + spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); +} + +static struct lock_torture_ops spin_lock_irq_ops = { + .writelock = torture_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_lock_spin_write_unlock_irq, + .name = "spin_lock_irq" +}; + +/* + * Lock torture writer kthread. Repeatedly acquires and releases + * the lock, checking for duplicate acquisitions. + */ +static int lock_torture_writer(void *arg) +{ + struct lock_writer_stress_stats *lwsp = arg; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_TOROUT_STRING("lock_torture_writer task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1); + cur_ops->writelock(); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_write_lock_fail++; + lock_is_write_held = 1; + lwsp->n_write_lock_acquired++; + cur_ops->write_delay(&rand); + lock_is_write_held = 0; + cur_ops->writeunlock(); + stutter_wait("lock_torture_writer"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_writer"); + return 0; +} + +/* + * Create an lock-torture-statistics message in the specified buffer. + */ +static void lock_torture_printk(char *page) +{ + bool fail = 0; + int i; + long max = 0; + long min = lwsa[0].n_write_lock_acquired; + long long sum = 0; + + for (i = 0; i < nrealwriters_stress; i++) { + if (lwsa[i].n_write_lock_fail) + fail = true; + sum += lwsa[i].n_write_lock_acquired; + if (max < lwsa[i].n_write_lock_fail) + max = lwsa[i].n_write_lock_fail; + if (min > lwsa[i].n_write_lock_fail) + min = lwsa[i].n_write_lock_fail; + } + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, + "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + sum, max, min, max / 2 > min ? "???" : "", + fail, fail ? "!!!" : ""); + if (fail) + atomic_inc(&n_lock_torture_errors); +} + +/* + * Print torture statistics. Caller must ensure that there is only one + * call to this function at a given time!!! This is normally accomplished + * by relying on the module system to only have one copy of the module + * loaded, and then by giving the lock_torture_stats kthread full control + * (or the init/cleanup functions when lock_torture_stats thread is not + * running). + */ +static void lock_torture_stats_print(void) +{ + int size = nrealwriters_stress * 200 + 8192; + char *buf; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("lock_torture_stats_print: Out of memory, need: %d", + size); + return; + } + lock_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int lock_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("lock_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + lock_torture_stats_print(); + torture_shutdown_absorb("lock_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_stats"); + return 0; +} + +static inline void +lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, + const char *tag) +{ + pr_alert("%s" TORTURE_FLAG + "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealwriters_stress, stat_interval, verbose, + shuffle_interval, stutter, shutdown_secs, + onoff_interval, onoff_holdoff); +} + +static void lock_torture_cleanup(void) +{ + int i; + + if (torture_cleanup()) + return; + + if (writer_tasks) { + for (i = 0; i < nrealwriters_stress; i++) + torture_stop_kthread(lock_torture_writer, + writer_tasks[i]); + kfree(writer_tasks); + writer_tasks = NULL; + } + + torture_stop_kthread(lock_torture_stats, stats_task); + lock_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (atomic_read(&n_lock_torture_errors)) + lock_torture_print_module_parms(cur_ops, + "End of test: FAILURE"); + else if (torture_onoff_failures()) + lock_torture_print_module_parms(cur_ops, + "End of test: LOCK_HOTPLUG"); + else + lock_torture_print_module_parms(cur_ops, + "End of test: SUCCESS"); +} + +static int __init lock_torture_init(void) +{ + int i; + int firsterr = 0; + static struct lock_torture_ops *torture_ops[] = { + &spin_lock_ops, &spin_lock_irq_ops, + }; + + torture_init_begin(torture_type, verbose, &locktorture_runnable); + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + pr_alert("lock-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("lock-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); + torture_init_end(); + return -EINVAL; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nwriters_stress >= 0) + nrealwriters_stress = nwriters_stress; + else + nrealwriters_stress = 2 * num_online_cpus(); + lock_torture_print_module_parms(cur_ops, "Start of test"); + + /* Initialize the statistics so that each run gets its own numbers. */ + + lock_is_write_held = 0; + lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); + if (lwsa == NULL) { + VERBOSE_TOROUT_STRING("lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + lwsa[i].n_write_lock_fail = 0; + lwsa[i].n_write_lock_acquired = 0; + } + + /* Start up the kthreads. */ + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, + onoff_interval * HZ); + if (firsterr) + goto unwind; + } + if (shuffle_interval > 0) { + firsterr = torture_shuffle_init(shuffle_interval); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, + lock_torture_cleanup); + if (firsterr) + goto unwind; + } + if (stutter > 0) { + firsterr = torture_stutter_init(stutter); + if (firsterr) + goto unwind; + } + + writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], + writer_tasks[i]); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(lock_torture_stats, NULL, + stats_task); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + lock_torture_cleanup(); + return firsterr; +} + +module_init(lock_torture_init); +module_exit(lock_torture_cleanup); -- cgit v1.2.3 From ff20e251c409da81f2b850c81964908fb4c6fe66 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Feb 2014 08:45:56 -0800 Subject: rcutorture: Add an rcu_busted to test the test This commit adds a deliberately buggy RCU implementation into rcutorture to allow easy checking that rcutorture correctly flags buggy RCU implementations. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 40792e76a116..da6c38d909f1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -371,6 +371,48 @@ static struct rcu_torture_ops rcu_bh_ops = { .name = "rcu_bh" }; +/* + * Don't even think about trying any of these in real life!!! + * The names includes "busted", and they really means it! + * The only purpose of these functions is to provide a buggy RCU + * implementation to make sure that rcutorture correctly emits + * buggy-RCU error messages. + */ +static void rcu_busted_torture_deferred_free(struct rcu_torture *p) +{ + /* This is a deliberate bug for testing purposes only! */ + rcu_torture_cb(&p->rtort_rcu); +} + +static void synchronize_rcu_busted(void) +{ + /* This is a deliberate bug for testing purposes only! */ +} + +static void +call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + /* This is a deliberate bug for testing purposes only! */ + func(head); +} + +static struct rcu_torture_ops rcu_busted_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_busted_torture_deferred_free, + .sync = synchronize_rcu_busted, + .exp_sync = synchronize_rcu_busted, + .call = call_rcu_busted, + .cb_barrier = NULL, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_busted" +}; + /* * Definitions for srcu torture testing. */ @@ -1371,7 +1413,7 @@ rcu_torture_init(void) int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, }; torture_init_begin(torture_type, verbose, &rcutorture_runnable); -- cgit v1.2.3 From f881825a73543e9664c7fe7166e06f5f4d569834 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Feb 2014 14:42:51 -0800 Subject: rcutorture: Gracefully handle NULL cleanup hooks Although most torture tests will have some cleanup hook, it is possible that one might not. This commit therefore enables graceful handling of a NULL cleanup hook during torture-test shutdown. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/torture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index b26c7b42becd..acc9afc2f26e 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -454,7 +454,10 @@ static int torture_shutdown(void *arg) VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system"); shutdown_task = NULL; /* Avoid self-kill deadlock. */ - torture_shutdown_hook();/* Shut down the enclosing torture test. */ + if (torture_shutdown_hook) + torture_shutdown_hook(); + else + VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); kernel_power_off(); /* Shut down the system. */ return 0; } -- cgit v1.2.3 From e086481baf9d0436bdd6e9b739bfa4a83fb89ef5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Feb 2014 08:05:07 -0800 Subject: rcutorture: Add a lock_busted to test the test This commit adds a maximally broken locking primitive in which lock acquisition and release are both no-ops. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/locking/locktorture.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index d69d20d9c9db..f26b1a18e34e 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -112,6 +112,37 @@ static struct lock_torture_ops *cur_ops; * Definitions for lock torture testing. */ +static int torture_lock_busted_write_lock(void) +{ + return 0; /* BUGGY, do not use in real life!!! */ +} + +static void torture_lock_busted_write_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_us = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_lock_busted_write_unlock(void) +{ + /* BUGGY, do not use in real life!!! */ +} + +static struct lock_torture_ops lock_busted_ops = { + .writelock = torture_lock_busted_write_lock, + .write_delay = torture_lock_busted_write_delay, + .writeunlock = torture_lock_busted_write_unlock, + .name = "lock_busted" +}; + static DEFINE_SPINLOCK(torture_spinlock); static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) @@ -320,7 +351,7 @@ static int __init lock_torture_init(void) int i; int firsterr = 0; static struct lock_torture_ops *torture_ops[] = { - &spin_lock_ops, &spin_lock_irq_ops, + &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, }; torture_init_begin(torture_type, verbose, &locktorture_runnable); -- cgit v1.2.3 From f5645d3575fe9a2c468889146e152f11c199826e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 21 Feb 2014 14:19:30 -0800 Subject: capability: Use current logging styles Prefix logging output with "capability: " via pr_fmt. Convert printks to pr_. Use pr__once instead of guard flags. Coalesce formats. Signed-off-by: Joe Perches Acked-by: Serge E. Hallyn Signed-off-by: James Morris --- kernel/capability.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 4e66bf9275b0..d6a6c91863ff 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -7,6 +7,8 @@ * 30 May 2002: Cleanup, Robert M. Love */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -42,15 +44,10 @@ __setup("no_file_caps", file_caps_disable); static void warn_legacy_capability_use(void) { - static int warned; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" - " (legacy support in use)\n", - get_task_comm(name, current)); - warned = 1; - } + char name[sizeof(current->comm)]; + + pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", + get_task_comm(name, current)); } /* @@ -71,16 +68,10 @@ static void warn_legacy_capability_use(void) static void warn_deprecated_v2(void) { - static int warned; + char name[sizeof(current->comm)]; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses deprecated v2" - " capabilities in a way that may be insecure.\n", - get_task_comm(name, current)); - warned = 1; - } + pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", + get_task_comm(name, current)); } /* @@ -380,7 +371,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap) bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { - printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } -- cgit v1.2.3 From 5fd77595ec62141fa71e575bdbf410e0192f87d0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 24 Feb 2014 16:39:55 +0100 Subject: smp: Iterate functions through llist_for_each_entry_safe() The IPI function llist iteration is open coded. Lets simplify this with using an llist iterator. Also we want to keep the iteration safe against possible csd.llist->next value reuse from the IPI handler. At least the block subsystem used to do such things so lets stay careful and use llist_for_each_entry_safe(). Signed-off-by: Jan Kara Cc: Andrew Morton Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Jens Axboe Signed-off-by: Frederic Weisbecker Signed-off-by: Jens Axboe --- kernel/smp.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ffee35bef179..e3852de042a6 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -151,7 +151,8 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) */ void generic_smp_call_function_single_interrupt(void) { - struct llist_node *entry, *next; + struct llist_node *entry; + struct call_single_data *csd, *csd_next; /* * Shouldn't receive this interrupt on a cpu that is not yet online. @@ -161,16 +162,9 @@ void generic_smp_call_function_single_interrupt(void) entry = llist_del_all(&__get_cpu_var(call_single_queue)); entry = llist_reverse_order(entry); - while (entry) { - struct call_single_data *csd; - - next = entry->next; - - csd = llist_entry(entry, struct call_single_data, llist); + llist_for_each_entry_safe(csd, csd_next, entry, llist) { csd->func(csd->info); csd_unlock(csd); - - entry = next; } } -- cgit v1.2.3 From 08eed44c7249d381a099bc55577e55c6bb533160 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 24 Feb 2014 16:39:57 +0100 Subject: smp: Teach __smp_call_function_single() to check for offline cpus Align __smp_call_function_single() with smp_call_function_single() so that it also checks whether requested cpu is still online. Signed-off-by: Jan Kara Cc: Andrew Morton Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Jens Axboe Signed-off-by: Frederic Weisbecker Signed-off-by: Jens Axboe --- kernel/smp.c | 11 +++++++---- kernel/up.c | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index e3852de042a6..5ff14e3739ca 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -276,18 +276,18 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); /** * __smp_call_function_single(): Run a function on a specific CPU * @cpu: The CPU to run on. - * @data: Pre-allocated and setup data structure + * @csd: Pre-allocated and setup data structure * @wait: If true, wait until function has completed on specified CPU. * * Like smp_call_function_single(), but allow caller to pass in a * pre-allocated data structure. Useful for embedding @data inside * other structures, for instance. */ -void __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) +int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) { unsigned int this_cpu; unsigned long flags; + int err = 0; this_cpu = get_cpu(); /* @@ -303,11 +303,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd, local_irq_save(flags); csd->func(csd->info); local_irq_restore(flags); - } else { + } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { csd_lock(csd); generic_exec_single(cpu, csd, wait); + } else { + err = -ENXIO; /* CPU not online */ } put_cpu(); + return err; } EXPORT_SYMBOL_GPL(__smp_call_function_single); diff --git a/kernel/up.c b/kernel/up.c index 509403e3fbc6..cdf03d16840e 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,14 +22,15 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -void __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) +int __smp_call_function_single(int cpu, struct call_single_data *csd, + int wait) { unsigned long flags; local_irq_save(flags); csd->func(csd->info); local_irq_restore(flags); + return 0; } EXPORT_SYMBOL(__smp_call_function_single); -- cgit v1.2.3 From 8b28499a71d3431c9128abc743e2d2bfbdae3ed4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 24 Feb 2014 16:39:58 +0100 Subject: smp: Consolidate the various smp_call_function_single() declensions __smp_call_function_single() and smp_call_function_single() share some code that can be factorized: execute inline when the target is local, check if the target is online, lock the csd, call generic_exec_single(). Lets move the common parts to generic_exec_single(). Reviewed-by: Jan Kara Cc: Andrew Morton Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Frederic Weisbecker Signed-off-by: Jens Axboe --- kernel/smp.c | 80 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 5ff14e3739ca..64bb0d48e96f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -117,13 +117,43 @@ static void csd_unlock(struct call_single_data *csd) csd->flags &= ~CSD_FLAG_LOCK; } +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); + /* * Insert a previously allocated call_single_data element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static int generic_exec_single(int cpu, struct call_single_data *csd, + smp_call_func_t func, void *info, int wait) { + struct call_single_data csd_stack = { .flags = 0 }; + unsigned long flags; + + + if (cpu == smp_processor_id()) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; + } + + + if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; + + + if (!csd) { + csd = &csd_stack; + if (!wait) + csd = &__get_cpu_var(csd_data); + } + + csd_lock(csd); + + csd->func = func; + csd->info = info; + if (wait) csd->flags |= CSD_FLAG_WAIT; @@ -143,6 +173,8 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) if (wait) csd_lock_wait(csd); + + return 0; } /* @@ -168,8 +200,6 @@ void generic_smp_call_function_single_interrupt(void) } } -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); - /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. @@ -181,12 +211,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { - struct call_single_data d = { - .flags = 0, - }; - unsigned long flags; int this_cpu; - int err = 0; + int err; /* * prevent preemption and reschedule on another processor, @@ -203,26 +229,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); - if (cpu == this_cpu) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } else { - if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *csd = &d; - - if (!wait) - csd = &__get_cpu_var(csd_data); - - csd_lock(csd); - - csd->func = func; - csd->info = info; - generic_exec_single(cpu, csd, wait); - } else { - err = -ENXIO; /* CPU not online */ - } - } + err = generic_exec_single(cpu, NULL, func, info, wait); put_cpu(); @@ -285,9 +292,8 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); */ int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) { - unsigned int this_cpu; - unsigned long flags; int err = 0; + int this_cpu; this_cpu = get_cpu(); /* @@ -296,20 +302,12 @@ int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled() && !oops_in_progress); - if (cpu == this_cpu) { - local_irq_save(flags); - csd->func(csd->info); - local_irq_restore(flags); - } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - csd_lock(csd); - generic_exec_single(cpu, csd, wait); - } else { - err = -ENXIO; /* CPU not online */ - } + err = generic_exec_single(cpu, csd, csd->func, csd->info, wait); put_cpu(); + return err; } EXPORT_SYMBOL_GPL(__smp_call_function_single); -- cgit v1.2.3 From d7877c03f1b62de06f9c00417952f39f56c1ab00 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 24 Feb 2014 16:39:59 +0100 Subject: smp: Move __smp_call_function_single() below its safe version Move this function closer to __smp_call_function_single(). These functions have very similar behavior and should be displayed in the same block for clarity. Reviewed-by: Jan Kara Cc: Andrew Morton Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Frederic Weisbecker Signed-off-by: Jens Axboe --- kernel/smp.c | 64 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 64bb0d48e96f..fa04ab938e52 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -237,6 +237,38 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, } EXPORT_SYMBOL(smp_call_function_single); +/** + * __smp_call_function_single(): Run a function on a specific CPU + * @cpu: The CPU to run on. + * @csd: Pre-allocated and setup data structure + * @wait: If true, wait until function has completed on specified CPU. + * + * Like smp_call_function_single(), but allow caller to pass in a + * pre-allocated data structure. Useful for embedding @data inside + * other structures, for instance. + */ +int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) +{ + int err = 0; + int this_cpu; + + this_cpu = get_cpu(); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled() + && !oops_in_progress); + + err = generic_exec_single(cpu, csd, csd->func, csd->info, wait); + put_cpu(); + + return err; +} +EXPORT_SYMBOL_GPL(__smp_call_function_single); + /* * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. @@ -280,38 +312,6 @@ call: } EXPORT_SYMBOL_GPL(smp_call_function_any); -/** - * __smp_call_function_single(): Run a function on a specific CPU - * @cpu: The CPU to run on. - * @csd: Pre-allocated and setup data structure - * @wait: If true, wait until function has completed on specified CPU. - * - * Like smp_call_function_single(), but allow caller to pass in a - * pre-allocated data structure. Useful for embedding @data inside - * other structures, for instance. - */ -int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) -{ - int err = 0; - int this_cpu; - - this_cpu = get_cpu(); - /* - * Can deadlock when called with interrupts disabled. - * We allow cpu's that are not yet online though, as no one else can - * send smp call function interrupt to this cpu and as such deadlocks - * can't happen. - */ - WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled() - && !oops_in_progress); - - err = generic_exec_single(cpu, csd, csd->func, csd->info, wait); - put_cpu(); - - return err; -} -EXPORT_SYMBOL_GPL(__smp_call_function_single); - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). -- cgit v1.2.3 From e0a23b0628b10d25f2c178be6fcfc17c1ab49fda Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 24 Feb 2014 16:40:00 +0100 Subject: watchdog: Simplify a little the IPI call In order to remotely restart the watchdog hrtimer, update_timers() allocates a csd on the stack and pass it to __smp_call_function_single(). There is no partcular need, however, for a specific csd here. Lets simplify that a little by calling smp_call_function_single() which can already take care of the csd allocation by itself. Acked-by: Don Zickus Reviewed-by: Michal Hocko Cc: Andrew Morton Cc: Christoph Hellwig Cc: Don Zickus Cc: Ingo Molnar Cc: Jan Kara Cc: Jens Axboe Cc: Michal Hocko Cc: Srivatsa S. Bhat Signed-off-by: Frederic Weisbecker Signed-off-by: Jens Axboe --- kernel/watchdog.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4431610f049a..01c6f979486f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -505,7 +505,6 @@ static void restart_watchdog_hrtimer(void *info) static void update_timers(int cpu) { - struct call_single_data data = {.func = restart_watchdog_hrtimer}; /* * Make sure that perf event counter will adopt to a new * sampling period. Updating the sampling period directly would @@ -515,7 +514,7 @@ static void update_timers(int cpu) * might be late already so we have to restart the timer as well. */ watchdog_nmi_disable(cpu); - __smp_call_function_single(cpu, &data, 1); + smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); watchdog_nmi_enable(cpu); } -- cgit v1.2.3 From fce8ad1568c57e7f334018dec4fa1744c926c135 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 24 Feb 2014 16:40:01 +0100 Subject: smp: Remove wait argument from __smp_call_function_single() The main point of calling __smp_call_function_single() is to send an IPI in a pure asynchronous way. By embedding a csd in an object, a caller can send the IPI without waiting for a previous one to complete as is required by smp_call_function_single() for example. As such, sending this kind of IPI can be safe even when irqs are disabled. This flexibility comes at the expense of the caller who then needs to synchronize the csd lifecycle by himself and make sure that IPIs on a single csd are serialized. This is how __smp_call_function_single() works when wait = 0 and this usecase is relevant. Now there don't seem to be any usecase with wait = 1 that can't be covered by smp_call_function_single() instead, which is safer. Lets look at the two possible scenario: 1) The user calls __smp_call_function_single(wait = 1) on a csd embedded in an object. It looks like a nice and convenient pattern at the first sight because we can then retrieve the object from the IPI handler easily. But actually it is a waste of memory space in the object since the csd can be allocated from the stack by smp_call_function_single(wait = 1) and the object can be passed an the IPI argument. Besides that, embedding the csd in an object is more error prone because the caller must take care of the serialization of the IPIs for this csd. 2) The user calls __smp_call_function_single(wait = 1) on a csd that is allocated on the stack. It's ok but smp_call_function_single() can do it as well and it already takes care of the allocation on the stack. Again it's more simple and less error prone. Therefore, using the underscore prepend API version with wait = 1 is a bad pattern and a sign that the caller can do safer and more simple. There was a single user of that which has just been converted. So lets remove this option to discourage further users. Cc: Andrew Morton Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Frederic Weisbecker Signed-off-by: Jens Axboe --- kernel/sched/core.c | 2 +- kernel/smp.c | 19 ++++--------------- kernel/up.c | 3 +-- 3 files changed, 6 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..eba3d84765f3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -432,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } diff --git a/kernel/smp.c b/kernel/smp.c index fa04ab938e52..b76763189752 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -241,29 +241,18 @@ EXPORT_SYMBOL(smp_call_function_single); * __smp_call_function_single(): Run a function on a specific CPU * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure - * @wait: If true, wait until function has completed on specified CPU. * * Like smp_call_function_single(), but allow caller to pass in a * pre-allocated data structure. Useful for embedding @data inside * other structures, for instance. */ -int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) +int __smp_call_function_single(int cpu, struct call_single_data *csd) { int err = 0; - int this_cpu; - this_cpu = get_cpu(); - /* - * Can deadlock when called with interrupts disabled. - * We allow cpu's that are not yet online though, as no one else can - * send smp call function interrupt to this cpu and as such deadlocks - * can't happen. - */ - WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled() - && !oops_in_progress); - - err = generic_exec_single(cpu, csd, csd->func, csd->info, wait); - put_cpu(); + preempt_disable(); + err = generic_exec_single(cpu, csd, csd->func, csd->info, 0); + preempt_enable(); return err; } diff --git a/kernel/up.c b/kernel/up.c index cdf03d16840e..4e199d4cef8e 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,8 +22,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) +int __smp_call_function_single(int cpu, struct call_single_data *csd) { unsigned long flags; -- cgit v1.2.3 From c46fff2a3b29794b35d717b5680a27f31a6a6bc0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 24 Feb 2014 16:40:02 +0100 Subject: smp: Rename __smp_call_function_single() to smp_call_function_single_async() The name __smp_call_function_single() doesn't tell much about the properties of this function, especially when compared to smp_call_function_single(). The comments above the implementation are also misleading. The main point of this function is actually not to be able to embed the csd in an object. This is actually a requirement that result from the purpose of this function which is to raise an IPI asynchronously. As such it can be called with interrupts disabled. And this feature comes at the cost of the caller who then needs to serialize the IPIs on this csd. Lets rename the function and enhance the comments so that they reflect these properties. Suggested-by: Christoph Hellwig Cc: Andrew Morton Cc: Christoph Hellwig Cc: Ingo Molnar Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Frederic Weisbecker Signed-off-by: Jens Axboe --- kernel/sched/core.c | 2 +- kernel/smp.c | 19 +++++++++++++------ kernel/up.c | 4 ++-- 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eba3d84765f3..0cca04a53de0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -432,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } diff --git a/kernel/smp.c b/kernel/smp.c index b76763189752..06d574e42c72 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -238,15 +238,22 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, EXPORT_SYMBOL(smp_call_function_single); /** - * __smp_call_function_single(): Run a function on a specific CPU + * smp_call_function_single_async(): Run an asynchronous function on a + * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure * - * Like smp_call_function_single(), but allow caller to pass in a - * pre-allocated data structure. Useful for embedding @data inside - * other structures, for instance. + * Like smp_call_function_single(), but the call is asynchonous and + * can thus be done from contexts with disabled interrupts. + * + * The caller passes his own pre-allocated data structure + * (ie: embedded in an object) and is responsible for synchronizing it + * such that the IPIs performed on the @csd are strictly serialized. + * + * NOTE: Be careful, there is unfortunately no current debugging facility to + * validate the correctness of this serialization. */ -int __smp_call_function_single(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, struct call_single_data *csd) { int err = 0; @@ -256,7 +263,7 @@ int __smp_call_function_single(int cpu, struct call_single_data *csd) return err; } -EXPORT_SYMBOL_GPL(__smp_call_function_single); +EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus diff --git a/kernel/up.c b/kernel/up.c index 4e199d4cef8e..1760bf3d1463 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,7 +22,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int __smp_call_function_single(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, struct call_single_data *csd) { unsigned long flags; @@ -31,7 +31,7 @@ int __smp_call_function_single(int cpu, struct call_single_data *csd) local_irq_restore(flags); return 0; } -EXPORT_SYMBOL(__smp_call_function_single); +EXPORT_SYMBOL(smp_call_function_single_async); int on_each_cpu(smp_call_func_t func, void *info, int wait) { -- cgit v1.2.3 From c75611282cf1bf717c1866e7a7eb4d0743815187 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:01 -0500 Subject: cgroup: add css_set->mg_tasks Currently, while migrating tasks from one cgroup to another, cgroup_attach_task() builds a flex array of all target tasks; unfortunately, this has a couple issues. * Flex array has size limit. On 64bit, struct task_and_cgroup is 24bytes making the flex element limit around 87k. It is a high number but not impossible to hit. This means that the current cgroup implementation can't migrate a process with more than 87k threads. * Process migration involves memory allocation whose size is dependent on the number of threads the process has. This means that cgroup core can't guarantee success or failure of multi-process migrations as memory allocation failure can happen in the middle. This is in part because cgroup can't grab threadgroup locks of multiple processes at the same time, so when there are multiple processes to migrate, it is imposible to tell how many tasks are to be migrated beforehand. Note that this already affects cgroup_transfer_tasks(). cgroup currently cannot guarantee atomic success or failure of the operation. It may fail in the middle and after such failure cgroup doesn't have enough information to roll back properly. It just aborts with some tasks migrated and others not. To resolve the situation, we're going to use task->cg_list during migration too. Instead of building a separate array, target tasks will be linked into a dedicated migration list_head on the owning css_set. Tasks on the migration list are treated the same as tasks on the usual tasks list; however, being on a separate list allows cgroup migration code path to keep track of the target tasks by simply keeping the list of css_sets with tasks being migrated, making unpredictable dynamic allocation unnecessary. In prepartion of such migration path update, this patch introduces css_set->mg_tasks list and updates css_set task iterations so that they walk both css_set->tasks and ->mg_tasks. Note that ->mg_tasks isn't used yet. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 56 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8ab800c7bac0..b80c611ff836 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -644,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, atomic_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); + INIT_LIST_HEAD(&cset->mg_tasks); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in @@ -2590,9 +2591,14 @@ static void css_advance_task_iter(struct css_task_iter *it) } link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; - } while (list_empty(&cset->tasks)); + } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); + it->cset_link = l; - it->task = cset->tasks.next; + + if (!list_empty(&cset->tasks)) + it->task = cset->tasks.next; + else + it->task = cset->mg_tasks.next; } /** @@ -2636,24 +2642,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; - struct cgrp_cset_link *link; + struct cgrp_cset_link *link = list_entry(it->cset_link, + struct cgrp_cset_link, cset_link); /* If the iterator cg is NULL, we have no tasks */ if (!it->cset_link) return NULL; res = list_entry(l, struct task_struct, cg_list); - /* Advance iterator to find next entry */ + + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ l = l->next; - link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); - if (l == &link->cset->tasks) { - /* - * We reached the end of this task list - move on to the - * next cgrp_cset_link. - */ + + if (l == &link->cset->tasks) + l = link->cset->mg_tasks.next; + + if (l == &link->cset->mg_tasks) css_advance_task_iter(it); - } else { + else it->task = l; - } + return res; } @@ -4502,16 +4513,23 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct css_set *cset = link->cset; struct task_struct *task; int count = 0; + seq_printf(seq, "css_set %p\n", cset); + list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) { - seq_puts(seq, " ...\n"); - break; - } else { - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); } + continue; + overflow: + seq_puts(seq, " ...\n"); } up_read(&css_set_rwsem); return 0; -- cgit v1.2.3 From b3dc094e93905ae9c1bc0815402ad8e5b203d068 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:01 -0500 Subject: cgroup: use css_set->mg_tasks to track target tasks during migration Currently, while migrating tasks from one cgroup to another, cgroup_attach_task() builds a flex array of all target tasks; unfortunately, this has a couple issues. * Flex array has size limit. On 64bit, struct task_and_cgroup is 24bytes making the flex element limit around 87k. It is a high number but not impossible to hit. This means that the current cgroup implementation can't migrate a process with more than 87k threads. * Process migration involves memory allocation whose size is dependent on the number of threads the process has. This means that cgroup core can't guarantee success or failure of multi-process migrations as memory allocation failure can happen in the middle. This is in part because cgroup can't grab threadgroup locks of multiple processes at the same time, so when there are multiple processes to migrate, it is imposible to tell how many tasks are to be migrated beforehand. Note that this already affects cgroup_transfer_tasks(). cgroup currently cannot guarantee atomic success or failure of the operation. It may fail in the middle and after such failure cgroup doesn't have enough information to roll back properly. It just aborts with some tasks migrated and others not. To resolve the situation, this patch updates the migration path to use task->cg_list to track target tasks. The previous patch already added css_set->mg_tasks and updated iterations in non-migration paths to include them during task migration. This patch updates migration path to actually make use of it. Instead of putting onto a flex_array, each target task is moved from its css_set->tasks list to css_set->mg_tasks and the migration path keeps trace of all the source css_sets and the associated cgroups. Once all source css_sets are determined, the destination css_set for each is determined, linked to the matching source css_set and put on a separate list. To iterate the target tasks, migration path just needs to iterat through either the source or target css_sets, depending on whether migration has been committed or not, and the tasks on their ->mg_tasks lists. cgroup_taskset is updated to contain the list_heads for source and target css_sets and the iteration cursor. cgroup_taskset_*() are accordingly updated to walk through css_sets and their ->mg_tasks. This resolves the above listed issues with moderate additional complexity. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 223 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 115 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b80c611ff836..5def4a800425 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -52,7 +52,6 @@ #include #include #include /* TODO: replace with more sophisticated array */ -#include /* used in cgroup_attach_task */ #include #include @@ -645,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->mg_node); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in @@ -1639,20 +1639,26 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* - * Control Group taskset - */ -struct task_and_cgroup { - struct task_struct *task; - struct cgroup *cgrp; - struct css_set *cset; -}; - +/* used to track tasks and other necessary states during migration */ struct cgroup_taskset { - struct task_and_cgroup single; - struct flex_array *tc_array; - int tc_array_len; - int idx; + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; }; /** @@ -1663,12 +1669,10 @@ struct cgroup_taskset { */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) { - if (tset->tc_array) { - tset->idx = 0; - return cgroup_taskset_next(tset); - } else { - return tset->single.task; - } + tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); + tset->cur_task = NULL; + + return cgroup_taskset_next(tset); } /** @@ -1680,13 +1684,27 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) { - struct task_and_cgroup *tc; + struct css_set *cset = tset->cur_cset; + struct task_struct *task = tset->cur_task; - if (!tset->tc_array || tset->idx >= tset->tc_array_len) - return NULL; + while (&cset->mg_node != tset->csets) { + if (!task) + task = list_first_entry(&cset->mg_tasks, + struct task_struct, cg_list); + else + task = list_next_entry(task, cg_list); - tc = flex_array_get(tset->tc_array, tset->idx++); - return tc->task; + if (&task->cg_list != &cset->mg_tasks) { + tset->cur_cset = cset; + tset->cur_task = task; + return task; + } + + cset = list_next_entry(cset, mg_node); + task = NULL; + } + + return NULL; } /** @@ -1714,11 +1732,13 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); + get_css_set(new_cset); + task_lock(tsk); rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); - list_move(&tsk->cg_list, &new_cset->tasks); + list_move(&tsk->cg_list, &new_cset->mg_tasks); /* * We just gained a reference on old_cset by taking it from the @@ -1741,80 +1761,58 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, bool threadgroup) { - int ret, i, group_size; - struct cgroupfs_root *root = cgrp->root; + struct cgroup_taskset tset = { + .src_csets = LIST_HEAD_INIT(tset.src_csets), + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), + .csets = &tset.src_csets, + }; struct cgroup_subsys_state *css, *failed_css = NULL; - /* threadgroup list cursor and array */ - struct task_struct *task; - struct task_and_cgroup *tc; - struct flex_array *group; - struct cgroup_taskset tset = { }; - - /* - * step 0: in order to do expensive, possibly blocking operations for - * every thread, we cannot iterate the thread group list, since it needs - * rcu or tasklist locked. instead, build an array of all threads in the - * group - group_rwsem prevents new threads from appearing, and if - * threads exit, this will just be an over-estimate. - */ - if (threadgroup) - group_size = get_nr_threads(leader); - else - group_size = 1; - /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); - if (!group) - return -ENOMEM; - /* pre-allocate to guarantee space while iterating in rcu read-side. */ - ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); - if (ret) - goto out_free_group_list; + struct css_set *cset, *tmp_cset; + struct task_struct *task, *tmp_task; + int i, ret; - i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - down_read(&css_set_rwsem); + down_write(&css_set_rwsem); rcu_read_lock(); task = leader; do { - struct task_and_cgroup ent; + struct cgroup *src_cgrp; /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) goto next; - /* as per above, nr_threads may decrease, but not increase. */ - BUG_ON(i >= group_size); - ent.task = task; - ent.cgrp = task_cgroup_from_root(task, root); + cset = task_css_set(task); + src_cgrp = task_cgroup_from_root(task, cgrp->root); + /* nothing to do if this task is already in the cgroup */ - if (ent.cgrp == cgrp) + if (src_cgrp == cgrp) goto next; - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ - ret = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(ret != 0); - i++; + + if (!cset->mg_src_cgrp) { + WARN_ON(!list_empty(&cset->mg_tasks)); + WARN_ON(!list_empty(&cset->mg_node)); + + cset->mg_src_cgrp = src_cgrp; + list_add(&cset->mg_node, &tset.src_csets); + get_css_set(cset); + } + + list_move(&task->cg_list, &cset->mg_tasks); next: if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); - up_read(&css_set_rwsem); - /* remember the number of threads in the array for later. */ - group_size = i; - tset.tc_array = group; - tset.tc_array_len = group_size; + up_write(&css_set_rwsem); /* methods shouldn't be called if no task is actually migrating */ - ret = 0; - if (!group_size) - goto out_free_group_list; + if (list_empty(&tset.src_csets)) + return 0; /* * step 1: check that we can legitimately attach to the cgroup. @@ -1833,16 +1831,21 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, * step 2: make sure css_sets exist for all threads to be migrated. * we use find_css_set, which allocates a new one if necessary. */ - for (i = 0; i < group_size; i++) { - struct css_set *old_cset; + list_for_each_entry(cset, &tset.src_csets, mg_node) { + struct css_set *dst_cset; - tc = flex_array_get(group, i); - old_cset = task_css_set(tc->task); - tc->cset = find_css_set(old_cset, cgrp); - if (!tc->cset) { + dst_cset = find_css_set(cset, cgrp); + if (!dst_cset) { ret = -ENOMEM; - goto out_put_css_set_refs; + goto out_release_tset; } + + if (list_empty(&dst_cset->mg_node)) + list_add(&dst_cset->mg_node, &tset.dst_csets); + else + put_css_set(dst_cset, false); + + cset->mg_dst_cset = dst_cset; } /* @@ -1851,12 +1854,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, * failure cases after here, so this is the commit point. */ down_write(&css_set_rwsem); - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); + list_for_each_entry(cset, &tset.src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) + cgroup_task_migrate(cset->mg_src_cgrp, task, + cset->mg_dst_cset); } up_write(&css_set_rwsem); - /* nothing is sensitive to fork() after this point. */ + + /* migration is committed, all target tasks are now on dst_csets */ + tset.csets = &tset.dst_csets; + + /* nothing is sensitive to fork() after this point */ /* * step 4: do subsystem attach callbacks. @@ -1865,30 +1873,27 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, if (css->ss->attach) css->ss->attach(css, &tset); - /* - * step 5: success! and cleanup - */ ret = 0; -out_put_css_set_refs: - if (ret) { - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - if (!tc->cset) - break; - put_css_set(tc->cset, false); - } - } + goto out_release_tset; + out_cancel_attach: - if (ret) { - for_each_css(css, i, cgrp) { - if (css == failed_css) - break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, &tset); - } + for_each_css(css, i, cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, &tset); } -out_free_group_list: - flex_array_free(group); +out_release_tset: + down_write(&css_set_rwsem); + list_splice_init(&tset.dst_csets, &tset.src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { + list_splice_init(&cset->mg_tasks, &cset->tasks); + cset->mg_dst_cset = NULL; + cset->mg_src_cgrp = NULL; + list_del_init(&cset->mg_node); + put_css_set_locked(cset, false); + } + up_write(&css_set_rwsem); return ret; } @@ -3895,6 +3900,8 @@ int __init cgroup_init_early(void) atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); + INIT_LIST_HEAD(&init_css_set.mg_tasks); + INIT_LIST_HEAD(&init_css_set.mg_node); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&cgroup_dummy_root); -- cgit v1.2.3 From ceb6a081f6f52d17ec9e46e271cc26a1eb8a7573 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:02 -0500 Subject: cgroup: separate out cset_group_from_root() from task_cgroup_from_root() This will be used by the planned migration path update. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5def4a800425..23e3a8c74bd4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -758,25 +758,15 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) cgroup_free_root(root); } -/* - * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_rwsem held. - */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, +/* look up cgroup associated with given css_set on the specified hierarchy */ +static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroupfs_root *root) { - struct css_set *cset; struct cgroup *res = NULL; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_rwsem); - /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. - */ - cset = task_css_set(task); if (cset == &init_css_set) { res = &root->top_cgroup; } else { @@ -796,6 +786,21 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, return res; } +/* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex and css_set_rwsem held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + return cset_cgroup_from_root(task_css_set(task), root); +} + /* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. -- cgit v1.2.3 From 1958d2d53dadbb1c9aaf0b37741f13a60098b243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:03 -0500 Subject: cgroup: split process / task migration into four steps Currently, process / task migration is a single operation which may fail depending on memory pressure or the involved controllers' ->can_attach() callbacks. One problem with this approach is migration of multiple targets. It's impossible to tell whether a given target will be successfully migrated beforehand and cgroup core can't keep track of enough states to roll back after intermediate failure. This is already an issue with cgroup_transfer_tasks(). Also, we're gonna need multiple target migration for unified hierarchy. This patch splits migration into four stages - cgroup_migrate_add_src(), cgroup_migrate_prepare_dst(), cgroup_migrate() and cgroup_migrate_finish(), where cgroup_migrate_prepare_dst() performs all the operations which may fail due to allocation failure without actually migrating the target. The four separate stages mean that, disregarding ->can_attach() failures, the success or failure of multi target migration can be determined before performing any actual migration. If preparations of all targets succeed, the whole thing will succeed. If not, the whole operation can fail without any side-effect. Since the previous patch to use css_set->mg_tasks to keep track of migration targets, the only thing which may need memory allocation during migration is the target css_sets. cgroup_migrate_prepare() pins all source and target css_sets and link them up. Note that this can be performed without holding threadgroup_lock even if the target is a process. As long as cgroup_mutex is held, no new css_set can be put into play. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 240 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 181 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 23e3a8c74bd4..a93f6f1ebc69 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -644,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->mg_preload_node); INIT_LIST_HEAD(&cset->mg_node); INIT_HLIST_NODE(&cset->hlist); @@ -1755,16 +1756,137 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, } /** - * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup - * @cgrp: the cgroup to attach to - * @leader: the task or the leader of the threadgroup to be attached - * @threadgroup: attach the whole threadgroup? + * cgroup_migrate_finish - cleanup after attach + * @preloaded_csets: list of preloaded css_sets * - * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of @tsk or each thread in the threadgroup individually in turn. + * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See + * those functions for details. */ -static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, - bool threadgroup) +static void cgroup_migrate_finish(struct list_head *preloaded_csets) +{ + struct css_set *cset, *tmp_cset; + + lockdep_assert_held(&cgroup_mutex); + + down_write(&css_set_rwsem); + list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_preload_node); + put_css_set_locked(cset, false); + } + up_write(&css_set_rwsem); +} + +/** + * cgroup_migrate_add_src - add a migration source css_set + * @src_cset: the source css_set to add + * @dst_cgrp: the destination cgroup + * @preloaded_csets: list of preloaded css_sets + * + * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin + * @src_cset and add it to @preloaded_csets, which should later be cleaned + * up by cgroup_migrate_finish(). + * + * This function may be called without holding threadgroup_lock even if the + * target is a process. Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations. + */ +static void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + struct cgroup *src_cgrp; + + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + + /* nothing to do if this cset already belongs to the cgroup */ + if (src_cgrp == dst_cgrp) + return; + + if (!list_empty(&src_cset->mg_preload_node)) + return; + + WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(!list_empty(&src_cset->mg_tasks)); + WARN_ON(!list_empty(&src_cset->mg_node)); + + src_cset->mg_src_cgrp = src_cgrp; + get_css_set(src_cset); + list_add(&src_cset->mg_preload_node, preloaded_csets); +} + +/** + * cgroup_migrate_prepare_dst - prepare destination css_sets for migration + * @dst_cgrp: the destination cgroup + * @preloaded_csets: list of preloaded source css_sets + * + * Tasks are about to be moved to @dst_cgrp and all the source css_sets + * have been preloaded to @preloaded_csets. This function looks up and + * pins all destination css_sets, links each to its source, and put them on + * @preloaded_csets. + * + * This function must be called after cgroup_migrate_add_src() has been + * called on each migration source css_set. After migration is performed + * using cgroup_migrate(), cgroup_migrate_finish() must be called on + * @preloaded_csets. + */ +static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + LIST_HEAD(csets); + struct css_set *src_cset; + + lockdep_assert_held(&cgroup_mutex); + + /* look up the dst cset for each src cset and link it to src */ + list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + struct css_set *dst_cset; + + dst_cset = find_css_set(src_cset, dst_cgrp); + if (!dst_cset) + goto err; + + WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + src_cset->mg_dst_cset = dst_cset; + + if (list_empty(&dst_cset->mg_preload_node)) + list_add(&dst_cset->mg_preload_node, &csets); + else + put_css_set(dst_cset, false); + } + + list_splice(&csets, preloaded_csets); + return 0; +err: + cgroup_migrate_finish(&csets); + return -ENOMEM; +} + +/** + * cgroup_migrate - migrate a process or task to a cgroup + * @cgrp: the destination cgroup + * @leader: the leader of the process or the task to migrate + * @threadgroup: whether @leader points to the whole process or a single task + * + * Migrate a process or task denoted by @leader to @cgrp. If migrating a + * process, the caller must be holding threadgroup_lock of @leader. The + * caller is also responsible for invoking cgroup_migrate_add_src() and + * cgroup_migrate_prepare_dst() on the targets before invoking this + * function and following up with cgroup_migrate_finish(). + * + * As long as a controller's ->can_attach() doesn't fail, this function is + * guaranteed to succeed. This means that, excluding ->can_attach() + * failure, when migrating multiple targets, the success or failure can be + * decided for all targets by invoking group_migrate_prepare_dst() before + * actually starting migrating. + */ +static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, + bool threadgroup) { struct cgroup_taskset tset = { .src_csets = LIST_HEAD_INIT(tset.src_csets), @@ -1785,29 +1907,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, rcu_read_lock(); task = leader; do { - struct cgroup *src_cgrp; - /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) goto next; cset = task_css_set(task); - src_cgrp = task_cgroup_from_root(task, cgrp->root); - - /* nothing to do if this task is already in the cgroup */ - if (src_cgrp == cgrp) + if (!cset->mg_src_cgrp) goto next; - if (!cset->mg_src_cgrp) { - WARN_ON(!list_empty(&cset->mg_tasks)); - WARN_ON(!list_empty(&cset->mg_node)); - - cset->mg_src_cgrp = src_cgrp; - list_add(&cset->mg_node, &tset.src_csets); - get_css_set(cset); - } - list_move(&task->cg_list, &cset->mg_tasks); + list_move(&cset->mg_node, &tset.src_csets); + list_move(&cset->mg_dst_cset->mg_node, &tset.dst_csets); next: if (!threadgroup) break; @@ -1819,9 +1929,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, if (list_empty(&tset.src_csets)) return 0; - /* - * step 1: check that we can legitimately attach to the cgroup. - */ + /* check that we can legitimately attach to the cgroup */ for_each_css(css, i, cgrp) { if (css->ss->can_attach) { ret = css->ss->can_attach(css, &tset); @@ -1833,30 +1941,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, } /* - * step 2: make sure css_sets exist for all threads to be migrated. - * we use find_css_set, which allocates a new one if necessary. - */ - list_for_each_entry(cset, &tset.src_csets, mg_node) { - struct css_set *dst_cset; - - dst_cset = find_css_set(cset, cgrp); - if (!dst_cset) { - ret = -ENOMEM; - goto out_release_tset; - } - - if (list_empty(&dst_cset->mg_node)) - list_add(&dst_cset->mg_node, &tset.dst_csets); - else - put_css_set(dst_cset, false); - - cset->mg_dst_cset = dst_cset; - } - - /* - * step 3: now that we're guaranteed success wrt the css_sets, - * proceed to move all tasks to the new cgroup. There are no - * failure cases after here, so this is the commit point. + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. */ down_write(&css_set_rwsem); list_for_each_entry(cset, &tset.src_csets, mg_node) { @@ -1866,14 +1953,13 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, } up_write(&css_set_rwsem); - /* migration is committed, all target tasks are now on dst_csets */ - tset.csets = &tset.dst_csets; - - /* nothing is sensitive to fork() after this point */ - /* - * step 4: do subsystem attach callbacks. + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. */ + tset.csets = &tset.dst_csets; + for_each_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); @@ -1893,15 +1979,50 @@ out_release_tset: list_splice_init(&tset.dst_csets, &tset.src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { list_splice_init(&cset->mg_tasks, &cset->tasks); - cset->mg_dst_cset = NULL; - cset->mg_src_cgrp = NULL; list_del_init(&cset->mg_node); - put_css_set_locked(cset, false); } up_write(&css_set_rwsem); return ret; } +/** + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup + * @dst_cgrp: the cgroup to attach to + * @leader: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? + * + * Call holding cgroup_mutex and the group_rwsem of the leader. Will take + * task_lock of @tsk or each thread in the threadgroup individually in turn. + */ +static int cgroup_attach_task(struct cgroup *dst_cgrp, + struct task_struct *leader, bool threadgroup) +{ + LIST_HEAD(preloaded_csets); + struct task_struct *task; + int ret; + + /* look up all src csets */ + down_read(&css_set_rwsem); + rcu_read_lock(); + task = leader; + do { + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, + &preloaded_csets); + if (!threadgroup) + break; + } while_each_thread(leader, task); + rcu_read_unlock(); + up_read(&css_set_rwsem); + + /* prepare dst csets and commit */ + ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + if (!ret) + ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + + cgroup_migrate_finish(&preloaded_csets); + return ret; +} + /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock @@ -3906,6 +4027,7 @@ int __init cgroup_init_early(void) INIT_LIST_HEAD(&init_css_set.cgrp_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_LIST_HEAD(&init_css_set.mg_tasks); + INIT_LIST_HEAD(&init_css_set.mg_preload_node); INIT_LIST_HEAD(&init_css_set.mg_node); INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; -- cgit v1.2.3 From eaf797abc53b0ab3f0a02d4ef873a565fcce6daa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:03 -0500 Subject: cgroup: update how a newly forked task gets associated with css_set When a new process is forked, cgroup_fork() associates it with the css_set of its parent but doesn't link it into it. After the new process is linked to tasklist, cgroup_post_fork() does the linking. This is problematic for cgroup_transfer_tasks() as there's no way to tell whether there are tasks which are pointing to a css_set but not linked yet. It is impossible to implement an operation which transfer all tasks of a cgroup to another and the current cgroup_transfer_tasks() can easily be tricked into leaving a newly forked process behind if it gets called between cgroup_fork() and cgroup_post_fork(). Let's make association with a css_set and linking atomic by moving it to cgroup_post_fork(). cgroup_fork() sets child->cgroups to init_css_set as a placeholder and cgroup_post_fork() is updated to perform both the association with the parent's cgroup and linking there. This means that a newly created task will point to init_css_set without holding a ref to it much like what it does on the exit path. Empty cg_list is used to indicate that the task isn't holding a ref to the associated css_set. This fixes an actual bug with cgroup_transfer_tasks(); however, I'm not marking it for -stable. The whole thing is broken in multiple other ways which require invasive updates to fix and I don't think it's worthwhile to bother with backporting this particular one. Fortunately, the only user is cpuset and these bugs don't crash the machine. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 86 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a93f6f1ebc69..fa0567f4eedd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1342,8 +1342,12 @@ static void cgroup_enable_task_cg_lists(void) * racing against cgroup_exit(). */ spin_lock_irq(&p->sighand->siglock); - if (!(p->flags & PF_EXITING)) - list_add(&p->cg_list, &task_css_set(p)->tasks); + if (!(p->flags & PF_EXITING)) { + struct css_set *cset = task_css_set(p); + + list_add(&p->cg_list, &cset->tasks); + get_css_set(cset); + } spin_unlock_irq(&p->sighand->siglock); task_unlock(p); @@ -1911,6 +1915,10 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, if (task->flags & PF_EXITING) goto next; + /* leave @task alone if post_fork() hasn't linked it yet */ + if (list_empty(&task->cg_list)) + goto next; + cset = task_css_set(task); if (!cset->mg_src_cgrp) goto next; @@ -2815,6 +2823,12 @@ void css_task_iter_end(struct css_task_iter *it) * cgroup_trasnsfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside + * + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { @@ -4243,27 +4257,16 @@ static const struct file_operations proc_cgroupstats_operations = { }; /** - * cgroup_fork - attach newly forked task to its parents cgroup. + * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * - * Description: A task inherits its parent's cgroup at fork(). - * - * A pointer to the shared css_set was automatically copied in - * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. - * - * At the point that cgroup_fork() is called, 'current' is the parent - * task, and the passed argument 'child' points to the child task. + * A task is associated with the init_css_set until cgroup_post_fork() + * attaches it to the parent's css_set. Empty cg_list indicates that + * @child isn't holding reference to its css_set. */ void cgroup_fork(struct task_struct *child) { - task_lock(current); - get_css_set(task_css_set(current)); - child->cgroups = current->cgroups; - task_unlock(current); + RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } @@ -4283,21 +4286,38 @@ void cgroup_post_fork(struct task_struct *child) int i; /* - * use_task_css_set_links is set to 1 before we walk the tasklist - * under the tasklist_lock and we read it here after we added the child - * to the tasklist under the tasklist_lock as well. If the child wasn't - * yet in the tasklist when we walked through it from - * cgroup_enable_task_cg_lists(), then use_task_css_set_links value - * should be visible now due to the paired locking and barriers implied - * by LOCK/UNLOCK: it is written before the tasklist_lock unlock - * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock - * lock on fork. + * This may race against cgroup_enable_task_cg_links(). As that + * function sets use_task_css_set_links before grabbing + * tasklist_lock and we just went through tasklist_lock to add + * @child, it's guaranteed that either we see the set + * use_task_css_set_links or cgroup_enable_task_cg_lists() sees + * @child during its iteration. + * + * If we won the race, @child is associated with %current's + * css_set. Grabbing css_set_rwsem guarantees both that the + * association is stable, and, on completion of the parent's + * migration, @child is visible in the source of migration or + * already in the destination cgroup. This guarantee is necessary + * when implementing operations which need to migrate all tasks of + * a cgroup to another. + * + * Note that if we lose to cgroup_enable_task_cg_links(), @child + * will remain in init_css_set. This is safe because all tasks are + * in the init_css_set before cg_links is enabled and there's no + * operation which transfers all tasks out of init_css_set. */ if (use_task_css_set_links) { + struct css_set *cset; + down_write(&css_set_rwsem); + cset = task_css_set_check(current, + lockdep_is_held(&css_set_rwsem)); task_lock(child); - if (list_empty(&child->cg_list)) - list_add(&child->cg_list, &task_css_set(child)->tasks); + if (list_empty(&child->cg_list)) { + rcu_assign_pointer(child->cgroups, cset); + list_add(&child->cg_list, &cset->tasks); + get_css_set(cset); + } task_unlock(child); up_write(&css_set_rwsem); } @@ -4353,6 +4373,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { struct cgroup_subsys *ss; struct css_set *cset; + bool put_cset = false; int i; /* @@ -4361,8 +4382,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) */ if (!list_empty(&tsk->cg_list)) { down_write(&css_set_rwsem); - if (!list_empty(&tsk->cg_list)) + if (!list_empty(&tsk->cg_list)) { list_del_init(&tsk->cg_list); + put_cset = true; + } up_write(&css_set_rwsem); } @@ -4384,7 +4407,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - put_css_set(cset, true); + if (put_cset) + put_css_set(cset, true); } static void check_for_release(struct cgroup *cgrp) -- cgit v1.2.3 From 0e1d768f1b1873272ec4e8dc1482bb5281855017 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:03 -0500 Subject: cgroup: drop task_lock() protection around task->cgroups For optimization, task_lock() is additionally used to protect task->cgroups. The optimization is pretty dubious as either css_set_rwsem is grabbed anyway or PF_EXITING already protects task->cgroups. It adds only overhead and confusion at this point. Let's drop task_[un]lock() and update comments accordingly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 97 ++++++++++++++------------------------------------------- 1 file changed, 24 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fa0567f4eedd..f783af900208 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -80,12 +80,21 @@ static DEFINE_MUTEX(cgroup_tree_mutex); /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. + * + * css_set_rwsem protects task->cgroups pointer, the list of css_set + * objects, and the chain of tasks off each css_set. + * + * These locks are exported if CONFIG_PROVE_RCU so that accessors in + * cgroup.h can use them for lockdep annotations. */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ +DECLARE_RWSEM(css_set_rwsem); +EXPORT_SYMBOL_GPL(cgroup_mutex); +EXPORT_SYMBOL_GPL(css_set_rwsem); #else static DEFINE_MUTEX(cgroup_mutex); +static DECLARE_RWSEM(css_set_rwsem); #endif /* @@ -338,12 +347,6 @@ struct cgrp_cset_link { static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; - -/* - * css_set_rwsem protects the list of css_set objects, and the chain of - * tasks off each css_set. - */ -static DECLARE_RWSEM(css_set_rwsem); static int css_set_count; /* @@ -803,10 +806,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, } /* - * There is one global cgroup mutex. We also require taking - * task_lock() when dereferencing a task's cgroup subsys pointers. - * See "The task_lock() exception", at the end of this comment. - * * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. @@ -836,18 +835,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that top_cgroup cannot be deleted. * - * The task_lock() exception - * - * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one task's cgroup pointer with - * another. It does so using cgroup_mutex, however there are - * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global - * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task's cgroup pointer we use - * task_lock(), which acts on a spinlock (task->alloc_lock) already in - * the task_struct routinely used for such matters. - * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ @@ -1329,8 +1316,6 @@ static void cgroup_enable_task_cg_lists(void) */ read_lock(&tasklist_lock); do_each_thread(g, p) { - task_lock(p); - WARN_ON_ONCE(!list_empty(&p->cg_list) || task_css_set(p) != &init_css_set); @@ -1349,8 +1334,6 @@ static void cgroup_enable_task_cg_lists(void) get_css_set(cset); } spin_unlock_irq(&p->sighand->siglock); - - task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: @@ -1743,11 +1726,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, old_cset = task_css_set(tsk); get_css_set(new_cset); - - task_lock(tsk); rcu_assign_pointer(tsk->cgroups, new_cset); - task_unlock(tsk); - list_move(&tsk->cg_list, &new_cset->mg_tasks); /* @@ -1999,8 +1978,7 @@ out_release_tset: * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * - * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of @tsk or each thread in the threadgroup individually in turn. + * Call holding cgroup_mutex and threadgroup_lock of @leader. */ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) @@ -2034,7 +2012,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup; may take task_lock of task. + * cgroup_mutex and threadgroup. */ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { @@ -4155,12 +4133,6 @@ core_initcall(cgroup_wq_init); * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc//cgroup. - * - No need to task_lock(tsk) on this tsk->cgroup reference, as it - * doesn't really matter if tsk->cgroup changes after we read it, - * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it - * anyway. No need to check that tsk->cgroup != NULL, thanks to - * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks - * cgroup to top_cgroup. */ /* TODO: Use a proper seq_file iterator */ @@ -4310,15 +4282,12 @@ void cgroup_post_fork(struct task_struct *child) struct css_set *cset; down_write(&css_set_rwsem); - cset = task_css_set_check(current, - lockdep_is_held(&css_set_rwsem)); - task_lock(child); + cset = task_css_set(current); if (list_empty(&child->cg_list)) { rcu_assign_pointer(child->cgroups, cset); list_add(&child->cg_list, &cset->tasks); get_css_set(cset); } - task_unlock(child); up_write(&css_set_rwsem); } @@ -4347,27 +4316,13 @@ void cgroup_post_fork(struct task_struct *child) * use notify_on_release cgroups where very high task exit scaling * is required on large systems. * - * the_top_cgroup_hack: - * - * Set the exiting tasks cgroup to the root cgroup (top_cgroup). - * - * We call cgroup_exit() while the task is still competent to - * handle notify_on_release(), then leave the task attached to the - * root cgroup in each hierarchy for the remainder of its exit. - * - * To do this properly, we would increment the reference count on - * top_cgroup, and near the very end of the kernel/exit.c do_exit() - * code we would add a second cgroup function call, to drop that - * reference. This would just create an unnecessary hot spot on - * the top_cgroup reference count, to no avail. - * - * Normally, holding a reference to a cgroup without bumping its - * count is unsafe. The cgroup could go away, or someone could - * attach us to a different cgroup, decrementing the count on - * the first cgroup that we never incremented. But in this case, - * top_cgroup isn't going away, and either task has PF_EXITING set, - * which wards off any cgroup_attach_task() attempts, or task is a failed - * fork, never visible to cgroup_attach_task. + * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We + * call cgroup_exit() while the task is still competent to handle + * notify_on_release(), then leave the task attached to the root cgroup in + * each hierarchy for the remainder of its exit. No need to bother with + * init_css_set refcnting. init_css_set never goes away and we can't race + * with migration path - either PF_EXITING is visible to migration path or + * @tsk never got on the tasklist. */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { @@ -4377,20 +4332,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) int i; /* - * Unlink from the css_set task list if necessary. Optimistically - * check cg_list before taking css_set_rwsem. + * Unlink from @tsk from its css_set. As migration path can't race + * with us, we can check cg_list without grabbing css_set_rwsem. */ if (!list_empty(&tsk->cg_list)) { down_write(&css_set_rwsem); - if (!list_empty(&tsk->cg_list)) { - list_del_init(&tsk->cg_list); - put_cset = true; - } + list_del_init(&tsk->cg_list); up_write(&css_set_rwsem); + put_cset = true; } /* Reassign the task to the init_css_set. */ - task_lock(tsk); cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); @@ -4405,7 +4357,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } } } - task_unlock(tsk); if (put_cset) put_css_set(cset, true); -- cgit v1.2.3 From 952aaa125428fae883670a2c2e40ea8044ca1eaa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2014 10:04:03 -0500 Subject: cgroup: update cgroup_transfer_tasks() to either succeed or fail cgroup_transfer_tasks() can currently fail in the middle due to memory allocation failure. When that happens, the function just aborts and returns error code and there's no way to tell how many actually got migrated at the point of failure and or to revert the partial migration. Update it to use cgroup_migrate{_add_src|prepare_dst|migrate|finish}() so that the function either succeeds or fails as a whole as long as ->can_attach() doesn't fail. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f783af900208..306ad0ed19ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2810,10 +2810,28 @@ void css_task_iter_end(struct css_task_iter *it) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { + LIST_HEAD(preloaded_csets); + struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; - int ret = 0; + int ret; + + mutex_lock(&cgroup_mutex); + + /* all tasks in @from are being moved, all csets are source */ + down_read(&css_set_rwsem); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + up_read(&css_set_rwsem); + ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + if (ret) + goto out_err; + + /* + * Migrate tasks one-by-one until @form is empty. This fails iff + * ->can_attach() fails. + */ do { css_task_iter_start(&from->dummy_css, &it); task = css_task_iter_next(&it); @@ -2822,13 +2840,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - mutex_lock(&cgroup_mutex); - ret = cgroup_attach_task(to, task, false); - mutex_unlock(&cgroup_mutex); + ret = cgroup_migrate(to, task, false); put_task_struct(task); } } while (task && !ret); - +out_err: + cgroup_migrate_finish(&preloaded_csets); + mutex_unlock(&cgroup_mutex); return ret; } -- cgit v1.2.3 From a60bed296ac67b9e2765646dec8e36e3b4d7c395 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 16:07:59 -0500 Subject: cgroup_freezer: document freezer_fork() subtleties cgroup_subsys->fork() callback is special in that it's called outside the usual cgroup locking and may race with on-going migration. freezer_fork() currently doesn't consider such race condition; however, it is still correct thanks to the fact that freeze_task() may be called spuriously. This is quite subtle. Let's explain what's going on and add test to detect racing and losing to task migration and skip freeze_task() in such cases for documentation. This doesn't make any behavior difference meaningful to userland. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: "Rafael J. Wysocki" --- kernel/cgroup_freezer.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 7201a637c405..2ea98b216bff 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -214,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, } } +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; @@ -222,14 +232,26 @@ static void freezer_fork(struct task_struct *task) freezer = task_freezer(task); /* - * The root cgroup is non-freezable, so we can skip the - * following check. + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. */ if (!parent_freezer(freezer)) goto out; + /* + * Grab @freezer->lock and freeze @task after verifying @task still + * belongs to @freezer and it's freezing. The former is for the + * case where we have raced against task migration and lost and + * @task is already in a different cgroup which may not be frozen. + * This isn't strictly necessary as freeze_task() is allowed to be + * called spuriously but let's do it anyway for, if nothing else, + * documentation. + */ spin_lock_irq(&freezer->lock); - if (freezer->state & CGROUP_FREEZING) + if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) freeze_task(task); spin_unlock_irq(&freezer->lock); out: -- cgit v1.2.3 From fff421580f512fc044cc7421fdff31a7a6997350 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Jan 2014 20:20:43 -0800 Subject: timers: Track total number of timers in list Currently, the tvec_base structure's ->active_timers field tracks only the non-deferrable timers, which means that even if ->active_timers is zero, there might well be deferrable timers in the list. This commit therefore adds an ->all_timers field to track all the timers, whether deferrable or not. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e5..fdc43834f3af 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -81,6 +81,7 @@ struct tvec_base { unsigned long timer_jiffies; unsigned long next_timer; unsigned long active_timers; + unsigned long all_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -392,6 +393,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) base->next_timer = timer->expires; base->active_timers++; } + base->all_timers++; } #ifdef CONFIG_TIMER_STATS @@ -671,6 +673,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) base->active_timers--; + base->all_timers--; } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -685,6 +688,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } + base->all_timers--; return 1; } @@ -1559,6 +1563,7 @@ static int init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; base->active_timers = 0; + base->all_timers = 0; return 0; } -- cgit v1.2.3 From d550e81dc0ddc04f1b417c179c214103a28e0ee8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 05:57:10 -0800 Subject: timers: Reduce __run_timers() latency for empty list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. In this case, which is likely to be common for NO_HZ_FULL kernels, the kernel currently incurs a large latency for no good reason. This commit therefore short-circuits this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index fdc43834f3af..c8bc7091d8f3 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -338,6 +338,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); +/* + * If the list is empty, catch up ->timer_jiffies to the current time. + * The caller must hold the tvec_base lock. Returns true if the list + * was empty and therefore ->timer_jiffies was updated. + */ +static bool catchup_timer_jiffies(struct tvec_base *base) +{ + if (!base->all_timers) { + base->timer_jiffies = jiffies; + return true; + } + return false; +} + static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { @@ -1150,6 +1164,10 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); + if (catchup_timer_jiffies(base)) { + spin_unlock_irq(&base->lock); + return; + } while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; -- cgit v1.2.3 From 16d937f880312e3f47157d4d6d6ebf7e61523378 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 10:32:01 -0800 Subject: timers: Reduce future __run_timers() latency for newly emptied list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. Therefore, if we just emptied the timer wheel, for example, by deleting the last timer, we should mark the timer wheel as being up to date. This marking will reduce (and perhaps eliminate) the jiffy-stepping that a future __run_timers() call will need to do in response to some future timer posting or migration. This commit therefore catches ->timer_jiffies for this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index c8bc7091d8f3..dfac34f7186f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -688,6 +688,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) if (!tbase_get_deferrable(timer->base)) base->active_timers--; base->all_timers--; + (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -703,6 +704,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, base->next_timer = base->timer_jiffies; } base->all_timers--; + (void)catchup_timer_jiffies(base); return 1; } -- cgit v1.2.3 From 18d8cb64c9c074cbe2bd677ab10fff8283abdb62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 10:41:50 -0800 Subject: timers: Reduce future __run_timers() latency for first add to empty list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. Therefore, just before we add a timer to an empty timer wheel, we should mark the timer wheel as being up to date. This marking will reduce (and perhaps eliminate) the jiffy-stepping that a future __run_timers() call will need to do in response to some future timer posting or migration. This commit therefore updates ->timer_jiffies for this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index dfac34f7186f..0c638cf3d9d2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -398,6 +398,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { + (void)catchup_timer_jiffies(base); __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer -- cgit v1.2.3 From aea369b959bef10d235cd0714789cd8b0fe170b8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Jan 2014 16:19:27 -0800 Subject: timers: Make internal_add_timer() update ->next_timer if ->active_timers == 0 The internal_add_timer() function updates base->next_timer only if timer->expires < base->next_timer. This is correct, but it also makes sense to do the same if we add the first non-deferrable timer. Signed-off-by: Oleg Nesterov Reviewed-by: Steven Rostedt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Tested-by: Mike Galbraith --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0c638cf3d9d2..c0d8898fed98 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,9 +404,9 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * Update base->active_timers and base->next_timer */ if (!tbase_get_deferrable(timer->base)) { - if (time_before(timer->expires, base->next_timer)) + if (!base->active_timers++ || + time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; - base->active_timers++; } base->all_timers++; } -- cgit v1.2.3 From d498d4b47fb3050f2f7840cc49251f87f04d1ca9 Mon Sep 17 00:00:00 2001 From: Vijaya Kumar K Date: Tue, 28 Jan 2014 16:50:20 +0530 Subject: KGDB: make kgdb_breakpoint() as noinline The function kgdb_breakpoint() sets up break point at compile time by calling arch_kgdb_breakpoint(); Though this call is surrounded by wmb() barrier, the compile can still re-order the break point, because this scheduling barrier is not a code motion barrier in gcc. Making kgdb_breakpoint() as noinline solves this problem of code reording around break point instruction and also avoids problem of being called as inline function from other places More details about discussion on this can be found here http://comments.gmane.org/gmane.linux.ports.arm.kernel/269732 Signed-off-by: Vijaya Kumar K Acked-by: Will Deacon Acked-by: Jason Wessel Signed-off-by: Catalin Marinas --- kernel/debug/debug_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 334b3980ffc1..99982a70ddad 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1035,7 +1035,7 @@ int dbg_io_get_char(void) * otherwise as a quick means to stop program execution and "break" into * the debugger. */ -void kgdb_breakpoint(void) +noinline void kgdb_breakpoint(void) { atomic_inc(&kgdb_setting_breakpoint); wmb(); /* Sync point before breakpoint */ -- cgit v1.2.3 From 8857563b819b140aa8c9be920cfe44d5d3f808b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Feb 2014 13:02:22 -0800 Subject: notifier: Substitute rcu_access_pointer() for rcu_dereference_raw() (Trivial patch.) If the code is looking at the RCU-protected pointer itself, but not dereferencing it, the rcu_dereference() functions can be downgraded to rcu_access_pointer(). This commit makes this downgrade in __blocking_notifier_call_chain() which simply compares the RCU-protected pointer against NULL with no dereferencing. Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Linus Torvalds Reviewed-by: Josh Triplett --- kernel/notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 2d5cc4ccff7f..db4c8b08a50c 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ - if (rcu_dereference_raw(nh->head)) { + if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); -- cgit v1.2.3 From 7a754743185a4b05818e10058fa2fbe4e6969085 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 11 Feb 2014 16:10:12 -0500 Subject: rcu: Fix sparse warning for rcu_expedited from kernel/ksysfs.c This commit fixes the follwoing warning: kernel/ksysfs.c:143:5: warning: symbol 'rcu_expedited' was not declared. Should it be static? Signed-off-by: Paul Gortmaker [ paulmck: Moved the declaration to include/linux/rcupdate.h to avoid including the RCU-internal rcu.h file outside of RCU. ] Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/ksysfs.c | 2 ++ kernel/rcu/rcu.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d945a949760f..e660964086e2 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -19,6 +19,8 @@ #include #include +#include /* rcu_expedited */ + #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 1bd787fddcb2..af2e60a8425d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -116,8 +116,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) } } -extern int rcu_expedited; - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_suppress; -- cgit v1.2.3 From 5cb5c6e18f822b19bd41a2c0f9930c82b3ec0bc9 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 19 Feb 2014 14:33:27 -0500 Subject: rcu: Ensure kernel/rcu/rcu.h can be sourced/used stand-alone The kbuild test bot uncovered an implicit dependence on the trace header being present before rcu.h in ia64 allmodconfig that looks like this: In file included from kernel/ksysfs.c:22:0: kernel/rcu/rcu.h: In function '__rcu_reclaim': kernel/rcu/rcu.h:107:3: error: implicit declaration of function 'trace_rcu_invoke_kfree_callback' [-Werror=implicit-function-declaration] kernel/rcu/rcu.h:112:3: error: implicit declaration of function 'trace_rcu_invoke_callback' [-Werror=implicit-function-declaration] cc1: some warnings being treated as errors Looking at other rcu.h users, we can find that they all were sourcing the trace header in advance of rcu.h itself, as seen in the context of this diff. There were also some inconsistencies as to whether it was or wasn't sourced based on the parent tracing Kconfig. Rather than "fix" it at each use site, and have inconsistent use based on whether "#ifdef CONFIG_RCU_TRACE" was used or not, lets just source the trace header just once, in the actual consumer of it, which is rcu.h itself. We include it unconditionally, as build testing shows us that is a hard requirement for some files. Reported-by: kbuild test robot Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 1 + kernel/rcu/srcu.c | 2 -- kernel/rcu/tiny.c | 4 ---- kernel/rcu/tree.c | 2 -- kernel/rcu/update.c | 1 - 5 files changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index af2e60a8425d..bfda2726ca45 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -23,6 +23,7 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include #ifdef CONFIG_RCU_TRACE #define RCU_TRACE(stmt) stmt #else /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 2359779e1daa..c639556f3fa0 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -36,8 +36,6 @@ #include #include -#include - #include "rcu.h" /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 53b95bbf4abb..d9efcc13008c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -37,10 +37,6 @@ #include #include -#ifdef CONFIG_RCU_TRACE -#include -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - #include "rcu.h" /* Forward declarations for tiny_plugin.h. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 73c3cd2b87ac..c7ed5db2dd79 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -58,8 +58,6 @@ #include #include "tree.h" -#include - #include "rcu.h" MODULE_ALIAS("rcutree"); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fd0d5b5b8e7c..4c0a9b0af469 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -49,7 +49,6 @@ #include #define CREATE_TRACE_POINTS -#include #include "rcu.h" -- cgit v1.2.3 From 5edb93b89f6cc3089ee283656555e7a9ad36a8a0 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 4 Feb 2014 19:32:28 -0800 Subject: resource: Add resource_contains() We have two identical copies of resource_contains() already, and more places that could use it. This moves it to ioport.h where it can be shared. resource_contains(struct resource *r1, struct resource *r2) returns true iff r1 and r2 are the same type (most callers already checked this separately) and the r1 address range completely contains r2. In addition, the new resource_contains() checks that both r1 and r2 have addresses assigned to them. If a resource is IORESOURCE_UNSET, it doesn't have a valid address and can't contain or be contained by another resource. Some callers already check this or for res->start. No functional change. Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 3f285dce9347..a8344dda7049 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -432,11 +432,6 @@ static void resource_clip(struct resource *res, resource_size_t min, res->end = max; } -static bool resource_contains(struct resource *res1, struct resource *res2) -{ - return res1->start <= res2->start && res1->end >= res2->end; -} - /* * Find empty slot in the resource tree with the given range and * alignment constraints @@ -471,10 +466,11 @@ static int __find_resource(struct resource *root, struct resource *old, arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ - avail = *new; avail.start = ALIGN(tmp.start, constraint->align); avail.end = tmp.end; + avail.flags = new->flags & ~IORESOURCE_UNSET; if (avail.start >= tmp.start) { + alloc.flags = avail.flags; alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; -- cgit v1.2.3 From c685689fd24d310343ac33942e9a54a974ae9c43 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 24 Feb 2014 11:29:50 +0800 Subject: genirq: Remove racy waitqueue_active check We hit one rare case below: T1 calling disable_irq(), but hanging at synchronize_irq() always; The corresponding irq thread is in sleeping state; And all CPUs are in idle state; After analysis, we found there is one possible scenerio which causes T1 is waiting there forever: CPU0 CPU1 synchronize_irq() wait_event() spin_lock() atomic_dec_and_test(&threads_active) insert the __wait into queue spin_unlock() if(waitqueue_active) atomic_read(&threads_active) wake_up() Here after inserted the __wait into queue on CPU0, and before test if queue is empty on CPU1, there is no barrier, it maybe cause it is not visible for CPU1 immediately, although CPU0 has updated the queue list. It is similar for CPU0 atomic_read() threads_active also. So we'd need one smp_mb() before waitqueue_active.that, but removing the waitqueue_active() check solves it as wel l and it makes things simple and clear. Signed-off-by: Chuansheng Liu Cc: Xiaoming Wang Link: http://lkml.kernel.org/r/1393212590-32543-1-git-send-email-chuansheng.liu@intel.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b17..d3bf660cb57f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } -- cgit v1.2.3 From 791c9e0292671a3bfa95286bb5c08129d8605618 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 18 Feb 2014 17:56:51 -0600 Subject: sched: Fix double normalization of vruntime dequeue_entity() is called when p->on_rq and sets se->on_rq = 0 which appears to guarentee that the !se->on_rq condition is met. If the task has done set_current_state(TASK_INTERRUPTIBLE) without schedule() the second condition will be met and vruntime will be incorrectly adjusted twice. In certain cases this can result in the task's vruntime never increasing past the vruntime of other tasks on the CFS' run queue, starving them of CPU time. This patch changes switched_from_fair() to use !p->on_rq instead of !se->on_rq. I'm able to cause a task with a priority of 120 to starve all other tasks with the same priority on an ARM platform running 3.2.51-rt72 PREEMPT RT by writing one character at time to a serial tty (16550 UART) in a tight loop. I'm also able to verify making this change corrects the problem on that platform and kernel version. Signed-off-by: George McCollister Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1392767811-28916-1-git-send-email-george.mccollister@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78157099b167..9b4c4f320130 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7001,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. -- cgit v1.2.3 From 3908ac13b550c93f97d8db136ff572be5495bc06 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Feb 2014 19:52:23 +0400 Subject: sched/deadline: Cleanup RT leftovers from {inc/dec}_dl_migration In deadline class we do not have group scheduling. So, let's remove unnecessary X = X; equations. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/r/1393343543.4089.5.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 15cbc17fbf84..aecf93030e0b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq) static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; -- cgit v1.2.3 From eec751ed41a0ae7e92a43c33a458d7bd1b941631 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 24 Feb 2014 11:47:12 +0100 Subject: sched/deadline: Switch CPU's presence test order Commit 82b9580 ("sched/deadline: Test for CPU's presence explicitly") changed how we check if a CPU returned by cpudeadline machinery is valid. But, we don't want to call cpu_present() if best_cpu is equal to -1. So, switch the order of tests inside WARN_ON(). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: konrad.wilk@oracle.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1393238832-9100-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b8838b56d1c..5b9bb42b2d47 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); return best_cpu; } -- cgit v1.2.3 From faa5993736d9b44b508cab4f1f3a77d66641c6f4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 21 Feb 2014 11:37:15 +0100 Subject: sched/deadline: Prevent rt_time growth to infinity Kirill Tkhai noted: Since deadline tasks share rt bandwidth, we must care about bandwidth timer set. Otherwise rt_time may grow up to infinity in update_curr_dl(), if there are no other available RT tasks on top level bandwidth. RT task were in fact throttled right after they got enqueued, and never executed again (rt_time never again went below rt_runtime). Peter then proposed to accrue DL execution on rt_time only when rt timer is active, and proposed a patch (this patch is a slight modification of that) to implement that behavior. While this solves Kirill problem, it has a drawback. Indeed, Kirill noted again: It looks we may get into a situation, when all CPU time is shared between RT and DL tasks: rt_runtime = n rt_period = 2n | RT working, DL sleeping | DL working, RT sleeping | ----------------------------------------------------------- | (1) duration = n | (2) duration = n | (repeat) |--------------------------|------------------------------| | (rt_bw timer is running) | (rt_bw timer is not running) | No time for fair tasks at all. While this can happen during the first period, if rq is always backlogged, RT tasks won't have the opportunity to execute anymore: rt_time reached rt_runtime during (1), suppose after (2) RT is enqueued back, it gets throttled since rt timer didn't fire, replenishment is from now on eaten up by DL tasks that accrue their execution on rt_time (while rt timer is active - we have an RT task waiting for replenishment). FAIR tasks are not touched after this first period. Ok, this is not ideal, and the situation is even worse! What above (the nice case), practically never happens in reality, where your rt timer is not aligned to tasks periods, tasks are in general not periodic, etc.. Long story short, you always risk to overload your system. This patch is based on Peter's idea, but exploits an additional fact: if you don't have RT tasks enqueued, it makes little sense to continue incrementing rt_time once you reached the upper limit (DL tasks have their own mechanism for throttling). This cures both problems: - no matter how many DL instances in the past, you'll have an rt_time slightly above rt_runtime when an RT task is enqueued, and from that point on (after the first replenishment), the task will normally execute; - you can still eat up all bandwidth during the first period, but not anymore after that, remember that DL execution will increment rt_time till the upper limit is reached. The situation is still not perfect! But, we have a simple solution for now, that limits how much you can jeopardize your system, as we keep working towards the right answer: RT groups scheduled using deadline servers. Reported-by: Kirill Tkhai Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140225151515.617714e2f2cd6c558531ba61@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++++-- kernel/sched/rt.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aecf93030e0b..6e79b3faa4cd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b45..1999021042c7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. -- cgit v1.2.3 From e3703f8cdfcf39c25c4338c3ad8e68891cca3731 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2014 12:06:12 +0100 Subject: perf: Fix hotplug splat Drew Richardson reported that he could make the kernel go *boom* when hotplugging while having perf events active. It turned out that when you have a group event, the code in __perf_event_exit_context() fails to remove the group siblings from the context. We then proceed with destroying and freeing the event, and when you re-plug the CPU and try and add another event to that CPU, things go *boom* because you've still got dead entries there. Reported-by: Drew Richardson Signed-off-by: Peter Zijlstra Cc: Will Deacon Cc: Link: http://lkml.kernel.org/n/tip-k6v5wundvusvcseqj1si0oz0@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd3..fa0b2d4ad83c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7856,14 +7856,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; + struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) __perf_remove_from_context(event); + rcu_read_unlock(); } static void perf_event_exit_cpu_context(int cpu) @@ -7887,11 +7887,11 @@ static void perf_event_exit_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + perf_event_exit_cpu_context(cpu); + mutex_lock(&swhash->hlist_mutex); swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } -- cgit v1.2.3 From f5f9739d7a0ccbdcf913a0b3604b134129d14f7e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 26 Feb 2014 11:19:33 +0000 Subject: sched: Put rq's sched_avg under CONFIG_FAIR_GROUP_SCHED The struct sched_avg of struct rq is only used in case group scheduling is enabled inside __update_tg_runnable_avg() to update per-cpu representation of a task group. I.e. that there is no need to maintain the runnable avg of a rq in the !CONFIG_FAIR_GROUP_SCHED case. This patch guards struct sched_avg of struct rq and update_rq_runnable_avg() with CONFIG_FAIR_GROUP_SCHED. There is an extra empty definition for update_rq_runnable_avg() necessary for the !CONFIG_FAIR_GROUP_SCHED && CONFIG_SMP case. The function print_cfs_group_stats() which prints out struct sched_avg of struct rq is already guarded with CONFIG_FAIR_GROUP_SCHED. Reviewed-by: Ben Segall Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/530DCDC5.1060406@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++------ kernel/sched/sched.h | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a3a41c61a2c9..be4f7d9eaf03 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2374,12 +2374,19 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) se->avg.load_avg_contrib >>= NICE_0_SHIFT; } } + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); +} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) @@ -2478,12 +2485,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} - /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d608125b36ef..046084ebb1fb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -541,6 +541,8 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + + struct sched_avg avg; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -630,8 +632,6 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif - - struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) -- cgit v1.2.3 From 06d50c65b1043b166d102accc081093f79d8f7e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2014 18:22:07 +0100 Subject: sched/idle: Remove stale old file Commit cf37b6b48428d ("sched/idle: Move cpu/idle.c to sched/idle.c") said to simply move a file; somehow it got mangled and created an old version of the file and forgot to remove the old file. Fix this fail; add the lost change and remove the now identical old file. Signed-off-by: Peter Zijlstra Cc: rjw@rjwysocki.net Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/20140224172207.GC9987@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cpu/idle.c | 147 ---------------------------------------------------- kernel/sched/idle.c | 17 +++--- 2 files changed, 10 insertions(+), 154 deletions(-) delete mode 100644 kernel/cpu/idle.c (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c deleted file mode 100644 index b7976a127178..000000000000 --- a/kernel/cpu/idle.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Generic entry point for the idle threads - */ -#include -#include -#include -#include -#include -#include - -#include - -#include - -static int __read_mostly cpu_idle_force_poll; - -void cpu_idle_poll_ctrl(bool enable) -{ - if (enable) { - cpu_idle_force_poll++; - } else { - cpu_idle_force_poll--; - WARN_ON_ONCE(cpu_idle_force_poll < 0); - } -} - -#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP -static int __init cpu_idle_poll_setup(char *__unused) -{ - cpu_idle_force_poll = 1; - return 1; -} -__setup("nohlt", cpu_idle_poll_setup); - -static int __init cpu_idle_nopoll_setup(char *__unused) -{ - cpu_idle_force_poll = 0; - return 1; -} -__setup("hlt", cpu_idle_nopoll_setup); -#endif - -static inline int cpu_idle_poll(void) -{ - rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); - local_irq_enable(); - while (!tif_need_resched()) - cpu_relax(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - rcu_idle_exit(); - return 1; -} - -/* Weak implementations for optional arch specific functions */ -void __weak arch_cpu_idle_prepare(void) { } -void __weak arch_cpu_idle_enter(void) { } -void __weak arch_cpu_idle_exit(void) { } -void __weak arch_cpu_idle_dead(void) { } -void __weak arch_cpu_idle(void) -{ - cpu_idle_force_poll = 1; - local_irq_enable(); -} - -/* - * Generic idle loop implementation - */ -static void cpu_idle_loop(void) -{ - while (1) { - tick_nohz_idle_enter(); - - while (!need_resched()) { - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(smp_processor_id())) - arch_cpu_idle_dead(); - - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { - cpu_idle_poll(); - } else { - if (!current_clr_polling_and_test()) { - stop_critical_timings(); - rcu_idle_enter(); - if (cpuidle_idle_call()) - arch_cpu_idle(); - if (WARN_ON_ONCE(irqs_disabled())) - local_irq_enable(); - rcu_idle_exit(); - start_critical_timings(); - } else { - local_irq_enable(); - } - __current_set_polling(); - } - arch_cpu_idle_exit(); - } - - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} - -void cpu_startup_entry(enum cpuhp_state state) -{ - /* - * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (arm and sh have never invoked the canary - * init for the non boot cpus!). Will be fixed in 3.11 - */ -#ifdef CONFIG_X86 - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. The boot CPU already has it initialized but no harm - * in doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -#endif - __current_set_polling(); - arch_cpu_idle_prepare(); - cpu_idle_loop(); -} diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 14ca43430aee..b7976a127178 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -108,14 +108,17 @@ static void cpu_idle_loop(void) __current_set_polling(); } arch_cpu_idle_exit(); - /* - * We need to test and propagate the TIF_NEED_RESCHED - * bit here because we might not have send the - * reschedule IPI to idle tasks. - */ - if (tif_need_resched()) - set_preempt_need_resched(); } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } -- cgit v1.2.3 From 37e117c07b89194aae7062bc63bde1104c03db02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Feb 2014 12:25:08 +0100 Subject: sched: Guarantee task priority in pick_next_task() Michael spotted that the idle_balance() push down created a task priority problem. Previously, when we called idle_balance() before pick_next_task() it wasn't a problem when -- because of the rq->lock droppage -- an rt/dl task slipped in. Similarly for pre_schedule(), rt pre-schedule could have a dl task slip in. But by pulling it into the pick_next_task() loop, we'll not try a higher task priority again. Cure this by creating a re-start condition in pick_next_task(); and triggering this from pick_next_task_{rt,fair}(). It also fixes a live-lock where we get stuck in pick_next_task_fair() due to idle_balance() seeing !0 nr_running but there not actually being any fair tasks about. Reported-by: Michael Wang Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()") Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: Juri Lelli Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140224121218.GR15586@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ++++++++---- kernel/sched/fair.c | 13 ++++++++++++- kernel/sched/rt.c | 10 +++++++++- kernel/sched/sched.h | 5 +++++ 4 files changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8a73b8897bf..cde573d3f12e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2586,24 +2586,28 @@ static inline void schedule_debug(struct task_struct *prev) static inline struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev) { - const struct sched_class *class; + const struct sched_class *class = &fair_sched_class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(prev->sched_class == &fair_sched_class && + if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); - if (likely(p)) + if (likely(p && p != RETRY_TASK)) return p; } +again: for_each_class(class) { p = class->pick_next_task(rq, prev); - if (p) + if (p) { + if (unlikely(p == RETRY_TASK)) + goto again; return p; + } } BUG(); /* the idle class will always have a runnable task */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be4f7d9eaf03..16042b58a32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4686,6 +4686,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; + int new_tasks; again: #ifdef CONFIG_FAIR_GROUP_SCHED @@ -4784,7 +4785,17 @@ simple: return p; idle: - if (idle_balance(rq)) /* drops rq->lock */ + /* + * Because idle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + new_tasks = idle_balance(rq); + + if (rq->nr_running != rq->cfs.h_nr_running) + return RETRY_TASK; + + if (new_tasks) goto again; return NULL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4d4b386598aa..398b3f990823 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1360,8 +1360,16 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; - if (need_pull_rt_task(rq, prev)) + if (need_pull_rt_task(rq, prev)) { pull_rt_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a dl task can slip in, in which case we need to + * re-start task selection. + */ + if (unlikely(rq->dl.dl_nr_running)) + return RETRY_TASK; + } if (!rt_rq->rt_nr_running) return NULL; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 046084ebb1fb..1929deb3f29d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1091,6 +1091,8 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 1 +#define RETRY_TASK ((void *)-1UL) + struct sched_class { const struct sched_class *next; @@ -1105,6 +1107,9 @@ struct sched_class { * It is the responsibility of the pick_next_task() method that will * return the next task to call put_prev_task() on the @prev task or * something equivalent. + * + * May return RETRY_TASK when it finds a higher prio class has runnable + * tasks. */ struct task_struct * (*pick_next_task) (struct rq *rq, struct task_struct *prev); -- cgit v1.2.3 From 2b3942e4bb20ef8ef26515bd958c2df83d9a6210 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 24 Feb 2014 22:12:01 +0800 Subject: trace: Replace hardcoding of 19 with MAX_NICE Use MAX_NICE instead of the value 19 for ring_buffer_benchmark. Signed-off-by: Dongsheng Yang Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1393251121-25534-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer_benchmark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a5457d577b98..0434ff1b808e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -40,8 +40,8 @@ static int write_iteration = 50; module_param(write_iteration, uint, 0644); MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); -static int producer_nice = 19; -static int consumer_nice = 19; +static int producer_nice = MAX_NICE; +static int consumer_nice = MAX_NICE; static int producer_fifo = -1; static int consumer_fifo = -1; @@ -308,7 +308,7 @@ static void ring_buffer_producer(void) /* Let the user know that the test is running at low priority */ if (producer_fifo < 0 && consumer_fifo < 0 && - producer_nice == 19 && consumer_nice == 19) + producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); trace_printk("Time: %lld (usecs)\n", time); -- cgit v1.2.3 From 9e3170411ed171a126f4dca1672012a33efe59e5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2014 17:44:18 +0000 Subject: perf: Fix prototype of find_pmu_context() For some reason find_pmu_context() is defined as returning void * rather than a __percpu struct perf_cpu_context *. As all the requisite types are defined in advance there's no reason to keep it that way. This patch modifies the prototype of pmu_find_context to return a __percpu struct perf_cpu_context *. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra Reviewed-by: Dave Martin Acked-by: Will Deacon Link: http://lkml.kernel.org/r/1392054264-23570-2-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fa990061aa6c..425159882a6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6313,7 +6313,7 @@ static int perf_event_idx_default(struct perf_event *event) * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. */ -static void *find_pmu_context(int ctxn) +static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) { struct pmu *pmu; -- cgit v1.2.3 From fdded676c3ef680bf1abc415d307d7e69a6768d1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2014 17:44:19 +0000 Subject: perf: Remove redundant PMU assignment Currently perf_branch_stack_sched_in iterates over the set of pmus, checks that each pmu has a flush_branch_stack callback, then overwrites the pmu before calling the callback. This is either redundant or broken. In systems with a single hw pmu, pmu == cpuctx->ctx.pmu, and thus the assignment is redundant. In systems with multiple hw pmus (i.e. multiple pmus with task_ctx_nr == perf_hw_context) the pmus share the same perf_cpu_context. Thus the assignment can cause one of the pmus to flush its branch stack repeatedly rather than causing each of the pmus to flush their branch stacks. Worse still, if only some pmus have the callback the assignment can result in a branch to NULL. This patch removes the redundant assignment. Signed-off-by: Mark Rutland Acked-by: Will Deacon Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1392054264-23570-3-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 425159882a6f..823a53d72d6a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2582,8 +2582,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev, if (cpuctx->ctx.nr_branch_stack > 0 && pmu->flush_branch_stack) { - pmu = cpuctx->ctx.pmu; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); -- cgit v1.2.3 From 4a2345937c17722bd2979f662ae909846b4a052a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Feb 2014 12:43:31 +0100 Subject: perf: Optimize group_sched_in() Use the ctx pmu instead of the event pmu. When a group leader is a software event but the group contains hardware events, the entire group is on the hardware PMU. Using the hardware PMU for the transaction makes most sense since that's the most expensive one to programm (and software PMUs generally don't have TXN support anyway). Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-sctoo9t2f3nn2c9g568928q3@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 823a53d72d6a..661951ab8ae7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1733,7 +1733,7 @@ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = group_event->pmu; + struct pmu *pmu = ctx->pmu; u64 now = ctx->time; bool simulate = false; -- cgit v1.2.3 From 64be38ab03e9b238a1299857fef8b3707c0ed045 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:10:12 +0530 Subject: genirq: Include missing header file in irqdomain.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file include/linux/of_irq.h in kernel/irq/irqdomain.c because it contains prototype definition of function define in kernel/irq/irqdomain.c. This eliminates the following warning in kernel/irq/irqdomain.c: kernel/irq/irqdomain.c:468:14: warning: no previous prototype for ‘irq_create_of_mapping’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Cc: Benjamin Herrenschmidt Link: http://lkml.kernel.org/r/eb89aebea7ff1a46122918ac389ebecf8248be9a.1393493276.git.rashika.kheria@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb36fe58..f14033700c25 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 4729583006772b9530404bc1bb7c3aa4a10ffd4d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:03 +0800 Subject: cpuset: fix a locking issue in cpuset_migrate_mm() I can trigger a lockdep warning: # mount -t cgroup -o cpuset xxx /cgroup # mkdir /cgroup/cpuset # mkdir /cgroup/tmp # echo 0 > /cgroup/tmp/cpuset.cpus # echo 0 > /cgroup/tmp/cpuset.mems # echo 1 > /cgroup/tmp/cpuset.memory_migrate # echo $$ > /cgroup/tmp/tasks # echo 1 > /cgruop/tmp/cpuset.mems =============================== [ INFO: suspicious RCU usage. ] 3.14.0-rc1-0.1-default+ #32 Not tainted ------------------------------- include/linux/cgroup.h:682 suspicious rcu_dereference_check() usage! ... [] dump_stack+0x72/0x86 [] lockdep_rcu_suspicious+0x101/0x140 [] cpuset_migrate_mm+0xb1/0xe0 ... We used to hold cgroup_mutex when calling cpuset_migrate_mm(), but now we hold cpuset_mutex, which causes task_css() to complain. This is not a false-positive but a real issue. Holding cpuset_mutex won't prevent a task from migrating to another cpuset, and it won't prevent the original task->cgroup from destroying during this change. Fixes: 5d21cc2db040 (cpuset: replace cgroup_mutex locking with cpuset internal locking) Cc: # 3.9+ Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cpuset.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f1..dba9e4aef69a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cpuset_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &tsk->mems_allowed); + rcu_read_unlock(); } /* -- cgit v1.2.3 From 99afb0fd5f05aac467ffa85c36778fec4396209b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:36 +0800 Subject: cpuset: fix a race condition in __cpuset_node_allowed_softwall() It's not safe to access task's cpuset after releasing task_lock(). Holding callback_mutex won't help. Cc: Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dba9e4aef69a..e6b1b66afe52 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2482,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) task_lock(current); cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed; } -- cgit v1.2.3 From 864f32a52b53341c54a25953afb5b66ed79a7f76 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:50:19 +0530 Subject: kernel: Mark function as static in kernel/seccomp.c Mark function as static in kernel/seccomp.c because it is not used outside this file. This eliminates the following warning in kernel/seccomp.c: kernel/seccomp.c:296:6: warning: no previous prototype for ?seccomp_attach_user_filter? [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Acked-by: Kees Cook Acked-by: Will Drewry Signed-off-by: James Morris --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a10048a32c..0e004a70f63a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -293,7 +293,7 @@ fail: * * Returns 0 on success and non-zero otherwise. */ -long seccomp_attach_user_filter(char __user *user_filter) +static long seccomp_attach_user_filter(char __user *user_filter) { struct sock_fprog fprog; long ret = -EFAULT; -- cgit v1.2.3 From 48095d991d85687569ac025b18a6c7ae1632c9f7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Feb 2014 17:25:33 -0800 Subject: audit: Use struct net not pid_t to remember the network namespce to reply in In struct audit_netlink_list and audit_reply add a reference to the network namespace of the caller and remove the userspace pid of the caller. This cleanly remembers the callers network namespace, and removes a huge class of races and nasty failure modes that can occur when attempting to relook up the callers network namespace from a pid_t (including the caller's network namespace changing, pid wraparound, and the pid simply not being present). Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 10 ++++++---- kernel/audit.h | 2 +- kernel/auditfilter.c | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 34c5a2310fbf..1e5756f16f6f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -182,7 +182,7 @@ struct audit_buffer { struct audit_reply { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff *skb; }; @@ -500,7 +500,7 @@ int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; - struct net *net = get_net_ns_by_pid(dest->pid); + struct net *net = dest->net; struct audit_net *aunet = net_generic(net, audit_net_id); /* wait for parent to finish and send an ACK */ @@ -510,6 +510,7 @@ int audit_send_list(void *_dest) while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(aunet->nlsk, skb, dest->portid, 0); + put_net(net); kfree(dest); return 0; @@ -543,7 +544,7 @@ out_kfree_skb: static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct net *net = get_net_ns_by_pid(reply->pid); + struct net *net = reply->net; struct audit_net *aunet = net_generic(net, audit_net_id); mutex_lock(&audit_cmd_mutex); @@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg) /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); + put_net(net); kfree(reply); return 0; } @@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; + reply->net = get_net(current->nsproxy->net_ns); reply->portid = portid; - reply->pid = task_pid_vnr(current); reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); diff --git a/kernel/audit.h b/kernel/audit.h index 57cc64d67718..8df132214606 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -247,7 +247,7 @@ extern void audit_panic(const char *message); struct audit_netlink_list { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff_head q; }; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 14a78cca384e..a5e3d73d73e4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "audit.h" /* @@ -1083,8 +1084,8 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; + dest->net = get_net(current->nsproxy->net_ns); dest->portid = portid; - dest->pid = task_pid_vnr(current); skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); -- cgit v1.2.3 From 04b73469750050290cb0a773e7ecf2358d65f6d5 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:13:53 +0530 Subject: PM / sleep: Move prototype declaration to header file kernel/power/power.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move prototype declaration of function to header file kernel/power/power.h because it is used by more than one file. This eliminates the following warning in kernel/power/snapshot.c: kernel/power/snapshot.c:1588:16: warning: no previous prototype for ‘swsusp_save’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 7d4b7ffb3c1d..1ca753106557 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -49,6 +49,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) +asmlinkage int swsusp_save(void); + /* kernel/power/hibernate.c */ extern bool freezer_test_done; -- cgit v1.2.3 From 6c5be2916565e8c710e9f0f7b43cf65a3ba39dd9 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:15:54 +0530 Subject: PM / wakeup: Include appropriate header file in kernel/power/wakelock.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file kernel/power/power.h in kernel/power/wakelock.c because it has prototype declaration of function defined in kernel/power/wakelock.c. This eliminates the following warning in kernel/power/wakelock.c: kernel/power/wakelock.c:34:9: warning: no previous prototype for ‘pm_show_wakelocks’ [-Wmissing-prototypes] kernel/power/wakelock.c:184:5: warning: no previous prototype for ‘pm_wake_lock’ [-Wmissing-prototypes] kernel/power/wakelock.c:232:5: warning: no previous prototype for ‘pm_wake_unlock’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Rafael J. Wysocki --- kernel/power/wakelock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 8f50de394d22..019069c84ff6 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -18,6 +18,8 @@ #include #include +#include "power.h" + static DEFINE_MUTEX(wakelocks_lock); struct wakelock { -- cgit v1.2.3 From 421a5fa1a6cfc037a21220b638d4def6da7cbabe Mon Sep 17 00:00:00 2001 From: Sebastian Capella Date: Fri, 14 Feb 2014 14:52:56 -0800 Subject: PM / hibernate: use name_to_dev_t to parse resume Use the name_to_dev_t call to parse the device name echo'd to to /sys/power/resume. This imitates the method used in hibernate.c in software_resume, and allows the resume partition to be specified using other equivalent device formats as well. By allowing /sys/debug/resume to accept the same syntax as the resume=device parameter, we can parse the resume=device in the init script and use the resume device directly from the kernel command line. Signed-off-by: Sebastian Capella Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 37170d4dd9a6..f4f2073711d3 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -973,16 +973,20 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - unsigned int maj, min; dev_t res; - int ret = -EINVAL; + int len = n; + char *name; - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; + if (len && buf[len-1] == '\n') + len--; + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; + res = name_to_dev_t(name); + kfree(name); + if (!res) + return -EINVAL; lock_system_sleep(); swsusp_resume_device = res; @@ -990,9 +994,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, printk(KERN_INFO "PM: Starting manual resume from disk\n"); noresume = 0; software_resume(); - ret = n; - out: - return ret; + return n; } power_attr(resume); -- cgit v1.2.3 From 6f285b19d09f72e801525f5eea1bdad22e559bf0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2014 19:44:55 -0800 Subject: audit: Send replies in the proper network namespace. In perverse cases of file descriptor passing the current network namespace of a process and the network namespace of a socket used by that socket may differ. Therefore use the network namespace of the appropiate socket to ensure replies always go to the appropiate socket. Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 21 ++++++++++----------- kernel/auditfilter.c | 7 +++++-- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 1e5756f16f6f..32086bff5564 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the port id. * No failure notifications. */ -static void audit_send_reply(__u32 portid, int seq, int type, int done, +static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct sk_buff *skb; struct task_struct *tsk; struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), @@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; - reply->net = get_net(current->nsproxy->net_ns); + reply->net = get_net(net); reply->portid = portid; reply->skb = skb; @@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); return 0; } @@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog = skb_queue_len(&audit_skb_queue); s.version = AUDIT_VERSION_LATEST; s.backlog_wait_time = audit_backlog_wait_time; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { @@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq, data, nlmsg_len(nlh)); break; case AUDIT_LIST_RULES: - err = audit_list_rules_send(NETLINK_CB(skb).portid, seq); + err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); @@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, - 0, 0, sig_data, sizeof(*sig_data) + len); + audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, + sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); - audit_send_reply(NETLINK_CB(skb).portid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a5e3d73d73e4..e8d1c7c515d7 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "audit.h" /* @@ -1069,8 +1070,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, * @portid: target portid for netlink audit messages * @seq: netlink audit message sequence (serial) number */ -int audit_list_rules_send(__u32 portid, int seq) +int audit_list_rules_send(struct sk_buff *request_skb, int seq) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; int err = 0; @@ -1084,7 +1087,7 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(current->nsproxy->net_ns); + dest->net = get_net(net); dest->portid = portid; skb_queue_head_init(&dest->q); -- cgit v1.2.3 From 64e8d20bd39b81994d9cd60e7b42f8ec8652f5af Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:25:54 +0530 Subject: kernel: Include appropriate header file in time/timekeeping_debug.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file kernel/time/timekeeping_internal.h in kernel/time/timekeeping_debug.c because it has prototype declaration of function defined in kernel/time/timekeeping_debug.c. This eliminates the following warning in kernel/time/timekeeping_debug.c: kernel/time/timekeeping_debug.c:68:6: warning: no previous prototype for ‘tk_debug_account_sleep_time’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: John Stultz --- kernel/time/timekeeping_debug.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 802433a4f5eb..4d54f97558df 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -21,6 +21,8 @@ #include #include +#include "timekeeping_internal.h" + static unsigned int sleep_time_bin[32] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) -- cgit v1.2.3 From 03b8c7b623c80af264c4c8d6111e5c6289933666 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 2 Mar 2014 13:09:47 +0100 Subject: futex: Allow architectures to skip futex_atomic_cmpxchg_inatomic() test If an architecture has futex_atomic_cmpxchg_inatomic() implemented and there is no runtime check necessary, allow to skip the test within futex_init(). This allows to get rid of some code which would always give the same result, and also allows the compiler to optimize a couple of if statements away. Signed-off-by: Heiko Carstens Cc: Finn Thain Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20140302120947.GA3641@osiris Signed-off-by: Thomas Gleixner --- kernel/futex.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 44a1261cb9ff..5d17e3a83f8c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -157,7 +157,9 @@ * enqueue. */ +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG int __read_mostly futex_cmpxchg_enabled; +#endif /* * Futex flags used to encode options to functions and preserve them across @@ -2843,9 +2845,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void) { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG u32 curval; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non-functional ones will return + * -ENOSYS. + */ + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) + futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ unsigned int futex_shift; unsigned long i; @@ -2861,18 +2882,8 @@ static int __init futex_init(void) &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; + + futex_detect_cmpxchg(); for (i = 0; i < futex_hashsize; i++) { plist_head_init(&futex_queues[i].chain); -- cgit v1.2.3 From b8dadcb58d542ecbf1d3dae5fefcd3fd8cb26539 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 3 Mar 2014 17:28:36 -0500 Subject: cpuset: use rcu_read_lock() to protect task_cs() We no longer use task_lock() to protect tsk->cgroups. Reported-by: Fengguang Wu Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d8bec21d7a11..8d5324583aa4 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2239,10 +2239,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cpus_cs; mutex_lock(&callback_mutex); - task_lock(tsk); + rcu_read_lock(); cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); guarantee_online_cpus(cpus_cs, pmask); - task_unlock(tsk); + rcu_read_unlock(); mutex_unlock(&callback_mutex); } @@ -2295,10 +2295,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) nodemask_t mask; mutex_lock(&callback_mutex); - task_lock(tsk); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &mask); - task_unlock(tsk); + rcu_read_unlock(); mutex_unlock(&callback_mutex); return mask; @@ -2414,9 +2414,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ mutex_lock(&callback_mutex); - task_lock(current); + rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); - task_unlock(current); + rcu_read_unlock(); allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); @@ -2543,24 +2543,26 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, * @task: pointer to task_struct of some task. * * Description: Prints @task's name, cpuset name, and cached copy of its - * mems_allowed to the kernel log. Must hold task_lock(task) to allow - * dereferencing task_cs(task). + * mems_allowed to the kernel log. */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { /* Statically allocated to prevent using excess stack. */ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); - struct cgroup *cgrp = task_cs(tsk)->css.cgroup; + struct cgroup *cgrp; spin_lock(&cpuset_buffer_lock); + rcu_read_lock(); + cgrp = task_cs(tsk)->css.cgroup; nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); printk(KERN_INFO "%s cpuset=", tsk->comm); pr_cont_cgroup_name(cgrp); pr_cont(" mems_allowed=%s\n", cpuset_nodelist); + rcu_read_unlock(); spin_unlock(&cpuset_buffer_lock); } @@ -2592,9 +2594,9 @@ int cpuset_memory_pressure_enabled __read_mostly; void __cpuset_memory_pressure_bump(void) { - task_lock(current); + rcu_read_lock(); fmeter_markevent(&task_cs(current)->fmeter); - task_unlock(current); + rcu_read_unlock(); } #ifdef CONFIG_PROC_PID_CPUSET -- cgit v1.2.3 From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 13:37:38 -0500 Subject: tracing: Do not add event files for modules that fail tracepoints If a module fails to add its tracepoints due to module tainting, do not create the module event infrastructure in the debugfs directory. As the events will not work and worse yet, they will silently fail, making the user wonder why the events they enable do not display anything. Having a warning on module load and the events not visible to the users will make the cause of the problem much clearer. Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT" Acked-by: Mathieu Desnoyers Cc: stable@vger.kernel.org # 2.6.31+ Cc: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 10 ++++++++++ kernel/tracepoint.c | 7 ++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4eccb5..f3989ceb5cd5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1777,6 +1777,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c9..031cc5655a51 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) +{ + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod, *iter; @@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) * module headers (for forced load), to make sure we don't cause a crash. * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.2.3 From 7dec935a3aa04412cba2cebe1524ae0d34a30c24 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 10:54:36 -0500 Subject: tracepoint: Do not waste memory on mods with no tracepoints No reason to allocate tp_module structures for modules that have no tracepoints. This just wastes memory. Fixes: b75ef8b44b1c "Tracepoint: Dissociate from module mutex" Acked-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c9..0d4ef26574ff 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -636,6 +636,9 @@ static int tracepoint_module_coming(struct module *mod) struct tp_module *tp_mod, *iter; int ret = 0; + if (!mod->num_tracepoints) + return 0; + /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. @@ -679,6 +682,9 @@ static int tracepoint_module_going(struct module *mod) { struct tp_module *pos; + if (!mod->num_tracepoints) + return 0; + mutex_lock(&tracepoints_mutex); tracepoint_update_probe_range(mod->tracepoints_ptrs, mod->tracepoints_ptrs + mod->num_tracepoints); -- cgit v1.2.3 From c24a4a369419c360c323865b91198878275c1481 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 28 Feb 2014 14:15:21 +0530 Subject: timer: Check failure of timer_cpu_notify() before calling init_timer_stats() timer_cpu_notify() should return NOTIFY_OK and nothing else. Anything else would trigger a BUG_ON(). Return value of this routine is already checked correctly but is done after issuing a call to init_timer_stats(). The right order would be to check the error case first and then call init_timer_stats(). Lets do it. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: tj@kernel.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/c439f5b6bbc2047e1662f4d523350531425bcf9d.1393576981.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index a71bdfdb51e7..31824ef3eb96 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1681,9 +1681,9 @@ void __init init_timers(void) err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); - init_timer_stats(); - BUG_ON(err != NOTIFY_OK); + + init_timer_stats(); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.2.3 From 38edbb0b913d73713c23dcc742669f7e78b52aa7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 28 Feb 2014 14:15:22 +0530 Subject: timer: Make sure TIMER_FLAG_MASK bits are free in allocated base Currently we are using two lowest bit of base for internal purpose and so they both should be zero in the allocated address. The code was doing the right thing before this patch came in: commit c5f66e99b (timer: Implement TIMER_IRQSAFE) Tejun probably forgot to update this piece of code which checks if the lowest 'n' bits are zero or not and so wasn't updated according to the new flag. Lets use TIMER_FLAG_MASK in the calculations here. [ tglx: Massaged changelog ] Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: tj@kernel.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/9144e10d7e854a0aa8a673332adec356d81a923c.1393576981.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 31824ef3eb96..949d74ea0ce4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1555,9 +1555,8 @@ static int init_timers_cpu(int cpu) if (!base) return -ENOMEM; - /* Make sure that tvec_base is 2 byte aligned */ - if (tbase_get_deferrable(base)) { - WARN_ON(1); + /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ + if (WARN_ON(base != tbase_get_base(base))) { kfree(base); return -ENOMEM; } -- cgit v1.2.3 From 792d0018a5fe31ef8ef9d07a7a02081d4abdf6b7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Feb 2014 21:40:14 +0000 Subject: genirq: Add a kstat helper to increment irq stats There is a common pattern all over the place: kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); This results in a call to core code anyway. So provide a function which does the same thing in core. While at it, replace the butt ugly macro with an inline. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140223212737.422068876@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8ab8e9390297..a7174617616b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -489,6 +489,11 @@ void dynamic_irq_cleanup(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } +void kstat_incr_irq_this_cpu(unsigned int irq) +{ + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); +} + unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); -- cgit v1.2.3 From 8f945a3325bbe0dd651e2f496a53df9b06fc6d07 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Feb 2014 21:40:23 +0000 Subject: genirq: Move kstat_incr_irqs_this_cpu() to core No more users outside the core code. Put it into the poison cabinet. That also gets rid of the linux/irq.h include in kernel_stat.h Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20140223212739.124207133@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index d61ac29e32d0..17b671713d5f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -6,6 +6,7 @@ * of this file for your non core code. */ #include +#include #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -180,3 +181,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { return d->state_use_accessors & mask; } + +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +{ + __this_cpu_inc(*desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} -- cgit v1.2.3 From af5040da01ef980670b3741b3e10733ee3e33566 Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Tue, 4 Mar 2014 23:13:10 +0900 Subject: blktrace: fix accounting of partially completed requests trace_block_rq_complete does not take into account that request can be partially completed, so we can get the following incorrect output of blkparser: C R 232 + 240 [0] C R 240 + 232 [0] C R 248 + 224 [0] C R 256 + 216 [0] but should be: C R 232 + 8 [0] C R 240 + 8 [0] C R 248 + 8 [0] C R 256 + 8 [0] Also, the whole output summary statistics of completed requests and final throughput will be incorrect. This patch takes into account real completion size of the request and fixes wrong completion accounting. Signed-off-by: Roman Pen CC: Steven Rostedt CC: Frederic Weisbecker CC: Ingo Molnar CC: linux-kernel@vger.kernel.org Cc: stable@kernel.org Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b418cb0d7242..4f3a3c03eadb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -702,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q) * blk_add_trace_rq - Add a trace for a request oriented action * @q: queue the io is for * @rq: the source request + * @nr_bytes: number of completed bytes * @what: the action * * Description: @@ -709,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + unsigned int nr_bytes, u32 what) { struct blk_trace *bt = q->blk_trace; @@ -718,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, + __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -730,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, static void blk_add_trace_rq_abort(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ABORT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_INSERT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); } static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, - struct request *rq) + struct request *rq, + unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); + blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); } /** -- cgit v1.2.3 From 62a6fa97684ed4c124564ea92500ecd513d60611 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 3 Mar 2014 16:11:13 +0100 Subject: kernel/compat: convert to COMPAT_SYSCALL_DEFINE Convert all compat system call functions where all parameter types have a size of four or less than four bytes, or are pointer types to COMPAT_SYSCALL_DEFINE. The implicit casts within COMPAT_SYSCALL_DEFINE will perform proper zero and sign extension to 64 bit of all parameters if needed. Signed-off-by: Heiko Carstens --- kernel/compat.c | 90 ++++++++++++++++++++++++++++----------------------------- kernel/ptrace.c | 4 +-- 2 files changed, 47 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 0a09e481b70b..2622011a44c9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -110,8 +110,8 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) return 0; } -asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) { if (tv) { struct timeval ktv; @@ -127,8 +127,8 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, return 0; } -asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) +COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) { struct timespec kts; struct timezone ktz; @@ -236,8 +236,8 @@ static long compat_nanosleep_restart(struct restart_block *restart) return ret; } -asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { struct timespec tu, rmt; mm_segment_t oldfs; @@ -328,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x) return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); } -asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) +COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) { if (tbuf) { struct tms tms; @@ -354,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) * types that can be passed to put_user()/get_user(). */ -asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) +COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) { old_sigset_t s; long ret; @@ -424,8 +424,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, #endif -asmlinkage long compat_sys_setrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; @@ -443,8 +443,8 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, #ifdef COMPAT_RLIM_OLD_INFINITY -asmlinkage long compat_sys_old_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; @@ -470,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, #endif -asmlinkage long compat_sys_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; @@ -596,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, return compat_get_bitmap(k, user_mask_ptr, len * 8); } -asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, - unsigned int len, - compat_ulong_t __user *user_mask_ptr) +COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid, + unsigned int, len, + compat_ulong_t __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; @@ -616,8 +616,8 @@ out: return retval; } -asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, - compat_ulong_t __user *user_mask_ptr) +COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, + compat_ulong_t __user *, user_mask_ptr) { int ret; cpumask_var_t mask; @@ -662,9 +662,9 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } -long compat_sys_timer_create(clockid_t which_clock, - struct compat_sigevent __user *timer_event_spec, - timer_t __user *created_timer_id) +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) { struct sigevent __user *event = NULL; @@ -680,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -long compat_sys_timer_settime(timer_t timer_id, int flags, - struct compat_itimerspec __user *new, - struct compat_itimerspec __user *old) +COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + struct compat_itimerspec __user *, new, + struct compat_itimerspec __user *, old) { long err; mm_segment_t oldfs; @@ -703,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags, return err; } -long compat_sys_timer_gettime(timer_t timer_id, - struct compat_itimerspec __user *setting) +COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct compat_itimerspec __user *, setting) { long err; mm_segment_t oldfs; @@ -720,8 +720,8 @@ long compat_sys_timer_gettime(timer_t timer_id, return err; } -long compat_sys_clock_settime(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; @@ -737,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock, return err; } -long compat_sys_clock_gettime(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; @@ -754,8 +754,8 @@ long compat_sys_clock_gettime(clockid_t which_clock, return err; } -long compat_sys_clock_adjtime(clockid_t which_clock, - struct compat_timex __user *utp) +COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, + struct compat_timex __user *, utp) { struct timex txc; mm_segment_t oldfs; @@ -777,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock, return ret; } -long compat_sys_clock_getres(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; @@ -818,9 +818,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) return err; } -long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, - struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { long err; mm_segment_t oldfs; @@ -1010,7 +1010,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, /* compat_time_t is a 32 bit "long" and needs to get converted. */ -asmlinkage long compat_sys_time(compat_time_t __user * tloc) +COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) { compat_time_t i; struct timeval tv; @@ -1026,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) return i; } -asmlinkage long compat_sys_stime(compat_time_t __user *tptr) +COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) { struct timespec tv; int err; @@ -1046,7 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) +COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) { struct timex txc; int err, ret; @@ -1085,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, - const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes) +COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, + compat_ulong_t, maxnode, + const compat_ulong_t __user *, old_nodes, + const compat_ulong_t __user *, new_nodes) { unsigned long __user *old = NULL; unsigned long __user *new = NULL; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f4bcb3cc21c..adf98622cb32 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1180,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, return ret; } -asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, - compat_long_t addr, compat_long_t data) +COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, + compat_long_t, addr, compat_long_t, data) { struct task_struct *child; long ret; -- cgit v1.2.3 From ca2c405ab90591dcb1bc3765467cbdf2b99a0f6a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 17:13:42 +0100 Subject: kexec/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter types In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that performs proper zero and sign extension convert all 64 bit parameters to their corresponding 32 bit compat counterparts. Signed-off-by: Heiko Carstens --- kernel/kexec.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 60bafbed06ab..45601cf41bee 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1039,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void) {} #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, - struct compat_kexec_segment __user *segments, - unsigned long flags) +COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, + compat_ulong_t, nr_segments, + struct compat_kexec_segment __user *, segments, + compat_ulong_t, flags) { struct compat_kexec_segment in; struct kexec_segment out, __user *ksegments; -- cgit v1.2.3 From 2f2728f6de9837abe4b354443a45be578fbbf942 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Mar 2014 17:18:23 +0100 Subject: mm/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter types In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that performs proper zero and sign extension convert all 64 bit parameters to their corresponding 32 bit compat counterparts. Signed-off-by: Heiko Carstens --- kernel/compat.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 2622011a44c9..488ff8c4cf48 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1065,11 +1065,11 @@ COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) } #ifdef CONFIG_NUMA -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - compat_uptr_t __user *pages32, - const int __user *nodes, - int __user *status, - int flags) +COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, + compat_uptr_t __user *, pages32, + const int __user *, nodes, + int __user *, status, + int, flags) { const void __user * __user *pages; int i; -- cgit v1.2.3 From 1d6bae966e90134bcfd7807b8f9488d55198de91 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Aug 2012 19:16:14 -0400 Subject: tracing: Move raw output code from macro to standalone function The code for trace events to format the raw recorded event data into human readable format in the 'trace' file is repeated for every event in the system. When you have over 500 events, this can add up quite a bit. By making helper functions in the core kernel to do the work instead, we can shrink the size of the kernel down a bit. With a kernel configured with 502 events, the change in size was: text data bss dec hex filename 12991007 1913568 9785344 24689919 178bcff /tmp/vmlinux.orig 12990946 1913568 9785344 24689858 178bcc2 /tmp/vmlinux.patched Note, this version does not save as much as the version of this patch I had a few years ago. That is because in the mean time, commit f71130de5c7f ("tracing: Add a helper function for event print functions") did a lot of the work my original patch did. But this change helps slightly, and is part of a larger clean up to reduce the size much further. Link: http://lkml.kernel.org/r/20120810034707.378538034@goodmis.org Cc: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed32284fbe32..ca0e79e2abaa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -439,6 +439,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } EXPORT_SYMBOL(ftrace_raw_output_prep); +static int ftrace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) +{ + struct trace_seq *s = &iter->seq; + int ret; + + ret = trace_seq_printf(s, "%s: ", name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_vprintf(s, fmt, ap); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = ftrace_output_raw(iter, name, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_output_call); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.3 From 35bb4399bd0ef16b8a57fccea0047d98b6b0e7fb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Aug 2012 22:26:46 -0400 Subject: tracing: Move event storage for array from macro to standalone function The code that shows array fields for events is defined for all events. This can add up quite a bit when you have over 500 events. By making helper functions in the core kernel to do the work instead, we can shrink the size of the kernel down a bit. With a kernel configured with 502 events, the change in size was: text data bss dec hex filename 12990946 1913568 9785344 24689858 178bcc2 /tmp/vmlinux 12987390 1913504 9785344 24686238 178ae9e /tmp/vmlinux.patched That's a total of 3556 bytes, which comes down to 7 bytes per event. Although it's not much, this code is just called at initialization of the events. Link: http://lkml.kernel.org/r/20120810034708.084036335@goodmis.org Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ------ kernel/trace/trace_export.c | 12 ++++-------- kernel/trace/trace_output.c | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4eccb5..22826c73a9da 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e72e2b6..39c746c5ae73 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -96,14 +96,10 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __array(type, item, len) \ do { \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ + ret = ftrace_event_define_field(event_call, #type, len, \ + #item, offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; \ } while (0); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ca0e79e2abaa..ee8d74840b88 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,6 +20,10 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +#define EVENT_STORAGE_SIZE 128 +static DEFINE_MUTEX(event_storage_mutex); +static char event_storage[EVENT_STORAGE_SIZE]; + int trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; @@ -470,6 +474,23 @@ int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) } EXPORT_SYMBOL_GPL(ftrace_output_call); +int ftrace_event_define_field(struct ftrace_event_call *call, + char *type, int len, char *item, int offset, + int field_size, int sign, int filter) +{ + int ret; + + mutex_lock(&event_storage_mutex); + snprintf(event_storage, sizeof(event_storage), + "%s[%d]", type, len); + ret = trace_define_field(call, event_storage, item, offset, + field_size, sign, filter); + mutex_unlock(&event_storage_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_event_define_field); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.3 From 3fd40d1ee6a317523172ab95b6f7ea41ba8fcee3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Aug 2012 22:42:57 -0400 Subject: tracing: Use helper functions in event assignment to shrink macro size The functions that assign the contents for the ftrace events are defined by the TRACE_EVENT() macros. Each event has its own unique way to assign data to its buffer. When you have over 500 events, that means there's 500 functions assigning data uniquely for each event (not really that many, as DECLARE_EVENT_CLASS() and multiple DEFINE_EVENT()s will only need a single function). By making helper functions in the core kernel to do some of the work instead, we can shrink the size of the kernel down a bit. With a kernel configured with 502 events, the change in size was: text data bss dec hex filename 12987390 1913504 9785344 24686238 178ae9e /tmp/vmlinux 12959102 1913504 9785344 24657950 178401e /tmp/vmlinux.patched That's a total of 28288 bytes, which comes down to 56 bytes per event. Link: http://lkml.kernel.org/r/20120810034708.370808175@goodmis.org Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 22826c73a9da..b8f73b333a3c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -188,6 +188,36 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, + struct ftrace_event_file *ftrace_file, + unsigned long len) +{ + struct ftrace_event_call *event_call = ftrace_file->event_call; + + local_save_flags(fbuffer->flags); + fbuffer->pc = preempt_count(); + fbuffer->ftrace_file = ftrace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, + event_call->event.type, len, + fbuffer->flags, fbuffer->pc); + if (!fbuffer->event) + return NULL; + + fbuffer->entry = ring_buffer_event_data(fbuffer->event); + return fbuffer->entry; +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +{ + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); + int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type, void *data) { -- cgit v1.2.3 From b196e2b9e262be01737dc2bbf9e3c7c87340fa4d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Feb 2014 15:45:07 -0500 Subject: tracing: Warn if a tracepoint is not set via debugfs Tracepoints were made to allow enabling a tracepoint in a module before that module was loaded. When a tracepoint is enabled and it does not exist, the name is stored and will be enabled when the tracepoint is created. The problem with this approach is that when a tracepoint is enabled when it expects to be there, it gives no warning that it does not exist. To add salt to the wound, if a module is added and sets the FORCED flag, which can happen if it isn't signed properly, the tracepoint code will not enabled the tracepoints, but they will be created in the debugfs system! When a user goes to enable the tracepoint, the tracepoint code will not see it existing and will think it is to be enabled later AND WILL NOT GIVE A WARNING. The tracing will look like it succeeded but will actually be doing nothing. This will cause lots of confusion and headaches for developers trying to figure out why they are not seeing their tracepoints. Link: http://lkml.kernel.org/r/20140213154507.4040fb06@gandalf.local.home Reported-by: Mathieu Desnoyers Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 0d4ef26574ff..0058f33d05c1 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -62,6 +62,7 @@ struct tracepoint_entry { struct hlist_node hlist; struct tracepoint_func *funcs; int refcount; /* Number of times armed. 0 if disarmed. */ + int enabled; /* Tracepoint enabled */ char name[0]; }; @@ -237,6 +238,7 @@ static struct tracepoint_entry *add_tracepoint(const char *name) memcpy(&e->name[0], name, name_len); e->funcs = NULL; e->refcount = 0; + e->enabled = 0; hlist_add_head(&e->hlist, head); return e; } @@ -316,6 +318,7 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin, if (mark_entry) { set_tracepoint(&mark_entry, *iter, !!mark_entry->refcount); + mark_entry->enabled = !!mark_entry->refcount; } else { disable_tracepoint(*iter); } @@ -380,6 +383,8 @@ tracepoint_add_probe(const char *name, void *probe, void *data) int tracepoint_probe_register(const char *name, void *probe, void *data) { struct tracepoint_func *old; + struct tracepoint_entry *entry; + int ret = 0; mutex_lock(&tracepoints_mutex); old = tracepoint_add_probe(name, probe, data); @@ -388,9 +393,13 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) return PTR_ERR(old); } tracepoint_update_probes(); /* may update entry */ + entry = get_tracepoint(name); + /* Make sure the entry was enabled */ + if (!entry || !entry->enabled) + ret = -ENODEV; mutex_unlock(&tracepoints_mutex); release_probes(old); - return 0; + return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); -- cgit v1.2.3 From 1dc43cf0be9a94a6a7273db284152db15c526106 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 24 Feb 2014 19:59:56 +0100 Subject: ftrace: Cleanup of global variables ftrace_new_pgs and ftrace_update_cnt Some of them can be local to functions, so make them local and pass them as parameters where needed: * __start_mcount_loc+__stop_mcount_loc are local to ftrace_init * ftrace_new_pgs -> new_pgs/start_pg * ftrace_update_cnt -> local update_cnt in ftrace_update_code Link: http://lkml.kernel.org/r/1393268401-24379-1-git-send-email-jslaby@suse.cz Signed-off-by: Jiri Slaby Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5313c1100d30..3f95bbeb8e8d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1174,8 +1174,6 @@ struct ftrace_page { int size; }; -static struct ftrace_page *ftrace_new_pgs; - #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) @@ -2246,7 +2244,6 @@ static void ftrace_shutdown_sysctl(void) } static cycle_t ftrace_update_time; -static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; static inline int ops_traces_mod(struct ftrace_ops *ops) @@ -2302,11 +2299,12 @@ static int referenced_filters(struct dyn_ftrace *rec) return cnt; } -static int ftrace_update_code(struct module *mod) +static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; + unsigned long update_cnt = 0; unsigned long ref = 0; bool test = false; int i; @@ -2332,9 +2330,8 @@ static int ftrace_update_code(struct module *mod) } start = ftrace_now(raw_smp_processor_id()); - ftrace_update_cnt = 0; - for (pg = ftrace_new_pgs; pg; pg = pg->next) { + for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { int cnt = ref; @@ -2355,7 +2352,7 @@ static int ftrace_update_code(struct module *mod) if (!ftrace_code_disable(mod, p)) break; - ftrace_update_cnt++; + update_cnt++; /* * If the tracing is enabled, go ahead and enable the record. @@ -2374,11 +2371,9 @@ static int ftrace_update_code(struct module *mod) } } - ftrace_new_pgs = NULL; - stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; - ftrace_update_tot_cnt += ftrace_update_cnt; + ftrace_update_tot_cnt += update_cnt; return 0; } @@ -4270,9 +4265,6 @@ static int ftrace_process_locs(struct module *mod, /* Assign the last page to ftrace_pages */ ftrace_pages = pg; - /* These new locations need to be initialized */ - ftrace_new_pgs = start_pg; - /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt @@ -4283,7 +4275,7 @@ static int ftrace_process_locs(struct module *mod, */ if (!mod) local_irq_save(flags); - ftrace_update_code(mod); + ftrace_update_code(mod, start_pg); if (!mod) local_irq_restore(flags); ret = 0; @@ -4392,11 +4384,10 @@ struct notifier_block ftrace_module_exit_nb = { .priority = INT_MIN, /* Run after anything that can remove kprobes */ }; -extern unsigned long __start_mcount_loc[]; -extern unsigned long __stop_mcount_loc[]; - void __init ftrace_init(void) { + extern unsigned long __start_mcount_loc[]; + extern unsigned long __stop_mcount_loc[]; unsigned long count, addr, flags; int ret; -- cgit v1.2.3 From c867ccd8388d1c1a31bef9c54544b2ef32f0ebca Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 24 Feb 2014 19:59:57 +0100 Subject: ftrace: Inline the code from ftrace_dyn_table_alloc() The function used to do allocations some time ago. This no longer happens and it only checks the count and prints some info. This patch inlines the body to the only caller. There are two reasons: * the name of the function was misleading * it's clear what is going on in ftrace_init now Link: http://lkml.kernel.org/r/1393268401-24379-2-git-send-email-jslaby@suse.cz Signed-off-by: Jiri Slaby Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f95bbeb8e8d..76b6ed29d856 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2465,22 +2465,6 @@ ftrace_allocate_pages(unsigned long num_to_init) return NULL; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) -{ - int cnt; - - if (!num_to_init) { - pr_info("ftrace: No functions to be traced?\n"); - return -1; - } - - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); - - return 0; -} - #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { @@ -4403,10 +4387,13 @@ void __init ftrace_init(void) goto failed; count = __stop_mcount_loc - __start_mcount_loc; - - ret = ftrace_dyn_table_alloc(count); - if (ret) + if (!count) { + pr_info("ftrace: No functions to be traced?\n"); goto failed; + } + + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, count / ENTRIES_PER_PAGE + 1); last_ftrace_enabled = ftrace_enabled = 1; -- cgit v1.2.3 From af64a7cb09db77344c596a0bf3d57d77257e8bf5 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 24 Feb 2014 19:59:58 +0100 Subject: ftrace: Pass retval through return in ftrace_dyn_arch_init() No architecture uses the "data" parameter in ftrace_dyn_arch_init() in any way, it just sets the value to 0. And this is used as a return value in the caller -- ftrace_init, which just checks the retval against zero. Note there is also "return 0" in every ftrace_dyn_arch_init. So it is enough to check the retval and remove all the indirect sets of data on all archs. Link: http://lkml.kernel.org/r/1393268401-24379-3-git-send-email-jslaby@suse.cz Cc: linux-arch@vger.kernel.org Signed-off-by: Jiri Slaby Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 76b6ed29d856..083c6d5fce25 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4379,11 +4379,9 @@ void __init ftrace_init(void) addr = (unsigned long)ftrace_stub; local_irq_save(flags); - ftrace_dyn_arch_init(&addr); + ret = ftrace_dyn_arch_init(&addr); local_irq_restore(flags); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) + if (ret) goto failed; count = __stop_mcount_loc - __start_mcount_loc; -- cgit v1.2.3 From 3a36cb11ca65cd6804972eaf1000378ba4384ea7 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 24 Feb 2014 19:59:59 +0100 Subject: ftrace: Do not pass data to ftrace_dyn_arch_init As the data parameter is not really used by any ftrace_dyn_arch_init, remove that from ftrace_dyn_arch_init. This also removes the addr local variable from ftrace_init which is now unused. Note the documentation was imprecise as it did not suggest to set (*data) to 0. Link: http://lkml.kernel.org/r/1393268401-24379-4-git-send-email-jslaby@suse.cz Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: linux-arch@vger.kernel.org Signed-off-by: Jiri Slaby Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 083c6d5fce25..5bd70e8b09b0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4372,14 +4372,11 @@ void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; - unsigned long count, addr, flags; + unsigned long count, flags; int ret; - /* Keep the ftrace pointer to the stub */ - addr = (unsigned long)ftrace_stub; - local_irq_save(flags); - ret = ftrace_dyn_arch_init(&addr); + ret = ftrace_dyn_arch_init(); local_irq_restore(flags); if (ret) goto failed; -- cgit v1.2.3 From cd21067f69240041d36e491ff5597e0217615465 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 24 Feb 2014 17:12:21 +0100 Subject: ftrace: Warn on error when modifying ftrace function We should print some warning and kill ftrace functionality when the ftrace function is not set correctly. Otherwise, ftrace might do crazy things without an explanation. The error value has been ignored so far. Note that an error that happens during updating all the traced calls is handled in ftrace_replace_code(). We print more details about the particular failing address via ftrace_bug() there. Link: http://lkml.kernel.org/r/1393258342-29978-3-git-send-email-pmladek@suse.cz Signed-off-by: Petr Mladek Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5bd70e8b09b0..0e48ff4cefa5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1994,6 +1994,7 @@ int __weak ftrace_arch_code_modify_post_process(void) void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; + int err = 0; /* * If the ftrace_caller calls a ftrace_ops func directly, @@ -2005,8 +2006,11 @@ void ftrace_modify_all_code(int command) * to make sure the ops are having the right functions * traced. */ - if (update) - ftrace_update_ftrace_func(ftrace_ops_list_func); + if (update) { + err = ftrace_update_ftrace_func(ftrace_ops_list_func); + if (FTRACE_WARN_ON(err)) + return; + } if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); @@ -2019,13 +2023,16 @@ void ftrace_modify_all_code(int command) /* If irqs are disabled, we are in stop machine */ if (!irqs_disabled()) smp_call_function(ftrace_sync_ipi, NULL, 1); - ftrace_update_ftrace_func(ftrace_trace_function); + err = ftrace_update_ftrace_func(ftrace_trace_function); + if (FTRACE_WARN_ON(err)) + return; } if (command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); + err = ftrace_enable_ftrace_graph_caller(); else if (command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); + err = ftrace_disable_ftrace_graph_caller(); + FTRACE_WARN_ON(err); } static int __ftrace_modify_code(void *data) -- cgit v1.2.3 From f952d10ff40b436a8ef156a74ec327abe303823d Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 27 Jan 2014 17:38:42 -0500 Subject: audit: Use more current logging style again Add pr_fmt to prefix "audit: " to output Convert printk(KERN_ to pr_ Coalesce formats Signed-off-by: Richard Guy Briggs --- kernel/auditfilter.c | 12 +++++++----- kernel/auditsc.c | 31 +++++++++++++++---------------- 2 files changed, 22 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 14a78cca384e..3152d1aea164 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -19,6 +19,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -247,7 +249,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { - printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + pr_err("AUDIT_POSSIBLE is deprecated\n"); goto exit_err; } if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) @@ -477,8 +479,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM " - "\'%s\' is invalid\n", str); + pr_warn("audit rule for LSM \'%s\' is invalid\n", + str); err = 0; } if (err) { @@ -707,8 +709,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { - printk(KERN_WARNING "audit rule for LSM \'%s\' is " - "invalid\n", df->lsm_str); + pr_warn("audit rule for LSM \'%s\' is invalid\n", + df->lsm_str); ret = 0; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 10176cd5956a..6874c1fd453d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -42,6 +42,8 @@ * and for LSPP certification compliance. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -850,16 +852,15 @@ static inline void audit_free_names(struct audit_context *context) if (context->put_count + context->ino_count != context->name_count) { int i = 0; - printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" - " name_count=%d put_count=%d" - " ino_count=%d [NOT freeing]\n", - __FILE__, __LINE__, + pr_err("%s:%d(:%d): major=%d in_syscall=%d" + " name_count=%d put_count=%d ino_count=%d" + " [NOT freeing]\n", __FILE__, __LINE__, context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); list_for_each_entry(n, &context->names_list, list) { - printk(KERN_ERR "names[%d] = %p = %s\n", i++, - n->name, n->name->name ?: "(null)"); + pr_err("names[%d] = %p = %s\n", i++, n->name, + n->name->name ?: "(null)"); } dump_stack(); return; @@ -1550,7 +1551,7 @@ static inline void handle_one(const struct inode *inode) if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { - printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); + pr_warn("out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); @@ -1609,8 +1610,7 @@ retry: goto retry; } /* too bad */ - printk(KERN_WARNING - "out of memory, audit has lost a tree reference\n"); + pr_warn("out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; @@ -1682,7 +1682,7 @@ void __audit_getname(struct filename *name) if (!context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", + pr_err("%s:%d(:%d): ignoring getname(%p)\n", __FILE__, __LINE__, context->serial, name); dump_stack(); #endif @@ -1721,15 +1721,15 @@ void audit_putname(struct filename *name) BUG_ON(!context); if (!context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", + pr_err("%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { struct audit_names *n; int i = 0; list_for_each_entry(n, &context->names_list, list) - printk(KERN_ERR "name[%d] = %p = %s\n", i++, - n->name, n->name->name ?: "(null)"); + pr_err("name[%d] = %p = %s\n", i++, n->name, + n->name->name ?: "(null)"); } #endif final_putname(name); @@ -1738,9 +1738,8 @@ void audit_putname(struct filename *name) else { ++context->put_count; if (context->put_count > context->name_count) { - printk(KERN_ERR "%s:%d(:%d): major=%d" - " in_syscall=%d putname(%p) name_count=%d" - " put_count=%d\n", + pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)" + " name_count=%d put_count=%d\n", __FILE__, __LINE__, context->serial, context->major, context->in_syscall, name->name, -- cgit v1.2.3 From d211f177b28ec070c25b3d0b960aa55f352f731f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Mar 2014 15:31:54 -0800 Subject: audit: Update kdoc for audit_send_reply and audit_list_rules_send The kbuild test robot reported: > tree: git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next > head: 6f285b19d09f72e801525f5eea1bdad22e559bf0 > commit: 6f285b19d09f72e801525f5eea1bdad22e559bf0 [2/2] audit: Send replies in the proper network namespace. > reproduce: make htmldocs > > >> Warning(kernel/audit.c:575): No description found for parameter 'request_skb' > >> Warning(kernel/audit.c:575): Excess function parameter 'portid' description in 'audit_send_reply' > >> Warning(kernel/auditfilter.c:1074): No description found for parameter 'request_skb' > >> Warning(kernel/auditfilter.c:1074): Excess function parameter 'portid' description in 'audit_list_rules_s Which was caused by my failure to update the kdoc annotations when I updated the functions. Fix that small oversight now. Signed-off-by: "Eric W. Biederman" --- kernel/audit.c | 2 +- kernel/auditfilter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 32086bff5564..3392d3e0254a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -559,7 +559,7 @@ static int audit_send_reply_thread(void *arg) } /** * audit_send_reply - send an audit reply message via netlink - * @portid: netlink port to which to send reply + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e8d1c7c515d7..92062fd6cc8c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1067,7 +1067,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, /** * audit_list_rules_send - list the audit rules - * @portid: target portid for netlink audit messages + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: netlink audit message sequence (serial) number */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) -- cgit v1.2.3 From e97ca8e5b864f88b028c1759ba8536fa827d6d96 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 10 Mar 2014 15:49:43 -0700 Subject: mm: fix GFP_THISNODE callers and clarify GFP_THISNODE is for callers that implement their own clever fallback to remote nodes. It restricts the allocation to the specified node and does not invoke reclaim, assuming that the caller will take care of it when the fallback fails, e.g. through a subsequent allocation request without GFP_THISNODE set. However, many current GFP_THISNODE users only want the node exclusive aspect of the flag, without actually implementing their own fallback or triggering reclaim if necessary. This results in things like page migration failing prematurely even when there is easily reclaimable memory available, unless kswapd happens to be running already or a concurrent allocation attempt triggers the necessary reclaim. Convert all callsites that don't implement their own fallback strategy to __GFP_THISNODE. This restricts the allocation a single node too, but at the same time allows the allocator to enter the slowpath, wake kswapd, and invoke direct reclaim if necessary, to make the allocation happen when memory is full. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Cc: Jan Stancek Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 6631e1ef55ab..ebdd9c1a86b4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -549,14 +549,14 @@ static int create_hash_tables(void) struct page *page; page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; -- cgit v1.2.3 From 7500d9363f7e356a5a3f10f1778f2e4f8a7eba94 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Mar 2014 19:31:51 +0530 Subject: PM / suspend: Remove unnecessary !! Double ! or !! are normally required to get 0 or 1 out of a expression. A comparision always returns 0 or 1 and hence there is no need to apply double ! over it again. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 62ee437b5c7e..90b3d9366d1a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -39,7 +39,7 @@ static const struct platform_suspend_ops *suspend_ops; static bool need_suspend_ops(suspend_state_t state) { - return !!(state > PM_SUSPEND_FREEZE); + return state > PM_SUSPEND_FREEZE; } static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); -- cgit v1.2.3 From d44753b843e093f9e1f2f14806fbe106fff74898 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 3 Mar 2014 12:09:21 +0100 Subject: sched/deadline: Deny unprivileged users to set/change SCHED_DEADLINE policy Deny the use of SCHED_DEADLINE policy to unprivileged users. Even if root users can set the policy for normal users, we don't want the latter to be able to change their parameters (safest behavior). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1393844961-18097-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6edbef296ece..f5c6635b806c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3338,6 +3338,15 @@ recheck: return -EPERM; } + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. -- cgit v1.2.3 From 177c53d943368fc97644ebc0a250dc8e2d124250 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Feb 2014 13:39:05 +0100 Subject: stop_machine: Fix^2 race between stop_two_cpus() and stop_cpus() We must use smp_call_function_single(.wait=1) for the irq_cpu_stop_queue_work() to ensure the queueing is actually done under stop_cpus_lock. Without this we could have dropped the lock by the time we do the queueing and get the race we tried to fix. Fixes: 7053ea1a34fa ("stop_machine: Fix race between stop_two_cpus() and stop_cpus()") Signed-off-by: Peter Zijlstra Cc: Prarit Bhargava Cc: Rik van Riel Cc: Mel Gorman Cc: Christoph Hellwig Cc: Andrew Morton Link: http://lkml.kernel.org/r/20140228123905.GK3104@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84571e09c907..01fbae5b97b7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * */ smp_call_function_single(min(cpu1, cpu2), &irq_cpu_stop_queue_work, - &call_args, 0); + &call_args, 1); lg_local_unlock(&stop_cpus_lock); preempt_enable(); -- cgit v1.2.3 From 96b3d28bf4b00f62fc8386ff5d487d1830793a3d Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Thu, 6 Mar 2014 14:25:28 +0900 Subject: sched/clock: Prevent tracing recursion in sched_clock_cpu() Prevent tracing of preempt_disable/enable() in sched_clock_cpu(). When CONFIG_DEBUG_PREEMPT is enabled, preempt_disable/enable() are traced and this causes trace_clock() users (and probably others) to go into an infinite recursion. Systems with a stable sched_clock() are not affected. This problem is similar to that fixed by upstream commit 95ef1e52922 ("KVM guest: prevent tracing recursion with kvmclock"). Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1394083528.4524.3.camel@nexus Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 43c2bcc35761..b30a2924ef14 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu) if (unlikely(!sched_clock_running)) return 0ull; - preempt_disable(); + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); - preempt_enable(); + preempt_enable_notrace(); return clock; } -- cgit v1.2.3 From 30cdd69e2a266505ca8229c944d361ff350a6959 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 3 Mar 2014 08:48:51 +0100 Subject: cpuidle/idle: Move the cpuidle_idle_call function to idle.c The cpuidle_idle_call does nothing more than calling the three individuals function and is no longer used by any arch specific code but only in the cpuidle framework code. We can move this function into the idle task code to ensure better proximity to the scheduler code. Signed-off-by: Daniel Lezcano Acked-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: rjw@rjwysocki.net Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1393832934-11625-2-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b7976a127178..d5aaf5eb4531 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -63,6 +63,62 @@ void __weak arch_cpu_idle(void) local_irq_enable(); } +#ifdef CONFIG_CPU_IDLE +/** + * cpuidle_idle_call - the main idle function + * + * NOTE: no locks or semaphores should be used here + * return non-zero on failure + */ +static int cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int next_state, entered_state, ret; + bool broadcast; + + ret = cpuidle_enabled(drv, dev); + if (ret < 0) + return ret; + + /* ask the governor for the next state */ + next_state = cpuidle_select(drv, dev); + + if (need_resched()) { + dev->last_residency = 0; + /* give the governor an opportunity to reflect on the outcome */ + cpuidle_reflect(dev, next_state); + local_irq_enable(); + return 0; + } + + broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + + if (broadcast && + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + return -EBUSY; + + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + entered_state = cpuidle_enter(drv, dev, next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + + if (broadcast) + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + + /* give the governor an opportunity to reflect on the outcome */ + cpuidle_reflect(dev, entered_state); + + return 0; +} +#else +static inline int cpuidle_idle_call(void) +{ + return -ENODEV; +} +#endif + /* * Generic idle loop implementation */ -- cgit v1.2.3 From c8cc7d4de7a4f2fb1f8774ec2de5b49c46c42e64 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 3 Mar 2014 08:48:52 +0100 Subject: sched/idle: Reorganize the idle loop Now that we have the main cpuidle function in idle.c, move some code from the idle mainloop to this function for the sake of clarity. That removes if then else indentation difficult to follow when looking at the code. This patch does not change the current behavior. Signed-off-by: Daniel Lezcano Acked-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: tglx@linutronix.de Cc: rjw@rjwysocki.net Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1393832934-11625-3-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d5aaf5eb4531..dc8a2466418f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -63,7 +63,6 @@ void __weak arch_cpu_idle(void) local_irq_enable(); } -#ifdef CONFIG_CPU_IDLE /** * cpuidle_idle_call - the main idle function * @@ -77,9 +76,14 @@ static int cpuidle_idle_call(void) int next_state, entered_state, ret; bool broadcast; + stop_critical_timings(); + rcu_idle_enter(); + ret = cpuidle_enabled(drv, dev); - if (ret < 0) - return ret; + if (ret < 0) { + arch_cpu_idle(); + goto out; + } /* ask the governor for the next state */ next_state = cpuidle_select(drv, dev); @@ -89,7 +93,7 @@ static int cpuidle_idle_call(void) /* give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, next_state); local_irq_enable(); - return 0; + goto out; } broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); @@ -109,15 +113,15 @@ static int cpuidle_idle_call(void) /* give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, entered_state); +out: + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); + + rcu_idle_exit(); + start_critical_timings(); return 0; } -#else -static inline int cpuidle_idle_call(void) -{ - return -ENODEV; -} -#endif /* * Generic idle loop implementation @@ -150,14 +154,7 @@ static void cpu_idle_loop(void) cpu_idle_poll(); } else { if (!current_clr_polling_and_test()) { - stop_critical_timings(); - rcu_idle_enter(); - if (cpuidle_idle_call()) - arch_cpu_idle(); - if (WARN_ON_ONCE(irqs_disabled())) - local_irq_enable(); - rcu_idle_exit(); - start_critical_timings(); + cpuidle_idle_call(); } else { local_irq_enable(); } -- cgit v1.2.3 From 8ca3c6424f4988fc19ed1067b121fbaf2e884d77 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 3 Mar 2014 08:48:53 +0100 Subject: sched/idle: Move idle conditions in cpuidle_idle main function This patch moves the condition before entering idle into the cpuidle main function located in idle.c. That simplify the idle mainloop functions and increase the readibility of the conditions to enter truly idle. This patch is code reorganization and does not change the behavior of the function. Signed-off-by: Daniel Lezcano Signed-off-by: Peter Zijlstra Cc: tglx@linutronix.de Cc: rjw@rjwysocki.net Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1393832934-11625-4-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 78 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index dc8a2466418f..cc7a6f3801ff 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -76,44 +76,59 @@ static int cpuidle_idle_call(void) int next_state, entered_state, ret; bool broadcast; + if (current_clr_polling_and_test()) { + local_irq_enable(); + __current_set_polling(); + return 0; + } + stop_critical_timings(); rcu_idle_enter(); ret = cpuidle_enabled(drv, dev); - if (ret < 0) { - arch_cpu_idle(); - goto out; - } - /* ask the governor for the next state */ - next_state = cpuidle_select(drv, dev); + if (!ret) { + /* ask the governor for the next state */ + next_state = cpuidle_select(drv, dev); - if (need_resched()) { - dev->last_residency = 0; - /* give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, next_state); - local_irq_enable(); - goto out; - } + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + } else { + broadcast = !!(drv->states[next_state].flags & + CPUIDLE_FLAG_TIMER_STOP); + + if (broadcast) + ret = clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_ENTER, + &dev->cpu); - broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + if (!ret) { + trace_cpu_idle_rcuidle(next_state, dev->cpu); - if (broadcast && - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) - return -EBUSY; + entered_state = cpuidle_enter(drv, dev, + next_state); - trace_cpu_idle_rcuidle(next_state, dev->cpu); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, + dev->cpu); - entered_state = cpuidle_enter(drv, dev, next_state); + if (broadcast) + clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_EXIT, + &dev->cpu); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + /* give the governor an opportunity to reflect on the outcome */ + cpuidle_reflect(dev, entered_state); + } + } + } + + if (ret) + arch_cpu_idle(); - if (broadcast) - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + __current_set_polling(); - /* give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); -out: if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); @@ -150,16 +165,11 @@ static void cpu_idle_loop(void) * know that the IPI is going to arrive right * away */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + if (cpu_idle_force_poll || tick_check_broadcast_expired()) cpu_idle_poll(); - } else { - if (!current_clr_polling_and_test()) { - cpuidle_idle_call(); - } else { - local_irq_enable(); - } - __current_set_polling(); - } + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } -- cgit v1.2.3 From a1d028bd6d2b7789d15eddfd07c5bea2aaf36040 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 3 Mar 2014 08:48:54 +0100 Subject: sched/idle: Add more comments to the code The idle main function is a complex and a critical function. Added more comments to the code. Signed-off-by: Daniel Lezcano Acked-by: Nicolas Pitre Signed-off-by: Peter Zijlstra Cc: tglx@linutronix.de Cc: rjw@rjwysocki.net Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1393832934-11625-5-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cc7a6f3801ff..8f4390a079c7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -76,21 +76,49 @@ static int cpuidle_idle_call(void) int next_state, entered_state, ret; bool broadcast; + /* + * Check if the idle task must be rescheduled. If it is the + * case, exit the function after re-enabling the local irq and + * set again the polling flag + */ if (current_clr_polling_and_test()) { local_irq_enable(); __current_set_polling(); return 0; } + /* + * During the idle period, stop measuring the disabled irqs + * critical sections latencies + */ stop_critical_timings(); + + /* + * Tell the RCU framework we are entering an idle section, + * so no more rcu read side critical sections and one more + * step to the grace period + */ rcu_idle_enter(); + /* + * Check if the cpuidle framework is ready, otherwise fallback + * to the default arch specific idle method + */ ret = cpuidle_enabled(drv, dev); if (!ret) { - /* ask the governor for the next state */ + /* + * Ask the governor to choose an idle state it thinks + * it is convenient to go to. There is *always* a + * convenient idle state + */ next_state = cpuidle_select(drv, dev); + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ if (current_clr_polling_and_test()) { dev->last_residency = 0; entered_state = next_state; @@ -100,6 +128,14 @@ static int cpuidle_idle_call(void) CPUIDLE_FLAG_TIMER_STOP); if (broadcast) + /* + * Tell the time framework to switch + * to a broadcast timer because our + * local timer will be shutdown. If a + * local timer is used from another + * cpu as a broadcast timer, this call + * may fail if it is not available + */ ret = clockevents_notify( CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu); @@ -107,6 +143,14 @@ static int cpuidle_idle_call(void) if (!ret) { trace_cpu_idle_rcuidle(next_state, dev->cpu); + /* + * Enter the idle state previously + * returned by the governor + * decision. This function will block + * until an interrupt occurs and will + * take care of re-enabling the local + * interrupts + */ entered_state = cpuidle_enter(drv, dev, next_state); @@ -118,17 +162,28 @@ static int cpuidle_idle_call(void) CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); - /* give the governor an opportunity to reflect on the outcome */ + /* + * Give the governor an opportunity to reflect on the + * outcome + */ cpuidle_reflect(dev, entered_state); } } } + /* + * We can't use the cpuidle framework, let's use the default + * idle routine + */ if (ret) arch_cpu_idle(); __current_set_polling(); + /* + * It is up to the idle functions to enable back the local + * interrupt + */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); -- cgit v1.2.3 From cfa77bc4af2c75c0781ee76cde2dd104c6c8e2b7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 2 Mar 2014 16:56:38 +0100 Subject: perf: Disallow user-space callchains for function trace events Recent issues with user space callchains processing within page fault handler tracing showed as Peter said 'there's just too much fail surface'. Related list discussions: http://marc.info/?t=139302086500001&r=1&w=2 http://marc.info/?t=139301437300003&r=1&w=2 Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "H. Peter Anvin" Cc: Vince Weaver Cc: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1393775800-13524-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e854f420e033..d5e01c3f4e69 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -31,9 +31,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, } /* The ftrace function trace is allowed only for root. */ - if (ftrace_event_is_function(tp_event) && - perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (ftrace_event_is_function(tp_event)) { + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + } /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) -- cgit v1.2.3 From 63c45f4ba533e9749da16298db53e491c25d805b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 2 Mar 2014 16:56:39 +0100 Subject: perf: Disallow user-space stack dumps for function trace events Recent issues with user space callchains processing within page fault handler tracing showed as Peter said 'there's just too much fail surface'. The user space stack dump is just another source of the this issue. Related list discussions: http://marc.info/?t=139302086500001&r=1&w=2 http://marc.info/?t=139301437300003&r=1&w=2 Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Vince Weaver Cc: Steven Rostedt Cc: Paul Mackerras Cc: H. Peter Anvin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1393775800-13524-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index d5e01c3f4e69..c894614de14d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -42,6 +42,13 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, */ if (!p_event->attr.exclude_callchain_user) return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; } /* No tracing, just counting, so no obvious leak */ -- cgit v1.2.3 From 734ff2a71f9e6aa6fedfa5a9a34818b8586516d5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 4 Mar 2014 19:25:46 +0400 Subject: sched/rt: Fix picking RT and DL tasks from empty queue The problems: 1) We check for rt_nr_running before call of put_prev_task(). If previous task is RT, its rt_rq may become throttled and dequeued after this call. In case of p is from rt->rq this just causes picking a task from throttled queue, but in case of its rt_rq is child we are guaranteed catch BUG_ON. 2) The same with deadline class. The only difference we operate on only dl_rq. This patch fixes all the above problems and it adds a small skip in the DL update like we've already done for RT class: if (unlikely((s64)delta_exec <= 0)) return; This will optimize sequential update_curr_dl() calls a little. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/r/1393946746.3643.3.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 ++++++++-- kernel/sched/rt.c | 7 +++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e4f3ac3b8514..27ef40925525 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -609,8 +609,8 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1023,6 +1023,12 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) if (need_pull_dl_task(rq, prev)) pull_dl_task(rq); + /* + * When prev is DL, we may throttle it in put_prev_task(). + * So, we update time before we check for dl_nr_running. + */ + if (prev->sched_class == &dl_sched_class) + update_curr_dl(rq); if (unlikely(!dl_rq->dl_nr_running)) return NULL; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index facc824334fb..f3cee0a63b76 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1379,6 +1379,13 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) return RETRY_TASK; } + /* + * We may dequeue prev's rt_rq in put_prev_task(). + * So, we update time before rt_nr_running check. + */ + if (prev->sched_class == &rt_sched_class) + update_curr_rt(rq); + if (!rt_rq->rt_nr_running) return NULL; -- cgit v1.2.3 From e4aa358b6c23f98b2715594f6b1e9a4996a55f04 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 6 Mar 2014 13:31:55 +0400 Subject: sched/fair: Push down check for high priority class task into idle_balance() We close idle_exit_fair() bracket in case of we've pulled something or we've received task of high priority class. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Vincent Guittot Link: http://lkml.kernel.org/r/1394098315.19290.10.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++----- kernel/sched/idle_task.c | 1 - 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8482e1c575e..b956e70fc503 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4787,17 +4787,16 @@ simple: return p; idle: + new_tasks = idle_balance(rq); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ - new_tasks = idle_balance(rq); - - if (rq->nr_running != rq->cfs.h_nr_running) + if (new_tasks < 0) return RETRY_TASK; - if (new_tasks) + if (new_tasks > 0) goto again; return NULL; @@ -6728,8 +6727,14 @@ static int idle_balance(struct rq *this_rq) this_rq->max_idle_balance_cost = curr_cost; out: - if (pulled_task) + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) { + idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } return pulled_task; } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 1f3725882838..879f2b75266a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -29,7 +29,6 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev) put_prev_task(rq, prev); schedstat_inc(rq, sched_goidle); - idle_enter_fair(rq); return rq->idle; } -- cgit v1.2.3 From 4c6c4e38c4e9a454889298dcc498174968d14a09 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 6 Mar 2014 13:32:01 +0400 Subject: sched/core: Fix endless loop in pick_next_task() 1) Single cpu machine case. When rq has only RT tasks, but no one of them can be picked because of throttling, we enter in endless loop. pick_next_task_{dl,rt} return NULL. In pick_next_task_fair() we permanently go to retry if (rq->nr_running != rq->cfs.h_nr_running) return RETRY_TASK; (rq->nr_running is not being decremented when rt_rq becomes throttled). No chances to unthrottle any rt_rq or to wake fair here, because of rq is locked permanently and interrupts are disabled. 2) In case of SMP this can cause a hang too. Although we unlock rq in idle_balance(), interrupts are still disabled. The solution is to check for available tasks in DL and RT classes instead of checking for sum. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394098321.19290.11.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +++- kernel/sched/rt.c | 10 ---------- kernel/sched/sched.h | 12 ++++++++++++ 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b956e70fc503..10db4a87ad72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6728,7 +6728,9 @@ static int idle_balance(struct rq *this_rq) out: /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_running && + (this_rq->dl.dl_nr_running || + (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) pulled_task = -1; if (pulled_task) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f3cee0a63b76..d8cdf1618551 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -470,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} - static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -545,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 378bff76267f..f2de7a175620 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -423,6 +423,18 @@ struct rt_rq { #endif }; +#ifdef CONFIG_RT_GROUP_SCHED +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} +#else +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} +#endif + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit v1.2.3 From 35805ff8f4fc535ac85330170d3c56829c87c677 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 6 Mar 2014 19:16:15 +0400 Subject: sched/fair: Fix endless loop in idle_balance() Check for fair tasks number to decide, that we've pulled a task. rq's nr_running may contain throttled RT tasks. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394118975.19290.104.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10db4a87ad72..f1eedae1e83e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6710,7 +6710,7 @@ static int idle_balance(struct rq *this_rq) * While browsing the domains, we released the rq lock. * A task could have be enqueued in the meantime */ - if (this_rq->nr_running && !pulled_task) { + if (this_rq->cfs.h_nr_running && !pulled_task) { pulled_task = 1; goto out; } -- cgit v1.2.3 From 156654f491dd8d52687a5fbe1637f472a52ce75b Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 28 Feb 2014 07:23:11 +0100 Subject: sched/numa: Move task_numa_free() to __put_task_struct() Bad idea on -rt: [ 908.026136] [] rt_spin_lock_slowlock+0xaa/0x2c0 [ 908.026145] [] task_numa_free+0x31/0x130 [ 908.026151] [] finish_task_switch+0xce/0x100 [ 908.026156] [] thread_return+0x48/0x4ae [ 908.026160] [] schedule+0x25/0xa0 [ 908.026163] [] rt_spin_lock_slowlock+0xd5/0x2c0 [ 908.026170] [] get_signal_to_deliver+0xaf/0x680 [ 908.026175] [] do_signal+0x3d/0x5b0 [ 908.026179] [] do_notify_resume+0x90/0xe0 [ 908.026186] [] int_signal+0x12/0x17 [ 908.026193] [<00007ff2a388b1d0>] 0x7ff2a388b1cf and since upstream does not mind where we do this, be a bit nicer ... Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Mel Gorman Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1393568591.6018.27.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/fork.c | 1 + kernel/sched/core.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a17621c6cd42..332688e5e7b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd89c27bb56f..9e126a21c5c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2151,8 +2151,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { - task_numa_free(prev); - if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); -- cgit v1.2.3 From c9122da1e2d29bd6a1475a0d1ce2aa6ac6ea25fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 13:32:16 +0100 Subject: locking: Move mcs_spinlock.h into kernel/locking/ The mcs_spinlock code is not meant (or suitable) as a generic locking primitive, therefore take it away from the normal includes and place it in kernel/locking/. This way the locking primitives implemented there can use it as part of their implementation but we do not risk it getting used inapropriately. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-byirmpamgr7h25m5kyavwpzx@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mcs_spinlock.h | 114 ++++++++++++++++++++++++++++++++++++++++++ kernel/locking/mutex.c | 2 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 kernel/locking/mcs_spinlock.h (limited to 'kernel') diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h new file mode 100644 index 000000000000..f2a5c6360083 --- /dev/null +++ b/kernel/locking/mcs_spinlock.h @@ -0,0 +1,114 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +#include + +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ +}; + +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l) \ +do { \ + while (!(smp_load_acquire(l))) \ + arch_mutex_cpu_relax(); \ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l) \ + smp_store_release((l), 1) +#endif + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* + * Lock acquired, don't need to set node->locked to 1. Threads + * only spin on its own node->locked value for lock acquisition. + * However, since this thread can immediately acquire the lock + * and does not proceed to spin on its own node->locked, this + * value won't be used. If a debug mode is needed to + * audit lock status, then set node->locked value here. + */ + return; + } + ACCESS_ONCE(prev->next) = node; + + /* Wait until the lock holder passes the lock down. */ + arch_mcs_spin_lock_contended(&node->locked); +} + +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + + /* Pass lock to next waiter. */ + arch_mcs_spin_unlock_contended(&next->locked); +} + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 45fe1b5293d6..4f408be39a07 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include "mcs_spinlock.h" /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, -- cgit v1.2.3 From 46af29e479cc0c1c63633007993af5292c2c3e75 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Jan 2014 11:13:12 -0800 Subject: locking/mutexes: Return false if task need_resched() in mutex_can_spin_on_owner() The mutex_can_spin_on_owner() function should also return false if the task needs to be rescheduled to avoid entering the MCS queue when it needs to reschedule. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Waiman.Long@hp.com Cc: torvalds@linux-foundation.org Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Cc: chegu_vinod@hp.com Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1390936396-3962-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4f408be39a07..e6d646b18d6c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -166,6 +166,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + if (need_resched()) + return 0; + rcu_read_lock(); owner = ACCESS_ONCE(lock->owner); if (owner) -- cgit v1.2.3 From 47667fa1502e4d759df87e9cc7fbc0f202483361 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Jan 2014 11:13:13 -0800 Subject: locking/mutexes: Modify the way optimistic spinners are queued The mutex->spin_mlock was introduced in order to ensure that only 1 thread spins for lock acquisition at a time to reduce cache line contention. When lock->owner is NULL and the lock->count is still not 1, the spinner(s) will continually release and obtain the lock->spin_mlock. This can generate quite a bit of overhead/contention, and also might just delay the spinner from getting the lock. This patch modifies the way optimistic spinners are queued by queuing before entering the optimistic spinning loop as oppose to acquiring before every call to mutex_spin_on_owner(). So in situations where the spinner requires a few extra spins before obtaining the lock, then there will only be 1 spinner trying to get the lock and it will avoid the overhead from unnecessarily unlocking and locking the spin_mlock. Signed-off-by: Jason Low Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Cc: chegu_vinod@hp.com Cc: Waiman.Long@hp.com Cc: paulmck@linux.vnet.ibm.com Cc: torvalds@linux-foundation.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1390936396-3962-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e6d646b18d6c..82dad2ccd40b 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -403,9 +403,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; + mcs_spin_lock(&lock->mcs_lock, &node); for (;;) { struct task_struct *owner; - struct mcs_spinlock node; if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; @@ -420,19 +420,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * performed the optimistic spinning cannot be done. */ if (ACCESS_ONCE(ww->ctx)) - goto slowpath; + break; } /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ - mcs_spin_lock(&lock->mcs_lock, &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) { - mcs_spin_unlock(&lock->mcs_lock, &node); - goto slowpath; - } + if (owner && !mutex_spin_on_owner(lock, owner)) + break; if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { @@ -449,7 +446,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_enable(); return 0; } - mcs_spin_unlock(&lock->mcs_lock, &node); /* * When there's no owner, we might have preempted between the @@ -458,7 +454,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) - goto slowpath; + break; /* * The cpu_relax() call is a compiler barrier which forces @@ -468,6 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } + mcs_spin_unlock(&lock->mcs_lock, &node); slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From 1d8fe7dc8078b23e060ec62ccb4cdc1ac3c41bf8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Jan 2014 11:13:14 -0800 Subject: locking/mutexes: Unlock the mutex without the wait_lock When running workloads that have high contention in mutexes on an 8 socket machine, mutex spinners would often spin for a long time with no lock owner. The main reason why this is occuring is in __mutex_unlock_common_slowpath(), if __mutex_slowpath_needs_to_unlock(), then the owner needs to acquire the mutex->wait_lock before releasing the mutex (setting lock->count to 1). When the wait_lock is contended, this delays the mutex from being released. We should be able to release the mutex without holding the wait_lock. Signed-off-by: Jason Low Cc: chegu_vinod@hp.com Cc: paulmck@linux.vnet.ibm.com Cc: Waiman.Long@hp.com Cc: torvalds@linux-foundation.org Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1390936396-3962-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 82dad2ccd40b..dc3d6f2bbe2a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -671,10 +671,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - /* * some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to @@ -683,6 +679,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); + spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); + if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = -- cgit v1.2.3 From fb0527bd5ea99bfeb2dd91e3c1433ecf745d6b99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Jan 2014 12:51:42 +0100 Subject: locking/mutexes: Introduce cancelable MCS lock for adaptive spinning Since we want a task waiting for a mutex_lock() to go to sleep and reschedule on need_resched() we must be able to abort the mcs_spin_lock() around the adaptive spin. Therefore implement a cancelable mcs lock. Signed-off-by: Peter Zijlstra Cc: chegu_vinod@hp.com Cc: paulmck@linux.vnet.ibm.com Cc: Waiman.Long@hp.com Cc: torvalds@linux-foundation.org Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Cc: Jason Low Link: http://lkml.kernel.org/n/tip-62hcl5wxydmjzd182zhvk89m@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 2 +- kernel/locking/mcs_spinlock.c | 178 ++++++++++++++++++++++++++++++++++++++++++ kernel/locking/mcs_spinlock.h | 15 ++++ kernel/locking/mutex.c | 10 ++- 4 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 kernel/locking/mcs_spinlock.c (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index baab8e5e7f66..2a9ee96ecf00 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o +obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c new file mode 100644 index 000000000000..838dc9e00669 --- /dev/null +++ b/kernel/locking/mcs_spinlock.c @@ -0,0 +1,178 @@ + +#include +#include +#include +#include "mcs_spinlock.h" + +#ifdef CONFIG_SMP + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_queue * +osq_wait_next(struct optimistic_spin_queue **lock, + struct optimistic_spin_queue *node, + struct optimistic_spin_queue *prev) +{ + struct optimistic_spin_queue *next = NULL; + + for (;;) { + if (*lock == node && cmpxchg(lock, node, prev) == node) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + arch_mutex_cpu_relax(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *prev, *next; + + node->locked = 0; + node->next = NULL; + + node->prev = prev = xchg(lock, node); + if (likely(prev == NULL)) + return true; + + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + arch_mutex_cpu_relax(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + arch_mutex_cpu_relax(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *next; + + /* + * Fast path for the uncontended case. + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + + /* + * Second most likely case. + */ + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} + +#endif + diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index f2a5c6360083..a2dbac4aca6b 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -111,4 +111,19 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mcs_spin_unlock_contended(&next->locked); } +/* + * Cancellable version of the MCS lock above. + * + * Intended for adaptive spinning of sleeping locks: + * mutex_lock()/rwsem_down_{read,write}() etc. + */ + +struct optimistic_spin_queue { + struct optimistic_spin_queue *next, *prev; + int locked; /* 1 if lock acquired */ +}; + +extern bool osq_lock(struct optimistic_spin_queue **lock); +extern void osq_unlock(struct optimistic_spin_queue **lock); + #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dc3d6f2bbe2a..2670b84067d6 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -53,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->mcs_lock = NULL; + lock->osq = NULL; #endif debug_mutex_init(lock, name, key); @@ -403,7 +403,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; - mcs_spin_lock(&lock->mcs_lock, &node); + if (!osq_lock(&lock->osq)) + goto slowpath; + for (;;) { struct task_struct *owner; @@ -442,7 +444,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mcs_spin_unlock(&lock->mcs_lock, &node); + osq_unlock(&lock->osq); preempt_enable(); return 0; } @@ -464,7 +466,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } - mcs_spin_unlock(&lock->mcs_lock, &node); + osq_unlock(&lock->osq); slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From 34c6bc2c919a55e5ad4e698510a2f35ee13ab900 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 16:21:09 +0100 Subject: locking/mutexes: Add extra reschedule point Add in an extra reschedule in an attempt to avoid getting reschedule the moment we've acquired the lock. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-zah5eyn9gu7qlgwh9r6n2anc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2670b84067d6..02c61a9c8906 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -468,6 +468,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } osq_unlock(&lock->osq); slowpath: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) + schedule_preempt_disabled(); #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From db0fbadcbd0c288525ea9f76488b324642a78c7f Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 10 Mar 2014 21:42:11 +0100 Subject: ftrace: Fix compilation warning about control_ops_free With CONFIG_DYNAMIC_FTRACE=n, I see a warning: kernel/trace/ftrace.c:240:13: warning: 'control_ops_free' defined but not used static void control_ops_free(struct ftrace_ops *ops) ^ Move that function around to an already existing #ifdef CONFIG_DYNAMIC_FTRACE block as the function is used solely from the dynamic function tracing functions. Link: http://lkml.kernel.org/r/1394484131-5107-1-git-send-email-jslaby@suse.cz Signed-off-by: Jiri Slaby Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0e48ff4cefa5..b4531b228180 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -237,11 +237,6 @@ static int control_ops_alloc(struct ftrace_ops *ops) return 0; } -static void control_ops_free(struct ftrace_ops *ops) -{ - free_percpu(ops->disabled); -} - static void update_global_ops(void) { ftrace_func_t func = ftrace_global_list_func; @@ -2100,6 +2095,11 @@ static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; static int global_start_up; +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { -- cgit v1.2.3 From 4d4348202b34c130a899b597fec14bbb5d83108d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 11 Mar 2014 11:23:41 +0100 Subject: PM / Hibernate: Spelling s/anonymouns/anonymous/ Spelling fix. Signed-off-by: Geert Uytterhoeven Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d9f61a145802..149e745eaa52 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1268,7 +1268,7 @@ static void free_unnecessary_pages(void) * [number of saveable pages] - [number of pages that can be freed in theory] * * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, * minus mapped file pages. */ static unsigned long minimum_image_size(unsigned long saveable) -- cgit v1.2.3 From af02b5fdb1fb3c5d5f8d71f7f84e4fb243e1ae31 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 11 Mar 2014 12:08:11 +0100 Subject: PM: Add missing "freeze" state Fix descriptions of /sys/power/state in the documentation and in a code comment. Signed-off-by: Geert Uytterhoeven Reviewed-by: Srivatsa S. Bhat Acked-by: Pavel Machek [rjw: Changelog] Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 1d1bf630e6e9..6271bc4073ef 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -282,8 +282,8 @@ struct kobject *power_kobj; * state - control system power state. * * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). + * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), + * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). * * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. -- cgit v1.2.3 From 4c11628a16506a8a8e030515f601771df07bba97 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 11 Mar 2014 21:32:28 -0400 Subject: tracepoints: API doc update to data argument Describe the @data argument (probe private data). Link: http://lkml.kernel.org/r/1394587948-27878-1-git-send-email-mathieu.desnoyers@efficios.com Fixes: 38516ab59fbc "tracing: Let tracepoints have data passed to tracepoint callbacks" CC: Ingo Molnar CC: Frederic Weisbecker CC: Andrew Morton Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 0058f33d05c1..a4f629da3011 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -376,6 +376,7 @@ tracepoint_add_probe(const char *name, void *probe, void *data) * tracepoint_probe_register - Connect a probe to a tracepoint * @name: tracepoint name * @probe: probe handler + * @data: probe private data * * Returns 0 if ok, error value on error. * The probe address must at least be aligned on the architecture pointer size. @@ -424,6 +425,7 @@ tracepoint_remove_probe(const char *name, void *probe, void *data) * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @name: tracepoint name * @probe: probe function pointer + * @data: probe private data * * We do not need to call a synchronize_sched to make sure the probes have * finished running before doing a module unload, because the module unload @@ -464,6 +466,7 @@ static void tracepoint_add_old_probes(void *old) * tracepoint_probe_register_noupdate - register a probe but not connect * @name: tracepoint name * @probe: probe handler + * @data: probe private data * * caller must call tracepoint_probe_update_all() */ @@ -488,6 +491,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect * @name: tracepoint name * @probe: probe function pointer + * @data: probe private data * * caller must call tracepoint_probe_update_all() */ -- cgit v1.2.3 From 3bbc8db341773eb6aa5576eaabca4e95170fbe34 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 10 Mar 2014 21:04:58 -0400 Subject: tracepoints: API doc update to tracepoint_probe_register() return value Describe the return values of tracepoint_probe_register(), including -ENODEV added by commit: Author: Steven Rostedt tracing: Warn if a tracepoint is not set via debugfs Link: http://lkml.kernel.org/r/1394499898-1537-2-git-send-email-mathieu.desnoyers@efficios.com CC: Ingo Molnar CC: Frederic Weisbecker CC: Andrew Morton Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index a4f629da3011..e2a58a22b0f4 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -378,7 +378,17 @@ tracepoint_add_probe(const char *name, void *probe, void *data) * @probe: probe handler * @data: probe private data * - * Returns 0 if ok, error value on error. + * Returns: + * - 0 if the probe was successfully registered, and tracepoint + * callsites are currently loaded for that probe, + * - -ENODEV if the probe was successfully registered, but no tracepoint + * callsite is currently loaded for that probe, + * - other negative error value on error. + * + * When tracepoint_probe_register() returns either 0 or -ENODEV, + * parameters @name, @probe, and @data may be used by the tracepoint + * infrastructure until the probe is unregistered. + * * The probe address must at least be aligned on the architecture pointer size. */ int tracepoint_probe_register(const char *name, void *probe, void *data) -- cgit v1.2.3 From d88471cb8b17a72b1edf5ab62e1704d78373c066 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 9 Jan 2013 18:09:20 -0500 Subject: ftrace: Constify ftrace_text_reserved Link: http://lkml.kernel.org/r/1357772960-4436-5-git-send-email-sasha.levin@oracle.com Signed-off-by: Sasha Levin Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4531b228180..1fd4b9479210 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1555,7 +1555,7 @@ unsigned long ftrace_location(unsigned long ip) * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. */ -int ftrace_text_reserved(void *start, void *end) +int ftrace_text_reserved(const void *start, const void *end) { unsigned long ret; -- cgit v1.2.3 From 383afd0971538b3d77532a56404b24cfe967b5dd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Mar 2014 19:24:20 -0400 Subject: sched: Fix broken setscheduler() I decided to run my tests on linux-next, and my wakeup_rt tracer was broken. After running a bisect, I found that the problem commit was: linux-next commit c365c292d059 "sched: Consider pi boosting in setscheduler()" And the reason the wake_rt tracer test was failing, was because it had no RT task to trace. I first noticed this when running with sched_switch event and saw that my RT task still had normal SCHED_OTHER priority. Looking at the problem commit, I found: - p->normal_prio = normal_prio(p); - p->prio = rt_mutex_getprio(p); With no + p->normal_prio = normal_prio(p); + p->prio = rt_mutex_getprio(p); Reading what the commit is suppose to do, I realize that the p->prio can't be set if the task is boosted with a higher prio, but the p->normal_prio still needs to be set regardless, otherwise, when the task is deboosted, it wont get the new priority. The p->prio has to be set before "check_class_changed()" is called, otherwise the class wont be changed. Also added fix to newprio to include a check for deadline policy that was missing. This change was suggested by Juri Lelli. Signed-off-by: Steven Rostedt Cc: SebastianAndrzej Siewior Cc: Juri Lelli Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140306120438.638bfe94@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9e126a21c5c7..ae365aaa8181 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3195,6 +3195,7 @@ static void __setscheduler_params(struct task_struct *p, * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); set_load_weight(p); } @@ -3204,6 +3205,12 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, { __setscheduler_params(p, attr); + /* + * If we get here, there was no pi waiters boosting the + * task. It is safe to use the normal prio. + */ + p->prio = normal_prio(p); + if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; else if (rt_prio(p->prio)) @@ -3262,7 +3269,8 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { - int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; int policy = attr->sched_policy; unsigned long flags; -- cgit v1.2.3 From a2cd42601b474b957e1a5fe3692bcf7f9363bd51 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Mar 2014 17:26:06 +0100 Subject: sched: Remove double calculation in fix_small_imbalance() The tmp value has been already calculated in: scaled_busy_load_per_task = (busiest->load_per_task * SCHED_POWER_SCALE) / busiest->group_power; Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394555166-22894-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f1eedae1e83e..b301918ed510 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6061,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; - if (busiest->avg_load > tmp) { + if (busiest->avg_load > scaled_busy_load_per_task) { pwr_move += busiest->group_power * min(busiest->load_per_task, - busiest->avg_load - tmp); + busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ -- cgit v1.2.3 From 6037dd1a49f95092824fa8ba75c717ff7805e317 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 12 Mar 2014 14:51:51 +0800 Subject: sched: Clean up the task_hot() function task_hot() doesn't need the 'sched_domain' parameter, so remove it. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394607111-1904-1-git-send-email-alex.shi@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b301918ed510..7e9bd0b1fa9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5037,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta; @@ -5198,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); -- cgit v1.2.3 From 6f008e72cd111a119b5d8de8c5438d892aae99eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Mar 2014 13:24:42 +0100 Subject: locking/mutex: Fix debug checks OK, so commit: 1d8fe7dc8078 ("locking/mutexes: Unlock the mutex without the wait_lock") generates this boot warning when CONFIG_DEBUG_MUTEXES=y: WARNING: CPU: 0 PID: 139 at /usr/src/linux-2.6/kernel/locking/mutex-debug.c:82 debug_mutex_unlock+0x155/0x180() DEBUG_LOCKS_WARN_ON(lock->owner != current) And that makes sense, because as soon as we release the lock a new owner can come in... One would think that !__mutex_slowpath_needs_to_unlock() implementations suffer the same, but for DEBUG we fall back to mutex-null.h which has an unconditional 1 for that. The mutex debug code requires the mutex to be unlocked after doing the debug checks, otherwise it can find inconsistent state. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: jason.low2@hp.com Link: http://lkml.kernel.org/r/20140312122442.GB27965@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.c | 6 ++++++ kernel/locking/mutex.c | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index faf6f5b53e77..e1191c996c59 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); + + /* + * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug + * mutexes so that we can do it here after we've verified state. + */ + atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 02c61a9c8906..14fe72cc8ce7 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -34,6 +34,13 @@ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" # include +/* + * Must be 0 for the debug case so we do not do the unlock outside of the + * wait_lock region. debug_mutex_unlock() will do the actual unlock in this + * case. + */ +# undef __mutex_slowpath_needs_to_unlock +# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" # include -- cgit v1.2.3 From c1bacbae8192dd2a9ebadd22d793b68054f6c6e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 8 Mar 2014 08:59:58 +0100 Subject: genirq: Provide irq_request/release_resources chip callbacks For certain irq types, e.g. gpios, it's necessary to request resources before starting up the irq. This might fail so we cannot use the irq_startup() callback because we might call the irq_set_type() callback before that which does not make sense when the resource is not available. Calling irq_startup() before irq_set_type() can lead to spurious interrupts which is not desired either. Signed-off-by: Thomas Gleixner Cc: Jean-Jacques Hiblot Cc: Grant Likely Cc: linux-arm-kernel@lists.infradead.org Reviewed-by: Linus Walleij Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1403080857160.18573@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d3bf660cb57f..5d35cbe22896 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -896,6 +896,23 @@ static void irq_setup_forced_threading(struct irqaction *new) } } +static int irq_request_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + return c->irq_request_resources ? c->irq_request_resources(d) : 0; +} + +static void irq_release_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + if (c->irq_release_resources) + c->irq_release_resources(d); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1091,6 +1108,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mask; + } + init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1261,8 +1285,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) *action_ptr = action->next; /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) + if (!desc->action) { irq_shutdown(desc); + irq_release_resources(desc); + } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ -- cgit v1.2.3 From 27bba4d6bb3779a6678b31f9c9b9c1553c63fa95 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 3 Feb 2014 11:13:13 +1030 Subject: module: use pr_cont When dumping loaded modules, we print them one by one in separate printks. Let's use pr_cont as they are continuation prints. Signed-off-by: Jiri Slaby Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/module.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d24fcf29cb64..efa1e6031950 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3809,12 +3809,12 @@ void print_modules(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - printk(" %s%s", mod->name, module_flags(mod, buf)); + pr_cont(" %s%s", mod->name, module_flags(mod, buf)); } preempt_enable(); if (last_unloaded_module[0]) - printk(" [last unloaded: %s]", last_unloaded_module); - printk("\n"); + pr_cont(" [last unloaded: %s]", last_unloaded_module); + pr_cont("\n"); } #ifdef CONFIG_MODVERSIONS -- cgit v1.2.3 From 66cc69e34e86a231fbe68d8918c6119e3b7549a3 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 13 Mar 2014 12:11:30 +1030 Subject: Fix: module signature vs tracepoints: add new TAINT_UNSIGNED_MODULE Users have reported being unable to trace non-signed modules loaded within a kernel supporting module signature. This is caused by tracepoint.c:tracepoint_module_coming() refusing to take into account tracepoints sitting within force-loaded modules (TAINT_FORCED_MODULE). The reason for this check, in the first place, is that a force-loaded module may have a struct module incompatible with the layout expected by the kernel, and can thus cause a kernel crash upon forced load of that module on a kernel with CONFIG_TRACEPOINTS=y. Tracepoints, however, specifically accept TAINT_OOT_MODULE and TAINT_CRAP, since those modules do not lead to the "very likely system crash" issue cited above for force-loaded modules. With kernels having CONFIG_MODULE_SIG=y (signed modules), a non-signed module is tainted re-using the TAINT_FORCED_MODULE taint flag. Unfortunately, this means that Tracepoints treat that module as a force-loaded module, and thus silently refuse to consider any tracepoint within this module. Since an unsigned module does not fit within the "very likely system crash" category of tainting, add a new TAINT_UNSIGNED_MODULE taint flag to specifically address this taint behavior, and accept those modules within Tracepoints. We use the letter 'X' as a taint flag character for a module being loaded that doesn't know how to sign its name (proposed by Steven Rostedt). Also add the missing 'O' entry to trace event show_module_flags() list for the sake of completeness. Signed-off-by: Mathieu Desnoyers Acked-by: Steven Rostedt NAKed-by: Ingo Molnar CC: Thomas Gleixner CC: David Howells CC: Greg Kroah-Hartman Signed-off-by: Rusty Russell --- kernel/module.c | 4 +++- kernel/panic.c | 2 ++ kernel/tracepoint.c | 5 +++-- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index efa1e6031950..c1acb0c5b637 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1013,6 +1013,8 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'F'; if (mod->taints & (1 << TAINT_CRAP)) buf[l++] = 'C'; + if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) + buf[l++] = 'X'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't @@ -3214,7 +3216,7 @@ static int load_module(struct load_info *info, const char __user *uargs, pr_notice_once("%s: module verification failed: signature " "and/or required key missing - tainting " "kernel\n", mod->name); - add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); + add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); } #endif diff --git a/kernel/panic.c b/kernel/panic.c index 6d6300375090..0e25fe10871e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -210,6 +210,7 @@ static const struct tnt tnts[] = { { TAINT_CRAP, 'C', ' ' }, { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, + { TAINT_UNSIGNED_MODULE, 'X', ' ' }, }; /** @@ -228,6 +229,7 @@ static const struct tnt tnts[] = { * 'C' - modules from drivers/staging are loaded. * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. + * 'X' - Unsigned module has been loaded. * * The string is overwritten by the next call to print_tainted(). */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 031cc5655a51..3cdbed1fbdc7 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -633,7 +633,8 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { - return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | + (1 << TAINT_UNSIGNED_MODULE)); } static int tracepoint_module_coming(struct module *mod) @@ -644,7 +645,7 @@ static int tracepoint_module_coming(struct module *mod) /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging and out-of-tree GPL modules are fine. + * Staging, out-of-tree, and unsigned GPL modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; -- cgit v1.2.3 From dee08a72deefac251267ed2717717596aa8b6818 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 5 Mar 2014 17:02:22 +0100 Subject: cputime: Fix jiffies based cputime assumption on steal accounting The steal guest time accounting code assumes that cputime_t is based on jiffies. So when CONFIG_NO_HZ_FULL=y, which implies that cputime_t is based on nsecs, steal_account_process_tick() passes the delta in jiffies to account_steal_time() which then accounts it as if it's a value in nsecs. As a result, accounting 1 second of steal time (with HZ=100 that would be 100 jiffies) is spuriously accounted as 100 nsecs. As such /proc/stat may report 0 values of steal time even when two guests have run concurrently for a few seconds on the same host and same CPU. In order to fix this, lets convert the nsecs based steal delta to cputime instead of jiffies by using the right conversion API. Given that the steal time is stored in cputime_t and this type can have a smaller granularity than nsecs, we only account the rounded converted value and leave the remaining nsecs for the next deltas. Reported-by: Huiqingding Reported-by: Marcelo Tosatti Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Peter Zijlstra Cc: Thomas Gleixner Acked-by: Rik van Riel Signed-off-by: Frederic Weisbecker --- kernel/sched/cputime.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30b..c91b09770ebd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; + u64 steal; + cputime_t steal_ct; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - account_steal_time(st); - return st; + account_steal_time(steal_ct); + return steal_ct; } #endif return false; -- cgit v1.2.3 From 300a9d887ea221f344962506f724e02101bacc08 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 5 Mar 2014 17:05:57 +0100 Subject: sched: Remove needless round trip nsecs <-> tick conversion of steal time When update_rq_clock_task() accounts the pending steal time for a task, it converts the steal delta from nsecs to tick then from tick to nsecs. There is no apparent good reason for doing that though because both the task clock and the prev steal delta are u64 and store values in nsecs. So lets remove the needless conversion. Cc: Ingo Molnar Cc: Marcelo Tosatti Cc: Peter Zijlstra Cc: Thomas Gleixner Acked-by: Rik van Riel Signed-off-by: Frederic Weisbecker --- kernel/sched/core.c | 6 ------ kernel/sched/sched.h | 10 ---------- 2 files changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..b14a188af898 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -823,19 +823,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - u64 st; - steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - st = steal_ticks(steal); - steal = st * TICK_NSEC; - rq->prev_steal_time_rq += steal; - delta -= steal; } #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2119fd20f8b..5ec991010122 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1214,16 +1214,6 @@ extern void update_idle_cpu_load(struct rq *this_rq); extern void init_task_runnable_average(struct task_struct *p); -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); - - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif - static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; -- cgit v1.2.3 From 89f8b33ca1ea881d1d84542282cb85d07d02e78d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Mar 2014 09:38:42 -0600 Subject: block: remove old blk_iopoll_enabled variable This was a debugging measure to toggle enabled/disabled when testing. But for real production setups, it's not safe to toggle this setting without either reloading drivers of quiescing IO first. Neither of which the toggle enforces. Additionally, it makes drivers deal with the conditional state. Remove it completely. It's up to the driver whether iopoll is enabled or not. Signed-off-by: Jens Axboe --- kernel/sysctl.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 49e13e1f8fe6..ef0bf04e8649 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -112,9 +112,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif -#ifdef CONFIG_BLOCK -extern int blk_iopoll_enabled; -#endif /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR @@ -1093,15 +1090,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#endif -#ifdef CONFIG_BLOCK - { - .procname = "blk_iopoll", - .data = &blk_iopoll_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif { } }; -- cgit v1.2.3 From 328a4978df833249b099c9875738d7b72042ffe1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2014 19:03:51 +0100 Subject: genirq: Add a new IRQCHIP_EOI_THREADED flag The flag is necessary for interrupt chips which require an ACK/EOI after the handler has run. In case of threaded handlers this needs to happen after the threaded handler has completed before the unmask of the interrupt. The flag is only unseful in combination with the handle_fasteoi_irq flow control handler. It can be combined with the flag IRQCHIP_EOI_IF_HANDLED, so the EOI is not issued when the interrupt is disabled or in progress. Tested-by: Hans de Goede Reviewed-by: Hans de Goede Cc: linux-arm-kernel@lists.infradead.org Cc: linux-sunxi@googlegroups.com Cc: Maxime Ripard Link: http://lkml.kernel.org/r/1394733834-26839-2-git-send-email-hdegoede@redhat.com Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- kernel/irq/internals.h | 1 + kernel/irq/manage.c | 2 +- 3 files changed, 42 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc04c166c54d..6397df2d6945 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -281,6 +281,19 @@ void unmask_irq(struct irq_desc *desc) } } +void unmask_threaded_irq(struct irq_desc *desc) +{ + struct irq_chip *chip = desc->irq_data.chip; + + if (chip->flags & IRQCHIP_EOI_THREADED) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_unmask) { + chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); + } +} + /* * handle_nested_irq - Handle a nested irq from a irq thread * @irq: the interrupt number @@ -435,6 +448,27 @@ static inline void preflow_handler(struct irq_desc *desc) static inline void preflow_handler(struct irq_desc *desc) { } #endif +static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) +{ + if (!(desc->istate & IRQS_ONESHOT)) { + chip->irq_eoi(&desc->irq_data); + return; + } + /* + * We need to unmask in the following cases: + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) { + chip->irq_eoi(&desc->irq_data); + unmask_irq(desc); + } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) { + chip->irq_eoi(&desc->irq_data); + } +} + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -448,6 +482,8 @@ static inline void preflow_handler(struct irq_desc *desc) { } void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { + struct irq_chip *chip = desc->irq_data.chip; + raw_spin_lock(&desc->lock); if (unlikely(irqd_irq_inprogress(&desc->irq_data))) @@ -473,18 +509,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); - if (desc->istate & IRQS_ONESHOT) - cond_unmask_irq(desc); + cond_unmask_eoi_irq(desc, chip); -out_eoi: - desc->irq_data.chip->irq_eoi(&desc->irq_data); -out_unlock: raw_spin_unlock(&desc->lock); return; out: - if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) - goto out_eoi; - goto out_unlock; + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); } /** diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 17b671713d5f..ddf1ffeb79f1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -74,6 +74,7 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); +extern void unmask_threaded_irq(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index de1a8ed29b40..2486a4c1a710 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -748,7 +748,7 @@ again: if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) - unmask_irq(desc); + unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); -- cgit v1.2.3 From 09294e31b1779dda22f420c195994a0db54c9a92 Mon Sep 17 00:00:00 2001 From: "David A. Long" Date: Fri, 7 Mar 2014 10:32:22 -0500 Subject: uprobes: Kconfig dependency fix Suggested change from Oleg Nesterov. Fixes incomplete dependencies for uprobes feature. Signed-off-by: David A. Long Acked-by: Oleg Nesterov --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 015f85aaca08..8639819f6cef 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -424,6 +424,7 @@ config UPROBE_EVENT bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES depends on MMU + depends on PERF_EVENTS select UPROBES select PROBE_EVENTS select TRACING -- cgit v1.2.3 From 6fe50a28ba6e5fafb4a549dea666dd15297dd8bd Mon Sep 17 00:00:00 2001 From: "David A. Long" Date: Mon, 3 Feb 2014 14:25:49 -0500 Subject: uprobes: allow ignoring of probe hits Allow arches to decided to ignore a probe hit. ARM will use this to only call handlers if the conditions to execute a conditionally executed instruction are satisfied. Signed-off-by: David A. Long Acked-by: Oleg Nesterov --- kernel/events/uprobes.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 307d87c0991a..04709b66369d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1804,6 +1804,11 @@ static bool handle_trampoline(struct pt_regs *regs) return true; } +bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) +{ + return false; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1858,7 +1863,11 @@ static void handle_swbp(struct pt_regs *regs) if (!get_utask()) goto out; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + goto out; + handler_chain(uprobe, regs); + if (can_skip_sstep(uprobe, regs)) goto out; -- cgit v1.2.3 From 3eb59ec64fc7a3f4576da23f811b39331b830ba2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Mar 2014 17:02:36 +0800 Subject: cgroup: fix a failure path in create_css() If online_css() fails, we should remove cgroup files belonging to css->ss. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 105f273b6f86..0c753ddd223b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4112,17 +4112,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) err = percpu_ref_init(&css->refcnt, css_release); if (err) - goto err_free; + goto err_free_css; init_css(css, ss, cgrp); err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); if (err) - goto err_free; + goto err_free_percpu_ref; err = online_css(css); if (err) - goto err_free; + goto err_clear_dir; dget(cgrp->dentry); css_get(css->parent); @@ -4138,8 +4138,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; -err_free: +err_clear_dir: + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); +err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); +err_free_css: ss->css_free(css); return err; } -- cgit v1.2.3 From d532676cc7329e1088702ccb0015942cc370b954 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Mar 2014 11:19:52 +0100 Subject: softirq: Add linux/irq.h to make it compile again On Sparc and S390 the removal of irq.h from kernel_stat.h causes: kernel/softirq.c:774:9: error: 'NR_IRQS_LEGACY' undeclared Reported-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 490fcbb1dc5b..b50990a5bea0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -25,6 +25,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include -- cgit v1.2.3 From bab5c790cc64adb1ede54b4077444375108ac8da Mon Sep 17 00:00:00 2001 From: Chema Gonzalez Date: Thu, 13 Mar 2014 19:50:55 -0700 Subject: genirq: procfs: Make smp_affinity values go+r Includes: - /proc/irq/default_smp_affinity - /proc/irq/*/affinity_hint - /proc/irq/*/smp_affinity - /proc/irq/*/smp_affinity_list Users can distill the same information by reading /proc/interrupts. Signed-off-by: Chema Gonzalez Cc: Eric Dumazet Link: http://lkml.kernel.org/r/1394765455-1217-1-git-send-email-chema@google.com Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 36f6ee181b0c..ac1ba2f11032 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -324,15 +324,15 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ - proc_create_data("smp_affinity", 0600, desc->dir, + proc_create_data("smp_affinity", 0644, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); /* create /proc/irq//affinity_hint */ - proc_create_data("affinity_hint", 0400, desc->dir, + proc_create_data("affinity_hint", 0444, desc->dir, &irq_affinity_hint_proc_fops, (void *)(long)irq); /* create /proc/irq//smp_affinity_list */ - proc_create_data("smp_affinity_list", 0600, desc->dir, + proc_create_data("smp_affinity_list", 0644, desc->dir, &irq_affinity_list_proc_fops, (void *)(long)irq); proc_create_data("node", 0444, desc->dir, @@ -372,7 +372,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP - proc_create("irq/default_smp_affinity", 0600, NULL, + proc_create("irq/default_smp_affinity", 0644, NULL, &default_affinity_proc_fops); #endif } -- cgit v1.2.3 From 5d77381fd8aa631a8fda718c395da1319afb5d2d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:53 -0400 Subject: cgroup: relocate setting of CGRP_DEAD In cgroup_destroy_locked(), move setting of CGRP_DEAD above invocations of kill_css(). This doesn't make any visible behavior difference now but will be used to inhibit manipulating controller enable states of a dying cgroup on the unified hierarchy. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 306ad0ed19ef..b604c7e0cfc6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3867,6 +3867,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!empty) return -EBUSY; + /* + * Mark @cgrp dead. This prevents further task migration and child + * creation by disabling cgroup_lock_live_group(). Note that + * CGRP_DEAD assertion is depended upon by css_next_child() to + * resume iteration after dropping RCU read lock. See + * css_next_child() for details. + */ + set_bit(CGRP_DEAD, &cgrp->flags); + /* * Initiate massacre of all css's. cgroup_destroy_css_killed() * will be invoked to perform the rest of destruction once the @@ -3878,15 +3887,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) kill_css(css); mutex_lock(&cgroup_mutex); - /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by css_next_child() to - * resume iteration after dropping RCU read lock. See - * css_next_child() for details. - */ - set_bit(CGRP_DEAD, &cgrp->flags); - /* CGRP_DEAD is set, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) -- cgit v1.2.3 From 172a2c0685ff3bc0b7a611b308aac0694de34594 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:53 -0400 Subject: cgroup: reorganize cgroup bootstrapping * Fields of init_css_set and css_set_count are now set using initializer instead of programmatically from cgroup_init_early(). * init_cgroup_root() now also takes @opts and performs the optional part of initialization too. The leftover part of cgroup_root_from_opts() is collapsed into its only caller - cgroup_mount(). * Initialization of cgroup_root_count and linking of init_css_set are moved from cgroup_init_early() to to cgroup_init(). None of the early_init users depends on init_css_set being linked. * Subsystem initializations are moved after dummy hierarchy init and init_css_set linking. These changes reorganize the bootstrap logic so that the dummy hierarchy can share the usual hierarchy init path and be made more normal. These changes don't make noticeable behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 100 +++++++++++++++++++++++++++----------------------------- 1 file changed, 49 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b604c7e0cfc6..e66b9ee5ecc1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -338,16 +338,24 @@ struct cgrp_cset_link { struct list_head cgrp_link; }; -/* The default css_set - used by init and its children prior to any +/* + * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ +static struct css_set init_css_set = { + .refcount = ATOMIC_INIT(1), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), + .tasks = LIST_HEAD_INIT(init_css_set.tasks), + .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), +}; -static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; -static int css_set_count; +static int css_set_count = 1; /* 1 for init_css_set */ /* * hash table for cgroup groups. This improves the performance to find @@ -1352,7 +1360,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dummy_css.cgroup = cgrp; } -static void init_cgroup_root(struct cgroupfs_root *root) +static void init_cgroup_root(struct cgroupfs_root *root, + struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->top_cgroup; @@ -1361,20 +1370,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) cgrp->root = root; init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); -} - -static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) -{ - struct cgroupfs_root *root; - - if (!opts->subsys_mask && !opts->none) - return ERR_PTR(-EINVAL); - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) - return ERR_PTR(-ENOMEM); - - init_cgroup_root(root); root->flags = opts->flags; if (opts->release_agent) @@ -1383,7 +1378,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->name, opts->name); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); - return root; } static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) @@ -1548,13 +1542,24 @@ retry: goto out_unlock; } - /* no such thing, create a new one */ - root = cgroup_root_from_opts(&opts); - if (IS_ERR(root)) { - ret = PTR_ERR(root); + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; goto out_unlock; } + init_cgroup_root(root, &opts); + ret = cgroup_setup_root(root, opts.subsys_mask); if (ret) cgroup_free_root(root); @@ -4030,26 +4035,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) */ int __init cgroup_init_early(void) { + static struct cgroup_sb_opts __initdata opts = { }; struct cgroup_subsys *ss; int i; - atomic_set(&init_css_set.refcount, 1); - INIT_LIST_HEAD(&init_css_set.cgrp_links); - INIT_LIST_HEAD(&init_css_set.tasks); - INIT_LIST_HEAD(&init_css_set.mg_tasks); - INIT_LIST_HEAD(&init_css_set.mg_preload_node); - INIT_LIST_HEAD(&init_css_set.mg_node); - INIT_HLIST_NODE(&init_css_set.hlist); - css_set_count = 1; - init_cgroup_root(&cgroup_dummy_root); - cgroup_root_count = 1; + init_cgroup_root(&cgroup_dummy_root, &opts); RCU_INIT_POINTER(init_task.cgroups, &init_css_set); - init_cgrp_cset_link.cset = &init_css_set; - init_cgrp_cset_link.cgrp = cgroup_dummy_top; - list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); - list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", @@ -4077,22 +4069,10 @@ int __init cgroup_init(void) { struct cgroup_subsys *ss; unsigned long key; - int i, err; + int ssid, err; BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - for_each_subsys(ss, i) { - if (!ss->early_init) - cgroup_init_subsys(ss); - - /* - * cftype registration needs kmalloc and can't be done - * during early_init. Register base cftypes separately. - */ - if (ss->base_cftypes) - WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); - } - /* allocate id for the dummy hierarchy */ mutex_lock(&cgroup_mutex); @@ -4106,8 +4086,26 @@ int __init cgroup_init(void) 0, 1, GFP_KERNEL); BUG_ON(err < 0); + cgroup_root_count = 1; + init_cgrp_cset_link.cset = &init_css_set; + init_cgrp_cset_link.cgrp = cgroup_dummy_top; + list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); + list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); + mutex_unlock(&cgroup_mutex); + for_each_subsys(ss, ssid) { + if (!ss->early_init) + cgroup_init_subsys(ss); + + /* + * cftype registration needs kmalloc and can't be done + * during early_init. Register base cftypes separately. + */ + if (ss->base_cftypes) + WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + } + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); if (!cgroup_kobj) return -ENOMEM; -- cgit v1.2.3 From 985ed670144c25058f235276f69d687de1b7c7ba Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:53 -0400 Subject: cgroup: use cgroup_setup_root() to initialize cgroup_dummy_root cgroup_dummy_root is used to host controllers which aren't attached to any other hierarchy. The root is minimally set up during kernfs bootstrap and didn't go through full hierarchy initialization. We're planning to use cgroup_dummy_root for the default unified hierarchy and thus want it to be fully functional. Replace the special initialization, which was collected into cgroup_init() by the previous patch, with an invocation of cgroup_setup_root(). This simplifies the init path and makes cgroup_dummy_root a full hierarchy with its own kernfs_root and all. As this puts the dummy hierarchy on the cgroup_roots list, rename for_each_active_root() to for_each_root() and update its users to skip the dummy root for now. This patch doesn't cause any userland visible behavior changes at this point. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e66b9ee5ecc1..78017f52c69b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -289,8 +289,8 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -/* iterate across the active hierarchies */ -#define for_each_active_root(root) \ +/* iterate across the hierarchies */ +#define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) /** @@ -354,7 +354,6 @@ static struct css_set init_css_set = { .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), }; -static struct cgrp_cset_link init_cgrp_cset_link; static int css_set_count = 1; /* 1 for init_css_set */ /* @@ -693,14 +692,13 @@ static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root) return top_cgrp->root; } -static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) +static int cgroup_init_root_id(struct cgroupfs_root *root) { int id; lockdep_assert_held(&cgroup_mutex); - id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, - GFP_KERNEL); + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); if (id < 0) return id; @@ -1405,8 +1403,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) if (ret) goto out; - /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ - ret = cgroup_init_root_id(root, 2, 0); + ret = cgroup_init_root_id(root); if (ret) goto out; @@ -1486,9 +1483,12 @@ retry: goto out_unlock; /* look for a matching existing root */ - for_each_active_root(root) { + for_each_root(root) { bool name_match = false; + if (root == &cgroup_dummy_root) + continue; + /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. @@ -2106,9 +2106,12 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); - for_each_active_root(root) { + for_each_root(root) { struct cgroup *from_cgrp; + if (root == &cgroup_dummy_root) + continue; + down_read(&css_set_rwsem); from_cgrp = task_cgroup_from_root(from, root); up_read(&css_set_rwsem); @@ -4073,26 +4076,17 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - /* allocate id for the dummy hierarchy */ + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); - - err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, - 0, 1, GFP_KERNEL); - BUG_ON(err < 0); - - cgroup_root_count = 1; - init_cgrp_cset_link.cset = &init_css_set; - init_cgrp_cset_link.cgrp = cgroup_dummy_top; - list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); - list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); + BUG_ON(cgroup_setup_root(&cgroup_dummy_root, 0)); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { if (!ss->early_init) @@ -4176,11 +4170,14 @@ int proc_cgroup_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); down_read(&css_set_rwsem); - for_each_active_root(root) { + for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; + if (root == &cgroup_dummy_root) + continue; + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) -- cgit v1.2.3 From 5df3603229e520442502ff7fc5715c77bbf61912 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: treat cgroup_dummy_root as an equivalent hierarchy during rebinding Currently, while rebinding, cgroup_dummy_root serves as the anchor point. In addition to the target root, rebind_subsystems() takes @added_mask and @removed_mask. The subsystems specified in the former are expected to be on the dummy root and then moved to the target root. The ones in the latter are moved from non-dummy root to dummy. Now that the dummy root is a fully functional one and we're planning to use it for the default unified hierarchy, this level of distinction between dummy and non-dummy roots is quite awkward. This patch updates rebind_subsystems() to take the target root and one subsystem mask and move the specified subsystmes to the target root which may or may not be the dummy root. IOW, unbinding now becomes moving the subsystems to the dummy root and binding to non-dummy root. This makes the dummy root mostly equivalent to other hierarchies in terms of the mechanism of moving subsystems around; however, we still retain all the semantical restrictions so that this patch doesn't introduce any visible behavior differences. Another noteworthy detail is that rebind_subsystems() guarantees that moving a subsystem to the dummy root never fails so that valid unmounting attempts always succeed. This unifies binding and unbinding of subsystems. The invocation points of ->bind() were inconsistent between the two and now moved after whole rebinding is complete. This doesn't break the current users and generally makes more sense. All rebind_subsystems() users are converted accordingly. Note that cgroup_remount() now makes two calls to rebind_subsystems() to bind and then unbind the requested subsystems. This will allow repurposing of the dummy hierarchy as the default unified hierarchy and shouldn't make any userland visible behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 100 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 78017f52c69b..b632981bd9c7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -175,8 +175,8 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); -static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long added_mask, unsigned removed_mask); +static int rebind_subsystems(struct cgroupfs_root *dst_root, + unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -739,7 +739,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - WARN_ON(rebind_subsystems(root, 0, root->subsys_mask)); + rebind_subsystems(&cgroup_dummy_root, root->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -976,69 +976,77 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) } } -static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long added_mask, unsigned removed_mask) +static int rebind_subsystems(struct cgroupfs_root *dst_root, + unsigned long ss_mask) { - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *dst_top = &dst_root->top_cgroup; struct cgroup_subsys *ss; - int i, ret; + int ssid, ret; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - /* Check that any added subsystems are currently free */ - for_each_subsys(ss, i) - if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root) + for_each_subsys(ss, ssid) { + if (!(ss_mask & (1 << ssid))) + continue; + + /* if @ss is on the dummy_root, we can always move it */ + if (ss->root == &cgroup_dummy_root) + continue; + + /* if @ss has non-root cgroups attached to it, can't move */ + if (!list_empty(&ss->root->top_cgroup.children)) return -EBUSY; - ret = cgroup_populate_dir(cgrp, added_mask); - if (ret) - return ret; + /* can't move between two non-dummy roots either */ + if (dst_root != &cgroup_dummy_root) + return -EBUSY; + } + + if (dst_root != &cgroup_dummy_root) { + ret = cgroup_populate_dir(dst_top, ss_mask); + if (ret) + return ret; + } /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ mutex_unlock(&cgroup_mutex); - cgroup_clear_dir(cgrp, removed_mask); + for_each_subsys(ss, ssid) + if ((ss_mask & (1 << ssid)) && ss->root != &cgroup_dummy_root) + cgroup_clear_dir(&ss->root->top_cgroup, 1 << ssid); mutex_lock(&cgroup_mutex); - for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; - - if (bit & added_mask) { - /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgroup_css(cgrp, ss)); - BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); - BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); + for_each_subsys(ss, ssid) { + struct cgroupfs_root *src_root; + struct cgroup *src_top; + struct cgroup_subsys_state *css; - rcu_assign_pointer(cgrp->subsys[i], - cgroup_css(cgroup_dummy_top, ss)); - cgroup_css(cgrp, ss)->cgroup = cgrp; + if (!(ss_mask & (1 << ssid))) + continue; - ss->root = root; - if (ss->bind) - ss->bind(cgroup_css(cgrp, ss)); + src_root = ss->root; + src_top = &src_root->top_cgroup; + css = cgroup_css(src_top, ss); - /* refcount was already taken, and we're keeping it */ - root->subsys_mask |= bit; - } else if (bit & removed_mask) { - /* We're removing this subsystem */ - BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); - BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); + WARN_ON(!css || cgroup_css(dst_top, ss)); - if (ss->bind) - ss->bind(cgroup_css(cgroup_dummy_top, ss)); + RCU_INIT_POINTER(src_top->subsys[ssid], NULL); + rcu_assign_pointer(dst_top->subsys[ssid], css); + ss->root = dst_root; + css->cgroup = dst_top; - cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; - RCU_INIT_POINTER(cgrp->subsys[i], NULL); + src_root->subsys_mask &= ~(1 << ssid); + dst_root->subsys_mask |= 1 << ssid; - cgroup_subsys[i]->root = &cgroup_dummy_root; - root->subsys_mask &= ~bit; - } + if (ss->bind) + ss->bind(css); } - kernfs_activate(cgrp->kn); + if (dst_root != &cgroup_dummy_root) + kernfs_activate(dst_top->kn); return 0; } @@ -1277,10 +1285,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) goto out_unlock; } - ret = rebind_subsystems(root, added_mask, removed_mask); + ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; + rebind_subsystems(&cgroup_dummy_root, removed_mask); + if (opts.release_agent) { spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, opts.release_agent); @@ -1420,7 +1430,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) if (ret) goto destroy_root; - ret = rebind_subsystems(root, ss_mask, 0); + ret = rebind_subsystems(root, ss_mask); if (ret) goto destroy_root; @@ -4026,6 +4036,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); + cgroup_dummy_root.subsys_mask |= 1 << ss->id; + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); } -- cgit v1.2.3 From 944196278d3dea0cece1636de417b56897d9a108 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: move ->subsys_mask from cgroupfs_root to cgroup cgroupfs_root->subsys_mask represents the controllers attached to the hierarchy. This patch moves the field to cgroup. Subsystem initialization and rebinding updates the top cgroup's subsys_mask. For !root cgroups, the subsys_mask bits are set from create_css() and cleared from kill_css(), which effectively means that all cgroups will have the same subsys_mask as the top cgroup. While this doesn't make any difference now, this will help implementation of the default unified hierarchy where !root cgroups may have subsets of the top_cgroup's subsys_mask. While at it, __kill_css() is split out of kill_css(). The former doesn't care about the subsys_mask while the latter becomes noop if the controller is already killed and clears the matching bit if not before proceeding to killing the css. This will be used later by the default unified hierarchy implementation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b632981bd9c7..807f88dbda51 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -526,7 +526,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->subsys_mask & (1UL << i)) { + if (root->top_cgroup.subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -739,7 +739,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgroup_dummy_root, root->subsys_mask); + rebind_subsystems(&cgroup_dummy_root, cgrp->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -1038,8 +1038,8 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root, ss->root = dst_root; css->cgroup = dst_top; - src_root->subsys_mask &= ~(1 << ssid); - dst_root->subsys_mask |= 1 << ssid; + src_top->subsys_mask &= ~(1 << ssid); + dst_top->subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -1058,7 +1058,7 @@ static int cgroup_show_options(struct seq_file *seq, int ssid; for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) + if (root->top_cgroup.subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1262,12 +1262,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->top_cgroup.subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->top_cgroup.subsys_mask; + removed_mask = root->top_cgroup.subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1515,7 +1515,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { + (opts.subsys_mask != root->top_cgroup.subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -3594,6 +3594,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) cgroup_get(cgrp); css_get(css->parent); + cgrp->subsys_mask |= 1 << ss->id; + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -3700,7 +3702,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->subsys_mask & (1 << ssid)) { + if (root->top_cgroup.subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; @@ -3788,17 +3790,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -/** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) +static void __kill_css(struct cgroup_subsys_state *css) { + lockdep_assert_held(&cgroup_tree_mutex); + /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. @@ -3824,6 +3819,28 @@ static void kill_css(struct cgroup_subsys_state *css) percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + struct cgroup *cgrp = css->cgroup; + + lockdep_assert_held(&cgroup_tree_mutex); + + /* if already killed, noop */ + if (cgrp->subsys_mask & (1 << css->ss->id)) { + cgrp->subsys_mask &= ~(1 << css->ss->id); + __kill_css(css); + } +} + /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4036,7 +4053,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgroup_dummy_root.subsys_mask |= 1 << ss->id; + cgroup_dummy_root.top_cgroup.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4192,7 +4209,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) + if (root->top_cgroup.subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", -- cgit v1.2.3 From 3dd06ffa9df99aa88f4e01eaa0c9d3cb362430b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: rename cgroup_dummy_root and related names The dummy root will be repurposed to serve as the default unified hierarchy. Let's rename things in preparation. * s/cgroup_dummy_root/cgrp_dfl_root/ * s/cgroupfs_root/cgroup_root/ as we don't do fs part directly anymore * s/cgroup_root->top_cgroup/cgroup_root->cgrp/ for brevity This is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 168 +++++++++++++++++++++++++++----------------------------- 1 file changed, 81 insertions(+), 87 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 807f88dbda51..60ea16058c42 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,14 +138,11 @@ static const char *cgroup_subsys_name[] = { #undef SUBSYS /* - * The dummy hierarchy, reserved for the subsystems that are otherwise + * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -static struct cgroupfs_root cgroup_dummy_root; - -/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ -static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; +static struct cgroup_root cgrp_dfl_root; /* The list of hierarchy roots */ @@ -175,7 +172,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); -static int rebind_subsystems(struct cgroupfs_root *dst_root, +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -514,7 +511,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state *template[]) { - struct cgroupfs_root *root = cgrp->root; + struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; @@ -526,7 +523,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->top_cgroup.subsys_mask & (1UL << i)) { + if (root->cgrp.subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -685,14 +682,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *top_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kf_root->kn->priv; - return top_cgrp->root; + return root_cgrp->root; } -static int cgroup_init_root_id(struct cgroupfs_root *root) +static int cgroup_init_root_id(struct cgroup_root *root) { int id; @@ -706,7 +703,7 @@ static int cgroup_init_root_id(struct cgroupfs_root *root) return 0; } -static void cgroup_exit_root_id(struct cgroupfs_root *root) +static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); @@ -716,7 +713,7 @@ static void cgroup_exit_root_id(struct cgroupfs_root *root) } } -static void cgroup_free_root(struct cgroupfs_root *root) +static void cgroup_free_root(struct cgroup_root *root) { if (root) { /* hierarhcy ID shoulid already have been released */ @@ -727,9 +724,9 @@ static void cgroup_free_root(struct cgroupfs_root *root) } } -static void cgroup_destroy_root(struct cgroupfs_root *root) +static void cgroup_destroy_root(struct cgroup_root *root) { - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; mutex_lock(&cgroup_tree_mutex); @@ -739,7 +736,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgroup_dummy_root, cgrp->subsys_mask); + rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -770,7 +767,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root) /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, - struct cgroupfs_root *root) + struct cgroup_root *root) { struct cgroup *res = NULL; @@ -778,7 +775,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, lockdep_assert_held(&css_set_rwsem); if (cset == &init_css_set) { - res = &root->top_cgroup; + res = &root->cgrp; } else { struct cgrp_cset_link *link; @@ -801,7 +798,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, * called with cgroup_mutex and css_set_rwsem held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroupfs_root *root) + struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the @@ -837,9 +834,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cgroup + * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't - * need a special hack to ensure that top_cgroup cannot be deleted. + * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() @@ -905,7 +902,7 @@ static void cgroup_free_fn(struct work_struct *work) kfree(cgrp); } else { /* - * This is top cgroup's refcnt reaching zero, which + * This is root cgroup's refcnt reaching zero, which * indicates that the root should be released. */ cgroup_destroy_root(cgrp->root); @@ -976,10 +973,9 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) } } -static int rebind_subsystems(struct cgroupfs_root *dst_root, +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { - struct cgroup *dst_top = &dst_root->top_cgroup; struct cgroup_subsys *ss; int ssid, ret; @@ -991,20 +987,20 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root, continue; /* if @ss is on the dummy_root, we can always move it */ - if (ss->root == &cgroup_dummy_root) + if (ss->root == &cgrp_dfl_root) continue; /* if @ss has non-root cgroups attached to it, can't move */ - if (!list_empty(&ss->root->top_cgroup.children)) + if (!list_empty(&ss->root->cgrp.children)) return -EBUSY; /* can't move between two non-dummy roots either */ - if (dst_root != &cgroup_dummy_root) + if (dst_root != &cgrp_dfl_root) return -EBUSY; } - if (dst_root != &cgroup_dummy_root) { - ret = cgroup_populate_dir(dst_top, ss_mask); + if (dst_root != &cgrp_dfl_root) { + ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); if (ret) return ret; } @@ -1015,50 +1011,48 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root, */ mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) - if ((ss_mask & (1 << ssid)) && ss->root != &cgroup_dummy_root) - cgroup_clear_dir(&ss->root->top_cgroup, 1 << ssid); + if ((ss_mask & (1 << ssid)) && ss->root != &cgrp_dfl_root) + cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); mutex_lock(&cgroup_mutex); for_each_subsys(ss, ssid) { - struct cgroupfs_root *src_root; - struct cgroup *src_top; + struct cgroup_root *src_root; struct cgroup_subsys_state *css; if (!(ss_mask & (1 << ssid))) continue; src_root = ss->root; - src_top = &src_root->top_cgroup; - css = cgroup_css(src_top, ss); + css = cgroup_css(&src_root->cgrp, ss); - WARN_ON(!css || cgroup_css(dst_top, ss)); + WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); - RCU_INIT_POINTER(src_top->subsys[ssid], NULL); - rcu_assign_pointer(dst_top->subsys[ssid], css); + RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); + rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); ss->root = dst_root; - css->cgroup = dst_top; + css->cgroup = &dst_root->cgrp; - src_top->subsys_mask &= ~(1 << ssid); - dst_top->subsys_mask |= 1 << ssid; + src_root->cgrp.subsys_mask &= ~(1 << ssid); + dst_root->cgrp.subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); } - if (dst_root != &cgroup_dummy_root) - kernfs_activate(dst_top->kn); + if (dst_root != &cgrp_dfl_root) + kernfs_activate(dst_root->cgrp.kn); return 0; } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { - struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) - if (root->top_cgroup.subsys_mask & (1 << ssid)) + if (root->cgrp.subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1072,7 +1066,7 @@ static int cgroup_show_options(struct seq_file *seq, seq_printf(seq, ",release_agent=%s", root->release_agent_path); spin_unlock(&release_agent_path_lock); - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); @@ -1245,7 +1239,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; @@ -1262,12 +1256,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->top_cgroup.subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->top_cgroup.subsys_mask; - removed_mask = root->top_cgroup.subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; + removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1280,7 +1274,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->top_cgroup.children)) { + if (!list_empty(&root->cgrp.children)) { ret = -EBUSY; goto out_unlock; } @@ -1289,7 +1283,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgroup_dummy_root, removed_mask); + rebind_subsystems(&cgrp_dfl_root, removed_mask); if (opts.release_agent) { spin_lock(&release_agent_path_lock); @@ -1368,10 +1362,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dummy_css.cgroup = cgrp; } -static void init_cgroup_root(struct cgroupfs_root *root, +static void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) { - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); @@ -1385,13 +1379,13 @@ static void init_cgroup_root(struct cgroupfs_root *root, if (opts->name) strcpy(root->name, opts->name); if (opts->cpuset_clone_children) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { LIST_HEAD(tmp_links); - struct cgroup *root_cgrp = &root->top_cgroup; + struct cgroup *root_cgrp = &root->cgrp; struct css_set *cset; int i, ret; @@ -1443,7 +1437,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) cgroup_root_count++; /* - * Link the top cgroup in this hierarchy into all the css_set + * Link the root cgroup in this hierarchy into all the css_set * objects. */ down_write(&css_set_rwsem); @@ -1472,7 +1466,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { - struct cgroupfs_root *root; + struct cgroup_root *root; struct cgroup_sb_opts opts; struct dentry *dentry; int ret; @@ -1496,7 +1490,7 @@ retry: for_each_root(root) { bool name_match = false; - if (root == &cgroup_dummy_root) + if (root == &cgrp_dfl_root) continue; /* @@ -1515,7 +1509,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->top_cgroup.subsys_mask)) { + (opts.subsys_mask != root->cgrp.subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -1533,13 +1527,13 @@ retry: } /* - * A root's lifetime is governed by its top cgroup. Zero + * A root's lifetime is governed by its root cgroup. Zero * ref indicate that the root is being destroyed. Wait for * destruction to complete so that the subsystems are free. * We can use wait_queue for the wait but this path is * super cold. Let's just sleep for a bit and retry. */ - if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) { + if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); kfree(opts.release_agent); @@ -1586,16 +1580,16 @@ out_unlock: dentry = kernfs_mount(fs_type, flags, root->kf_root); if (IS_ERR(dentry)) - cgroup_put(&root->top_cgroup); + cgroup_put(&root->cgrp); return dentry; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); - struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); - cgroup_put(&root->top_cgroup); + cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -1622,7 +1616,7 @@ static struct kobject *cgroup_kobj; */ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { - struct cgroupfs_root *root; + struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; char *path = NULL; @@ -2112,14 +2106,14 @@ out_unlock_cgroup: */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { - struct cgroupfs_root *root; + struct cgroup_root *root; int retval = 0; mutex_lock(&cgroup_mutex); for_each_root(root) { struct cgroup *from_cgrp; - if (root == &cgroup_dummy_root) + if (root == &cgrp_dfl_root) continue; down_read(&css_set_rwsem); @@ -2151,7 +2145,7 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, static int cgroup_release_agent_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroupfs_root *root = css->cgroup->root; + struct cgroup_root *root = css->cgroup->root; BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(css->cgroup)) @@ -2362,14 +2356,14 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *root = &ss->root->top_cgroup; + struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_tree_mutex); /* don't bother if @ss isn't attached */ - if (ss->root == &cgroup_dummy_root) + if (ss->root == &cgrp_dfl_root) return 0; /* add/rm files for all cgroups created before */ @@ -3623,7 +3617,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, umode_t mode) { struct cgroup *cgrp; - struct cgroupfs_root *root = parent->root; + struct cgroup_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; struct kernfs_node *kn; @@ -3702,7 +3696,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->top_cgroup.subsys_mask & (1 << ssid)) { + if (root->cgrp.subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; @@ -4031,17 +4025,17 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) INIT_LIST_HEAD(&ss->cfts); - /* Create the top cgroup state for this subsystem */ - ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); + /* Create the root cgroup state for this subsystem */ + ss->root = &cgrp_dfl_root; + css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, cgroup_dummy_top); + init_css(css, ss, &cgrp_dfl_root.cgrp); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the - * init_css_set is in the subsystem's top cgroup. */ + * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4053,7 +4047,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgroup_dummy_root.top_cgroup.subsys_mask |= 1 << ss->id; + cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4071,7 +4065,7 @@ int __init cgroup_init_early(void) struct cgroup_subsys *ss; int i; - init_cgroup_root(&cgroup_dummy_root, &opts); + init_cgroup_root(&cgrp_dfl_root, &opts); RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { @@ -4112,7 +4106,7 @@ int __init cgroup_init(void) key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(cgroup_setup_root(&cgroup_dummy_root, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4181,7 +4175,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) struct task_struct *tsk; char *buf, *path; int retval; - struct cgroupfs_root *root; + struct cgroup_root *root; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); @@ -4204,12 +4198,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgroup_dummy_root) + if (root == &cgrp_dfl_root) continue; seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->top_cgroup.subsys_mask & (1 << ssid)) + if (root->cgrp.subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", @@ -4639,7 +4633,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) struct cgroup *c = link->cgrp; const char *name = "?"; - if (c != cgroup_dummy_top) { + if (c != &cgrp_dfl_root.cgrp) { cgroup_name(c, name_buf, NAME_MAX + 1); name = name_buf; } -- cgit v1.2.3 From 4d3bb511b5f9980fc3e9ae5939ebc475b231d3fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:54 -0400 Subject: cgroup: drop const from @buffer of cftype->write_string() cftype->write_string() just passes on the writeable buffer from kernfs and there's no reason to add const restriction on the buffer. The only thing const achieves is unnecessarily complicating parsing of the buffer. Drop const from @buffer. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki --- kernel/cgroup.c | 2 +- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 60ea16058c42..f5754910e80b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2143,7 +2143,7 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, } static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { struct cgroup_root *root = css->cgroup->root; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2ea98b216bff..2bc4a2256444 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -442,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { bool freeze; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8d5324583aa4..efbf9baf77ec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1610,7 +1610,7 @@ out_unlock: * Common handling for a write to a "cpus" or "mems" file. */ static int cpuset_write_resmask(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf) + struct cftype *cft, char *buf) { struct cpuset *cs = css_cs(css); struct cpuset *trialcs; -- cgit v1.2.3 From a2dd424750807f83632a2a70293961086fd8f870 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:55 -0400 Subject: cgroup: make cgrp_dfl_root mountable cgrp_dfl_root will be used as the default unified hierarchy. This patch makes cgrp_dfl_root mountable by making the following changes. * cgroup_init_early() now initializes cgrp_dfl_root w/ CGRP_ROOT_SANE_BEHAVIOR. The default hierarchy is always sane. * parse_cgroupfs_options() and cgroup_mount() are updated such that cgrp_dfl_root is mounted if sane_behavior is specified w/o any subsystems. * rebind_subsystems() now populates the root directory of cgrp_dfl_root. Note that the function still guarantees success of rebinding subsystems to cgrp_dfl_root. If populating fails while rebinding to cgrp_dfl_root, it whines but ignores the error. * For backward compatibility, the default hierarchy shows up in /proc/$PID/cgroup only after it's explicitly mounted so that userland which doesn't make use of it doesn't see any change. * "current_css_set_cg_links" file of debug cgroup now treats the default hierarchy the same as other hierarchies. This is visible to userland. Given that it's for debug controller, this should be fine. * While at it, implement cgroup_on_dfl() which tests whether a give cgroup is on the default hierarchy or not. The above changes make cgrp_dfl_root mostly equivalent to other controllers but the actual unified hierarchy behaviors are not implemented yet. Let's plug child cgroup creation in cgrp_dfl_root from create_cgroup() for now. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 94 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f5754910e80b..69b4939e9f6d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -142,7 +142,13 @@ static const char *cgroup_subsys_name[] = { * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -static struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root; + +/* + * The default hierarchy always exists but is hidden until mounted for the + * first time. This is for backward compatibility. + */ +static bool cgrp_dfl_root_visible; /* The list of hierarchy roots */ @@ -999,10 +1005,22 @@ static int rebind_subsystems(struct cgroup_root *dst_root, return -EBUSY; } - if (dst_root != &cgrp_dfl_root) { - ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); - if (ret) + ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); + if (ret) { + if (dst_root != &cgrp_dfl_root) return ret; + + /* + * Rebinding back to the default root is not allowed to + * fail. Using both default and non-default roots should + * be rare. Moving subsystems back and forth even more so. + * Just warn about it and continue. + */ + if (cgrp_dfl_root_visible) { + pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + } } /* @@ -1011,7 +1029,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, */ mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) - if ((ss_mask & (1 << ssid)) && ss->root != &cgrp_dfl_root) + if (ss_mask & (1 << ssid)) cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); mutex_lock(&cgroup_mutex); @@ -1039,8 +1057,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->bind(css); } - if (dst_root != &cgrp_dfl_root) - kernfs_activate(dst_root->cgrp.kn); + kernfs_activate(dst_root->cgrp.kn); return 0; } @@ -1190,16 +1207,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options - * were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - set_bit(i, &opts->subsys_mask); - /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { @@ -1211,6 +1218,23 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } + } else { + /* + * If the 'all' option was specified select all the + * subsystems, otherwise if 'none', 'name=' and a subsystem + * name options were not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + set_bit(i, &opts->subsys_mask); + + /* + * We either have to specify by name or by subsystems. (So + * all empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; } /* @@ -1226,13 +1250,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->subsys_mask && opts->none) return -EINVAL; - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - return 0; } @@ -1487,6 +1504,14 @@ retry: goto out_unlock; /* look for a matching existing root */ + if (!opts.subsys_mask && !opts.none && !opts.name) { + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + ret = 0; + goto out_unlock; + } + for_each_root(root) { bool name_match = false; @@ -3622,6 +3647,13 @@ static long cgroup_create(struct cgroup *parent, const char *name, struct cgroup_subsys *ss; struct kernfs_node *kn; + /* + * XXX: The default hierarchy isn't fully implemented yet. Block + * !root cgroup creation on it for now. + */ + if (root == &cgrp_dfl_root) + return -EINVAL; + /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) @@ -4061,7 +4093,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts = { }; + static struct cgroup_sb_opts __initdata opts = + { .flags = CGRP_ROOT_SANE_BEHAVIOR }; struct cgroup_subsys *ss; int i; @@ -4198,7 +4231,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root) + if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -4631,15 +4664,10 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name = "?"; - - if (c != &cgrp_dfl_root.cgrp) { - cgroup_name(c, name_buf, NAME_MAX + 1); - name = name_buf; - } + cgroup_name(c, name_buf, NAME_MAX + 1); seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name); + c->root->hierarchy_id, name_buf); } rcu_read_unlock(); up_read(&css_set_rwsem); -- cgit v1.2.3 From 8cbbf2c972c4444cad36f61cd571714c39b8cf04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 10:23:55 -0400 Subject: cgroup: implement CFTYPE_ONLY_ON_DFL This cftype flag makes the file only appear on the default hierarchy. This will later be used for cgroup.controllers file. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 69b4939e9f6d..37b6d534b0ca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2356,6 +2356,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) -- cgit v1.2.3 From 6404e88e8385638123f4b18b104430480870601a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 7 Mar 2014 09:22:19 -0700 Subject: resources: Set type in __request_region() We don't set the type (I/O, memory, etc.) of resources added by __request_region(), which leads to confusing messages like this: address space collision: [io 0x1000-0x107f] conflicts with ACPI CPU throttle [??? 0x00001010-0x00001015 flags 0x80000000] Set the type of a new resource added by __request_region() (used by request_region() and request_mem_region()) to the type of its parent. This makes the resource tree internally consistent and fixes messages like the above, where the ACPI CPU throttle resource really is an I/O port region, but request_region() didn't fill in the type, so %pR didn't know how to print it. Sample dmesg showing the issue at the link below. Link: https://bugzilla.kernel.org/show_bug.cgi?id=71611 Reported-by: Paul Bolle Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index a8344dda7049..673061c06da1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -945,8 +945,8 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; - res->flags |= flags; + res->flags = resource_type(parent); + res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); -- cgit v1.2.3 From 1b9aba49eab5e85b0d3de8ba630cda6d68546297 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Mar 2014 17:43:21 -0400 Subject: cgroup: fix cgroup_taskset walking order cgroup_taskset is used to track and iterate target tasks while migrating a task or process and should guarantee that the first task iterated is the task group leader if a process is being migrated. b3dc094e9390 ("cgroup: use css_set->mg_tasks to track target tasks during migration") replaced flex array cgroup_taskset->tc_array with css_set->mg_tasks list to remove process size limit and dynamic allocation during migration; unfortunately, it incorrectly used list operations which don't preserve order breaking the guarantee that cgroup_taskset_first() returns the leader for a process target. Fix it by using order preserving list operations. Note that as multiple src_csets may map to a single dst_cset, the iteration order may change across cgroup_task_migrate(); however, the leader is still guaranteed to be the first entry. The switch to list_splice_tail_init() at the end of cgroup_migrate() isn't strictly necessary. Let's still do it for consistency. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 37b6d534b0ca..98a8045e2149 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1761,7 +1761,14 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, get_css_set(new_cset); rcu_assign_pointer(tsk->cgroups, new_cset); - list_move(&tsk->cg_list, &new_cset->mg_tasks); + + /* + * Use move_tail so that cgroup_taskset_first() still returns the + * leader after migration. This works because cgroup_migrate() + * ensures that the dst_cset of the leader is the first on the + * tset's dst_csets list. + */ + list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); /* * We just gained a reference on old_cset by taking it from the @@ -1936,9 +1943,16 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, if (!cset->mg_src_cgrp) goto next; - list_move(&task->cg_list, &cset->mg_tasks); - list_move(&cset->mg_node, &tset.src_csets); - list_move(&cset->mg_dst_cset->mg_node, &tset.dst_csets); + /* + * cgroup_taskset_first() must always return the leader. + * Take care to avoid disturbing the ordering. + */ + list_move_tail(&task->cg_list, &cset->mg_tasks); + if (list_empty(&cset->mg_node)) + list_add_tail(&cset->mg_node, &tset.src_csets); + if (list_empty(&cset->mg_dst_cset->mg_node)) + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset.dst_csets); next: if (!threadgroup) break; @@ -1999,7 +2013,7 @@ out_release_tset: down_write(&css_set_rwsem); list_splice_init(&tset.dst_csets, &tset.src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { - list_splice_init(&cset->mg_tasks, &cset->tasks); + list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } up_write(&css_set_rwsem); -- cgit v1.2.3 From c41eba7de133e43ea2c998ccd484059feab200f6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 18 Mar 2014 13:23:15 +0530 Subject: timer: Use variable head instead of &work_list in __run_timers() We already have a variable 'head' that points to '&work_list', and so we should use that instead wherever possible. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/0d8645a6efc8360c4196c9797d59343abbfdcc5e.1395129136.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 949d74ea0ce4..8e503fec1fba 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1192,7 +1192,7 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, &work_list); + list_replace_init(base->tv1.vec + index, head); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; -- cgit v1.2.3 From 6201b4d61fbf194df6371fb3376c5026cb8f5eec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 18 Mar 2014 16:26:07 +0530 Subject: timer: Remove code redundancy while calling get_nohz_timer_target() There are only two users of get_nohz_timer_target(): timer and hrtimer. Both call it under same circumstances, i.e. #ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif So, it makes more sense to get all this as part of get_nohz_timer_target() instead of duplicating code at two places. For this another parameter is required to be passed to this routine, pinned. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1e1b53537217d58d48c2d7a222a9c3ac47d5b64c.1395140107.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 15 +-------------- kernel/sched/core.c | 5 ++++- kernel/timer.c | 7 +------ 3 files changed, 6 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 09094361dce5..d55092ceee29 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -168,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } - -/* - * Get the preferred target CPU for NOHZ - */ -static int hrtimer_get_target(int this_cpu, int pinned) -{ -#ifdef CONFIG_NO_HZ_COMMON - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) - return get_nohz_timer_target(); -#endif - return this_cpu; -} - /* * With HIGHRES=y we do not migrate the timer when it is expiring * before the next event on the target cpu because we cannot reprogram @@ -214,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); - int cpu = hrtimer_get_target(this_cpu, pinned); + int cpu = get_nohz_timer_target(pinned); int basenum = base->index; again: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aab..c0339e206cc2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -555,12 +555,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { diff --git a/kernel/timer.c b/kernel/timer.c index 8e503fec1fba..1d35ddadc045 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -760,12 +760,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = smp_processor_id(); - -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) - cpu = get_nohz_timer_target(); -#endif + cpu = get_nohz_timer_target(pinned); new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { -- cgit v1.2.3 From a19423b98704aa85e84097be6d1d44a8615c2340 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 11 Mar 2014 02:04:03 +0530 Subject: CPU hotplug: Add lockdep annotations to get/put_online_cpus() Add lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin()/cpu_hotplug_end(). Cc: Ingo Molnar Reviewed-by: Oleg Nesterov Signed-off-by: Gautham R. Shenoy Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index deff2e693766..33caf5e97701 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "smpboot.h" @@ -57,17 +58,30 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif } cpu_hotplug = { .active_writer = NULL, .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "cpu_hotplug.lock" }, +#endif }; +/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ +#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) +#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) + void get_online_cpus(void) { might_sleep(); if (cpu_hotplug.active_writer == current) return; + cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); @@ -87,6 +101,7 @@ void put_online_cpus(void) if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } EXPORT_SYMBOL_GPL(put_online_cpus); @@ -117,6 +132,7 @@ void cpu_hotplug_begin(void) { cpu_hotplug.active_writer = current; + cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); if (likely(!cpu_hotplug.refcount)) @@ -131,6 +147,7 @@ void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } /* -- cgit v1.2.3 From 93ae4f978ca7f26d17df915ac7afc919c1dd0353 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 11 Mar 2014 02:04:14 +0530 Subject: CPU hotplug: Provide lockless versions of callback registration functions The following method of CPU hotplug callback registration is not safe due to the possibility of an ABBA deadlock involving the cpu_add_remove_lock and the cpu_hotplug.lock. get_online_cpus(); for_each_online_cpu(cpu) init_cpu(cpu); register_cpu_notifier(&foobar_cpu_notifier); put_online_cpus(); The deadlock is shown below: CPU 0 CPU 1 ----- ----- Acquire cpu_hotplug.lock [via get_online_cpus()] CPU online/offline operation takes cpu_add_remove_lock [via cpu_maps_update_begin()] Try to acquire cpu_add_remove_lock [via register_cpu_notifier()] CPU online/offline operation tries to acquire cpu_hotplug.lock [via cpu_hotplug_begin()] *** DEADLOCK! *** The problem here is that callback registration takes the locks in one order whereas the CPU hotplug operations take the same locks in the opposite order. To avoid this issue and to provide a race-free method to register CPU hotplug callbacks (along with initialization of already online CPUs), introduce new variants of the callback registration APIs that simply register the callbacks without holding the cpu_add_remove_lock during the registration. That way, we can avoid the ABBA scenario. However, we will need to hold the cpu_add_remove_lock throughout the entire critical section, to protect updates to the callback/notifier chain. This can be achieved by writing the callback registration code as follows: cpu_maps_update_begin(); [ or cpu_notifier_register_begin(); see below ] for_each_online_cpu(cpu) init_cpu(cpu); /* This doesn't take the cpu_add_remove_lock */ __register_cpu_notifier(&foobar_cpu_notifier); cpu_maps_update_done(); [ or cpu_notifier_register_done(); see below ] Note that we can't use get_online_cpus() here instead of cpu_maps_update_begin() because the cpu_hotplug.lock is dropped during the invocation of CPU_POST_DEAD notifiers, and hence get_online_cpus() cannot provide the necessary synchronization to protect the callback/notifier chains against concurrent reads and writes. On the other hand, since the cpu_add_remove_lock protects the entire hotplug operation (including CPU_POST_DEAD), we can use cpu_maps_update_begin/done() to guarantee proper synchronization. Also, since cpu_maps_update_begin/done() is like a super-set of get/put_online_cpus(), the former naturally protects the critical sections from concurrent hotplug operations. Since the names cpu_maps_update_begin/done() don't make much sense in CPU hotplug callback registration scenarios, we'll introduce new APIs named cpu_notifier_register_begin/done() and map them to cpu_maps_update_begin/done(). In summary, introduce the lockless variants of un/register_cpu_notifier() and also export the cpu_notifier_register_begin/done() APIs for use by modules. This way, we provide a race-free way to register hotplug callbacks as well as perform initialization for the CPUs that are already online. Cc: Thomas Gleixner Cc: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Acked-by: Oleg Nesterov Acked-by: Toshi Kani Reviewed-by: Gautham R. Shenoy Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 33caf5e97701..a9e710eef0e2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -28,18 +28,23 @@ static DEFINE_MUTEX(cpu_add_remove_lock); /* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. + * The following two APIs (cpu_maps_update_begin/done) must be used when + * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. + * The APIs cpu_notifier_register_begin/done() must be used to protect CPU + * hotplug callback (un)registration performed using __register_cpu_notifier() + * or __unregister_cpu_notifier(). */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_done); static RAW_NOTIFIER_HEAD(cpu_chain); @@ -183,6 +188,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ + return raw_notifier_chain_register(&cpu_chain, nb); +} + static int __cpu_notify(unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -206,6 +216,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) BUG_ON(cpu_notify(val, v)); } EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) { @@ -215,6 +226,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id -- cgit v1.2.3 From d39ad278a3001c860da4d7c13e51259b1904bec5 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 11 Mar 2014 02:11:56 +0530 Subject: trace, ring-buffer: Fix CPU hotplug callback registration Subsystems that want to register CPU hotplug callbacks, as well as perform initialization for the CPUs that are already online, often do it as shown below: get_online_cpus(); for_each_online_cpu(cpu) init_cpu(cpu); register_cpu_notifier(&foobar_cpu_notifier); put_online_cpus(); This is wrong, since it is prone to ABBA deadlocks involving the cpu_add_remove_lock and the cpu_hotplug.lock (when running concurrently with CPU hotplug operations). Instead, the correct and race-free way of performing the callback registration is: cpu_notifier_register_begin(); for_each_online_cpu(cpu) init_cpu(cpu); /* Note the use of the double underscored version of the API */ __register_cpu_notifier(&foobar_cpu_notifier); cpu_notifier_register_done(); Fix the tracing ring-buffer code by using this latter form of callback registration. Cc: Frederic Weisbecker Cc: Ingo Molnar Acked-by: Steven Rostedt Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/trace/ring_buffer.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fc4da2d97f9b..c634868c2921 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1301,7 +1301,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, * In that off case, we need to allocate for all possible cpus. */ #ifdef CONFIG_HOTPLUG_CPU - get_online_cpus(); + cpu_notifier_register_begin(); cpumask_copy(buffer->cpumask, cpu_online_mask); #else cpumask_copy(buffer->cpumask, cpu_possible_mask); @@ -1324,10 +1324,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, #ifdef CONFIG_HOTPLUG_CPU buffer->cpu_notify.notifier_call = rb_cpu_notify; buffer->cpu_notify.priority = 0; - register_cpu_notifier(&buffer->cpu_notify); + __register_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_done(); #endif - put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -1341,7 +1341,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif fail_free_buffer: kfree(buffer); @@ -1358,16 +1360,17 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; - get_online_cpus(); - #ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&buffer->cpu_notify); #endif for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); -- cgit v1.2.3 From c270a817196a9374a2dc730624d1501dced40b4d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 11 Mar 2014 02:12:08 +0530 Subject: profile: Fix CPU hotplug callback registration Subsystems that want to register CPU hotplug callbacks, as well as perform initialization for the CPUs that are already online, often do it as shown below: get_online_cpus(); for_each_online_cpu(cpu) init_cpu(cpu); register_cpu_notifier(&foobar_cpu_notifier); put_online_cpus(); This is wrong, since it is prone to ABBA deadlocks involving the cpu_add_remove_lock and the cpu_hotplug.lock (when running concurrently with CPU hotplug operations). Instead, the correct and race-free way of performing the callback registration is: cpu_notifier_register_begin(); for_each_online_cpu(cpu) init_cpu(cpu); /* Note the use of the double underscored version of the API */ __register_cpu_notifier(&foobar_cpu_notifier); cpu_notifier_register_done(); Fix the profile code by using this latter form of callback registration. Cc: Al Viro Cc: Mauro Carvalho Chehab Cc: Ingo Molnar Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/profile.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index ebdd9c1a86b4..93b2a3fe0a64 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -591,18 +591,28 @@ out_cleanup: int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ { struct proc_dir_entry *entry; + int err = 0; if (!prof_on) return 0; - if (create_hash_tables()) - return -ENOMEM; + + cpu_notifier_register_begin(); + + if (create_hash_tables()) { + err = -ENOMEM; + goto out; + } + entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) - return 0; + goto out; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - hotcpu_notifier(profile_cpu_callback, 0); - return 0; + __hotcpu_notifier(profile_cpu_callback, 0); + +out: + cpu_notifier_register_done(); + return err; } module_init(create_proc_profile); #endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 3f1c82502c299da08b7b7f08b435212e51166ed9 Mon Sep 17 00:00:00 2001 From: William Roberts Date: Tue, 11 Feb 2014 10:12:01 -0800 Subject: audit: Audit proc//cmdline aka proctitle During an audit event, cache and print the value of the process's proctitle value (proc//cmdline). This is useful in situations where processes are started via fork'd virtual machines where the comm field is incorrect. Often times, setting the comm field still is insufficient as the comm width is not very wide and most virtual machine "package names" do not fit. Also, during execution, many threads have their comm field set as well. By tying it back to the global cmdline value for the process, audit records will be more complete in systems with these properties. An example of where this is useful and applicable is in the realm of Android. With Android, their is no fork/exec for VM instances. The bare, preloaded Dalvik VM listens for a fork and specialize request. When this request comes in, the VM forks, and the loads the specific application (specializing). This was done to take advantage of COW and to not require a load of basic packages by the VM on very app spawn. When this spawn occurs, the package name is set via setproctitle() and shows up in procfs. Many of these package names are longer then 16 bytes, the historical width of task->comm. Having the cmdline in the audit records will couple the application back to the record directly. Also, on my Debian development box, some audit records were more useful then what was printed under comm. The cached proctitle is tied to the life-cycle of the audit_context structure and is built on demand. Proctitle is controllable by userspace, and thus should not be trusted. It is meant as an aid to assist in debugging. The proctitle event is emitted during syscall audits, and can be filtered with auditctl. Example: type=AVC msg=audit(1391217013.924:386): avc: denied { getattr } for pid=1971 comm="mkdir" name="/" dev="selinuxfs" ino=1 scontext=system_u:system_r:consolekit_t:s0-s0:c0.c255 tcontext=system_u:object_r:security_t:s0 tclass=filesystem type=SYSCALL msg=audit(1391217013.924:386): arch=c000003e syscall=137 success=yes exit=0 a0=7f019dfc8bd7 a1=7fffa6aed2c0 a2=fffffffffff4bd25 a3=7fffa6aed050 items=0 ppid=1967 pid=1971 auid=4294967295 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=4294967295 comm="mkdir" exe="/bin/mkdir" subj=system_u:system_r:consolekit_t:s0-s0:c0.c255 key=(null) type=UNKNOWN[1327] msg=audit(1391217013.924:386): proctitle=6D6B646972002D70002F7661722F72756E2F636F6E736F6C65 Acked-by: Steve Grubb (wrt record formating) Signed-off-by: William Roberts Signed-off-by: Eric Paris --- kernel/audit.h | 6 +++++ kernel/auditsc.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 57cc64d67718..38c967d28de5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -106,6 +106,11 @@ struct audit_names { bool should_free; }; +struct audit_proctitle { + int len; /* length of the cmdline field. */ + char *value; /* the cmdline field */ +}; + /* The per-task audit context. */ struct audit_context { int dummy; /* must be the first element */ @@ -202,6 +207,7 @@ struct audit_context { } execve; }; int fds[2]; + struct audit_proctitle proctitle; #if AUDIT_DEBUG int put_count; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6874c1fd453d..043d1ef9362f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -70,6 +70,7 @@ #include #include #include +#include #include "audit.h" @@ -81,6 +82,9 @@ /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 +/* max length to print of cmdline/proctitle value during audit */ +#define MAX_PROCTITLE_AUDIT_LEN 128 + /* number of audit rules */ int audit_n_rules; @@ -844,6 +848,13 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, return context; } +static inline void audit_proctitle_free(struct audit_context *context) +{ + kfree(context->proctitle.value); + context->proctitle.value = NULL; + context->proctitle.len = 0; +} + static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -956,6 +967,7 @@ static inline void audit_free_context(struct audit_context *context) audit_free_aux(context); kfree(context->filterkey); kfree(context->sockaddr); + audit_proctitle_free(context); kfree(context); } @@ -1272,6 +1284,59 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static inline int audit_proctitle_rtrim(char *proctitle, int len) +{ + char *end = proctitle + len - 1; + while (end > proctitle && !isprint(*end)) + end--; + + /* catch the case where proctitle is only 1 non-print character */ + len = end - proctitle + 1; + len -= isprint(proctitle[len-1]) == 0; + return len; +} + +static void audit_log_proctitle(struct task_struct *tsk, + struct audit_context *context) +{ + int res; + char *buf; + char *msg = "(null)"; + int len = strlen(msg); + struct audit_buffer *ab; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); + if (!ab) + return; /* audit_panic or being filtered */ + + audit_log_format(ab, "proctitle="); + + /* Not cached */ + if (!context->proctitle.value) { + buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL); + if (!buf) + goto out; + /* Historically called this from procfs naming */ + res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN); + if (res == 0) { + kfree(buf); + goto out; + } + res = audit_proctitle_rtrim(buf, res); + if (res == 0) { + kfree(buf); + goto out; + } + context->proctitle.value = buf; + context->proctitle.len = res; + } + msg = context->proctitle.value; + len = context->proctitle.len; +out: + audit_log_n_untrustedstring(ab, msg, len); + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1389,6 +1454,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_name(context, n, NULL, i++, &call_panic); } + audit_log_proctitle(tsk, context); + /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); if (ab) -- cgit v1.2.3 From 638a0fd2a062568c568661be0a780be8e8836d03 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2014 10:49:05 -0800 Subject: audit: Use struct net not pid_t to remember the network namespce to reply in While reading through 3.14-rc1 I found a pretty siginficant mishandling of network namespaces in the recent audit changes. In struct audit_netlink_list and audit_reply add a reference to the network namespace of the caller and remove the userspace pid of the caller. This cleanly remembers the callers network namespace, and removes a huge class of races and nasty failure modes that can occur when attempting to relook up the callers network namespace from a pid_t (including the caller's network namespace changing, pid wraparound, and the pid simply not being present). Signed-off-by: "Eric W. Biederman" Acked-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 10 ++++++---- kernel/audit.h | 2 +- kernel/auditfilter.c | 3 ++- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 34c5a2310fbf..71fb41f393d7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -182,7 +182,7 @@ struct audit_buffer { struct audit_reply { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff *skb; }; @@ -500,7 +500,7 @@ int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; - struct net *net = get_net_ns_by_pid(dest->pid); + struct net *net = dest->net; struct audit_net *aunet = net_generic(net, audit_net_id); /* wait for parent to finish and send an ACK */ @@ -510,6 +510,7 @@ int audit_send_list(void *_dest) while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(aunet->nlsk, skb, dest->portid, 0); + put_net(net); kfree(dest); return 0; @@ -543,7 +544,7 @@ out_kfree_skb: static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct net *net = get_net_ns_by_pid(reply->pid); + struct net *net = reply->net; struct audit_net *aunet = net_generic(net, audit_net_id); mutex_lock(&audit_cmd_mutex); @@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg) /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); + put_net(net); kfree(reply); return 0; } @@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; + reply->net = get_net(current->nsproxy->net_ns); reply->portid = portid; - reply->pid = task_pid_vnr(current); reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); diff --git a/kernel/audit.h b/kernel/audit.h index 38c967d28de5..7bb65730c890 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -253,7 +253,7 @@ extern void audit_panic(const char *message); struct audit_netlink_list { __u32 portid; - pid_t pid; + struct net *net; struct sk_buff_head q; }; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3152d1aea164..a0d470131fd0 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "audit.h" /* @@ -1085,8 +1086,8 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; + dest->net = get_net(current->nsproxy->net_ns); dest->portid = portid; - dest->pid = task_pid_vnr(current); skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); -- cgit v1.2.3 From 099dd235113700bbb476e572cd191ddb77b9af46 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2014 20:36:55 -0800 Subject: audit: Send replies in the proper network namespace. In perverse cases of file descriptor passing the current network namespace of a process and the network namespace of a socket used by that socket may differ. Therefore use the network namespace of the appropiate socket to ensure replies always go to the appropiate socket. Signed-off-by: "Eric W. Biederman" Acked-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit.c | 21 ++++++++++----------- kernel/auditfilter.c | 7 +++++-- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 71fb41f393d7..7b44bd47759c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the port id. * No failure notifications. */ -static void audit_send_reply(__u32 portid, int seq, int type, int done, +static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct sk_buff *skb; struct task_struct *tsk; struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), @@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done, if (!skb) goto out; - reply->net = get_net(current->nsproxy->net_ns); + reply->net = get_net(net); reply->portid = portid; reply->skb = skb; @@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); return 0; } @@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog = skb_queue_len(&audit_skb_queue); s.version = AUDIT_VERSION_LATEST; s.backlog_wait_time = audit_backlog_wait_time; - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, - &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_SET: { @@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq, data, nlmsg_len(nlh)); break; case AUDIT_LIST_RULES: - err = audit_list_rules_send(NETLINK_CB(skb).portid, seq); + err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: audit_trim_trees(); @@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, - 0, 0, sig_data, sizeof(*sig_data) + len); + audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, + sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); - audit_send_reply(NETLINK_CB(skb).portid, seq, - AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a0d470131fd0..549bbb6e6597 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "audit.h" /* @@ -1071,8 +1072,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, * @portid: target portid for netlink audit messages * @seq: netlink audit message sequence (serial) number */ -int audit_list_rules_send(__u32 portid, int seq) +int audit_list_rules_send(struct sk_buff *request_skb, int seq) { + u32 portid = NETLINK_CB(request_skb).portid; + struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct task_struct *tsk; struct audit_netlink_list *dest; int err = 0; @@ -1086,7 +1089,7 @@ int audit_list_rules_send(__u32 portid, int seq) dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest->net = get_net(current->nsproxy->net_ns); + dest->net = get_net(net); dest->portid = portid; skb_queue_head_init(&dest->q); -- cgit v1.2.3 From 4a3eb726d1543c4b616b9a0a4d4c53ddd276f5f4 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 18 Feb 2014 15:29:43 -0500 Subject: audit: rename the misleading audit_get_context() to audit_take_context() "get" usually implies incrementing a refcount into a structure to indicate a reference being held by another part of code. Change this function name to indicate it is in fact being taken from it, returning the value while clearing it in the supplying structure. Signed-off-by: Richard Guy Briggs --- kernel/auditsc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 043d1ef9362f..57bf178ca7d5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -811,7 +811,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) rcu_read_unlock(); } -static inline struct audit_context *audit_get_context(struct task_struct *tsk, +/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */ +static inline struct audit_context *audit_take_context(struct task_struct *tsk, int return_valid, long return_code) { @@ -1474,7 +1475,7 @@ void __audit_free(struct task_struct *tsk) { struct audit_context *context; - context = audit_get_context(tsk, 0, 0); + context = audit_take_context(tsk, 0, 0); if (!context) return; @@ -1568,7 +1569,7 @@ void __audit_syscall_exit(int success, long return_code) else success = AUDITSC_FAILURE; - context = audit_get_context(tsk, success, return_code); + context = audit_take_context(tsk, success, return_code); if (!context) return; -- cgit v1.2.3 From c92cdeb45eea38515e82187f48c2e4f435fb4e25 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 10 Dec 2013 22:10:41 -0500 Subject: audit: convert PPIDs to the inital PID namespace. sys_getppid() returns the parent pid of the current process in its own pid namespace. Since audit filters are based in the init pid namespace, a process could avoid a filter or trigger an unintended one by being in an alternate pid namespace or log meaningless information. Switch to task_ppid_nr() for PPIDs to anchor all audit filters in the init_pid_ns. (informed by ebiederman's 6c621b7e) Cc: stable@vger.kernel.org Cc: Eric W. Biederman Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 4 ++-- kernel/auditsc.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7b44bd47759c..e1e1b2137048 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1819,10 +1819,10 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) spin_unlock_irq(&tsk->sighand->siglock); audit_log_format(ab, - " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", - sys_getppid(), + task_ppid_nr(tsk), tsk->pid, from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 57bf178ca7d5..a6cf7ab56e61 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -465,7 +465,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PPID: if (ctx) { if (!ctx->ppid) - ctx->ppid = sys_getppid(); + ctx->ppid = task_ppid_nr(tsk); result = audit_comparator(ctx->ppid, f->op, f->val); } break; -- cgit v1.2.3 From f1dc4867ff41b7bcca57fa19449d1fe7ad517ac1 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 11 Dec 2013 13:52:26 -0500 Subject: audit: anchor all pid references in the initial pid namespace Store and log all PIDs with reference to the initial PID namespace and use the access functions task_pid_nr() and task_tgid_nr() for task->pid and task->tgid. Cc: "Eric W. Biederman" (informed by ebiederman's c776b5d2) Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 5 +++-- kernel/auditfilter.c | 17 ++++++++++++++++- kernel/auditsc.c | 16 +++++++++------- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e1e1b2137048..5a096f8e28cb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -649,6 +649,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); + pid_t pid = task_tgid_nr(current); if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; @@ -658,7 +659,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return rc; - audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); + audit_log_format(*ab, "pid=%d uid=%u", pid, uid); audit_log_session_info(*ab); audit_log_task_context(*ab); @@ -1823,7 +1824,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - tsk->pid, + task_pid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 549bbb6e6597..96c8a704f130 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -433,6 +433,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->val = 0; } + if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) { + struct pid *pid; + rcu_read_lock(); + pid = find_vpid(f->val); + if (!pid) { + rcu_read_unlock(); + err = -ESRCH; + goto exit_free; + } + f->val = pid_nr(pid); + rcu_read_unlock(); + } + err = audit_field_valid(entry, f); if (err) goto exit_free; @@ -1242,12 +1255,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type, for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + pid_t pid; int result = 0; u32 sid; switch (f->type) { case AUDIT_PID: - result = audit_comparator(task_pid_vnr(current), f->op, f->val); + pid = task_pid_nr(current); + result = audit_comparator(pid, f->op, f->val); break; case AUDIT_UID: result = audit_uid_comparator(current_uid(), f->op, f->uid); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a6cf7ab56e61..6381f25ac3d4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -457,10 +457,12 @@ static int audit_filter_rules(struct task_struct *tsk, struct audit_field *f = &rule->fields[i]; struct audit_names *n; int result = 0; + pid_t pid; switch (f->type) { case AUDIT_PID: - result = audit_comparator(tsk->pid, f->op, f->val); + pid = task_pid_nr(tsk); + result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: if (ctx) { @@ -2051,7 +2053,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, audit_log_format(ab, "pid=%d uid=%u" " old-auid=%u new-auid=%u old-ses=%u new-ses=%u" " res=%d", - current->pid, uid, + task_pid_nr(current), uid, oldloginuid, loginuid, oldsessionid, sessionid, !rc); audit_log_end(ab); @@ -2275,7 +2277,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = t->pid; + context->target_pid = task_pid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2300,7 +2302,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = tsk->pid; + audit_sig_pid = task_pid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2314,7 +2316,7 @@ int __audit_signal_info(int sig, struct task_struct *t) /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { - ctx->target_pid = t->tgid; + ctx->target_pid = task_tgid_nr(t); ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); @@ -2335,7 +2337,7 @@ int __audit_signal_info(int sig, struct task_struct *t) } BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); - axp->target_pid[axp->pid_count] = t->tgid; + axp->target_pid[axp->pid_count] = task_tgid_nr(t); axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); @@ -2435,7 +2437,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); audit_log_untrustedstring(ab, current->comm); if (mm) { down_read(&mm->mmap_sem); -- cgit v1.2.3 From 5a3cb3b6c3a07904bb66baf055b2eaf01198b1f9 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 16 Aug 2013 00:04:46 -0400 Subject: audit: allow user processes to log from another PID namespace Still only permit the audit logging daemon and control to operate from the initial PID namespace, but allow processes to log from another PID namespace. Cc: "Eric W. Biederman" (informed by ebiederman's c776b5d2) Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5a096f8e28cb..72c6e1cd6ef5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -607,9 +607,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) { int err = 0; - /* Only support the initial namespaces for now. */ - if ((current_user_ns() != &init_user_ns) || - (task_active_pid_ns(current) != &init_pid_ns)) + /* Only support initial user namespace for now. */ + if ((current_user_ns() != &init_user_ns)) return -EPERM; switch (msg_type) { @@ -629,6 +628,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_TTY_SET: case AUDIT_TRIM: case AUDIT_MAKE_EQUIV: + /* Only support auditd and auditctl in initial pid namespace + * for now. */ + if ((task_active_pid_ns(current) != &init_pid_ns)) + return -EPERM; + if (!capable(CAP_AUDIT_CONTROL)) err = -EPERM; break; -- cgit v1.2.3 From aa589a13b5d00d3c643ee4114d8cbc3addb4e99f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 24 Feb 2014 12:31:11 -0500 Subject: audit: remove superfluous new- prefix in AUDIT_LOGIN messages The new- prefix on ses and auid are un-necessary and break ausearch. Signed-off-by: Richard Guy Briggs --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6381f25ac3d4..61ac3cf53f1d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2051,7 +2051,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, if (!ab) return; audit_log_format(ab, "pid=%d uid=%u" - " old-auid=%u new-auid=%u old-ses=%u new-ses=%u" + " old-auid=%u auid=%u old-ses=%u ses=%u" " res=%d", task_pid_nr(current), uid, oldloginuid, loginuid, oldsessionid, sessionid, -- cgit v1.2.3 From ddfad8affdb73cc8df5890fef16d98d63ff3a6f0 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 19 Jan 2011 19:22:35 -0500 Subject: audit: include subject in login records The login uid change record does not include the selinux context of the task logging in. Add that information. (Updated from 2011-01: RHBZ:670328 -- RGB) Reported-by: Steve Grubb Acked-by: James Morris Signed-off-by: Eric Paris Signed-off-by: Aristeu Rozanski Signed-off-by: Richard Guy Briggs --- kernel/auditsc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 61ac3cf53f1d..bd3de52600ff 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2050,12 +2050,10 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; - audit_log_format(ab, "pid=%d uid=%u" - " old-auid=%u auid=%u old-ses=%u ses=%u" - " res=%d", - task_pid_nr(current), uid, - oldloginuid, loginuid, oldsessionid, sessionid, - !rc); + audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_task_context(ab); + audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, oldsessionid, sessionid, !rc); audit_log_end(ab); } -- cgit v1.2.3 From f12835276c3182f2b998d93dfd60100cf4b60c05 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 5 Mar 2014 16:29:55 -0500 Subject: audit: remove stray newlines from audit_log_lost messages Calling audit_log_lost with a \n in the format string leads to extra newlines in dmesg. That function will eventually call audit_panic which uses pr_err with an explicit \n included. Just make these calls match the others that lack \n. Reported-by: Jonathan Kamens Signed-off-by: Josh Boyer Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 72c6e1cd6ef5..c0696dcfed11 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb) if (printk_ratelimit()) pr_notice("type=%d %s\n", nlh->nlmsg_type, data); else - audit_log_lost("printk limit exceeded\n"); + audit_log_lost("printk limit exceeded"); } audit_hold_skb(skb); @@ -412,7 +412,7 @@ static void kauditd_send_skb(struct sk_buff *skb) BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ if (audit_pid) { pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd disappeared\n"); + audit_log_lost("auditd disappeared"); audit_pid = 0; audit_sock = NULL; } -- cgit v1.2.3 From b7550787fe8b5beffb5f56fa11a87712d699d085 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 5 Mar 2014 14:34:36 -0800 Subject: audit: remove stray newline from audit_log_execve_info() audit_panic() call There's an unnecessary use of a \n in audit_panic. Signed-off-by: Richard Guy Briggs --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bd3de52600ff..254ce2063d1d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1173,7 +1173,7 @@ static void audit_log_execve_info(struct audit_context *context, */ buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); if (!buf) { - audit_panic("out of memory for argv string\n"); + audit_panic("out of memory for argv string"); return; } -- cgit v1.2.3 From 5e937a9ae9137899c6641d718bd3820861099a09 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Mar 2014 12:48:43 -0400 Subject: syscall_get_arch: remove useless function arguments Every caller of syscall_get_arch() uses current for the task and no implementors of the function need args. So just get rid of both of those things. Admittedly, since these are inline functions we aren't wasting stack space, but it just makes the prototypes better. Signed-off-by: Eric Paris Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mips@linux-mips.org Cc: linux390@de.ibm.com Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-arch@vger.kernel.org --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a10048a32c..eda2da3df822 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -95,7 +95,7 @@ u32 seccomp_bpf_load(int off) if (off == BPF_DATA(nr)) return syscall_get_nr(current, regs); if (off == BPF_DATA(arch)) - return syscall_get_arch(current, regs); + return syscall_get_arch(); if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { unsigned long value; int arg = (off - BPF_DATA(args[0])) / sizeof(u64); @@ -351,7 +351,7 @@ static void seccomp_send_sigsys(int syscall, int reason) info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; - info.si_arch = syscall_get_arch(current, task_pt_regs(current)); + info.si_arch = syscall_get_arch(); info.si_syscall = syscall; force_sig_info(SIGSYS, &info, current); } -- cgit v1.2.3 From e1b2dc176f2d5be7952c47a4e4e8f3b06a90db1c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Mar 2014 11:10:15 -0400 Subject: cgroup: break kernfs active_ref protection in cgroup directory operations cgroup_tree_mutex should nest above the kernfs active_ref protection; however, cgroup_create() and cgroup_rename() were grabbing cgroup_tree_mutex while under kernfs active_ref protection. This has actualy possibility to lead to deadlocks in case these operations race against cgroup_rmdir() which invokes kernfs_remove() on directory kernfs_node while holding cgroup_tree_mutex. Neither cgroup_create() or cgroup_rename() requires active_ref protection. The former already has enough synchronization through cgroup_lock_live_group() and the latter doesn't care, so this can be fixed by updating both functions to break all active_ref protections before grabbing cgroup_tree_mutex. While this patch fixes the immediate issue, it probably needs further work in the long term - kernfs directories should enable lockdep annotations and maybe the better way to handle this is marking directory nodes as not needing active_ref protection rather than breaking it in each operation. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman --- kernel/cgroup.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 98a8045e2149..58c67b3060b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2324,6 +2324,14 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, if (cgroup_sane_behavior(cgrp)) return -EPERM; + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_tree_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); @@ -2331,6 +2339,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); + + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); return ret; } @@ -3778,8 +3789,22 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent = parent_kn->priv; + int ret; - return cgroup_create(parent, name, mode); + /* + * cgroup_create() grabs cgroup_tree_mutex which nests outside + * kernfs active_ref and cgroup_create() already synchronizes + * properly against removal through cgroup_lock_live_group(). + * Break it before calling cgroup_create(). + */ + cgroup_get(parent); + kernfs_break_active_protection(parent_kn); + + ret = cgroup_create(parent, name, mode); + + kernfs_unbreak_active_protection(parent_kn); + cgroup_put(parent); + return ret; } /* -- cgit v1.2.3 From 87291347c49dc40aa339f587b209618201c2e527 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Thu, 13 Feb 2014 19:51:48 -0800 Subject: tracing: Fix array size mismatch in format string In event format strings, the array size is reported in two locations. One in array subscript and then via the "size:" attribute. The values reported there have a mismatch. For e.g., in sched:sched_switch the prev_comm and next_comm character arrays have subscript values as [32] where as the actual field size is 16. name: sched_switch ID: 301 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:int common_pid; offset:4; size:4; signed:1; field:char prev_comm[32]; offset:8; size:16; signed:1; field:pid_t prev_pid; offset:24; size:4; signed:1; field:int prev_prio; offset:28; size:4; signed:1; field:long prev_state; offset:32; size:8; signed:1; field:char next_comm[32]; offset:40; size:16; signed:1; field:pid_t next_pid; offset:56; size:4; signed:1; field:int next_prio; offset:60; size:4; signed:1; After bisection, the following commit was blamed: 92edca0 tracing: Use direct field, type and system names This commit removes the duplication of strings for field->name and field->type assuming that all the strings passed in __trace_define_field() are immutable. This is not true for arrays, where the type string is created in event_storage variable and field->type for all array fields points to event_storage. Use __stringify() to create a string constant for the type string. Also, get rid of event_storage and event_storage_mutex that are not needed anymore. also, an added benefit is that this reduces the overhead of events a bit more: text data bss dec hex filename 8424787 2036472 1302528 11763787 b3804b vmlinux 8420814 2036408 1302528 11759750 b37086 vmlinux.patched Link: http://lkml.kernel.org/r/1392349908-29685-1-git-send-email-vnagarnaik@google.com Cc: Laurent Chavey Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ------ kernel/trace/trace_export.c | 7 ++----- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f3989ceb5cd5..7b16d40bd64d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e72e2b6..ee0a5098ac43 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); -- cgit v1.2.3 From 8c90487cdc64847b4fdd812ab3047f426fec4d13 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 26 Feb 2014 10:49:49 -0500 Subject: Rename TAINT_UNSAFE_SMP to TAINT_CPU_OUT_OF_SPEC Rename TAINT_UNSAFE_SMP to TAINT_CPU_OUT_OF_SPEC, so we can repurpose the flag to encompass a wider range of pushing the CPU beyond its warrany. Signed-off-by: Dave Jones Link: http://lkml.kernel.org/r/20140226154949.GA770@redhat.com Signed-off-by: H. Peter Anvin --- kernel/module.c | 2 +- kernel/panic.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d24fcf29cb64..ca2c1aded7ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1015,7 +1015,7 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't * apply to modules. */ return l; diff --git a/kernel/panic.c b/kernel/panic.c index 6d6300375090..2270cfd1d6be 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -199,7 +199,7 @@ struct tnt { static const struct tnt tnts[] = { { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' }, { TAINT_FORCED_RMMOD, 'R', ' ' }, { TAINT_MACHINE_CHECK, 'M', ' ' }, { TAINT_BAD_PAGE, 'B', ' ' }, -- cgit v1.2.3 From 765a3f4fed708ae429ee095914a7897acb3a65bd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Mar 2014 16:37:08 -0700 Subject: rcu: Provide grace-period piggybacking API The following pattern is currently not well supported by RCU: 1. Make data element inaccessible to RCU readers. 2. Do work that probably lasts for more than one grace period. 3. Do something to make sure RCU readers in flight before #1 above have completed. Here are some things that could currently be done: a. Do a synchronize_rcu() unconditionally at either #1 or #3 above. This works, but imposes needless work and latency. b. Post an RCU callback at #1 above that does a wakeup, then wait for the wakeup at #3. This works well, but likely results in an extra unneeded grace period. Open-coding this is also a bit more semi-tricky code than would be good. This commit therefore adds get_state_synchronize_rcu() and cond_synchronize_rcu() APIs. Call get_state_synchronize_rcu() at #1 above and pass its return value to cond_synchronize_rcu() at #3 above. This results in a call to synchronize_rcu() if no grace period has elapsed between #1 and #3, but requires only a load, comparison, and memory barrier if a full grace period did elapse. Requested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Acked-by: Peter Zijlstra --- kernel/rcu/tree.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 351faba48b91..0c47e300210a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1421,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - smp_wmb(); /* Record GP times before starting GP. */ - rsp->gpnum++; + /* Record GP times before starting GP, hence smp_store_release(). */ + smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ mutex_lock(&rsp->onoff_mutex); + smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1555,10 +1556,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); - rsp->completed = rsp->gpnum; /* Declare grace period done. */ + /* Declare grace period done. */ + ACCESS_ONCE(rsp->completed) = rsp->gpnum; trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); @@ -2637,6 +2639,58 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +/** + * get_state_synchronize_rcu - Snapshot current RCU state + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_rcu(void) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_rcu() + * and cond_synchronize_rcu(). + */ + return smp_load_acquire(&rcu_state->gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call to + * get_state_synchronize_rcu(), just return. Otherwise, invoke + * synchronize_rcu() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_rcu(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. + */ + newstate = smp_load_acquire(&rcu_state->completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu); + static int synchronize_sched_expedited_cpu_stop(void *data) { /* -- cgit v1.2.3 From 11d4616bd07f38d496bd489ed8fad1dc4d928823 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 20 Mar 2014 22:11:17 -0700 Subject: futex: revert back to the explicit waiter counting code Srikar Dronamraju reports that commit b0c29f79ecea ("futexes: Avoid taking the hb->lock if there's nothing to wake up") causes java threads getting stuck on futexes when runing specjbb on a power7 numa box. The cause appears to be that the powerpc spinlocks aren't using the same ticket lock model that we use on x86 (and other) architectures, which in turn result in the "spin_is_locked()" test in hb_waiters_pending() occasionally reporting an unlocked spinlock even when there are pending waiters. So this reinstates Davidlohr Bueso's original explicit waiter counting code, which I had convinced Davidlohr to drop in favor of figuring out the pending waiters by just using the existing state of the spinlock and the wait queue. Reported-and-tested-by: Srikar Dronamraju Original-code-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- kernel/futex.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 44a1261cb9ff..08ec814ad9d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -234,6 +234,7 @@ static const struct futex_q futex_q_init = { * waiting on a futex. */ struct futex_hash_bucket { + atomic_t waiters; spinlock_t lock; struct plist_head chain; } ____cacheline_aligned_in_smp; @@ -253,22 +254,37 @@ static inline void futex_get_mm(union futex_key *key) smp_mb__after_atomic_inc(); } -static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void hb_waiters_inc(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP + atomic_inc(&hb->waiters); /* - * Tasks trying to enter the critical region are most likely - * potential waiters that will be added to the plist. Ensure - * that wakers won't miss to-be-slept tasks in the window between - * the wait call and the actual plist_add. + * Full barrier (A), see the ordering comment above. */ - if (spin_is_locked(&hb->lock)) - return true; - smp_rmb(); /* Make sure we check the lock state first */ + smp_mb__after_atomic_inc(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} - return !plist_head_empty(&hb->chain); +static inline int hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + return atomic_read(&hb->waiters); #else - return true; + return 1; #endif } @@ -954,6 +970,7 @@ static void __unqueue_futex(struct futex_q *q) hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); } /* @@ -1257,7 +1274,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, */ if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); + hb_waiters_dec(hb1); plist_add(&q->list, &hb2->chain); + hb_waiters_inc(hb2); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -1600,6 +1619,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) struct futex_hash_bucket *hb; hb = hash_futex(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all queue_lock() + * users end up calling queue_me(). Similarly, for housekeeping, + * decrement the counter at queue_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + hb_waiters_inc(hb); + q->lock_ptr = &hb->lock; spin_lock(&hb->lock); /* implies MB (A) */ @@ -1611,6 +1641,7 @@ queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); + hb_waiters_dec(hb); } /** @@ -2342,6 +2373,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2875,6 +2907,7 @@ static int __init futex_init(void) futex_cmpxchg_enabled = 1; for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } -- cgit v1.2.3 From bc4c426ee2431d1f717004d3bbaacbd819b544fd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 21 Mar 2014 08:23:38 -0400 Subject: Revert "tracing: Move event storage for array from macro to standalone function" I originally wrote commit 35bb4399bd0e to shrink the size of the overhead of tracepoints by several kilobytes. Later, I received a patch from Vaibhav Nagarnaik that fixed a bug in the same code that this commit touches. Not only did it fix a bug, it also removed code and shrunk the size of the overhead of trace events even more than this commit did. Since this commit is scheduled for 3.15 and Vaibhav's patch is already in mainline, I need to revert this patch in order to keep it from conflicting with Vaibhav's patch. Not to mention, Vaibhav's patch makes this patch obsolete. Link: http://lkml.kernel.org/r/20140320225637.0226041b@gandalf.local.home Cc: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ++++++ kernel/trace/trace_export.c | 12 ++++++++---- kernel/trace/trace_output.c | 21 --------------------- 3 files changed, 14 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b8f73b333a3c..2f7b8e31e3a4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,6 +27,12 @@ DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_storage_mutex); +EXPORT_SYMBOL_GPL(event_storage_mutex); + +char event_storage[EVENT_STORAGE_SIZE]; +EXPORT_SYMBOL_GPL(event_storage); + LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 39c746c5ae73..7c3e3e72e2b6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -96,10 +96,14 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __array(type, item, len) \ do { \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = ftrace_event_define_field(event_call, #type, len, \ - #item, offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), filter_type); \ + mutex_lock(&event_storage_mutex); \ + snprintf(event_storage, sizeof(event_storage), \ + "%s[%d]", #type, len); \ + ret = trace_define_field(event_call, event_storage, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), filter_type); \ + mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ee8d74840b88..ca0e79e2abaa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,10 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -#define EVENT_STORAGE_SIZE 128 -static DEFINE_MUTEX(event_storage_mutex); -static char event_storage[EVENT_STORAGE_SIZE]; - int trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; @@ -474,23 +470,6 @@ int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) } EXPORT_SYMBOL_GPL(ftrace_output_call); -int ftrace_event_define_field(struct ftrace_event_call *call, - char *type, int len, char *item, int offset, - int field_size, int sign, int filter) -{ - int ret; - - mutex_lock(&event_storage_mutex); - snprintf(event_storage, sizeof(event_storage), - "%s[%d]", type, len); - ret = trace_define_field(call, event_storage, item, offset, - field_size, sign, filter); - mutex_unlock(&event_storage_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(ftrace_event_define_field); - #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.3 From 0dea6d52638b2693b18cd2ed8938b236e0789ddb Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 21 Mar 2014 01:19:01 -0400 Subject: tracepoint: Remove unused API functions After the following commit: commit b75ef8b44b1cb95f5a26484b0e2fe37a63b12b44 Author: Mathieu Desnoyers Date: Wed Aug 10 15:18:39 2011 -0400 Tracepoint: Dissociate from module mutex The following functions became unnecessary: - tracepoint_probe_register_noupdate, - tracepoint_probe_unregister_noupdate, - tracepoint_probe_update_all. In fact, none of the in-kernel tracers, nor LTTng, nor SystemTAP use them. Remove those. Moreover, the functions: - tracepoint_iter_start, - tracepoint_iter_next, - tracepoint_iter_stop, - tracepoint_iter_reset. are unused by in-kernel tracers, LTTng and SystemTAP. Remove those too. Link: http://lkml.kernel.org/r/1395379142-2118-2-git-send-email-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 222 ++-------------------------------------------------- 1 file changed, 5 insertions(+), 217 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e2a58a22b0f4..65d9f9459a75 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -67,10 +67,7 @@ struct tracepoint_entry { }; struct tp_probes { - union { - struct rcu_head rcu; - struct list_head list; - } u; + struct rcu_head rcu; struct tracepoint_func probes[0]; }; @@ -83,7 +80,7 @@ static inline void *allocate_probes(int count) static void rcu_free_old_probes(struct rcu_head *head) { - kfree(container_of(head, struct tp_probes, u.rcu)); + kfree(container_of(head, struct tp_probes, rcu)); } static inline void release_probes(struct tracepoint_func *old) @@ -91,7 +88,7 @@ static inline void release_probes(struct tracepoint_func *old) if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); + call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); } } @@ -459,204 +456,11 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); -static LIST_HEAD(old_probes); -static int need_update; - -static void tracepoint_add_old_probes(void *old) -{ - need_update = 1; - if (old) { - struct tp_probes *tp_probes = container_of(old, - struct tp_probes, probes[0]); - list_add(&tp_probes->u.list, &old_probes); - } -} - -/** - * tracepoint_probe_register_noupdate - register a probe but not connect - * @name: tracepoint name - * @probe: probe handler - * @data: probe private data - * - * caller must call tracepoint_probe_update_all() - */ -int tracepoint_probe_register_noupdate(const char *name, void *probe, - void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); - -/** - * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect - * @name: tracepoint name - * @probe: probe function pointer - * @data: probe private data - * - * caller must call tracepoint_probe_update_all() - */ -int tracepoint_probe_unregister_noupdate(const char *name, void *probe, - void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); - -/** - * tracepoint_probe_update_all - update tracepoints - */ -void tracepoint_probe_update_all(void) -{ - LIST_HEAD(release_probes); - struct tp_probes *pos, *next; - - mutex_lock(&tracepoints_mutex); - if (!need_update) { - mutex_unlock(&tracepoints_mutex); - return; - } - if (!list_empty(&old_probes)) - list_replace_init(&old_probes, &release_probes); - need_update = 0; - tracepoint_update_probes(); - mutex_unlock(&tracepoints_mutex); - list_for_each_entry_safe(pos, next, &release_probes, u.list) { - list_del(&pos->u.list); - call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); - } -} -EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); - -/** - * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. - * @tracepoint: current tracepoints (in), next tracepoint (out) - * @begin: beginning of the range - * @end: end of the range - * - * Returns whether a next tracepoint has been found (1) or not (0). - * Will return the first tracepoint in the range if the input tracepoint is - * NULL. - */ -static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, - struct tracepoint * const *begin, struct tracepoint * const *end) -{ - if (!*tracepoint && begin != end) { - *tracepoint = begin; - return 1; - } - if (*tracepoint >= begin && *tracepoint < end) - return 1; - return 0; -} - -#ifdef CONFIG_MODULES -static void tracepoint_get_iter(struct tracepoint_iter *iter) -{ - int found = 0; - struct tp_module *iter_mod; - - /* Core kernel tracepoints */ - if (!iter->module) { - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (found) - goto end; - } - /* Tracepoints in modules */ - mutex_lock(&tracepoints_mutex); - list_for_each_entry(iter_mod, &tracepoint_module_list, list) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - mutex_unlock(&tracepoints_mutex); -end: - if (!found) - tracepoint_iter_reset(iter); -} -#else /* CONFIG_MODULES */ -static void tracepoint_get_iter(struct tracepoint_iter *iter) -{ - int found = 0; - - /* Core kernel tracepoints */ - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (!found) - tracepoint_iter_reset(iter); -} -#endif /* CONFIG_MODULES */ - -void tracepoint_iter_start(struct tracepoint_iter *iter) -{ - tracepoint_get_iter(iter); -} -EXPORT_SYMBOL_GPL(tracepoint_iter_start); - -void tracepoint_iter_next(struct tracepoint_iter *iter) -{ - iter->tracepoint++; - /* - * iter->tracepoint may be invalid because we blindly incremented it. - * Make sure it is valid by marshalling on the tracepoints, getting the - * tracepoints from following modules if necessary. - */ - tracepoint_get_iter(iter); -} -EXPORT_SYMBOL_GPL(tracepoint_iter_next); - -void tracepoint_iter_stop(struct tracepoint_iter *iter) -{ -} -EXPORT_SYMBOL_GPL(tracepoint_iter_stop); - -void tracepoint_iter_reset(struct tracepoint_iter *iter) -{ -#ifdef CONFIG_MODULES - iter->module = NULL; -#endif /* CONFIG_MODULES */ - iter->tracepoint = NULL; -} -EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES static int tracepoint_module_coming(struct module *mod) { - struct tp_module *tp_mod, *iter; + struct tp_module *tp_mod; int ret = 0; if (!mod->num_tracepoints) @@ -677,23 +481,7 @@ static int tracepoint_module_coming(struct module *mod) } tp_mod->num_tracepoints = mod->num_tracepoints; tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; - - /* - * tracepoint_module_list is kept sorted by struct module pointer - * address for iteration on tracepoints from a seq_file that can release - * the mutex between calls. - */ - list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { - BUG_ON(iter == tp_mod); /* Should never be in the list twice */ - if (iter < tp_mod) { - /* We belong to the location right after iter. */ - list_add(&tp_mod->list, &iter->list); - goto module_added; - } - } - /* We belong to the beginning of the list */ - list_add(&tp_mod->list, &tracepoint_module_list); -module_added: + list_add_tail(&tp_mod->list, &tracepoint_module_list); tracepoint_update_probe_range(mod->tracepoints_ptrs, mod->tracepoints_ptrs + mod->num_tracepoints); end: -- cgit v1.2.3 From d6ee6d2325faeec3fb0122a4840678a2ba62b04b Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Sat, 22 Mar 2014 12:20:31 +0400 Subject: genirq: Export symbol no_action() This will allow to use the dummy IRQ handler no_action() from drivers compiled as module. Drivers which use ARM FIQ interrupts can use this to request the interrupt via the normal request_irq() mechanism w/o having to copy the dummy handler to their own code. Signed-off-by: Alexander Shiyan Link: http://lkml.kernel.org/r/1395476431-16070-1-git-send-email-shc_work@mail.ru Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index bfec453557b4..635480270858 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -41,6 +41,7 @@ irqreturn_t no_action(int cpl, void *dev_id) { return IRQ_NONE; } +EXPORT_SYMBOL_GPL(no_action); static void warn_no_thread(unsigned int irq, struct irqaction *action) { -- cgit v1.2.3 From 01a971406177c2ca9834be6914a67e20f463a3e6 Mon Sep 17 00:00:00 2001 From: Monam Agarwal Date: Mon, 24 Mar 2014 00:17:18 +0530 Subject: cgroup: Use RCU_INIT_POINTER(x, NULL) in cgroup.c This patch replaces rcu_assign_pointer(x, NULL) with RCU_INIT_POINTER(x, NULL) The rcu_assign_pointer() ensures that the initialization of a structure is carried out before storing a pointer to that structure. And in the case of the NULL pointer, there is no structure to initialize. So, rcu_assign_pointer(p, NULL) can be safely converted to RCU_INIT_POINTER(p, NULL) Signed-off-by: Monam Agarwal Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 58c67b3060b5..e378cb2fac5e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3545,7 +3545,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL); + RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } -- cgit v1.2.3 From 3862807880acc0adaef6749738d210c9f45c3049 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 24 Mar 2014 14:03:57 +0000 Subject: tracing: Add BUG_ON when stack end location is over written It is difficult to detect a stack overrun when it actually occurs. We have observed that this type of corruption is often silent and can go unnoticed. Once the corrupted region is examined, the outcome is undefined and often results in sporadic system crashes. When the stack tracing feature is enabled, let's check for this condition and take appropriate action. Note: init_task doesn't get its stack end location set to STACK_END_MAGIC. Link: http://lkml.kernel.org/r/1395669837-30209-1-git-send-email-atomlin@redhat.com Signed-off-by: Aaron Tomlin Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e6be585cf06a..21b320e5d163 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -144,6 +145,8 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + BUG_ON(current != &init_task && + *(end_of_stack(current)) != STACK_END_MAGIC); out: arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); -- cgit v1.2.3 From e231d54c1239ccf31aaee311bed0c4d1937cae2c Mon Sep 17 00:00:00 2001 From: Monam Agarwal Date: Mon, 24 Mar 2014 00:16:19 +0530 Subject: kernel: Use RCU_INIT_POINTER(x, NULL) in audit.c This patch replaces rcu_assign_pointer(x, NULL) with RCU_INIT_POINTER(x, NULL) The rcu_assign_pointer() ensures that the initialization of a structure is carried out before storing a pointer to that structure. And in the case of the NULL pointer, there is no structure to initialize. So, rcu_assign_pointer(p, NULL) can be safely converted to RCU_INIT_POINTER(p, NULL) Signed-off-by: Monam Agarwal Signed-off-by: Eric Paris --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c0696dcfed11..ad77d1e80895 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1092,7 +1092,7 @@ static void __net_exit audit_net_exit(struct net *net) audit_sock = NULL; } - rcu_assign_pointer(aunet->nlsk, NULL); + RCU_INIT_POINTER(aunet->nlsk, NULL); synchronize_net(); netlink_kernel_release(sock); } -- cgit v1.2.3 From ea2e64f280d2a34a8ed9ae3d783cd770d14b70ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Mar 2014 14:20:44 +0000 Subject: workqueue: Provide destroy_delayed_work_on_stack() If a delayed or deferrable work is on stack we need to tell debug objects that we are destroying the timer and the work. Otherwise we leak the tracking object. Signed-off-by: Thomas Gleixner Cc: Vince Weaver Acked-by: Tejun Heo Link: http://lkml.kernel.org/r/20140323141939.911487677@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 82ef9f3b7473..5b690b5a9e74 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -516,6 +516,13 @@ void destroy_work_on_stack(struct work_struct *work) } EXPORT_SYMBOL_GPL(destroy_work_on_stack); +void destroy_delayed_work_on_stack(struct delayed_work *work) +{ + destroy_timer_on_stack(&work->timer); + debug_object_free(&work->work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); + #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } -- cgit v1.2.3 From cacb3c76c2012ade52124e8c6fdc5cb125625772 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 25 Mar 2014 16:09:18 +0530 Subject: tick: Fix spelling mistake in tick_handle_periodic() One of the comments in tick_handle_periodic() had 'when' instead of 'which' (My guess :)). Fix it. Also fix spelling mistake in 'Possible'. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: skarafotis@gmail.com Link: http://lkml.kernel.org/r/2b29ca4230c163e44179941d7c7a16c1474385c2.1395743878.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 20b2fe37d105..0fec63414fb6 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -118,7 +118,7 @@ void tick_handle_periodic(struct clock_event_device *dev) * to be sure we're using a real hardware clocksource. * Otherwise we could get trapped in an infinite * loop, as the tick_periodic() increments jiffies, - * when then will increment time, posibly causing + * which then will increment time, possibly causing * the loop to trigger again and again. */ if (timekeeping_valid_for_hres()) -- cgit v1.2.3 From b97f0291a2504291aef850077f98cab68a5a2f33 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 25 Mar 2014 13:56:23 +0530 Subject: tick: Remove code duplication in tick_handle_periodic() tick_handle_periodic() is calling ktime_add() at two places, first before the infinite loop and then at the end of infinite loop. We can rearrange code a bit to fix code duplication here. It looks quite simple and shouldn't break anything, I guess :) Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/be3481e8f3f71df694a4b43623254fc93ca51b59.1395735873.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0fec63414fb6..015661279b68 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -98,18 +98,19 @@ static void tick_periodic(int cpu) void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); - ktime_t next; + ktime_t next = dev->next_event; tick_periodic(cpu); if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return; - /* - * Setup the next period for devices, which do not have - * periodic mode: - */ - next = ktime_add(dev->next_event, tick_period); for (;;) { + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(next, tick_period); + if (!clockevents_program_event(dev, next, false)) return; /* @@ -123,7 +124,6 @@ void tick_handle_periodic(struct clock_event_device *dev) */ if (timekeeping_valid_for_hres()) tick_periodic(cpu); - next = ktime_add(next, tick_period); } } -- cgit v1.2.3 From 2c4a33aba5f9ea3a28f2e40351f078d95f00786b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 25 Mar 2014 23:39:41 -0400 Subject: tracing: Fix traceon trigger condition to actually turn tracing on While working on my tutorial for 2014 Linux Collaboration Summit I found that the traceon trigger did not work when conditions were used. The other triggers worked fine though. Looking into it, it is because of the way the triggers use the ring buffer to store the fields it will use for the condition. But if tracing is off, nothing is stored in the buffer, and the tracepoint exits before calling the trigger to test the condition. This is fine for all the triggers that only work when tracing is on, but for traceon trigger that is to work when tracing is off, nothing happens. The fix is simple, just use a temp ring buffer to record the event if tracing is off and the event has a trace event conditional trigger enabled. The rest of the tracepoint code will work just fine, but the tracepoint wont be recorded in the other buffers. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 815c878f409b..24c1f2382557 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1600,15 +1600,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); +static struct ring_buffer *temp_buffer; + struct ring_buffer_event * trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, struct ftrace_event_file *ftrace_file, int type, unsigned long len, unsigned long flags, int pc) { + struct ring_buffer_event *entry; + *current_rb = ftrace_file->tr->trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, + entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer + * to store the trace event for the tigger to use. It's recusive + * safe and will not be recorded anywhere. + */ + if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { + *current_rb = temp_buffer; + entry = trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); + } + return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); @@ -6494,11 +6510,16 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); + /* Used for event triggers */ + temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); + if (!temp_buffer) + goto out_free_cpumask; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - goto out_free_cpumask; + goto out_free_temp_buffer; } if (global_trace.buffer_disabled) @@ -6540,6 +6561,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_temp_buffer: + ring_buffer_free(temp_buffer); out_free_cpumask: free_percpu(global_trace.trace_buffer.data); #ifdef CONFIG_TRACER_MAX_TRACE -- cgit v1.2.3 From cab5e127eef040399902caa8e1510795583fa03a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 27 Mar 2014 16:30:49 -0700 Subject: time: Revert to calling clock_was_set_delayed() while in irq context In commit 47a1b796306356f35 ("tick/timekeeping: Call update_wall_time outside the jiffies lock"), we moved to calling clock_was_set() due to the fact that we were no longer holding the timekeeping or jiffies lock. However, there is still the problem that clock_was_set() triggers an IPI, which cannot be done from the timer's hard irq context, and will generate WARN_ON warnings. Apparently in my earlier testing, I'm guessing I didn't bump the dmesg log level, so I somehow missed the WARN_ONs. Thus we need to revert back to calling clock_was_set_delayed(). Signed-off-by: John Stultz Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1395963049-11923-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0aa4ce81bc16..5b40279ecd71 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1435,7 +1435,8 @@ void update_wall_time(void) out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) - clock_was_set(); + /* Have to call _delayed version, since in irq context*/ + clock_was_set_delayed(); } /** -- cgit v1.2.3 From e8604cb43690b781f9a7ad4a770f3e10259fe939 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 28 Mar 2014 15:18:27 +0800 Subject: cgroup: fix spurious lockdep warning in cgroup_exit() cgroup_exit() is called in fork and exit path. If it's called in the failure path during fork, PF_EXITING isn't set, and then lockdep will complain. Fix this by removing cgroup_exit() in that failure path. cgroup_fork() does nothing that needs cleanup. Reported-by: Sasha Levin Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +-- kernel/fork.c | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e378cb2fac5e..60fd6f1f6d4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4431,8 +4431,7 @@ void cgroup_post_fork(struct task_struct *child) * notify_on_release(), then leave the task attached to the root cgroup in * each hierarchy for the remainder of its exit. No need to bother with * init_css_set refcnting. init_css_set never goes away and we can't race - * with migration path - either PF_EXITING is visible to migration path or - * @tsk never got on the tasklist. + * with migration path - PF_EXITING is visible to migration path. */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { diff --git a/kernel/fork.c b/kernel/fork.c index a17621c6cd42..8852b3463ab7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; + goto bad_fork_cleanup_threadgroup_lock; } mpol_fix_fork_child_flag(p); #endif @@ -1524,11 +1524,10 @@ bad_fork_cleanup_policy: perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_cgroup: +bad_fork_cleanup_threadgroup_lock: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: -- cgit v1.2.3 From 1ec41830e087cda1f62dda4182c2b62811eb0ffc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 28 Mar 2014 15:22:19 +0800 Subject: cgroup: remove useless argument from cgroup_exit() Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++--- kernel/exit.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 60fd6f1f6d4e..f7f94322d312 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4416,7 +4416,6 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process - * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -4433,7 +4432,7 @@ void cgroup_post_fork(struct task_struct *child) * init_css_set refcnting. init_css_set never goes away and we can't race * with migration path - PF_EXITING is visible to migration path. */ -void cgroup_exit(struct task_struct *tsk, int run_callbacks) +void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; @@ -4455,7 +4454,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - if (run_callbacks && need_forkexit_callback) { + if (need_forkexit_callback) { /* see cgroup_post_fork() for details */ for_each_subsys(ss, i) { if (ss->exit) { diff --git a/kernel/exit.c b/kernel/exit.c index 1e77fc645317..6480d1c85d7a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -797,7 +797,7 @@ void do_exit(long code) */ perf_event_exit_task(tsk); - cgroup_exit(tsk, 1); + cgroup_exit(tsk); if (group_dead) disassociate_ctty(1); -- cgit v1.2.3 From aa4af831bb4f3168f2f574b2620124699c09c4a3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Sun, 30 Mar 2014 19:07:54 -0400 Subject: AUDIT: Allow login in non-init namespaces It its possible to configure your PAM stack to refuse login if audit messages (about the login) were unable to be sent. This is common in many distros and thus normal configuration of many containers. The PAM modules determine if audit is enabled/disabled in the kernel based on the return value from sending an audit message on the netlink socket. If userspace gets back ECONNREFUSED it believes audit is disabled in the kernel. If it gets any other error else it refuses to let the login proceed. Just about ever since the introduction of namespaces the kernel audit subsystem has returned EPERM if the task sending a message was not in the init user or pid namespace. So many forms of containers have never worked if audit was enabled in the kernel. BUT if the container was not in net_init then the kernel network code would send ECONNREFUSED (instead of the audit code sending EPERM). Thus by pure accident/dumb luck/bug if an admin configured the PAM stack to reject all logins that didn't talk to audit, but then ran the login untility in the non-init_net namespace, it would work!! Clearly this was a bug, but it is a bug some people expected. With the introduction of network namespace support in 3.14-rc1 the two bugs stopped cancelling each other out. Now, containers in the non-init_net namespace refused to let users log in (just like PAM was configfured!) Obviously some people were not happy that what used to let users log in, now didn't! This fix is kinda hacky. We return ECONNREFUSED for all non-init relevant namespaces. That means that not only will the old broken non-init_net setups continue to work, now the broken non-init_pid or non-init_user setups will 'work'. They don't really work, since audit isn't logging things. But it's what most users want. In 3.15 we should have patches to support not only the non-init_net (3.14) namespace but also the non-init_pid and non-init_user namespace. So all will be right in the world. This just opens the doors wide open on 3.14 and hopefully makes users happy, if not the audit system... Reported-by: Andre Tomt Reported-by: Adam Richter Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- kernel/audit.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3392d3e0254a..95a20f3f52f1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -608,9 +608,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) int err = 0; /* Only support the initial namespaces for now. */ + /* + * We return ECONNREFUSED because it tricks userspace into thinking + * that audit was not configured into the kernel. Lots of users + * configure their PAM stack (because that's what the distro does) + * to reject login if unable to send messages to audit. If we return + * ECONNREFUSED the PAM stack thinks the kernel does not have audit + * configured in and will let login proceed. If we return EPERM + * userspace will reject all logins. This should be removed when we + * support non init namespaces!! + */ if ((current_user_ns() != &init_user_ns) || (task_active_pid_ns(current) != &init_pid_ns)) - return -EPERM; + return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: -- cgit v1.2.3 From 57673c2b0baa900dddae3b9eb3d7748ebf550eb3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 31 Mar 2014 14:39:57 +1030 Subject: Use 'E' instead of 'X' for unsigned module taint flag. Takashi Iwai says: > The letter 'X' has been already used for SUSE kernels for very long > time, to indicate the external supported modules. Can the new flag be > changed to another letter for avoiding conflict...? > (BTW, we also use 'N' for "no support", too.) Note: this code should be cleaned up, so we don't have such maps in three places! Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- kernel/panic.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c1acb0c5b637..5806e096d110 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1014,7 +1014,7 @@ static size_t module_flags_taint(struct module *mod, char *buf) if (mod->taints & (1 << TAINT_CRAP)) buf[l++] = 'C'; if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) - buf[l++] = 'X'; + buf[l++] = 'E'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't diff --git a/kernel/panic.c b/kernel/panic.c index 0e25fe10871e..02b6c9f0171b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -210,7 +210,7 @@ static const struct tnt tnts[] = { { TAINT_CRAP, 'C', ' ' }, { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, - { TAINT_UNSIGNED_MODULE, 'X', ' ' }, + { TAINT_UNSIGNED_MODULE, 'E', ' ' }, }; /** @@ -229,7 +229,7 @@ static const struct tnt tnts[] = { * 'C' - modules from drivers/staging are loaded. * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. - * 'X' - Unsigned module has been loaded. + * 'E' - Unsigned module has been loaded. * * The string is overwritten by the next call to print_tainted(). */ -- cgit v1.2.3 From bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Mar 2014 18:58:25 +0100 Subject: net: filter: rework/optimize internal BPF interpreter's instruction set This patch replaces/reworks the kernel-internal BPF interpreter with an optimized BPF instruction set format that is modelled closer to mimic native instruction sets and is designed to be JITed with one to one mapping. Thus, the new interpreter is noticeably faster than the current implementation of sk_run_filter(); mainly for two reasons: 1. Fall-through jumps: BPF jump instructions are forced to go either 'true' or 'false' branch which causes branch-miss penalty. The new BPF jump instructions have only one branch and fall-through otherwise, which fits the CPU branch predictor logic better. `perf stat` shows drastic difference for branch-misses between the old and new code. 2. Jump-threaded implementation of interpreter vs switch statement: Instead of single table-jump at the top of 'switch' statement, gcc will now generate multiple table-jump instructions, which helps CPU branch predictor logic. Note that the verification of filters is still being done through sk_chk_filter() in classical BPF format, so filters from user- or kernel space are verified in the same way as we do now, and same restrictions/constraints hold as well. We reuse current BPF JIT compilers in a way that this upgrade would even be fine as is, but nevertheless allows for a successive upgrade of BPF JIT compilers to the new format. The internal instruction set migration is being done after the probing for JIT compilation, so in case JIT compilers are able to create a native opcode image, we're going to use that, and in all other cases we're doing a follow-up migration of the BPF program's instruction set, so that it can be transparently run in the new interpreter. In short, the *internal* format extends BPF in the following way (more details can be taken from the appended documentation): - Number of registers increase from 2 to 10 - Register width increases from 32-bit to 64-bit - Conditional jt/jf targets replaced with jt/fall-through - Adds signed > and >= insns - 16 4-byte stack slots for register spill-fill replaced with up to 512 bytes of multi-use stack space - Introduction of bpf_call insn and register passing convention for zero overhead calls from/to other kernel functions - Adds arithmetic right shift and endianness conversion insns - Adds atomic_add insn - Old tax/txa insns are replaced with 'mov dst,src' insn Performance of two BPF filters generated by libpcap resp. bpf_asm was measured on x86_64, i386 and arm32 (other libpcap programs have similar performance differences): fprog #1 is taken from Documentation/networking/filter.txt: tcpdump -i eth0 port 22 -dd fprog #2 is taken from 'man tcpdump': tcpdump -i eth0 'tcp port 22 and (((ip[2:2] - ((ip[0]&0xf)<<2)) - ((tcp[12]&0xf0)>>2)) != 0)' -dd Raw performance data from BPF micro-benchmark: SK_RUN_FILTER on the same SKB (cache-hit) or 10k SKBs (cache-miss); time in ns per call, smaller is better: --x86_64-- fprog #1 fprog #1 fprog #2 fprog #2 cache-hit cache-miss cache-hit cache-miss old BPF 90 101 192 202 new BPF 31 71 47 97 old BPF jit 12 34 17 44 new BPF jit TBD --i386-- fprog #1 fprog #1 fprog #2 fprog #2 cache-hit cache-miss cache-hit cache-miss old BPF 107 136 227 252 new BPF 40 119 69 172 --arm32-- fprog #1 fprog #1 fprog #2 fprog #2 cache-hit cache-miss cache-hit cache-miss old BPF 202 300 475 540 new BPF 180 270 330 470 old BPF jit 26 182 37 202 new BPF jit TBD Thus, without changing any userland BPF filters, applications on top of AF_PACKET (or other families) such as libpcap/tcpdump, cls_bpf classifier, netfilter's xt_bpf, team driver's load-balancing mode, and many more will have better interpreter filtering performance. While we are replacing the internal BPF interpreter, we also need to convert seccomp BPF in the same step to make use of the new internal structure since it makes use of lower-level API details without being further decoupled through higher-level calls like sk_unattached_filter_{create,destroy}(), for example. Just as for normal socket filtering, also seccomp BPF experiences a time-to-verdict speedup: 05-sim-long_jumps.c of libseccomp was used as micro-benchmark: seccomp_rule_add_exact(ctx,... seccomp_rule_add_exact(ctx,... rc = seccomp_load(ctx); for (i = 0; i < 10000000; i++) syscall(199, 100); 'short filter' has 2 rules 'large filter' has 200 rules 'short filter' performance is slightly better on x86_64/i386/arm32 'large filter' is much faster on x86_64 and i386 and shows no difference on arm32 --x86_64-- short filter old BPF: 2.7 sec 39.12% bench libc-2.15.so [.] syscall 8.10% bench [kernel.kallsyms] [k] sk_run_filter 6.31% bench [kernel.kallsyms] [k] system_call 5.59% bench [kernel.kallsyms] [k] trace_hardirqs_on_caller 4.37% bench [kernel.kallsyms] [k] trace_hardirqs_off_caller 3.70% bench [kernel.kallsyms] [k] __secure_computing 3.67% bench [kernel.kallsyms] [k] lock_is_held 3.03% bench [kernel.kallsyms] [k] seccomp_bpf_load new BPF: 2.58 sec 42.05% bench libc-2.15.so [.] syscall 6.91% bench [kernel.kallsyms] [k] system_call 6.25% bench [kernel.kallsyms] [k] trace_hardirqs_on_caller 6.07% bench [kernel.kallsyms] [k] __secure_computing 5.08% bench [kernel.kallsyms] [k] sk_run_filter_int_seccomp --arm32-- short filter old BPF: 4.0 sec 39.92% bench [kernel.kallsyms] [k] vector_swi 16.60% bench [kernel.kallsyms] [k] sk_run_filter 14.66% bench libc-2.17.so [.] syscall 5.42% bench [kernel.kallsyms] [k] seccomp_bpf_load 5.10% bench [kernel.kallsyms] [k] __secure_computing new BPF: 3.7 sec 35.93% bench [kernel.kallsyms] [k] vector_swi 21.89% bench libc-2.17.so [.] syscall 13.45% bench [kernel.kallsyms] [k] sk_run_filter_int_seccomp 6.25% bench [kernel.kallsyms] [k] __secure_computing 3.96% bench [kernel.kallsyms] [k] syscall_trace_exit --x86_64-- large filter old BPF: 8.6 seconds 73.38% bench [kernel.kallsyms] [k] sk_run_filter 10.70% bench libc-2.15.so [.] syscall 5.09% bench [kernel.kallsyms] [k] seccomp_bpf_load 1.97% bench [kernel.kallsyms] [k] system_call new BPF: 5.7 seconds 66.20% bench [kernel.kallsyms] [k] sk_run_filter_int_seccomp 16.75% bench libc-2.15.so [.] syscall 3.31% bench [kernel.kallsyms] [k] system_call 2.88% bench [kernel.kallsyms] [k] __secure_computing --i386-- large filter old BPF: 5.4 sec new BPF: 3.8 sec --arm32-- large filter old BPF: 13.5 sec 73.88% bench [kernel.kallsyms] [k] sk_run_filter 10.29% bench [kernel.kallsyms] [k] vector_swi 6.46% bench libc-2.17.so [.] syscall 2.94% bench [kernel.kallsyms] [k] seccomp_bpf_load 1.19% bench [kernel.kallsyms] [k] __secure_computing 0.87% bench [kernel.kallsyms] [k] sys_getuid new BPF: 13.5 sec 76.08% bench [kernel.kallsyms] [k] sk_run_filter_int_seccomp 10.98% bench [kernel.kallsyms] [k] vector_swi 5.87% bench libc-2.17.so [.] syscall 1.77% bench [kernel.kallsyms] [k] __secure_computing 0.93% bench [kernel.kallsyms] [k] sys_getuid BPF filters generated by seccomp are very branchy, so the new internal BPF performance is better than the old one. Performance gains will be even higher when BPF JIT is committed for the new structure, which is planned in future work (as successive JIT migrations). BPF has also been stress-tested with trinity's BPF fuzzer. Joint work with Daniel Borkmann. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Cc: Hagen Paul Pfeifer Cc: Kees Cook Cc: Paul Moore Cc: Ingo Molnar Cc: H. Peter Anvin Cc: linux-kernel@vger.kernel.org Acked-by: Kees Cook Signed-off-by: David S. Miller --- kernel/seccomp.c | 119 +++++++++++++++++++++++++++---------------------------- 1 file changed, 58 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a10048a32c..4f18e754c23e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -55,60 +55,33 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; unsigned short len; /* Instruction count */ - struct sock_filter insns[]; + struct sock_filter_int insnsi[]; }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) -/** - * get_u32 - returns a u32 offset into data - * @data: a unsigned 64 bit value - * @index: 0 or 1 to return the first or second 32-bits - * - * This inline exists to hide the length of unsigned long. If a 32-bit - * unsigned long is passed in, it will be extended and the top 32-bits will be - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be - * properly returned. - * +/* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ -static inline u32 get_u32(u64 data, int index) +static void populate_seccomp_data(struct seccomp_data *sd) { - return ((u32 *)&data)[index]; -} + struct task_struct *task = current; + struct pt_regs *regs = task_pt_regs(task); -/* Helper for bpf_load below. */ -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) -/** - * bpf_load: checks and returns a pointer to the requested offset - * @off: offset into struct seccomp_data to load from - * - * Returns the requested 32-bits of data. - * seccomp_check_filter() should assure that @off is 32-bit aligned - * and not out of bounds. Failure to do so is a BUG. - */ -u32 seccomp_bpf_load(int off) -{ - struct pt_regs *regs = task_pt_regs(current); - if (off == BPF_DATA(nr)) - return syscall_get_nr(current, regs); - if (off == BPF_DATA(arch)) - return syscall_get_arch(current, regs); - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { - unsigned long value; - int arg = (off - BPF_DATA(args[0])) / sizeof(u64); - int index = !!(off % sizeof(u64)); - syscall_get_arguments(current, regs, arg, 1, &value); - return get_u32(value, index); - } - if (off == BPF_DATA(instruction_pointer)) - return get_u32(KSTK_EIP(current), 0); - if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) - return get_u32(KSTK_EIP(current), 1); - /* seccomp_check_filter should make this impossible. */ - BUG(); + sd->nr = syscall_get_nr(task, regs); + sd->arch = syscall_get_arch(task, regs); + + /* Unroll syscall_get_args to help gcc on arm. */ + syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); + syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); + syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); + syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); + syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); + syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); + + sd->instruction_pointer = KSTK_EIP(task); } /** @@ -133,17 +106,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) switch (code) { case BPF_S_LD_W_ABS: - ftest->code = BPF_S_ANC_SECCOMP_LD_W; + ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; case BPF_S_LD_W_LEN: - ftest->code = BPF_S_LD_IMM; + ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; case BPF_S_LDX_W_LEN: - ftest->code = BPF_S_LDX_IMM; + ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ @@ -185,6 +158,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) case BPF_S_JMP_JGT_X: case BPF_S_JMP_JSET_K: case BPF_S_JMP_JSET_X: + sk_decode_filter(ftest, ftest); continue; default: return -EINVAL; @@ -202,18 +176,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) static u32 seccomp_run_filters(int syscall) { struct seccomp_filter *f; + struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(current->seccomp.filter == NULL)) return SECCOMP_RET_KILL; + populate_seccomp_data(&sd); + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter(NULL, f->insns); + u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -231,6 +208,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) struct seccomp_filter *filter; unsigned long fp_size = fprog->len * sizeof(struct sock_filter); unsigned long total_insns = fprog->len; + struct sock_filter *fp; + int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) @@ -252,28 +231,43 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return -EACCES; - /* Allocate a new seccomp_filter */ - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, - GFP_KERNEL|__GFP_NOWARN); - if (!filter) + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); + if (!fp) return -ENOMEM; - atomic_set(&filter->usage, 1); - filter->len = fprog->len; /* Copy the instructions from fprog. */ ret = -EFAULT; - if (copy_from_user(filter->insns, fprog->filter, fp_size)) - goto fail; + if (copy_from_user(fp, fprog->filter, fp_size)) + goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(filter->insns, filter->len); + ret = sk_chk_filter(fp, fprog->len); if (ret) - goto fail; + goto free_prog; /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(filter->insns, filter->len); + ret = seccomp_check_filter(fp, fprog->len); + if (ret) + goto free_prog; + + /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + if (ret) + goto free_prog; + + /* Allocate a new seccomp_filter */ + filter = kzalloc(sizeof(struct seccomp_filter) + + sizeof(struct sock_filter_int) * new_len, + GFP_KERNEL|__GFP_NOWARN); + if (!filter) + goto free_prog; + + ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); if (ret) - goto fail; + goto free_filter; + + atomic_set(&filter->usage, 1); + filter->len = new_len; /* * If there is an existing filter, make it the prev and don't drop its @@ -282,8 +276,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) filter->prev = current->seccomp.filter; current->seccomp.filter = filter; return 0; -fail: + +free_filter: kfree(filter); +free_prog: + kfree(fp); return ret; } -- cgit v1.2.3 From 543bc6a1a987672b79d6ebe8e2ab10471d8f1047 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Sun, 30 Mar 2014 19:07:54 -0400 Subject: AUDIT: Allow login in non-init namespaces It its possible to configure your PAM stack to refuse login if audit messages (about the login) were unable to be sent. This is common in many distros and thus normal configuration of many containers. The PAM modules determine if audit is enabled/disabled in the kernel based on the return value from sending an audit message on the netlink socket. If userspace gets back ECONNREFUSED it believes audit is disabled in the kernel. If it gets any other error else it refuses to let the login proceed. Just about ever since the introduction of namespaces the kernel audit subsystem has returned EPERM if the task sending a message was not in the init user or pid namespace. So many forms of containers have never worked if audit was enabled in the kernel. BUT if the container was not in net_init then the kernel network code would send ECONNREFUSED (instead of the audit code sending EPERM). Thus by pure accident/dumb luck/bug if an admin configured the PAM stack to reject all logins that didn't talk to audit, but then ran the login untility in the non-init_net namespace, it would work!! Clearly this was a bug, but it is a bug some people expected. With the introduction of network namespace support in 3.14-rc1 the two bugs stopped cancelling each other out. Now, containers in the non-init_net namespace refused to let users log in (just like PAM was configfured!) Obviously some people were not happy that what used to let users log in, now didn't! This fix is kinda hacky. We return ECONNREFUSED for all non-init relevant namespaces. That means that not only will the old broken non-init_net setups continue to work, now the broken non-init_pid or non-init_user setups will 'work'. They don't really work, since audit isn't logging things. But it's what most users want. In 3.15 we should have patches to support not only the non-init_net (3.14) namespace but also the non-init_pid and non-init_user namespace. So all will be right in the world. This just opens the doors wide open on 3.14 and hopefully makes users happy, if not the audit system... Reported-by: Andre Tomt Reported-by: Adam Richter Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds Conflicts: kernel/audit.c --- kernel/audit.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index ad77d1e80895..873b965fdc58 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -608,8 +608,18 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) int err = 0; /* Only support initial user namespace for now. */ + /* + * We return ECONNREFUSED because it tricks userspace into thinking + * that audit was not configured into the kernel. Lots of users + * configure their PAM stack (because that's what the distro does) + * to reject login if unable to send messages to audit. If we return + * ECONNREFUSED the PAM stack thinks the kernel does not have audit + * configured in and will let login proceed. If we return EPERM + * userspace will reject all logins. This should be removed when we + * support non init namespaces!! + */ if ((current_user_ns() != &init_user_ns)) - return -EPERM; + return -ECONNREFUSED; switch (msg_type) { case AUDIT_LIST: -- cgit v1.2.3 From fbb32750a62df75d1ffea547f3908b21c5496d9f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Feb 2014 21:09:54 -0500 Subject: pipe: kill ->map() and ->unmap() all pipe_buffer_operations have the same instances of those... Signed-off-by: Al Viro --- kernel/relay.c | 2 -- kernel/trace/trace.c | 4 ---- 2 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 5001c9887db1..98833f664fb6 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, static const struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 24c1f2382557..7511de35257f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4316,8 +4316,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -5194,8 +5192,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .steal = generic_pipe_buf_steal, -- cgit v1.2.3 From 56c4911aedbecc2bdf7940073e85d52b691e2509 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 2 Apr 2014 15:46:42 -0400 Subject: audit: do not cast audit_rule_data pointers pointlesly For some sort of legacy support audit_rule is a subset of (and first entry in) audit_rule_data. We don't actually need or use audit_rule. We just do a cast from one to the other for no gain what so ever. Stop the crazy casting. Signed-off-by: Eric Paris --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 96c8a704f130..70101e0b184a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -228,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry) #endif /* Common user-space to kernel rule translation. */ -static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) +static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule) { unsigned listnr; struct audit_entry *entry; @@ -405,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, int i; char *str; - entry = audit_to_entry_common((struct audit_rule *)data); + entry = audit_to_entry_common(data); if (IS_ERR(entry)) goto exit_nofree; -- cgit v1.2.3 From d23082257d83e4bc89727d5aedee197e907999d2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 2 Apr 2014 17:45:05 +0200 Subject: pid_namespace: pidns_get() should check task_active_pid_ns() != NULL pidns_get()->get_pid_ns() can hit ns == NULL. This task_struct can't go away, but task_active_pid_ns(task) is NULL if release_task(task) was already called. Alternatively we could change get_pid_ns(ns) to check ns != NULL, but it seems that other callers are fine. Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman ebiederm@xmission.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 06c62de9c711..db95d8eb761b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -318,7 +318,9 @@ static void *pidns_get(struct task_struct *task) struct pid_namespace *ns; rcu_read_lock(); - ns = get_pid_ns(task_active_pid_ns(task)); + ns = task_active_pid_ns(task); + if (ns) + get_pid_ns(ns); rcu_read_unlock(); return ns; -- cgit v1.2.3 From 81c98869faa5f3a9457c93efef908ef476326b31 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 3 Apr 2014 14:46:25 -0700 Subject: kthread: ensure locality of task_struct allocations In the presence of memoryless nodes, numa_node_id() will return the current CPU's NUMA node, but that may not be where we expect to allocate from memory from. Instead, we should rely on the fallback code in the memory allocator itself, by using NUMA_NO_NODE. Also, when calling kthread_create_on_node(), use the nearest node with memory to the cpu in question, rather than the node it is running on. Signed-off-by: Nishanth Aravamudan Reviewed-by: Christoph Lameter Acked-by: David Rientjes Cc: Anton Blanchard Cc: Tejun Heo Cc: Oleg Nesterov Cc: Jan Kara Cc: Thomas Gleixner Cc: Tetsuo Handa Cc: Wanpeng Li Cc: Joonsoo Kim Cc: Ben Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index b5ae3ee860a9..9a130ec06f7a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk) if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif - return numa_node_id(); + return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), { struct task_struct *p; - p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, cpu); if (IS_ERR(p)) return p; -- cgit v1.2.3 From 62572e29bc530b38921ef6059088b4788a9832a5 Mon Sep 17 00:00:00 2001 From: Ben Zhang Date: Thu, 3 Apr 2014 14:47:18 -0700 Subject: kernel/watchdog.c: touch_nmi_watchdog should only touch local cpu not every one I ran into a scenario where while one cpu was stuck and should have panic'd because of the NMI watchdog, it didn't. The reason was another cpu was spewing stack dumps on to the console. Upon investigation, I noticed that when writing to the console and also when dumping the stack, the watchdog is touched. This causes all the cpus to reset their NMI watchdog flags and the 'stuck' cpu just spins forever. This change causes the semantics of touch_nmi_watchdog to be changed slightly. Previously, I accidentally changed the semantics and we noticed there was a codepath in which touch_nmi_watchdog could be touched from a preemtible area. That caused a BUG() to happen when CONFIG_DEBUG_PREEMPT was enabled. I believe it was the acpi code. My attempt here re-introduces the change to have the touch_nmi_watchdog() code only touch the local cpu instead of all of the cpus. But instead of using __get_cpu_var(), I use the __raw_get_cpu_var() version. This avoids the preemption problem. However my reasoning wasn't because I was trying to be lazy. Instead I rationalized it as, well if preemption is enabled then interrupts should be enabled to and the NMI watchdog will have no reason to trigger. So it won't matter if the wrong cpu is touched because the percpu interrupt counters the NMI watchdog uses should still be incrementing. Don said: : I'm ok with this patch, though it does alter the behaviour of how : touch_nmi_watchdog works. For the most part I don't think most callers : need to touch all of the watchdogs (on each cpu). Perhaps a corner case : will pop up (the scheduler?? to mimic touch_all_softlockup_watchdogs() ). : : But this does address an issue where if a system is locked up and one cpu : is spewing out useful debug messages (or error messages), the hard lockup : will fail to go off. We have seen this on RHEL also. Signed-off-by: Don Zickus Signed-off-by: Ben Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 01c6f979486f..e90089fd78e0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -158,14 +158,14 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - if (watchdog_user_enabled) { - unsigned cpu; - - for_each_present_cpu(cpu) { - if (per_cpu(watchdog_nmi_touch, cpu) != true) - per_cpu(watchdog_nmi_touch, cpu) = true; - } - } + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + __raw_get_cpu_var(watchdog_nmi_touch) = true; touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); -- cgit v1.2.3 From d26914d11751b23ca2e8747725f2cae10c2f2c1b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 3 Apr 2014 14:47:24 -0700 Subject: mm: optimize put_mems_allowed() usage Since put_mems_allowed() is strictly optional, its a seqcount retry, we don't need to evaluate the function if the allocation was in fact successful, saving a smp_rmb some loads and comparisons on some relative fast-paths. Since the naming, get/put_mems_allowed() does suggest a mandatory pairing, rename the interface, as suggested by Mel, to resemble the seqcount interface. This gives us: read_mems_allowed_begin() and read_mems_allowed_retry(), where it is important to note that the return value of the latter call is inverted from its previous incarnation. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e6b1b66afe52..f6fc7475f1a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing - * get_mems_allowed(). If at least one node remains unchanged and + * read_mems_allowed_begin(). If at least one node remains unchanged and * tsk does not have a mempolicy, then an empty nodemask will not be * possible when mems_allowed is larger than a word. */ -- cgit v1.2.3 From 5509a5d27b971a90b940e148ca9ca53312e4fa7a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 3 Apr 2014 14:48:19 -0700 Subject: drop_caches: add some documentation and info message There is plenty of anecdotal evidence and a load of blog posts suggesting that using "drop_caches" periodically keeps your system running in "tip top shape". Perhaps adding some kernel documentation will increase the amount of accurate data on its use. If we are not shrinking caches effectively, then we have real bugs. Using drop_caches will simply mask the bugs and make them harder to find, but certainly does not fix them, nor is it an appropriate "workaround" to limit the size of the caches. On the contrary, there have been bug reports on issues that turned out to be misguided use of cache dropping. Dropping caches is a very drastic and disruptive operation that is good for debugging and running tests, but if it creates bug reports from production use, kernel developers should be aware of its use. Add a bit more documentation about it, a syslog message to track down abusers, and vmstat drop counters to help analyze problem reports. [akpm@linux-foundation.org: checkpatch fixes] [hannes@cmpxchg.org: add runtime suppression control] Signed-off-by: Dave Hansen Signed-off-by: Michal Hocko Acked-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09d2e2413605..5c14b547882e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -123,7 +123,7 @@ static int __maybe_unused neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; -static int __maybe_unused three = 3; +static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -1264,7 +1264,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, .extra1 = &one, - .extra2 = &three, + .extra2 = &four, }, #ifdef CONFIG_COMPACTION { -- cgit v1.2.3 From 6af9f7bf3c399e0ab1eee048e13572c6d4e15fe9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 3 Apr 2014 14:48:25 -0700 Subject: sys_sysfs: Add CONFIG_SYSFS_SYSCALL sys_sysfs is an obsolete system call no longer supported by libc. - This patch adds a default CONFIG_SYSFS_SYSCALL=y - Option can be turned off in expert mode. - cond_syscall added to kernel/sys_ni.c [akpm@linux-foundation.org: tweak Kconfig help text] Signed-off-by: Fabian Frederick Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7078052284fd..74395a95b7e9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -146,6 +146,7 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(sys_sysfs); cond_syscall(sys_syslog); cond_syscall(sys_process_vm_readv); cond_syscall(sys_process_vm_writev); -- cgit v1.2.3 From 8f6c5ffc8987f4f5b5a3e9d557d94bbf3a9bf216 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Thu, 3 Apr 2014 14:48:26 -0700 Subject: kernel/groups.c: remove return value of set_groups After commit 6307f8fee295 ("security: remove dead hook task_setgroups"), set_groups will always return zero, so we could just remove return value of set_groups. This patch reduces code size, and simplfies code to use set_groups, because we don't need to check its return value any more. [akpm@linux-foundation.org: remove obsolete claims from set_groups() comment] Signed-off-by: Wang YanQing Cc: "Eric W. Biederman" Cc: Serge Hallyn Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index 90cf1c38c8ea..451698f86cfa 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp) * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install - * - * Validate a group subscription and, if valid, insert it into a set - * of credentials. */ -int set_groups(struct cred *new, struct group_info *group_info) +void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; - return 0; } EXPORT_SYMBOL(set_groups); @@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups); int set_current_groups(struct group_info *group_info) { struct cred *new; - int ret; new = prepare_creds(); if (!new) return -ENOMEM; - ret = set_groups(new, group_info); - if (ret < 0) { - abort_creds(new); - return ret; - } - + set_groups(new, group_info); return commit_creds(new); } -- cgit v1.2.3 From 69369a7003735d0d8ef22097e27a55a8bad9557a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 3 Apr 2014 14:48:27 -0700 Subject: fs, kernel: permit disabling the uselib syscall uselib hasn't been used since libc5; glibc does not use it. Support turning it off. When disabled, also omit the load_elf_library implementation from binfmt_elf.c, which only uselib invokes. bloat-o-meter: add/remove: 0/4 grow/shrink: 0/1 up/down: 0/-785 (-785) function old new delta padzero 39 36 -3 uselib_flags 20 - -20 sys_uselib 168 - -168 SyS_uselib 168 - -168 load_elf_library 426 - -426 The new CONFIG_USELIB defaults to `y'. Signed-off-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 74395a95b7e9..bc8d1b74a6b9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -152,6 +152,7 @@ cond_syscall(sys_process_vm_readv); cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); +cond_syscall(sys_uselib); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3 From c96d6660dc65b0a90aea9834bfd8be1d5656da18 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 3 Apr 2014 14:48:35 -0700 Subject: kernel: audit/fix non-modular users of module_init in core code Code that is obj-y (always built-in) or dependent on a bool Kconfig (built-in or absent) can never be modular. So using module_init as an alias for __initcall can be somewhat misleading. Fix these up now, so that we can relocate module_init from init.h into module.h in the future. If we don't do this, we'd have to add module.h to obviously non-modular code, and that would be a worse thing. The audit targets the following module_init users for change: kernel/user.c obj-y kernel/kexec.c bool KEXEC (one instance per arch) kernel/profile.c bool PROFILING kernel/hung_task.c bool DETECT_HUNG_TASK kernel/sched/stats.c bool SCHEDSTATS kernel/user_namespace.c bool USER_NS Note that direct use of __initcall is discouraged, vs. one of the priority categorized subgroups. As __initcall gets mapped onto device_initcall, our use of subsys_initcall (which makes sense for these files) will thus change this registration from level 6-device to level 4-subsys (i.e. slightly earlier). However no observable impact of that difference has been observed during testing. Also, two instances of missing ";" at EOL are fixed in kexec. Signed-off-by: Paul Gortmaker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 3 +-- kernel/kexec.c | 4 ++-- kernel/profile.c | 2 +- kernel/sched/stats.c | 2 +- kernel/user.c | 3 +-- kernel/user_namespace.c | 2 +- 6 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0b9c169d577f..06bb1417b063 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -246,5 +246,4 @@ static int __init hung_task_init(void) return 0; } - -module_init(hung_task_init); +subsys_initcall(hung_task_init); diff --git a/kernel/kexec.c b/kernel/kexec.c index 45601cf41bee..c0d261c7db7b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1235,7 +1235,7 @@ static int __init crash_notes_memory_init(void) } return 0; } -module_init(crash_notes_memory_init) +subsys_initcall(crash_notes_memory_init); /* @@ -1629,7 +1629,7 @@ static int __init crash_save_vmcoreinfo_init(void) return 0; } -module_init(crash_save_vmcoreinfo_init) +subsys_initcall(crash_save_vmcoreinfo_init); /* * Move into place and start executing a preloaded standalone diff --git a/kernel/profile.c b/kernel/profile.c index ebdd9c1a86b4..1b266dbe755a 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -604,5 +604,5 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ hotcpu_notifier(profile_cpu_callback, 0); return 0; } -module_init(create_proc_profile); +subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8b..a476bea17fbc 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void) proc_create("schedstat", 0, NULL, &proc_schedstat_operations); return 0; } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); diff --git a/kernel/user.c b/kernel/user.c index c006131beb77..294fc6a94168 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -222,5 +222,4 @@ static int __init uid_cache_init(void) return 0; } - -module_init(uid_cache_init); +subsys_initcall(uid_cache_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index dd06439b9c84..0d8f6023fd8d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -902,4 +902,4 @@ static __init int user_namespaces_init(void) user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); return 0; } -module_init(user_namespaces_init); +subsys_initcall(user_namespaces_init); -- cgit v1.2.3 From 28ab49ff7f3dcaf4df8d2bd0d4099b8c08285ed7 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Thu, 3 Apr 2014 14:48:36 -0700 Subject: kernel/resource.c: make reallocate_resource() static sparse says: kernel/resource.c:518:5: warning: symbol 'reallocate_resource' was not declared. Should it be static? Signed-off-by: Daeseok Youn Reviewed-by: Yasuaki Ishimatsu Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 673061c06da1..8957d686e29b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -511,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new, * @newsize: new size of the resource descriptor * @constraint: the size and alignment constraints to be met. */ -int reallocate_resource(struct resource *root, struct resource *old, +static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, struct resource_constraint *constraint) { -- cgit v1.2.3 From c64730b26f08cccfbc8fcbf169c304b4bd71dcac Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:39 -0700 Subject: printk: remove obsolete check for log level "c" The kernel log level "c" was removed in commit 61e99ab8e35a ("printk: remove the now unnecessary "C" annotation for KERN_CONT"). It is no longer detected in printk_get_level(). Hence we do not need to check it in vprintk_emit. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4dae9cbe9259..db7a02e05241 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1560,8 +1560,6 @@ asmlinkage int vprintk_emit(int facility, int level, level = kern_level - '0'; case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; - case 'c': /* KERN_CONT */ - break; } text_len -= end_of_header - text; text = (char *)end_of_header; -- cgit v1.2.3 From e8c42d36ab86cf45f88c3a0e344233b1032fbf3d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:41 -0700 Subject: printk: add comment about tricky check for text buffer size There is no check for potential "text_len" overflow. It is not needed because only valid level is detected. It took me some time to understand why. It would deserve a comment ;-) Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index db7a02e05241..012f3e40671d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1561,6 +1561,11 @@ asmlinkage int vprintk_emit(int facility, int level, case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; } + /* + * No need to check length here because vscnprintf + * put '\0' at the end of the string. Only valid and + * newly printed level is detected. + */ text_len -= end_of_header - text; text = (char *)end_of_header; } -- cgit v1.2.3 From 39b25109b400ea397e64c417d8b965a53e2ee0f0 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:42 -0700 Subject: printk: use also the last bytes in the ring buffer It seems that we have newer used the last byte in the ring buffer. In fact, we have newer used the last 4 bytes because of padding. First problem is in the check for free space. The exact number of free bytes is enough to store the length of data. Second problem is in the check where the ring buffer is rotated. The left side counts the first unused index. It is unused, so it might be the same as the size of the buffer. Note that the first problem has to be fixed together with the second one. Otherwise, the buffer is rotated even when there is enough space on the end of the buffer. Then the beginning of the buffer is rewritten and valid entries get corrupted. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 012f3e40671d..b3a1790f9e05 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -319,7 +319,7 @@ static void log_store(int facility, int level, else free = log_first_idx - log_next_idx; - if (free > size + sizeof(struct printk_log)) + if (free >= size + sizeof(struct printk_log)) break; /* drop old messages until we have enough contiuous space */ @@ -327,7 +327,7 @@ static void log_store(int facility, int level, log_first_seq++; } - if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { + if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* * This message + an additional empty header does not fit * at the end of the buffer. Add an empty header with len == 0 -- cgit v1.2.3 From fce6e0338abe910ba6d4db0657ae8adc6aa1a72b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 3 Apr 2014 14:48:43 -0700 Subject: printk: do not compute the size of the message twice This is just a tiny optimization. It removes duplicate computation of the message size. Signed-off-by: Petr Mladek Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Jan Kara Cc: Michal Hocko Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b3a1790f9e05..ff9faf4e3cd5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -351,7 +351,7 @@ static void log_store(int facility, int level, else msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; + msg->len = size; /* insert message */ log_next_idx += msg->len; -- cgit v1.2.3 From 72581487a61f6304a7cc32e189eb65fb1c920a53 Mon Sep 17 00:00:00 2001 From: Jane Li Date: Thu, 3 Apr 2014 14:48:45 -0700 Subject: printk: fix one circular lockdep warning about console_lock Fix a warning about possible circular locking dependency. If do in following sequence: enter suspend -> resume -> plug-out CPUx (echo 0 > cpux/online) lockdep will show warning as following: ====================================================== [ INFO: possible circular locking dependency detected ] 3.10.0 #2 Tainted: G O ------------------------------------------------------- sh/1271 is trying to acquire lock: (console_lock){+.+.+.}, at: console_cpu_notify+0x20/0x2c but task is already holding lock: (cpu_hotplug.lock){+.+.+.}, at: cpu_hotplug_begin+0x2c/0x58 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (cpu_hotplug.lock){+.+.+.}: lock_acquire+0x98/0x12c mutex_lock_nested+0x50/0x3d8 cpu_hotplug_begin+0x2c/0x58 _cpu_up+0x24/0x154 cpu_up+0x64/0x84 smp_init+0x9c/0xd4 kernel_init_freeable+0x78/0x1c8 kernel_init+0x8/0xe4 ret_from_fork+0x14/0x2c -> #1 (cpu_add_remove_lock){+.+.+.}: lock_acquire+0x98/0x12c mutex_lock_nested+0x50/0x3d8 disable_nonboot_cpus+0x8/0xe8 suspend_devices_and_enter+0x214/0x448 pm_suspend+0x1e4/0x284 try_to_suspend+0xa4/0xbc process_one_work+0x1c4/0x4fc worker_thread+0x138/0x37c kthread+0xa4/0xb0 ret_from_fork+0x14/0x2c -> #0 (console_lock){+.+.+.}: __lock_acquire+0x1b38/0x1b80 lock_acquire+0x98/0x12c console_lock+0x54/0x68 console_cpu_notify+0x20/0x2c notifier_call_chain+0x44/0x84 __cpu_notify+0x2c/0x48 cpu_notify_nofail+0x8/0x14 _cpu_down+0xf4/0x258 cpu_down+0x24/0x40 store_online+0x30/0x74 dev_attr_store+0x18/0x24 sysfs_write_file+0x16c/0x19c vfs_write+0xb4/0x190 SyS_write+0x3c/0x70 ret_fast_syscall+0x0/0x48 Chain exists of: console_lock --> cpu_add_remove_lock --> cpu_hotplug.lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpu_hotplug.lock); lock(cpu_add_remove_lock); lock(cpu_hotplug.lock); lock(console_lock); *** DEADLOCK *** There are three locks involved in two sequence: a) pm suspend: console_lock (@suspend_console()) cpu_add_remove_lock (@disable_nonboot_cpus()) cpu_hotplug.lock (@_cpu_down()) b) Plug-out CPUx: cpu_add_remove_lock (@(cpu_down()) cpu_hotplug.lock (@_cpu_down()) console_lock (@console_cpu_notify()) => Lockdeps prints warning log. There should be not real deadlock, as flag of console_suspended can protect this. Although console_suspend() releases console_sem, it doesn't tell lockdep about it. That results in the lockdep warning about circular locking when doing the following: enter suspend -> resume -> plug-out CPUx (echo 0 > cpux/online) Fix the problem by telling lockdep we actually released the semaphore in console_suspend() and acquired it again in console_resume(). Signed-off-by: Jane Li Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ff9faf4e3cd5..a45b50962295 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1883,6 +1883,7 @@ void suspend_console(void) console_lock(); console_suspended = 1; up(&console_sem); + mutex_release(&console_lock_dep_map, 1, _RET_IP_); } void resume_console(void) @@ -1890,6 +1891,7 @@ void resume_console(void) if (!console_suspend_enabled) return; down(&console_sem); + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); console_suspended = 0; console_unlock(); } -- cgit v1.2.3 From c6b3d5bcd67c75961a1e8b9564d1475c0f194a84 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 4 Apr 2014 17:14:41 +0800 Subject: cgroup: fix top cgroup refcnt leak As mount() and kill_sb() is not a one-to-one match, If we mount the same cgroupfs in serveral mount points, and then umount all of them, kill_sb() will be called only once. Try: # mount -t cgroup -o cpuacct xxx /cgroup # mount -t cgroup -o cpuacct xxx /cgroup2 # cat /proc/cgroups | grep cpuacct cpuacct 2 1 1 # umount /cgroup # umount /cgroup2 # cat /proc/cgroups | grep cpuacct cpuacct 2 1 1 You'll see cgroupfs will never be freed. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fede3d3f28ff..0dfc7324c789 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1487,6 +1487,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + bool new_sb; /* * The first time anyone tries to mount a cgroup, enable the list @@ -1603,8 +1604,8 @@ out_unlock: if (ret) return ERR_PTR(ret); - dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL); - if (IS_ERR(dentry)) + dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); return dentry; } -- cgit v1.2.3 From b8780c363d808a726a34793caa900923d32b6b80 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 7 Apr 2014 17:33:06 +0200 Subject: sched: remove sleep_on() and friends This is the final piece in the puzzle, as all patches to remove the last users of \(interruptible_\|\)sleep_on\(_timeout\|\) have made it into the 3.15 merge window. The work was long overdue, and this interface in particular should not have survived the BKL removal that was done a couple of years ago. Citing Jon Corbet from http://lwn.net/2001/0201/kernel.php3": "[...] it was suggested that the janitors look for and fix all code that calls sleep_on() [...] since (1) almost all such code is incorrect, and (2) Linus has agreed that those functions should be removed in the 2.5 development series". We haven't quite made it for 2.5, but maybe we can merge this for 3.15. Signed-off-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d1b87b36778..0ff3f34bc7e3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2845,52 +2845,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - #ifdef CONFIG_RT_MUTEXES /* -- cgit v1.2.3 From 49957f8e2a43035a97d05bddefa394492a969c0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Apr 2014 16:44:47 -0400 Subject: cgroup: newly created dirs and files should be owned by the creator While converting cgroup to kernfs, 2bd59d48ebfb ("cgroup: convert to kernfs") accidentally dropped the logic which makes newly created cgroup dirs and files owned by the current uid / gid. This broke cases where cgroup subtree management is delegated to !root as the sub manager wouldn't be able to create more than single level of hierarchy or put tasks into child cgroups it created. Among other things, this breaks user session management in systemd and one of the symptoms was 90s hang during shutdown. User session systemd running as the user creates a sub-service to initiate shutdown and tries to put kill(1) into it but fails because cgroup.procs is owned by root. This leads to 90s hang during shutdown. Implement cgroup_kn_set_ugid() which sets a kn's uid and gid to those of the caller and use it from file and dir creation paths. Signed-off-by: Tejun Heo Reported-by: Linus Torvalds --- kernel/cgroup.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0dfc7324c789..9fcdaa705b6c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2346,11 +2346,26 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return ret; } +/* set uid and gid of cgroup dirs and files to that of the creator */ +static int cgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; + int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; @@ -2358,7 +2373,13 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), 0, cft->kf_ops, cft, NULL, false, key); - return PTR_ERR_OR_ZERO(kn); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = cgroup_kn_set_ugid(kn); + if (ret) + kernfs_remove(kn); + return ret; } /** @@ -3753,6 +3774,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + err = cgroup_kn_set_ugid(kn); + if (err) + goto err_destroy; + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; -- cgit v1.2.3 From a0715cc22601e8830ace98366c0c2bd8da52af52 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 7 Apr 2014 15:37:10 -0700 Subject: mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE Add VM_INIT_DEF_MASK, to allow us to set the default flags for VMs. It also adds a prctl control which allows us to set the THP disable bit in mm->def_flags so that VMs will pick up the setting as they are created. Signed-off-by: Alex Thorlton Suggested-by: Oleg Nesterov Cc: Gerald Schaefer Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Christian Borntraeger Cc: Paolo Bonzini Cc: "Kirill A. Shutemov" Cc: Mel Gorman Acked-by: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: Johannes Weiner Cc: David Rientjes Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 ++++++++--- kernel/sys.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index abc45890f0a5..e40c0a01d5a6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -530,8 +530,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? - (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); @@ -540,8 +538,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm_init_owner(mm, p); clear_tlb_flush_pending(mm); - if (likely(!mm_alloc_pgd(mm))) { + if (current->mm) { + mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; + } else { + mm->flags = default_dump_filter; mm->def_flags = 0; + } + + if (likely(!mm_alloc_pgd(mm))) { mmu_notifier_mm_init(mm); return mm; } diff --git a/kernel/sys.c b/kernel/sys.c index adaeab6f7a87..fba0f29401ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1996,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return current->no_new_privs ? 1 : 0; + case PR_GET_THP_DISABLE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + break; + case PR_SET_THP_DISABLE: + if (arg3 || arg4 || arg5) + return -EINVAL; + down_write(&me->mm->mmap_sem); + if (arg2) + me->mm->def_flags |= VM_NOHUGEPAGE; + else + me->mm->def_flags &= ~VM_NOHUGEPAGE; + up_write(&me->mm->mmap_sem); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From 615d6e8756c87149f2d4c1b93d471bca002bd849 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Apr 2014 15:37:25 -0700 Subject: mm: per-thread vma caching This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 50.61% | 19.90 | | patched | 73.45% | 13.58 | +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 75.28% | 11.03 | | patched | 88.09% | 9.31 | +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 70.66% | 17.14 | | patched | 91.15% | 12.57 | +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 1.06% | 91.54 | | patched | 99.97% | 14.18 | +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso Reviewed-by: Rik van Riel Acked-by: Linus Torvalds Reviewed-by: Michel Lespinasse Cc: Oleg Nesterov Tested-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/debug/debug_core.c | 14 +++++++++++--- kernel/fork.c | 7 ++++++- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 99982a70ddad..2956c8da1605 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); + if (current->mm) { + int i; + + for (i = 0; i < VMACACHE_SIZE; i++) { + if (!current->vmacache[i]) + continue; + flush_cache_range(current->vmacache[i], + addr, addr + BREAK_INSTR_SIZE); + } } + /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/fork.c b/kernel/fork.c index e40c0a01d5a6..bc0e96b78dfd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; - mm->mmap_cache = NULL; + mm->vmacache_seqnum = 0; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; + /* initialize the new vmacache entries */ + vmacache_flush(tsk); + if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; -- cgit v1.2.3 From 514ddb446c5c5a238eca32b7052b7a8accae4e93 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:27 -0700 Subject: fork: collapse copy_flags into copy_process copy_flags() does not use the clone_flags formal and can be collapsed into copy_process() for cleaner code. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bc0e96b78dfd..c777964c0662 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1080,15 +1080,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; -} - SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1238,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); -- cgit v1.2.3 From f0432d159601f96839f514f286eaa5b75c4112dc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:30 -0700 Subject: mm, mempolicy: remove per-process flag PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users. There's no significant performance degradation to checking current->mempolicy rather than current->flags & PF_MEMPOLICY in the allocation path, especially since this is considered unlikely(). Running TCP_RR with netperf-2.4.5 through localhost on 16 cpu machine with 64GB of memory and without a mempolicy: threads before after 16 1249409 1244487 32 1281786 1246783 48 1239175 1239138 64 1244642 1241841 80 1244346 1248918 96 1266436 1254316 112 1307398 1312135 128 1327607 1326502 Per-process flags are a scarce resource so we should free them up whenever possible and make them available. We'll be using it shortly for memcg oom reserves. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c777964c0662..e905e9c6b224 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1276,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_threadgroup_lock; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; -- cgit v1.2.3 From 539a13b47e462d28c48f076c63871580f694a366 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:32 -0700 Subject: res_counter: remove interface for locked charging and uncharging The res_counter_{charge,uncharge}_locked() variants are not used in the kernel outside of the resource counter code itself, so remove the interface. Signed-off-by: David Rientjes Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 4aa8a305aede..51dbac6a3633 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val, - bool force) +static u64 res_counter_uncharge_locked(struct res_counter *counter, + unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; + return counter->usage; +} + +static int res_counter_charge_locked(struct res_counter *counter, + unsigned long val, bool force) { int ret = 0; @@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *top, unsigned long val) -- cgit v1.2.3 From c39df5fa37b0623589508c95515b4aa1531c524e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:29 -0700 Subject: exit: call disassociate_ctty() before exit_task_namespaces() Commit 8aac62706ada ("move exit_task_namespaces() outside of exit_notify()") breaks pppd and the exiting service crashes the kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: ppp_register_channel+0x13/0x20 [ppp_generic] Call Trace: ppp_asynctty_open+0x12b/0x170 [ppp_async] tty_ldisc_open.isra.2+0x27/0x60 tty_ldisc_hangup+0x1e3/0x220 __tty_hangup+0x2c4/0x440 disassociate_ctty+0x61/0x270 do_exit+0x7f2/0xa50 ppp_register_channel() needs ->net_ns and current->nsproxy == NULL. Move disassociate_ctty() before exit_task_namespaces(), it doesn't make sense to delay it after perf_event_exit_task() or cgroup_exit(). This also allows to use task_work_add() inside the (nontrivial) code paths in disassociate_ctty(). Investigated by Peter Hurley. Signed-off-by: Oleg Nesterov Reported-by: Sree Harsha Totakura Cc: Peter Hurley Cc: Sree Harsha Totakura Cc: "Eric W. Biederman" Cc: Jeff Dike Cc: Ingo Molnar Cc: Andrey Vagin Cc: Al Viro Cc: [v3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 6480d1c85d7a..11f9e39a7368 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -784,6 +784,8 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + if (group_dead) + disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); @@ -799,13 +801,9 @@ void do_exit(long code) cgroup_exit(tsk); - if (group_dead) - disassociate_ctty(1); - module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ -- cgit v1.2.3 From 4bcb8232cf4eb061b086c10f56b6808adcdb5a93 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:30 -0700 Subject: exit: move check_stack_usage() to the end of do_exit() It is not clear why check_stack_usage() is called so early and thus it never checks the stack usage in, say, exit_notify() or flush_ptrace_hw_breakpoint() or other functions which are only called by do_exit(). Move the callsite down to the last preempt_disable/schedule. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 11f9e39a7368..171c9a9d7b00 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -788,7 +788,6 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - check_stack_usage(); exit_thread(); /* @@ -842,6 +841,7 @@ void do_exit(long code) validate_creds_for_do_exit(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); -- cgit v1.2.3 From ef9823939e5acd5d323ff61fbc427ef998dd203e Mon Sep 17 00:00:00 2001 From: Guillaume Morin Date: Mon, 7 Apr 2014 15:38:31 -0700 Subject: kernel/exit.c: call proc_exit_connector() after exit_state is set The process events connector delivers a notification when a process exits. This is really convenient for a process that spawns and wants to monitor its children through an epoll-able() interface. Unfortunately, there is a small window between when the event is delivered and the child become wait()-able. This is creates a race if the parent wants to make sure that it knows about the exit, e.g pid_t pid = fork(); if (pid > 0) { register_interest_for_pid(pid); if (waitpid(pid, NULL, WNOHANG) > 0) { /* We might have raced with exit() */ } return; } /* Child */ execve(...) register_interest_for_pid() would be telling the the connector socket reader to pay attention to events related to pid. Though this is not a bug, I think it would make the connector a bit more usable if this race was closed by simply moving the call to proc_exit_connector() from just before exit_notify() to right after. Oleg said: : Even with this patch the code above is still "racy" if the child is : multi-threaded. Plus it should obviously filter-out subthreads. And : afaics there is no way to make it reliable, even if you change the code : above so that waitpid() is called only after the last thread exits WNOHANG : still can fail. Signed-off-by: Guillaume Morin Cc: Matt Helsley Cc: Oleg Nesterov Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 171c9a9d7b00..decf648574f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -802,13 +802,13 @@ void do_exit(long code) module_put(task_thread_info(tsk)->exec_domain->module); - proc_exit_connector(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); -- cgit v1.2.3 From dfccbb5e49a621c1b21a62527d61fc4305617aca Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:41 -0700 Subject: wait: fix reparent_leader() vs EXIT_DEAD->EXIT_ZOMBIE race wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and drops tasklist_lock. If this task is not the natural child and it is traced, we change its state back to EXIT_ZOMBIE for ->real_parent. The last transition is racy, this is even documented in 50b8d257486a "ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race". wait_consider_task() tries to detect this transition and clear ->notask_error but we can't rely on ptrace_reparented(), debugger can exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE. And there is another problem which were missed before: this transition can also race with reparent_leader() which doesn't reset >exit_signal if EXIT_DEAD, assuming that this task must be reaped by someone else. So the tracee can be re-parented with ->exit_signal != SIGCHLD, and if /sbin/init doesn't use __WALL it becomes unreapable. Change reparent_leader() to update ->exit_signal even if EXIT_DEAD. Note: this is the simple temporary hack for -stable, it doesn't try to solve all problems, it will be reverted by the next changes. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Reported-by: Michal Schmidt Tested-by: Michal Schmidt Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index decf648574f6..e354cbb13a9b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -560,9 +560,6 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -570,9 +567,19 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* + * We don't want people slaying init. + * + * Note: we do this even if it is EXIT_DEAD, wait_task_zombie() + * can change ->exit_state to EXIT_ZOMBIE. If this is the final + * state, do_notify_parent() was already called and ->exit_signal + * doesn't matter. + */ p->exit_signal = SIGCHLD; + if (p->exit_state == EXIT_DEAD) + return; + /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { -- cgit v1.2.3 From abd50b39e783e1b6c75c7534c37f1eb2d94a89cd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:42 -0700 Subject: wait: introduce EXIT_TRACE to avoid the racy EXIT_DEAD->EXIT_ZOMBIE transition wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and drops tasklist_lock. If this task is not the natural child and it is traced, we change its state back to EXIT_ZOMBIE for ->real_parent. The last transition is racy, this is even documented in 50b8d257486a "ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race". wait_consider_task() tries to detect this transition and clear ->notask_error but we can't rely on ptrace_reparented(), debugger can exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE. And there is another problem which were missed before: this transition can also race with reparent_leader() which doesn't reset >exit_signal if EXIT_DEAD, assuming that this task must be reaped by someone else. So the tracee can be re-parented with ->exit_signal != SIGCHLD, and if /sbin/init doesn't use __WALL it becomes unreapable. This was fixed by the previous commit, but it was the temporary hack. 1. Add the new exit_state, EXIT_TRACE. It means that the task is the traced zombie, debugger is going to detach and notify its natural parent. This new state is actually EXIT_ZOMBIE | EXIT_DEAD. This way we can avoid the changes in proc/kgdb code, get_task_state() still reports "X (dead)" in this case. Note: with or without this change userspace can see Z -> X -> Z transition. Not really bad, but probably makes sense to fix. 2. Change wait_task_zombie() to use EXIT_TRACE instead of EXIT_DEAD if we need to notify the ->real_parent. 3. Revert the previous hack in reparent_leader(), now that EXIT_DEAD is always the final state we can safely ignore such a task. 4. Change wait_consider_task() to check EXIT_TRACE separately and kill the racy and no longer needed ptrace_reparented() case. If ptrace == T an EXIT_TRACE thread should be simply ignored, the owner of this state is going to ptrace_unlink() this task. We can pretend that it was already removed from ->ptraced list. Otherwise we should skip this thread too but clear ->notask_error, we must be the natural parent and debugger is going to untrace and notify us. IOW, this doesn't differ from "EXIT_ZOMBIE && p->ptrace" even if the task was already untraced. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Reported-by: Michal Schmidt Tested-by: Michal Schmidt Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e354cbb13a9b..022a0ff17318 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -560,6 +560,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { list_move_tail(&p->sibling, &p->real_parent->children); + + if (p->exit_state == EXIT_DEAD) + return; /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -567,19 +570,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* - * We don't want people slaying init. - * - * Note: we do this even if it is EXIT_DEAD, wait_task_zombie() - * can change ->exit_state to EXIT_ZOMBIE. If this is the final - * state, do_notify_parent() was already called and ->exit_signal - * doesn't matter. - */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; - if (p->exit_state == EXIT_DEAD) - return; - /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { @@ -1043,17 +1036,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return wait_noreap_copyout(wo, p, pid, uid, why, status); } + traced = ptrace_reparented(p); /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = traced ? EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } - - traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check * thread_group_leader() to filter out sub-threads. @@ -1114,7 +1103,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. + * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); @@ -1159,14 +1148,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * If this is not a sub-thread, notify the parent. * If parent wants a zombie, don't release it now. */ + state = EXIT_DEAD; if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + !do_notify_parent(p, p->exit_signal)) + state = EXIT_ZOMBIE; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); return retval; @@ -1362,12 +1351,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + if (unlikely(p->exit_state == EXIT_TRACE)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + if (likely(!ptrace)) wo->notask_error = 0; return 0; } -- cgit v1.2.3 From b436069059fede30ca31d4bf439cc86436ff5b1d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:43 -0700 Subject: wait: use EXIT_TRACE only if thread_group_leader(zombie) wait_task_zombie() always uses EXIT_TRACE/ptrace_unlink() if ptrace_reparented(). This is suboptimal and a bit confusing: we do not need do_notify_parent(p) if !thread_group_leader(p) and in this case we also do not need ptrace_unlink(), we can rely on ptrace_release_task(). Change wait_task_zombie() to check thread_group_leader() along with ptrace_reparented() and simplify the final p->exit_state transition. Signed-off-by: Oleg Nesterov Tested-by: Michal Schmidt Cc: Jan Kratochvil Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 022a0ff17318..4773ed990907 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1040,7 +1040,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = traced ? EXIT_TRACE : EXIT_DEAD; + state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* @@ -1140,18 +1140,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (!retval) retval = pid; - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - state = EXIT_DEAD; - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) - state = EXIT_ZOMBIE; + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } -- cgit v1.2.3 From b3ab03160dfaf8ab78d476b670de319f4c1a5685 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:45 -0700 Subject: wait: completely ignore the EXIT_DEAD tasks Now that EXIT_DEAD is the terminal state it doesn't make sense to call eligible_child() or security_task_wait() if the task is really dead. Signed-off-by: Oleg Nesterov Tested-by: Michal Schmidt Cc: Jan Kratochvil Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 4773ed990907..33cf8dba0a61 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1329,7 +1329,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + int ret; + + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, p); if (!ret) return ret; @@ -1347,10 +1352,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) - return 0; - if (unlikely(p->exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case -- cgit v1.2.3 From 377d75dafa07ee0da64223c9169f4e17b26c2b9a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:47 -0700 Subject: wait: WSTOPPED|WCONTINUED hangs if a zombie child is traced by real_parent "A zombie is only visible to its ptracer" logic in wait_consider_task() is very wrong. Trivial test-case: #include #include #include #include int main(void) { int child = fork(); if (!child) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); return 0x23; } assert(waitid(P_ALL, child, NULL, WEXITED | WNOWAIT) == 0); assert(waitid(P_ALL, 0, NULL, WSTOPPED) == -1); return 0; } it hangs in waitpid(WSTOPPED) despite the fact it has a single zombie child. This is because wait_consider_task(ptrace => 0) sees p->ptrace and cleares ->notask_error assuming that the debugger should detach and notify us. Change wait_consider_task(ptrace => 0) to pretend that ptrace == T if the child is traced by us. This really simplifies the logic and allows us to do more fixes, see the next changes. This also hides the unwanted group stop state automatically, we can remove another ptrace_reparented() check. Unfortunately, this adds the following behavioural changes: 1. Before this patch wait(WEXITED | __WNOTHREAD) does not reap a natural child if it is traced by the caller's sub-thread. Hopefully nobody will ever notice this change, and I think that nobody should rely on this behaviour anyway. 2. SIGNAL_STOP_CONTINUED is no longer hidden from debugger if it is real parent. While this change comes as a side effect, I think it is good by itself. The group continued state can not be consumed by another process in this case, it doesn't depend on ptrace, it doesn't make sense to hide it from real parent. Perhaps we should add the thread_group_leader() check before wait_task_continued()? May be, but this shouldn't depend on ptrace_reparented(). Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Jan Kratochvil Cc: Lennart Poettering Cc: Michal Schmidt Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 33cf8dba0a61..92d38d4da4b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1362,6 +1362,22 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } + if (likely(!ptrace) && unlikely(p->ptrace)) { + /* + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. + */ + if (!ptrace_reparented(p)) + ptrace = 1; + } + /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { /* @@ -1402,19 +1418,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { - /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. -- cgit v1.2.3 From 7c733eb3eac0e3d091aaf37c183d2175eeebfb2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:49 -0700 Subject: wait: WSTOPPED|WCONTINUED doesn't work if a zombie leader is traced by another process Even if the main thread is dead the process still can stop/continue. However, if the leader is ptraced wait_consider_task(ptrace => false) always skips wait_task_stopped/wait_task_continued, so WSTOPPED or WCONTINUED can never work for the natural parent in this case. Move the "A zombie ptracee is only visible to its ptracer" check into the "if (!delay_group_leader(p))" block. ->notask_error is cleared by the "fall through" code below. This depends on the previous change, wait_task_stopped/continued must be avoided if !delay_group_leader() and the tracer is ->real_parent. Otherwise WSTOPPED|WEXITED could wrongly report "stopped" when the child is already dead (single-threaded or not). If it is traced by another task then the "stopped" state is fine until the debugger detaches and reveals a zombie state. Stupid test-case: void *tfunc(void *arg) { sleep(1); // wait for zombie leader raise(SIGSTOP); exit(0x13); return NULL; } int run_child(void) { pthread_t thread; if (!fork()) { int tracee = getppid(); assert(ptrace(PTRACE_ATTACH, tracee, 0,0) == 0); do ptrace(PTRACE_CONT, tracee, 0,0); while (wait(NULL) > 0); return 0; } sleep(1); // wait for PTRACE_ATTACH assert(pthread_create(&thread, NULL, tfunc, NULL) == 0); pthread_exit(NULL); } int main(void) { int child, stat; child = fork(); if (!child) return run_child(); assert(child == waitpid(-1, &stat, WSTOPPED)); assert(stat == 0x137f); kill(child, SIGCONT); assert(child == waitpid(-1, &stat, WCONTINUED)); assert(stat == 0xffff); assert(child == waitpid(-1, &stat, 0)); assert(stat == 0x1300); return 0; } Without this patch it hangs in waitpid(WSTOPPED), wait_task_stopped() is never called. Note: this doesn't fix all problems with a zombie delay_group_leader(), WCONTINUED | WEXITED check is not exactly right. debugger can't assume it will be notified if another thread reaps the whole thread group. Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Jan Kratochvil Cc: Lennart Poettering Cc: Michal Schmidt Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 92d38d4da4b1..6ed6a1d552b5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1380,20 +1380,16 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { - /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. - */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } - /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by -- cgit v1.2.3 From 80df28476505ed4e6701c3448c63c9229a50c655 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Mon, 7 Apr 2014 15:38:57 -0700 Subject: hung_task: check the value of "sysctl_hung_task_timeout_sec" As sysctl_hung_task_timeout_sec is unsigned long, when this value is larger then LONG_MAX/HZ, the function schedule_timeout_interruptible in watchdog will return immediately without sleep and with print : schedule_timeout: wrong timeout value ffffffffffffff83 and then the funtion watchdog will call schedule_timeout_interruptible again and again. The screen will be filled with "schedule_timeout: wrong timeout value ffffffffffffff83" This patch does some check and correction in sysctl, to let the function schedule_timeout_interruptible allways get the valid parameter. Signed-off-by: Liu Hua Tested-by: Satoru Takeuchi Cc: [3.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5c14b547882e..74f5b580fe34 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -141,6 +141,11 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +#ifdef CONFIG_DETECT_HUNG_TASK +static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); +#endif + #ifdef CONFIG_INOTIFY_USER #include #endif @@ -985,6 +990,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, }, { .procname = "hung_task_warnings", -- cgit v1.2.3 From d7c0847fe3682a026ee6d147c5b6b8ab457fffc8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:39:03 -0700 Subject: kernel/panic.c: display reason at end + pr_emerg Currently, booting without initrd specified on 80x25 screen gives a call trace followed by atkbd : Spurious ACK. Original message ("VFS: Unable to mount root fs") is not available. Of course this could happen in other situations... This patch displays panic reason after call trace which could help lot of people even if it's not the very last line on screen. Also, convert all panic.c printk(KERN_EMERG to pr_emerg( [akpm@linux-foundation.org: missed a couple of pr_ conversions] Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 79fd820bb5e8..d02fa9fef46a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -100,7 +100,7 @@ void panic(const char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing @@ -141,7 +141,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); @@ -165,7 +165,7 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) @@ -176,6 +176,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif + pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -276,8 +277,7 @@ unsigned long get_taint(void) void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) - printk(KERN_WARNING - "Disabling lock debugging due to kernel taint\n"); + pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -382,8 +382,7 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - printk(KERN_WARNING "---[ end trace %016llx ]---\n", - (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } /* -- cgit v1.2.3 From 52f5684c8e1ec7463192aba8e2916df49807511a Mon Sep 17 00:00:00 2001 From: Gideon Israel Dsouza Date: Mon, 7 Apr 2014 15:39:20 -0700 Subject: kernel: use macros from compiler.h instead of __attribute__((...)) To increase compiler portability there is which provides convenience macros for various gcc constructs. Eg: __weak for __attribute__((weak)). I've replaced all instances of gcc attributes with the right macro in the kernel subsystem. Signed-off-by: Gideon Israel Dsouza Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- kernel/kallsyms.c | 11 ++++++----- kernel/kexec.c | 5 +++-- kernel/ksysfs.c | 5 +++-- kernel/power/power.h | 3 ++- kernel/power/snapshot.c | 3 ++- kernel/power/suspend.c | 5 +++-- kernel/power/swap.c | 2 +- kernel/sched/clock.c | 3 ++- kernel/sched/core.c | 3 ++- kernel/signal.c | 4 +++- kernel/time/timekeeping.c | 5 +++-- kernel/trace/trace.h | 3 ++- 13 files changed, 34 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e905e9c6b224..54a8d26f612f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -286,7 +287,7 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, +int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3127ad52cdb2..cb0cf37dac3a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -36,8 +37,8 @@ * These will be re-linked against their real values * during the second link stage. */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __weak; +extern const u8 kallsyms_names[] __weak; /* * Tell the compiler that the count isn't in the small data section if the arch @@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak)); extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __weak; static inline int is_kernel_inittext(unsigned long addr) { diff --git a/kernel/kexec.c b/kernel/kexec.c index c0d261c7db7b..c8380ad203bc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1551,10 +1552,10 @@ void vmcoreinfo_append_str(const char *fmt, ...) * provide an empty default implementation here -- architecture * code may override this */ -void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) +void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) +unsigned long __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e660964086e2..2495a9b14ac8 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* rcu_expedited */ @@ -162,8 +163,8 @@ KERNEL_ATTR_RW(rcu_expedited); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const void __start_notes __attribute__((weak)); -extern const void __stop_notes __attribute__((weak)); +extern const void __start_notes __weak; +extern const void __stop_notes __weak; #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct file *filp, struct kobject *kobj, diff --git a/kernel/power/power.h b/kernel/power/power.h index 1ca753106557..15f37ea08719 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -2,6 +2,7 @@ #include #include #include +#include struct swsusp_info { struct new_utsname uts; @@ -11,7 +12,7 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 149e745eaa52..18fb7a2fb14b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free) struct linked_page { struct linked_page *next; char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); +} __packed; static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 90b3d9366d1a..c3ad9cafe930 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "power.h" @@ -156,13 +157,13 @@ static int suspend_prepare(suspend_state_t state) } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +void __weak arch_suspend_disable_irqs(void) { local_irq_disable(); } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +void __weak arch_suspend_enable_irqs(void) { local_irq_enable(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c33ed200410..8c9a4819f798 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -101,7 +101,7 @@ struct swsusp_header { unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed)); +} __packed; static struct swsusp_header *swsusp_header; diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index b30a2924ef14..3ef6451e972e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -60,13 +60,14 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1d1b87b36778..80bd491b718c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -6498,7 +6499,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index 5d4b05a229a6..6ea13c09ae56 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -33,6 +33,8 @@ #include #include #include +#include + #define CREATE_TRACE_POINTS #include @@ -3618,7 +3620,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) } #endif -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +__weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b40279ecd71..f7df8ea21707 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "tick-internal.h" #include "ntp_internal.h" @@ -760,7 +761,7 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -775,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ffc314b7e92b..2e29d7ba5a52 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -1279,7 +1280,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_##call; + __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ -- cgit v1.2.3 From 08f141d3dbddacb70aba1541bc5f950e466591e9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:39 -0700 Subject: modules: use raw_cpu_write for initialization of per cpu refcount. The initialization of a structure is not subject to synchronization. The use of __this_cpu would trigger a false positive with the additional preemption checks for __this_cpu ops. So simply disable the check through the use of raw_cpu ops. Trace: __this_cpu_write operation in preemptible [00000000] code: modprobe/286 caller is __this_cpu_preempt_check+0x38/0x60 CPU: 3 PID: 286 Comm: modprobe Tainted: GF 3.12.0-rc4+ #187 Call Trace: dump_stack+0x4e/0x82 check_preemption_disabled+0xec/0x110 __this_cpu_preempt_check+0x38/0x60 load_module+0xcfd/0x2650 SyS_init_module+0xa6/0xd0 tracesys+0xe1/0xe6 Signed-off-by: Christoph Lameter Acked-by: Ingo Molnar Acked-by: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 29f7790eaa14..11869408f79b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->incs, 1); + raw_cpu_write(mod->refptr->incs, 1); return 0; } -- cgit v1.2.3 From 64b47e8fdb40a0d46e8cf458dd3e24f8afa073f6 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Mon, 7 Apr 2014 15:39:45 -0700 Subject: lglock: map to spinlock when !CONFIG_SMP When the system has only one CPU, lglock is effectively a spinlock; map it directly to spinlock to eliminate the indirection and duplicate code. In addition to removing overhead, this drops 1.6k of code with a defconfig modified to have !CONFIG_SMP, and 1.1k with a minimal config. Signed-off-by: Josh Triplett Cc: Rusty Russell Cc: Michal Marek Cc: Thomas Gleixner Cc: David Howells Cc: "H. Peter Anvin" Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 306a76b51e0f..b8bdcd4785b7 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o +obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -- cgit v1.2.3 From de7b2973903c6cc50b31ee5682a69b2219b9919d Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Apr 2014 17:26:21 -0400 Subject: tracepoint: Use struct pointer instead of name hash for reg/unreg tracepoints Register/unregister tracepoint probes with struct tracepoint pointer rather than tracepoint name. This change, which vastly simplifies tracepoint.c, has been proposed by Steven Rostedt. It also removes 8.8kB (mostly of text) to the vmlinux size. From this point on, the tracers need to pass a struct tracepoint pointer to probe register/unregister. A probe can now only be connected to a tracepoint that exists. Moreover, tracers are responsible for unregistering the probe before the module containing its associated tracepoint is unloaded. text data bss dec hex filename 10443444 4282528 10391552 25117524 17f4354 vmlinux.orig 10434930 4282848 10391552 25109330 17f2352 vmlinux Link: http://lkml.kernel.org/r/1396992381-23785-2-git-send-email-mathieu.desnoyers@efficios.com CC: Ingo Molnar CC: Frederic Weisbecker CC: Andrew Morton CC: Frank Ch. Eigler CC: Johannes Berg Signed-off-by: Mathieu Desnoyers [ SDR - fixed return val in void func in tracepoint_module_going() ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 55 ++-- kernel/trace/trace_events_trigger.c | 2 +- kernel/trace/trace_kprobe.c | 21 +- kernel/trace/trace_output.c | 2 +- kernel/trace/trace_uprobe.c | 20 +- kernel/tracepoint.c | 511 ++++++++++++++++-------------------- 6 files changed, 280 insertions(+), 331 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 83a4378dc5e0..3ddfd8f62c05 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -223,24 +223,25 @@ int ftrace_event_reg(struct ftrace_event_call *call, { struct ftrace_event_file *file = data; + WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->probe, file); case TRACE_REG_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->probe, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return tracepoint_probe_register(call->name, + return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: - tracepoint_probe_unregister(call->name, + tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; @@ -352,7 +353,7 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " - "%s\n", call->name); + "%s\n", ftrace_event_name(call)); break; } set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); @@ -481,27 +482,29 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, { struct ftrace_event_file *file; struct ftrace_event_call *call; + const char *name; int ret = -EINVAL; list_for_each_entry(file, &tr->events, list) { call = file->event_call; + name = ftrace_event_name(call); - if (!call->name || !call->class || !call->class->reg) + if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && - strcmp(match, call->name) != 0 && + strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; - if (event && strcmp(event, call->name) != 0) + if (event && strcmp(event, name) != 0) continue; ftrace_event_enable_disable(file, set); @@ -699,7 +702,7 @@ static int t_show(struct seq_file *m, void *v) if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); - seq_printf(m, "%s\n", call->name); + seq_printf(m, "%s\n", ftrace_event_name(call)); return 0; } @@ -792,7 +795,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!call->name || !call->class || !call->class->reg) + if (!ftrace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) @@ -907,7 +910,7 @@ static int f_show(struct seq_file *m, void *v) switch ((unsigned long)v) { case FORMAT_HEADER: - seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "name: %s\n", ftrace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_printf(m, "format:\n"); return 0; @@ -1527,6 +1530,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) struct trace_array *tr = file->tr; struct list_head *head; struct dentry *d_events; + const char *name; int ret; /* @@ -1540,10 +1544,11 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) } else d_events = parent; - file->dir = debugfs_create_dir(call->name, d_events); + name = ftrace_event_name(call); + file->dir = debugfs_create_dir(name, d_events); if (!file->dir) { pr_warning("Could not create debugfs '%s' directory\n", - call->name); + name); return -1; } @@ -1567,7 +1572,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) ret = call->class->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" - " events/%s\n", call->name); + " events/%s\n", name); return -1; } } @@ -1631,15 +1636,17 @@ static void event_remove(struct ftrace_event_call *call) static int event_init(struct ftrace_event_call *call) { int ret = 0; + const char *name; - if (WARN_ON(!call->name)) + name = ftrace_event_name(call); + if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) pr_warn("Could not initialize trace events/%s\n", - call->name); + name); } return ret; @@ -1885,7 +1892,7 @@ __trace_add_event_dirs(struct trace_array *tr) ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create directory for event %s\n", - call->name); + ftrace_event_name(call)); } } @@ -1894,18 +1901,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) { struct ftrace_event_file *file; struct ftrace_event_call *call; + const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; + name = ftrace_event_name(call); - if (!call->name || !call->class || !call->class->reg) + if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; - if (strcmp(event, call->name) == 0 && + if (strcmp(event, name) == 0 && strcmp(system, call->class->system) == 0) return file; } @@ -1973,7 +1982,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%s:%s:%s", data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, data->file->event_call->class->system, - data->file->event_call->name); + ftrace_event_name(data->file->event_call)); if (data->count == -1) seq_printf(m, ":unlimited\n"); @@ -2193,7 +2202,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warning("Could not create directory for event %s\n", - file->event_call->name); + ftrace_event_name(file->event_call)); } } @@ -2217,7 +2226,7 @@ __trace_early_add_events(struct trace_array *tr) ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create early event %s\n", - call->name); + ftrace_event_name(call)); } } @@ -2549,7 +2558,7 @@ static __init void event_trace_self_tests(void) continue; #endif - pr_info("Testing event %s: ", call->name); + pr_info("Testing event %s: ", ftrace_event_name(call)); /* * If an event is already enabled, someone is using diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 8efbb69b04f0..925f537f07d1 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, seq_printf(m, "%s:%s:%s", enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, enable_data->file->event_call->class->system, - enable_data->file->event_call->name); + ftrace_event_name(enable_data->file->event_call)); if (data->count == -1) seq_puts(m, ":unlimited"); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d021d21dd150..903ae28962be 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -341,7 +341,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, struct trace_kprobe *tk; list_for_each_entry(tk, &probe_list, list) - if (strcmp(tk->tp.call.name, event) == 0 && + if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 && strcmp(tk->tp.call.class->system, group) == 0) return tk; return NULL; @@ -516,7 +516,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk) mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); + old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call), + tk->tp.call.class->system); if (old_tk) { ret = unregister_trace_kprobe(old_tk); if (ret < 0) @@ -564,7 +565,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - tk->tp.call.name, mod->name, ret); + ftrace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -818,7 +820,8 @@ static int probes_seq_show(struct seq_file *m, void *v) int i; seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, + ftrace_event_name(&tk->tp.call)); if (!tk->symbol) seq_printf(m, " 0x%p", tk->rp.kp.addr); @@ -876,7 +879,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; - seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, + seq_printf(m, " %-44s %15lu %15lu\n", + ftrace_event_name(&tk->tp.call), tk->nhit, tk->rp.kp.nmissed); return 0; @@ -1011,7 +1015,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1047,7 +1051,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", tp->call.name)) + if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) goto partial; if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) @@ -1286,7 +1290,8 @@ static int register_kprobe_event(struct trace_kprobe *tk) call->data = tk; ret = trace_add_event_call(call); if (ret) { - pr_info("Failed to register kprobe event: %s\n", call->name); + pr_info("Failed to register kprobe event: %s\n", + ftrace_event_name(call)); kfree(call->print_fmt); unregister_ftrace_event(&call->event); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ca0e79e2abaa..a436de18aa99 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -431,7 +431,7 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - ret = trace_seq_printf(s, "%s: ", event->name); + ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e4473367e7a4..930e51462dc8 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -294,7 +294,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(tu->tp.call.name, event) == 0 && + if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 && strcmp(tu->tp.call.class->system, group) == 0) return tu; @@ -324,7 +324,8 @@ static int register_trace_uprobe(struct trace_uprobe *tu) mutex_lock(&uprobe_lock); /* register as an event */ - old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); + old_tu = find_probe_event(ftrace_event_name(&tu->tp.call), + tu->tp.call.class->system); if (old_tu) { /* delete old event */ ret = unregister_trace_uprobe(old_tu); @@ -599,7 +600,8 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, + ftrace_event_name(&tu->tp.call)); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); for (i = 0; i < tu->tp.nr_args; i++) @@ -649,7 +651,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; - seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); + seq_printf(m, " %s %-44s %15lu\n", tu->filename, + ftrace_event_name(&tu->tp.call), tu->nhit); return 0; } @@ -844,12 +847,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), entry->vaddr[1], entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, + if (!trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, false); @@ -1275,7 +1280,8 @@ static int register_uprobe_event(struct trace_uprobe *tu) ret = trace_add_event_call(call); if (ret) { - pr_info("Failed to register uprobe event: %s\n", call->name); + pr_info("Failed to register uprobe event: %s\n", + ftrace_event_name(call)); kfree(call->print_fmt); unregister_ftrace_event(&call->event); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 50f8329c2042..01b3bd84daa1 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 Mathieu Desnoyers + * Copyright (C) 2008-2014 Mathieu Desnoyers * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,39 +33,27 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; +#ifdef CONFIG_MODULES /* - * Tracepoints mutex protects the builtin and module tracepoints and the hash - * table, as well as the local module list. + * Tracepoint module list mutex protects the local module list. */ -static DEFINE_MUTEX(tracepoints_mutex); +static DEFINE_MUTEX(tracepoint_module_list_mutex); -#ifdef CONFIG_MODULES -/* Local list of struct module */ +/* Local list of struct tp_module */ static LIST_HEAD(tracepoint_module_list); #endif /* CONFIG_MODULES */ /* - * Tracepoint hash table, containing the active tracepoints. - * Protected by tracepoints_mutex. + * tracepoints_mutex protects the builtin and module tracepoints. + * tracepoints_mutex nests inside tracepoint_module_list_mutex. */ -#define TRACEPOINT_HASH_BITS 6 -#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; +static DEFINE_MUTEX(tracepoints_mutex); /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent * state is reached. - * Tracepoint entries modifications are protected by the tracepoints_mutex. */ -struct tracepoint_entry { - struct hlist_node hlist; - struct tracepoint_func *funcs; - int refcount; /* Number of times armed. 0 if disarmed. */ - int enabled; /* Tracepoint enabled */ - char name[0]; -}; - struct tp_probes { struct rcu_head rcu; struct tracepoint_func probes[0]; @@ -92,34 +80,33 @@ static inline void release_probes(struct tracepoint_func *old) } } -static void debug_print_probes(struct tracepoint_entry *entry) +static void debug_print_probes(struct tracepoint_func *funcs) { int i; - if (!tracepoint_debug || !entry->funcs) + if (!tracepoint_debug || !funcs) return; - for (i = 0; entry->funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); + for (i = 0; funcs[i].func; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); } -static struct tracepoint_func * -tracepoint_entry_add_probe(struct tracepoint_entry *entry, - void *probe, void *data) +static struct tracepoint_func *func_add(struct tracepoint_func **funcs, + struct tracepoint_func *tp_func) { int nr_probes = 0; struct tracepoint_func *old, *new; - if (WARN_ON(!probe)) + if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); - debug_print_probes(entry); - old = entry->funcs; + debug_print_probes(*funcs); + old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ for (nr_probes = 0; old[nr_probes].func; nr_probes++) - if (old[nr_probes].func == probe && - old[nr_probes].data == data) + if (old[nr_probes].func == tp_func->func && + old[nr_probes].data == tp_func->data) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ @@ -128,33 +115,30 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, return ERR_PTR(-ENOMEM); if (old) memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); - new[nr_probes].func = probe; - new[nr_probes].data = data; + new[nr_probes] = *tp_func; new[nr_probes + 1].func = NULL; - entry->refcount = nr_probes + 1; - entry->funcs = new; - debug_print_probes(entry); + *funcs = new; + debug_print_probes(*funcs); return old; } -static void * -tracepoint_entry_remove_probe(struct tracepoint_entry *entry, - void *probe, void *data) +static void *func_remove(struct tracepoint_func **funcs, + struct tracepoint_func *tp_func) { int nr_probes = 0, nr_del = 0, i; struct tracepoint_func *old, *new; - old = entry->funcs; + old = *funcs; if (!old) return ERR_PTR(-ENOENT); - debug_print_probes(entry); + debug_print_probes(*funcs); /* (N -> M), (N > 1, M >= 0) probes */ - if (probe) { + if (tp_func->func) { for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (old[nr_probes].func == probe && - old[nr_probes].data == data) + if (old[nr_probes].func == tp_func->func && + old[nr_probes].data == tp_func->data) nr_del++; } } @@ -165,9 +149,8 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ - entry->funcs = NULL; - entry->refcount = 0; - debug_print_probes(entry); + *funcs = NULL; + debug_print_probes(*funcs); return old; } else { int j = 0; @@ -177,91 +160,34 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i].func; i++) - if (old[i].func != probe || old[i].data != data) + if (old[i].func != tp_func->func + || old[i].data != tp_func->data) new[j++] = old[i]; new[nr_probes - nr_del].func = NULL; - entry->refcount = nr_probes - nr_del; - entry->funcs = new; + *funcs = new; } - debug_print_probes(entry); + debug_print_probes(*funcs); return old; } /* - * Get tracepoint if the tracepoint is present in the tracepoint hash table. - * Must be called with tracepoints_mutex held. - * Returns NULL if not present. + * Add the probe function to a tracepoint. */ -static struct tracepoint_entry *get_tracepoint(const char *name) +static int tracepoint_add_func(struct tracepoint *tp, + struct tracepoint_func *func) { - struct hlist_head *head; - struct tracepoint_entry *e; - u32 hash = jhash(name, strlen(name), 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, head, hlist) { - if (!strcmp(name, e->name)) - return e; - } - return NULL; -} + struct tracepoint_func *old, *tp_funcs; -/* - * Add the tracepoint to the tracepoint hash table. Must be called with - * tracepoints_mutex held. - */ -static struct tracepoint_entry *add_tracepoint(const char *name) -{ - struct hlist_head *head; - struct tracepoint_entry *e; - size_t name_len = strlen(name) + 1; - u32 hash = jhash(name, name_len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, head, hlist) { - if (!strcmp(name, e->name)) { - printk(KERN_NOTICE - "tracepoint %s busy\n", name); - return ERR_PTR(-EEXIST); /* Already there */ - } - } - /* - * Using kmalloc here to allocate a variable length element. Could - * cause some memory fragmentation if overused. - */ - e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); - if (!e) - return ERR_PTR(-ENOMEM); - memcpy(&e->name[0], name, name_len); - e->funcs = NULL; - e->refcount = 0; - e->enabled = 0; - hlist_add_head(&e->hlist, head); - return e; -} + if (tp->regfunc && !static_key_enabled(&tp->key)) + tp->regfunc(); -/* - * Remove the tracepoint from the tracepoint hash table. Must be called with - * mutex_lock held. - */ -static inline void remove_tracepoint(struct tracepoint_entry *e) -{ - hlist_del(&e->hlist); - kfree(e); -} - -/* - * Sets the probe callback corresponding to one tracepoint. - */ -static void set_tracepoint(struct tracepoint_entry **entry, - struct tracepoint *elem, int active) -{ - WARN_ON(strcmp((*entry)->name, elem->name) != 0); - - if (elem->regfunc && !static_key_enabled(&elem->key) && active) - elem->regfunc(); - else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) - elem->unregfunc(); + tp_funcs = tp->funcs; + old = func_add(&tp_funcs, func); + if (IS_ERR(old)) { + WARN_ON_ONCE(1); + return PTR_ERR(old); + } + release_probes(old); /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new @@ -270,199 +196,163 @@ static void set_tracepoint(struct tracepoint_entry **entry, * include/linux/tracepoints.h. A matching smp_read_barrier_depends() * is used. */ - rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !static_key_enabled(&elem->key)) - static_key_slow_inc(&elem->key); - else if (!active && static_key_enabled(&elem->key)) - static_key_slow_dec(&elem->key); + rcu_assign_pointer(tp->funcs, tp_funcs); + if (!static_key_enabled(&tp->key)) + static_key_slow_inc(&tp->key); + return 0; } /* - * Disable a tracepoint and its probe callback. + * Remove a probe function from a tracepoint. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured * by preempt_disable around the call site. */ -static void disable_tracepoint(struct tracepoint *elem) +static int tracepoint_remove_func(struct tracepoint *tp, + struct tracepoint_func *func) { - if (elem->unregfunc && static_key_enabled(&elem->key)) - elem->unregfunc(); - - if (static_key_enabled(&elem->key)) - static_key_slow_dec(&elem->key); - rcu_assign_pointer(elem->funcs, NULL); -} + struct tracepoint_func *old, *tp_funcs; -/** - * tracepoint_update_probe_range - Update a probe range - * @begin: beginning of the range - * @end: end of the range - * - * Updates the probe callback corresponding to a range of tracepoints. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) -{ - struct tracepoint * const *iter; - struct tracepoint_entry *mark_entry; - - if (!begin) - return; - - for (iter = begin; iter < end; iter++) { - mark_entry = get_tracepoint((*iter)->name); - if (mark_entry) { - set_tracepoint(&mark_entry, *iter, - !!mark_entry->refcount); - mark_entry->enabled = !!mark_entry->refcount; - } else { - disable_tracepoint(*iter); - } + tp_funcs = tp->funcs; + old = func_remove(&tp_funcs, func); + if (IS_ERR(old)) { + WARN_ON_ONCE(1); + return PTR_ERR(old); } -} - -#ifdef CONFIG_MODULES -void module_update_tracepoints(void) -{ - struct tp_module *tp_mod; - - list_for_each_entry(tp_mod, &tracepoint_module_list, list) - tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, - tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); -} -#else /* CONFIG_MODULES */ -void module_update_tracepoints(void) -{ -} -#endif /* CONFIG_MODULES */ + release_probes(old); + if (!tp_funcs) { + /* Removed last function */ + if (tp->unregfunc && static_key_enabled(&tp->key)) + tp->unregfunc(); -/* - * Update probes, removing the faulty probes. - * Called with tracepoints_mutex held. - */ -static void tracepoint_update_probes(void) -{ - /* Core kernel tracepoints */ - tracepoint_update_probe_range(__start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - /* tracepoints in modules. */ - module_update_tracepoints(); -} - -static struct tracepoint_func * -tracepoint_add_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) - return (struct tracepoint_func *)entry; + if (static_key_enabled(&tp->key)) + static_key_slow_dec(&tp->key); } - old = tracepoint_entry_add_probe(entry, probe, data); - if (IS_ERR(old) && !entry->refcount) - remove_tracepoint(entry); - return old; + rcu_assign_pointer(tp->funcs, tp_funcs); + return 0; } /** * tracepoint_probe_register - Connect a probe to a tracepoint - * @name: tracepoint name + * @tp: tracepoint * @probe: probe handler - * @data: probe private data - * - * Returns: - * - 0 if the probe was successfully registered, and tracepoint - * callsites are currently loaded for that probe, - * - -ENODEV if the probe was successfully registered, but no tracepoint - * callsite is currently loaded for that probe, - * - other negative error value on error. - * - * When tracepoint_probe_register() returns either 0 or -ENODEV, - * parameters @name, @probe, and @data may be used by the tracepoint - * infrastructure until the probe is unregistered. * - * The probe address must at least be aligned on the architecture pointer size. + * Returns 0 if ok, error value on error. + * Note: if @tp is within a module, the caller is responsible for + * unregistering the probe before the module is gone. This can be + * performed either with a tracepoint module going notifier, or from + * within module exit functions. */ -int tracepoint_probe_register(const char *name, void *probe, void *data) +int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) { - struct tracepoint_func *old; - struct tracepoint_entry *entry; - int ret = 0; + struct tracepoint_func tp_func; + int ret; mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ - entry = get_tracepoint(name); - /* Make sure the entry was enabled */ - if (!entry || !entry->enabled) - ret = -ENODEV; + tp_func.func = probe; + tp_func.data = data; + ret = tracepoint_add_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); - release_probes(old); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); -static struct tracepoint_func * -tracepoint_remove_probe(const char *name, void *probe, void *data) -{ - struct tracepoint_entry *entry; - struct tracepoint_func *old; - - entry = get_tracepoint(name); - if (!entry) - return ERR_PTR(-ENOENT); - old = tracepoint_entry_remove_probe(entry, probe, data); - if (IS_ERR(old)) - return old; - if (!entry->refcount) - remove_tracepoint(entry); - return old; -} - /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint - * @name: tracepoint name + * @tp: tracepoint * @probe: probe function pointer - * @data: probe private data * - * We do not need to call a synchronize_sched to make sure the probes have - * finished running before doing a module unload, because the module unload - * itself uses stop_machine(), which insures that every preempt disabled section - * have finished. + * Returns 0 if ok, error value on error. */ -int tracepoint_probe_unregister(const char *name, void *probe, void *data) +int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) { - struct tracepoint_func *old; + struct tracepoint_func tp_func; + int ret; mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_update_probes(); /* may update entry */ + tp_func.func = probe; + tp_func.data = data; + ret = tracepoint_remove_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); - release_probes(old); - return 0; + return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); - #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); } +static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); + +/** + * register_tracepoint_notifier - register tracepoint coming/going notifier + * @nb: notifier block + * + * Notifiers registered with this function are called on module + * coming/going with the tracepoint_module_list_mutex held. + * The notifier block callback should expect a "struct tp_module" data + * pointer. + */ +int register_tracepoint_module_notifier(struct notifier_block *nb) +{ + struct tp_module *tp_mod; + int ret; + + mutex_lock(&tracepoint_module_list_mutex); + ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb); + if (ret) + goto end; + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod); +end: + mutex_unlock(&tracepoint_module_list_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); + +/** + * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier + * @nb: notifier block + * + * The notifier block callback should expect a "struct tp_module" data + * pointer. + */ +int unregister_tracepoint_module_notifier(struct notifier_block *nb) +{ + struct tp_module *tp_mod; + int ret; + + mutex_lock(&tracepoint_module_list_mutex); + ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb); + if (ret) + goto end; + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod); +end: + mutex_unlock(&tracepoint_module_list_mutex); + return ret; + +} +EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); + +/* + * Ensure the tracer unregistered the module's probes before the module + * teardown is performed. Prevents leaks of probe and data pointers. + */ +static void tp_module_going_check_quiescent(struct tracepoint * const *begin, + struct tracepoint * const *end) +{ + struct tracepoint * const *iter; + + if (!begin) + return; + for (iter = begin; iter < end; iter++) + WARN_ON_ONCE((*iter)->funcs); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod; @@ -478,7 +368,7 @@ static int tracepoint_module_coming(struct module *mod) */ if (trace_module_has_bad_taint(mod)) return 0; - mutex_lock(&tracepoints_mutex); + mutex_lock(&tracepoint_module_list_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); if (!tp_mod) { ret = -ENOMEM; @@ -487,27 +377,33 @@ static int tracepoint_module_coming(struct module *mod) tp_mod->num_tracepoints = mod->num_tracepoints; tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; list_add_tail(&tp_mod->list, &tracepoint_module_list); - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + blocking_notifier_call_chain(&tracepoint_notify_list, + MODULE_STATE_COMING, tp_mod); end: - mutex_unlock(&tracepoints_mutex); + mutex_unlock(&tracepoint_module_list_mutex); return ret; } -static int tracepoint_module_going(struct module *mod) +static void tracepoint_module_going(struct module *mod) { - struct tp_module *pos; + struct tp_module *tp_mod; if (!mod->num_tracepoints) - return 0; + return; - mutex_lock(&tracepoints_mutex); - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - list_for_each_entry(pos, &tracepoint_module_list, list) { - if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { - list_del(&pos->list); - kfree(pos); + mutex_lock(&tracepoint_module_list_mutex); + list_for_each_entry(tp_mod, &tracepoint_module_list, list) { + if (tp_mod->tracepoints_ptrs == mod->tracepoints_ptrs) { + blocking_notifier_call_chain(&tracepoint_notify_list, + MODULE_STATE_GOING, tp_mod); + list_del(&tp_mod->list); + kfree(tp_mod); + /* + * Called the going notifier before checking for + * quiescence. + */ + tp_module_going_check_quiescent(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); break; } } @@ -517,12 +413,11 @@ static int tracepoint_module_going(struct module *mod) * flag on "going", in case a module taints the kernel only after being * loaded. */ - mutex_unlock(&tracepoints_mutex); - return 0; + mutex_unlock(&tracepoint_module_list_mutex); } -int tracepoint_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int tracepoint_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; int ret = 0; @@ -534,24 +429,58 @@ int tracepoint_module_notify(struct notifier_block *self, case MODULE_STATE_LIVE: break; case MODULE_STATE_GOING: - ret = tracepoint_module_going(mod); + tracepoint_module_going(mod); + break; + case MODULE_STATE_UNFORMED: break; } return ret; } -struct notifier_block tracepoint_module_nb = { +static struct notifier_block tracepoint_module_nb = { .notifier_call = tracepoint_module_notify, .priority = 0, }; -static int init_tracepoints(void) +static __init int init_tracepoints(void) { - return register_module_notifier(&tracepoint_module_nb); + int ret; + + ret = register_module_notifier(&tracepoint_module_nb); + if (ret) { + pr_warning("Failed to register tracepoint module enter notifier\n"); + } + return ret; } __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + struct tracepoint * const *iter; + + if (!begin) + return; + for (iter = begin; iter < end; iter++) + fct(*iter, priv); +} + +/** + * for_each_kernel_tracepoint - iteration on all kernel tracepoints + * @fct: callback + * @priv: private data + */ +void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + for_each_tracepoint_range(__start___tracepoints_ptrs, + __stop___tracepoints_ptrs, fct, priv); +} +EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); + #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ -- cgit v1.2.3 From eb7d035c59431bb12e1aa6e69ddd3940352faddb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 8 Apr 2014 20:09:40 -0400 Subject: tracepoint: Simplify tracepoint module search Instead of copying the num_tracepoints and tracepoints_ptrs from the module structure to the tp_mod structure, which only uses it to find the module associated to tracepoints of modules that are coming and going, simply copy the pointer to the module struct to the tracepoint tp_module structure. Also removed un-needed brackets around an if statement. Link: http://lkml.kernel.org/r/20140408201705.4dad2c4a@gandalf.local.home Acked-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 01b3bd84daa1..162be198a247 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -374,8 +374,7 @@ static int tracepoint_module_coming(struct module *mod) ret = -ENOMEM; goto end; } - tp_mod->num_tracepoints = mod->num_tracepoints; - tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; + tp_mod->mod = mod; list_add_tail(&tp_mod->list, &tracepoint_module_list); blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_COMING, tp_mod); @@ -393,7 +392,7 @@ static void tracepoint_module_going(struct module *mod) mutex_lock(&tracepoint_module_list_mutex); list_for_each_entry(tp_mod, &tracepoint_module_list, list) { - if (tp_mod->tracepoints_ptrs == mod->tracepoints_ptrs) { + if (tp_mod->mod == mod) { blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_GOING, tp_mod); list_del(&tp_mod->list); @@ -447,9 +446,9 @@ static __init int init_tracepoints(void) int ret; ret = register_module_notifier(&tracepoint_module_nb); - if (ret) { + if (ret) pr_warning("Failed to register tracepoint module enter notifier\n"); - } + return ret; } __initcall(init_tracepoints); -- cgit v1.2.3 From b725dfea24b89de672c055b34b22398283a3f4bc Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 9 Apr 2014 09:24:43 -0400 Subject: tracepoint: Fix sparse warnings in tracepoint.c Fix the following sparse warnings: CHECK kernel/tracepoint.c kernel/tracepoint.c:184:18: warning: incorrect type in assignment (different address spaces) kernel/tracepoint.c:184:18: expected struct tracepoint_func *tp_funcs kernel/tracepoint.c:184:18: got struct tracepoint_func [noderef] *funcs kernel/tracepoint.c:216:18: warning: incorrect type in assignment (different address spaces) kernel/tracepoint.c:216:18: expected struct tracepoint_func *tp_funcs kernel/tracepoint.c:216:18: got struct tracepoint_func [noderef] *funcs kernel/tracepoint.c:392:24: error: return expression in void function CC kernel/tracepoint.o kernel/tracepoint.c: In function tracepoint_module_going: kernel/tracepoint.c:491:6: warning: symbol 'syscall_regfunc' was not declared. Should it be static? kernel/tracepoint.c:508:6: warning: symbol 'syscall_unregfunc' was not declared. Should it be static? Link: http://lkml.kernel.org/r/1397049883-28692-1-git-send-email-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 162be198a247..ca2cfe21bb8e 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -181,7 +181,8 @@ static int tracepoint_add_func(struct tracepoint *tp, if (tp->regfunc && !static_key_enabled(&tp->key)) tp->regfunc(); - tp_funcs = tp->funcs; + tp_funcs = rcu_dereference_protected(tp->funcs, + lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func); if (IS_ERR(old)) { WARN_ON_ONCE(1); @@ -213,7 +214,8 @@ static int tracepoint_remove_func(struct tracepoint *tp, { struct tracepoint_func *old, *tp_funcs; - tp_funcs = tp->funcs; + tp_funcs = rcu_dereference_protected(tp->funcs, + lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (IS_ERR(old)) { WARN_ON_ONCE(1); -- cgit v1.2.3 From 69cd9eba38867a493a043bb13eb9b33cad5f1a9a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 8 Apr 2014 15:30:07 -0700 Subject: futex: avoid race between requeue and wake Jan Stancek reported: "pthread_cond_broadcast/4-1.c testcase from openposix testsuite (LTP) occasionally fails, because some threads fail to wake up. Testcase creates 5 threads, which are all waiting on same condition. Main thread then calls pthread_cond_broadcast() without holding mutex, which calls: futex(uaddr1, FUTEX_CMP_REQUEUE_PRIVATE, 1, 2147483647, uaddr2, ..) This immediately wakes up single thread A, which unlocks mutex and tries to wake up another thread: futex(uaddr2, FUTEX_WAKE_PRIVATE, 1) If thread A manages to call futex_wake() before any waiters are requeued for uaddr2, no other thread is woken up" The ordering constraints for the hash bucket waiter counting are that the waiter counts have to be incremented _before_ getting the spinlock (because the spinlock acts as part of the memory barrier), but the "requeue" operation didn't honor those rules, and nobody had even thought about that case. This fairly simple patch just increments the waiter count for the target hash bucket (hb2) when requeing a futex before taking the locks. It then decrements them again after releasing the lock - the code that actually moves the futex(es) between hash buckets will do the additional required waiter count housekeeping. Reported-and-tested-by: Jan Stancek Acked-by: Davidlohr Bueso Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org # 3.14 Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 67dacaf93e56..6801b3751a95 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1452,6 +1452,7 @@ retry: hb2 = hash_futex(&key2); retry_private: + hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { @@ -1461,6 +1462,7 @@ retry_private: if (unlikely(ret)) { double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); ret = get_user(curval, uaddr1); if (ret) @@ -1510,6 +1512,7 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); @@ -1519,6 +1522,7 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); cond_resched(); @@ -1594,6 +1598,7 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); /* * drop_futex_key_refs() must be called outside the spinlocks. During -- cgit v1.2.3 From abb43f6998eb6466ea392d3757e673bbdb6ae171 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 9 Apr 2014 17:06:08 -0400 Subject: tracing: Fix anonymous unions in struct ftrace_event_call gcc <= 4.5.x has significant limitations with respect to initialization of anonymous unions within structures. They need to be surrounded by brackets, _and_ they need to be initialized in the same order in which they appear in the structure declaration. Link: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10676 Link: http://lkml.kernel.org/r/1397077568-3156-1-git-send-email-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index ee0a5098ac43..d4ddde28a81a 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -173,9 +173,11 @@ struct ftrace_event_class __refdata event_class_ftrace_##call = { \ }; \ \ struct ftrace_event_call __used event_##call = { \ - .name = #call, \ - .event.type = etype, \ .class = &event_class_ftrace_##call, \ + { \ + .name = #call, \ + }, \ + .event.type = etype, \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ }; \ -- cgit v1.2.3 From 17a280ea8111c66791c18c0353b7986aafcb24fe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 10 Apr 2014 22:43:37 -0400 Subject: tracing: Add missing function triggers dump and cpudump to README The debugfs tracing README file lists all the function triggers except for dump and cpudump. These should be added too. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9be67c5e5b0f..e3e665685ee5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3611,6 +3611,8 @@ static const char readme_msg[] = #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif + "\t\t dump\n" + "\t\t cpudump\n" "\t example: echo do_fault:traceoff > set_ftrace_filter\n" "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" "\t The first one will disable tracing every time do_fault is hit\n" -- cgit v1.2.3 From a786c06d9f2719203c00b3d97b21f9a96980d0b5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 11 Apr 2014 12:01:03 -0400 Subject: missing bits of "splice: fix racy pipe->buffers uses" that commit has fixed only the parts of that mess in fs/splice.c itself; there had been more in several other ->splice_read() instances... Signed-off-by: Al Viro --- kernel/relay.c | 2 +- kernel/trace/trace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 98833f664fb6..7d38607649a3 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1251,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); + nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7511de35257f..27924caaa124 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4410,7 +4410,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; @@ -5267,7 +5267,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); - for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; -- cgit v1.2.3 From d7e8af1afeffb03ab250b91cd70ba8c701f0f2b7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 9 Apr 2014 11:55:07 -0700 Subject: futex: update documentation for ordering guarantees Commits 11d4616bd07f ("futex: revert back to the explicit waiter counting code") and 69cd9eba3886 ("futex: avoid race between requeue and wake") changed some of the finer details of how we think about futexes. One was a late fix and the other a consequence of overlooking the whole requeuing logic. The first change caused our documentation to be incorrect, and the second made us aware that we need to explicitly add more details to it. Signed-off-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- kernel/futex.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6801b3751a95..5f589279e462 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -70,7 +70,10 @@ #include "locking/rtmutex_common.h" /* - * Basic futex operation and ordering guarantees: + * READ this before attempting to hack on futexes! + * + * Basic futex operation and ordering guarantees + * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires @@ -119,7 +122,7 @@ * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * - * waiters++; + * waiters++; (a) * mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | @@ -135,14 +138,14 @@ * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); - * wake_waiters(futex); - * unlock(hash_bucket(futex)); + * else wake_waiters(futex); + * waiters--; (b) unlock(hash_bucket(futex)); * - * Where (A) orders the waiters increment and the futex value read -- this - * is guaranteed by the head counter in the hb spinlock; and where (B) - * orders the write to futex and the waiters read -- this is done by the - * barriers in get_futex_key_refs(), through either ihold or atomic_inc, - * depending on the futex type. + * Where (A) orders the waiters increment and the futex value read through + * atomic operations (see hb_waiters_inc) and where (B) orders the write + * to futex and the waiters read -- this is done by the barriers in + * get_futex_key_refs(), through either ihold or atomic_inc, depending on the + * futex type. * * This yields the following case (where X:=waiters, Y:=futex): * @@ -155,6 +158,17 @@ * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. + * + * Note that a new waiter is accounted for in (a) even when it is possible that + * the wait call can return error, in which case we backtrack from it in (b). + * Refer to the comment in queue_lock(). + * + * Similarly, in order to account for waiters being requeued on another + * address we always increment the waiters for the destination bucket before + * acquiring the lock. It then decrements them again after releasing it - + * the code that actually moves the futex(es) between hash buckets (requeue_futex) + * will do the additional required waiter count housekeeping. This is done for + * double_lock_hb() and double_unlock_hb(), respectively. */ #ifndef CONFIG_HAVE_FUTEX_CMPXCHG -- cgit v1.2.3 From 2eac7648321f4a08aa4078504d7727af0af7173b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 14 Apr 2014 21:02:59 +0200 Subject: seccomp: fix populating a0-a5 syscall args in 32-bit x86 BPF Linus reports that on 32-bit x86 Chromium throws the following seccomp resp. audit log messages: audit: type=1326 audit(1397359304.356:28108): auid=500 uid=500 gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023 pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=172 compat=0 ip=0xb2dd9852 code=0x30000 audit: type=1326 audit(1397359304.356:28109): auid=500 uid=500 gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023 pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=5 compat=0 ip=0xb2dd9852 code=0x50000 These audit messages are being triggered via audit_seccomp() through __secure_computing() in seccomp mode (BPF) filter with seccomp return codes 0x30000 (== SECCOMP_RET_TRAP) and 0x50000 (== SECCOMP_RET_ERRNO) during filter runtime. Moreover, Linus reports that x86_64 Chromium seems fine. The underlying issue that explains this is that the implementation of populate_seccomp_data() is wrong. Our seccomp data structure sd that is being shared with user ABI is: struct seccomp_data { int nr; __u32 arch; __u64 instruction_pointer; __u64 args[6]; }; Therefore, a simple cast to 'unsigned long *' for storing the value of the syscall argument via syscall_get_arguments() is just wrong as on 32-bit x86 (or any other 32bit arch), it would result in storing a0-a5 at wrong offsets in args[] member, and thus i) could leak stack memory to user space and ii) tampers with the logic of seccomp BPF programs that read out and check for syscall arguments: syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); Tested on 32-bit x86 with Google Chrome, unfortunately only via remote test machine through slow ssh X forwarding, but it fixes the issue on my side. So fix it up by storing args in type correct variables, gcc is clever and optimizes the copy away in other cases, e.g. x86_64. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Reported-and-bisected-by: Linus Torvalds Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Linus Torvalds Cc: Eric Paris Cc: James Morris Cc: Kees Cook Signed-off-by: David S. Miller --- kernel/seccomp.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d8d046c0726a..590c37925084 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -69,18 +69,17 @@ static void populate_seccomp_data(struct seccomp_data *sd) { struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); + unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); sd->arch = syscall_get_arch(); - - /* Unroll syscall_get_args to help gcc on arm. */ - syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); - syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); - syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); - syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); - syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); - syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); - + syscall_get_arguments(task, regs, 0, 6, args); + sd->args[0] = args[0]; + sd->args[1] = args[1]; + sd->args[2] = args[2]; + sd->args[3] = args[3]; + sd->args[4] = args[4]; + sd->args[5] = args[5]; sd->instruction_pointer = KSTK_EIP(task); } -- cgit v1.2.3 From e79323bd87808fdfbc68ce6c5371bd224d9672ee Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 14 Apr 2014 16:58:55 -0400 Subject: user namespace: fix incorrect memory barriers smp_read_barrier_depends() can be used if there is data dependency between the readers - i.e. if the read operation after the barrier uses address that was obtained from the read operation before the barrier. In this file, there is only control dependency, no data dependecy, so the use of smp_read_barrier_depends() is incorrect. The code could fail in the following way: * the cpu predicts that idx < entries is true and starts executing the body of the for loop * the cpu fetches map->extent[0].first and map->extent[0].count * the cpu fetches map->nr_extents * the cpu verifies that idx < extents is true, so it commits the instructions in the body of the for loop The problem is that in this scenario, the cpu read map->extent[0].first and map->nr_extents in the wrong order. We need a full read memory barrier to prevent it. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0d8f6023fd8d..bf71b4b2d632 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; @@ -615,9 +615,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write - * order and smp_read_barrier_depends() is guaranteed that we - * don't have crazy architectures returning stale data. - * + * order and smp_rmb() is guaranteed that we don't have crazy + * architectures returning stale data. */ mutex_lock(&id_map_mutex); -- cgit v1.2.3 From 0acf07d240a84069c4a6651e6030cf35d30c7159 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Apr 2014 10:54:34 -0700 Subject: seccomp: fix memory leak on filter attach This sets the correct error code when final filter memory is unavailable, and frees the raw filter no matter what. unreferenced object 0xffff8800d6ea4000 (size 512): comm "sshd", pid 278, jiffies 4294898315 (age 46.653s) hex dump (first 32 bytes): 21 00 00 00 04 00 00 00 15 00 01 00 3e 00 00 c0 !...........>... 06 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00 ........!....... backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc+0x280/0x320 [] prctl_set_seccomp+0x11e/0x3b0 [] SyS_prctl+0x3bb/0x4a0 [] system_call_fastpath+0x1a/0x1f [] 0xffffffffffffffff Reported-by: Masami Ichikawa Signed-off-by: Kees Cook Tested-by: Masami Ichikawa Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/seccomp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 590c37925084..b35c21503a36 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -255,6 +255,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) goto free_prog; /* Allocate a new seccomp_filter */ + ret = -ENOMEM; filter = kzalloc(sizeof(struct seccomp_filter) + sizeof(struct sock_filter_int) * new_len, GFP_KERNEL|__GFP_NOWARN); @@ -264,6 +265,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); if (ret) goto free_filter; + kfree(fp); atomic_set(&filter->usage, 1); filter->len = new_len; -- cgit v1.2.3