summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/cgroup.c191
-rw-r--r--kernel/cpuset.c15
-rw-r--r--kernel/events/core.c7
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/chip.c85
-rw-r--r--kernel/irq/internals.h16
-rw-r--r--kernel/irq/irqdesc.c42
-rw-r--r--kernel/irq/manage.c32
-rw-r--r--kernel/irq/pm.c159
-rw-r--r--kernel/irq_work.c15
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c54
-rw-r--r--kernel/power/suspend.c51
-rw-r--r--kernel/power/suspend_test.c32
-rw-r--r--kernel/reboot.c81
-rw-r--r--kernel/resource.c70
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sys.c489
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h7
-rw-r--r--kernel/time/tick-sched.c60
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/watchdog.c18
29 files changed, 1003 insertions, 468 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index b4c667d22e79..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct)
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
- struct pid_namespace *ns = acct->ns;
struct file *file = acct->file;
/*
@@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct)
ac.ac_gid16 = ac.ac_gid;
#endif
#if ACCT_VERSION == 3
- ac.ac_pid = task_tgid_nr_ns(current, ns);
- rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
- rcu_read_unlock();
+ {
+ struct pid_namespace *ns = acct->ns;
+
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
+ ns);
+ rcu_read_unlock();
+ }
#endif
/*
* Get freeze protection. If the fs is frozen, just skip the write
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
/* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
+ pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
+ pr_debug("async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a73f995a81e..136eceadeed1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
@@ -331,14 +329,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
return false;
}
-static int cgroup_is_releasable(const struct cgroup *cgrp)
-{
- const int bits =
- (1 << CGRP_RELEASABLE) |
- (1 << CGRP_NOTIFY_ON_RELEASE);
- return (cgrp->flags & bits) == bits;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -394,12 +384,7 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/*
@@ -498,7 +483,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-static void put_css_set_locked(struct css_set *cset, bool taskexit)
+static void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
@@ -524,11 +509,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
/* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links)) {
cgroup_update_populated(cgrp, false);
- if (notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
+ check_for_release(cgrp);
}
kfree(link);
@@ -537,7 +518,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset, bool taskexit)
+static void put_css_set(struct css_set *cset)
{
/*
* Ensure that the refcount doesn't hit zero while any readers
@@ -548,7 +529,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)
return;
down_write(&css_set_rwsem);
- put_css_set_locked(cset, taskexit);
+ put_css_set_locked(cset);
up_write(&css_set_rwsem);
}
@@ -969,14 +950,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex. These are the two most performance
- * critical pieces of code here. The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
@@ -1587,7 +1560,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
- INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
@@ -1597,6 +1569,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
}
static void init_cgroup_root(struct cgroup_root *root,
@@ -1634,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
goto out;
root_cgrp->id = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+ GFP_KERNEL);
if (ret)
goto out;
@@ -2052,8 +2026,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
* task. As trading it for new_cset is protected by cgroup_mutex,
* we're safe to drop it here; it will be freed under RCU.
*/
- set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
- put_css_set_locked(old_cset, false);
+ put_css_set_locked(old_cset);
}
/**
@@ -2074,7 +2047,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
- put_css_set_locked(cset, false);
+ put_css_set_locked(cset);
}
up_write(&css_set_rwsem);
}
@@ -2168,8 +2141,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
- put_css_set(src_cset, false);
- put_css_set(dst_cset, false);
+ put_css_set(src_cset);
+ put_css_set(dst_cset);
continue;
}
@@ -2178,7 +2151,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (list_empty(&dst_cset->mg_preload_node))
list_add(&dst_cset->mg_preload_node, &csets);
else
- put_css_set(dst_cset, false);
+ put_css_set(dst_cset);
}
list_splice_tail(&csets, preloaded_csets);
@@ -4173,7 +4146,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
else
@@ -4351,6 +4323,7 @@ static void css_free_work_fn(struct work_struct *work)
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
+ cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
/*
@@ -4510,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
init_and_link_css(css, ss, cgrp);
- err = percpu_ref_init(&css->refcnt, css_release);
+ err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
if (err)
goto err_free_css;
@@ -4583,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
goto out_unlock;
}
- ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
goto out_free_cgrp;
@@ -4813,19 +4786,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
- /* CSS_ONLINE is clear, remove from ->release_list for the last time */
- raw_spin_lock(&release_list_lock);
- if (!list_empty(&cgrp->release_list))
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
-
/*
* Remove @cgrp directory along with the base files. @cgrp has an
* extra ref on its kn.
*/
kernfs_remove(cgrp->kn);
- set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
check_for_release(cgroup_parent(cgrp));
/* put the base reference */
@@ -4842,13 +4808,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
- cgroup_get(cgrp); /* for @kn->priv clearing */
ret = cgroup_destroy_locked(cgrp);
cgroup_kn_unlock(kn);
-
- cgroup_put(cgrp);
return ret;
}
@@ -5052,12 +5015,9 @@ core_initcall(cgroup_wq_init);
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
-
-/* TODO: Use a proper seq_file iterator */
-int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *path;
int retval;
struct cgroup_root *root;
@@ -5067,14 +5027,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
- retval = 0;
-
mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
@@ -5104,11 +5056,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)
seq_putc(m, '\n');
}
+ retval = 0;
out_unlock:
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
- put_task_struct(tsk);
-out_free:
kfree(buf);
out:
return retval;
@@ -5179,7 +5130,7 @@ void cgroup_post_fork(struct task_struct *child)
int i;
/*
- * This may race against cgroup_enable_task_cg_links(). As that
+ * This may race against cgroup_enable_task_cg_lists(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
@@ -5194,7 +5145,7 @@ void cgroup_post_fork(struct task_struct *child)
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
- * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
@@ -5278,30 +5229,14 @@ void cgroup_exit(struct task_struct *tsk)
}
if (put_cset)
- put_css_set(cset, true);
+ put_css_set(cset);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
- !css_has_online_children(&cgrp->self)) {
- /*
- * Control Group is currently removeable. If it's not
- * already queued for a userspace notification, queue
- * it now
- */
- int need_schedule_work = 0;
-
- raw_spin_lock(&release_list_lock);
- if (!cgroup_is_dead(cgrp) &&
- list_empty(&cgrp->release_list)) {
- list_add(&cgrp->release_list, &release_list);
- need_schedule_work = 1;
- }
- raw_spin_unlock(&release_list_lock);
- if (need_schedule_work)
- schedule_work(&release_agent_work);
- }
+ if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
}
/*
@@ -5329,52 +5264,39 @@ static void check_for_release(struct cgroup *cgrp)
*/
static void cgroup_release_agent(struct work_struct *work)
{
- BUG_ON(work != &release_agent_work);
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *argv[3], *envp[3];
+
mutex_lock(&cgroup_mutex);
- raw_spin_lock(&release_list_lock);
- while (!list_empty(&release_list)) {
- char *argv[3], *envp[3];
- int i;
- char *pathbuf = NULL, *agentbuf = NULL, *path;
- struct cgroup *cgrp = list_entry(release_list.next,
- struct cgroup,
- release_list);
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!pathbuf)
- goto continue_free;
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
- if (!path)
- goto continue_free;
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!agentbuf)
- goto continue_free;
-
- i = 0;
- argv[i++] = agentbuf;
- argv[i++] = path;
- argv[i] = NULL;
-
- i = 0;
- /* minimal command environment */
- envp[i++] = "HOME=/";
- envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[i] = NULL;
-
- /* Drop the lock while we invoke the usermode helper,
- * since the exec could involve hitting disk and hence
- * be a slow process */
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- mutex_lock(&cgroup_mutex);
- continue_free:
- kfree(pathbuf);
- kfree(agentbuf);
- raw_spin_lock(&release_list_lock);
- }
- raw_spin_unlock(&release_list_lock);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = path;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
}
static int __init cgroup_disable(char *str)
@@ -5562,7 +5484,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ return (!cgroup_has_tasks(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
}
static struct cftype debug_files[] = {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 52cb04c993b7..1f107c74087b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2730,10 +2730,9 @@ void __cpuset_memory_pressure_bump(void)
* and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
@@ -2743,24 +2742,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
retval = -ENAMETOOLONG;
rcu_read_lock();
css = task_css(tsk, cpuset_cgrp_id);
p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
if (!p)
- goto out_put_task;
+ goto out_free;
seq_puts(m, p);
seq_putc(m, '\n');
retval = 0;
-out_put_task:
- put_task_struct(tsk);
out_free:
kfree(buf);
out:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 963bf139e2b2..b1c663593f5c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -392,14 +392,9 @@ perf_cgroup_match(struct perf_event *event)
event->cgrp->css.cgroup);
}
-static inline void perf_put_cgroup(struct perf_event *event)
-{
- css_put(&event->cgrp->css);
-}
-
static inline void perf_detach_cgroup(struct perf_event *event)
{
- perf_put_cgroup(event);
+ css_put(&event->cgrp->css);
event->cgrp = NULL;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index a91e47d86de2..8c162d102740 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -601,9 +601,8 @@ static void check_mm(struct mm_struct *mm)
printk(KERN_ALERT "BUG: Bad rss-counter state "
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
-
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON(mm->pmd_huge_pte);
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d269cecdfbf0..225086b2652e 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,9 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+config HANDLE_DOMAIN_IRQ
+ bool
+
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6223fab9a9d2..8fb52e9bddc1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -342,6 +342,31 @@ static bool irq_check_poll(struct irq_desc *desc)
return irq_wait_for_poll(desc);
}
+static bool irq_may_run(struct irq_desc *desc)
+{
+ unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+
+ /*
+ * If the interrupt is not in progress and is not an armed
+ * wakeup interrupt, proceed.
+ */
+ if (!irqd_has_set(&desc->irq_data, mask))
+ return true;
+
+ /*
+ * If the interrupt is an armed wakeup source, mark it pending
+ * and suspended, disable it and notify the pm core about the
+ * event.
+ */
+ if (irq_pm_check_wakeup(desc))
+ return false;
+
+ /*
+ * Handle a potential concurrent poll on a different core.
+ */
+ return irq_check_poll(desc);
+}
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -359,9 +384,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -412,9 +436,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -485,9 +508,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out;
+ if (!irq_may_run(desc))
+ goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -541,19 +563,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
@@ -602,18 +628,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
do {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 099ea2e0eb88..4332d766619d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -63,8 +63,8 @@ enum {
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
@@ -194,3 +194,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+
+#ifdef CONFIG_PM_SLEEP
+bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
+#else
+static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void
+irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
+static inline void
+irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
+#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1487a123db5c..a1782f88f0af 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -14,6 +14,7 @@
#include <linux/kernel_stat.h>
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
+#include <linux/irqdomain.h>
#include "internals.h"
@@ -336,6 +337,47 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+/**
+ * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @lookup: Whether to perform the domain lookup or not
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
+ bool lookup, struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq = hwirq;
+ int ret = 0;
+
+ irq_enter();
+
+#ifdef CONFIG_IRQ_DOMAIN
+ if (lookup)
+ irq = irq_find_mapping(domain, hwirq);
+#endif
+
+ /*
+ * Some hardware gives randomly wrong interrupts. Rather
+ * than crashing, do something sensible.
+ */
+ if (unlikely(!irq || irq >= nr_irqs)) {
+ ack_bad_irq(irq);
+ ret = -EINVAL;
+ } else {
+ generic_handle_irq(irq);
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
+
/* Dynamic interrupt handling */
/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3dc6a61bf06a..0a9104b4608b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,14 +382,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
}
#endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+void __disable_irq(struct irq_desc *desc, unsigned int irq)
{
- if (suspend) {
- if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
- return;
- desc->istate |= IRQS_SUSPENDED;
- }
-
if (!desc->depth++)
irq_disable(desc);
}
@@ -401,7 +395,7 @@ static int __disable_irq_nosync(unsigned int irq)
if (!desc)
return -EINVAL;
- __disable_irq(desc, irq, false);
+ __disable_irq(desc, irq);
irq_put_desc_busunlock(desc, flags);
return 0;
}
@@ -442,20 +436,8 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
+void __enable_irq(struct irq_desc *desc, unsigned int irq)
{
- if (resume) {
- if (!(desc->istate & IRQS_SUSPENDED)) {
- if (!desc->action)
- return;
- if (!(desc->action->flags & IRQF_FORCE_RESUME))
- return;
- /* Pretend that it got disabled ! */
- desc->depth++;
- }
- desc->istate &= ~IRQS_SUSPENDED;
- }
-
switch (desc->depth) {
case 0:
err_out:
@@ -497,7 +479,7 @@ void enable_irq(unsigned int irq)
KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
goto out;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
out:
irq_put_desc_busunlock(desc, flags);
}
@@ -1218,6 +1200,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->irq = irq;
*old_ptr = new;
+ irq_pm_install_action(desc, new);
+
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
@@ -1228,7 +1212,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1336,6 +1320,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries: */
*action_ptr = action->next;
+ irq_pm_remove_action(desc, action);
+
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
irq_shutdown(desc);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca86cb7..3ca532592704 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,17 +9,105 @@
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include "internals.h"
+bool irq_pm_check_wakeup(struct irq_desc *desc)
+{
+ if (irqd_is_wakeup_armed(&desc->irq_data)) {
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_wakeup();
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Called from __setup_irq() with desc->lock held after @action has
+ * been installed in the action chain.
+ */
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions++;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth++;
+
+ WARN_ON_ONCE(desc->force_resume_depth &&
+ desc->force_resume_depth != desc->nr_actions);
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth++;
+
+ WARN_ON_ONCE(desc->no_suspend_depth &&
+ desc->no_suspend_depth != desc->nr_actions);
+}
+
+/*
+ * Called from __free_irq() with desc->lock held after @action has
+ * been removed from the action chain.
+ */
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions--;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth--;
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth--;
+}
+
+static bool suspend_device_irq(struct irq_desc *desc, int irq)
+{
+ if (!desc->action || desc->no_suspend_depth)
+ return false;
+
+ if (irqd_is_wakeup_set(&desc->irq_data)) {
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ /*
+ * We return true here to force the caller to issue
+ * synchronize_irq(). We need to make sure that the
+ * IRQD_WAKEUP_ARMED is visible before we return from
+ * suspend_device_irqs().
+ */
+ return true;
+ }
+
+ desc->istate |= IRQS_SUSPENDED;
+ __disable_irq(desc, irq);
+
+ /*
+ * Hardware which has no wakeup source configuration facility
+ * requires that the non wakeup interrupts are masked at the
+ * chip level. The chip implementation indicates that with
+ * IRQCHIP_MASK_ON_SUSPEND.
+ */
+ if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ mask_irq(desc);
+ return true;
+}
+
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
- * During system-wide suspend or hibernation device drivers need to be prevented
- * from receiving interrupts and this function is provided for this purpose.
- * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQS_SUSPENDED flag for each of them.
+ * During system-wide suspend or hibernation device drivers need to be
+ * prevented from receiving interrupts and this function is provided
+ * for this purpose.
+ *
+ * So we disable all interrupts and mark them IRQS_SUSPENDED except
+ * for those which are unused, those which are marked as not
+ * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
+ * set and those which are marked as active wakeup sources.
+ *
+ * The active wakeup sources are handled by the flow handler entry
+ * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
+ * interrupt and notifies the pm core about the wakeup.
*/
void suspend_device_irqs(void)
{
@@ -28,18 +116,36 @@ void suspend_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
+ bool sync;
raw_spin_lock_irqsave(&desc->lock, flags);
- __disable_irq(desc, irq, true);
+ sync = suspend_device_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
- for_each_irq_desc(irq, desc)
- if (desc->istate & IRQS_SUSPENDED)
+ if (sync)
synchronize_irq(irq);
+ }
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
+static void resume_irq(struct irq_desc *desc, int irq)
+{
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+
+ if (desc->istate & IRQS_SUSPENDED)
+ goto resume;
+
+ /* Force resume the interrupt? */
+ if (!desc->force_resume_depth)
+ return;
+
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+resume:
+ desc->istate &= ~IRQS_SUSPENDED;
+ __enable_irq(desc, irq);
+}
+
static void resume_irqs(bool want_early)
{
struct irq_desc *desc;
@@ -54,7 +160,7 @@ static void resume_irqs(bool want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- __enable_irq(desc, irq, true);
+ resume_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -93,38 +199,3 @@ void resume_device_irqs(void)
resume_irqs(false);
}
EXPORT_SYMBOL_GPL(resume_device_irqs);
-
-/**
- * check_wakeup_irqs - check if any wake-up interrupts are pending
- */
-int check_wakeup_irqs(void)
-{
- struct irq_desc *desc;
- int irq;
-
- for_each_irq_desc(irq, desc) {
- /*
- * Only interrupts which are marked as wakeup source
- * and have not been disabled before the suspend check
- * can abort suspend.
- */
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->depth == 1 && desc->istate & IRQS_PENDING)
- return -EBUSY;
- continue;
- }
- /*
- * Check the non wakeup interrupts whether they need
- * to be masked before finally going into suspend
- * state. That's for hardware which has no wakeup
- * source configuration facility. The chip
- * implementation indicates that with
- * IRQCHIP_MASK_ON_SUSPEND.
- */
- if (desc->istate & IRQS_SUSPENDED &&
- irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
- mask_irq(desc);
- }
-
- return 0;
-}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e6bcbe756663..385b85aded19 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -115,8 +115,10 @@ bool irq_work_needs_cpu(void)
raised = &__get_cpu_var(raised_list);
lazy = &__get_cpu_var(lazy_list);
- if (llist_empty(raised) && llist_empty(lazy))
- return false;
+
+ if (llist_empty(raised) || arch_irq_work_has_interrupt())
+ if (llist_empty(lazy))
+ return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -171,6 +173,15 @@ void irq_work_run(void)
}
EXPORT_SYMBOL_GPL(irq_work_run);
+void irq_work_tick(void)
+{
+ struct llist_head *raised = &__get_cpu_var(raised_list);
+
+ if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
+ irq_work_run_list(raised);
+ irq_work_run_list(&__get_cpu_var(lazy_list));
+}
+
/*
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef483220e855..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
{
struct task_struct *p;
- p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
cpu);
if (IS_ERR(p))
return p;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e4e4121fa327..bbef57f5bdfd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -302,6 +302,10 @@ config PM_GENERIC_DOMAINS_RUNTIME
def_bool y
depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+config PM_GENERIC_DOMAINS_OF
+ def_bool y
+ depends on PM_GENERIC_DOMAINS && OF
+
config CPU_PM
bool
depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4ee194eb524b..7b323221b9ee 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -129,6 +129,7 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
+ pm_wakeup_clear();
printk("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f1604d8cf489..791a61892bb5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -725,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
clear_bit(bit, addr);
}
+static void memory_bm_clear_current(struct memory_bitmap *bm)
+{
+ int bit;
+
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -1333,23 +1341,39 @@ static struct memory_bitmap copy_bm;
void swsusp_free(void)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long fb_pfn, fr_pfn;
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- if (swsusp_page_is_forbidden(page) &&
- swsusp_page_is_free(page)) {
- swsusp_unset_page_forbidden(page);
- swsusp_unset_page_free(page);
- __free_page(page);
- }
- }
+ if (!forbidden_pages_map || !free_pages_map)
+ goto out;
+
+ memory_bm_position_reset(forbidden_pages_map);
+ memory_bm_position_reset(free_pages_map);
+
+loop:
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+
+ /*
+ * Find the next bit set in both bitmaps. This is guaranteed to
+ * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
+ */
+ do {
+ if (fb_pfn < fr_pfn)
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+ if (fr_pfn < fb_pfn)
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ } while (fb_pfn != fr_pfn);
+
+ if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+ struct page *page = pfn_to_page(fr_pfn);
+
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
+ __free_page(page);
+ goto loop;
}
+
+out:
nr_copy_pages = 0;
nr_meta_pages = 0;
restore_pblist = NULL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 18c62195660f..4ca9a33ff620 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state)
static int platform_suspend_prepare_late(suspend_state_t state)
{
+ return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ?
+ freeze_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_noirq(suspend_state_t state)
+{
return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
suspend_ops->prepare_late() : 0;
}
-static void platform_suspend_wake(suspend_state_t state)
+static void platform_resume_noirq(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
suspend_ops->wake();
}
-static void platform_suspend_finish(suspend_state_t state)
+static void platform_resume_early(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops->restore)
+ freeze_ops->restore();
+}
+
+static void platform_resume_finish(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
suspend_ops->finish();
@@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state)
return 0;
}
-static void platform_suspend_end(suspend_state_t state)
+static void platform_resume_end(suspend_state_t state)
{
if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
freeze_ops->end();
@@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state)
suspend_ops->end();
}
-static void platform_suspend_recover(suspend_state_t state)
+static void platform_recover(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
suspend_ops->recover();
@@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (error)
goto Platform_finish;
- error = dpm_suspend_end(PMSG_SUSPEND);
+ error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down\n");
+ printk(KERN_ERR "PM: late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
if (error)
+ goto Devices_early_resume;
+
+ error = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ goto Platform_early_resume;
+ }
+ error = platform_suspend_prepare_noirq(state);
+ if (error)
goto Platform_wake;
if (suspend_test(TEST_PLATFORM))
@@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
enable_nonboot_cpus();
Platform_wake:
- platform_suspend_wake(state);
- dpm_resume_start(PMSG_RESUME);
+ platform_resume_noirq(state);
+ dpm_resume_noirq(PMSG_RESUME);
+
+ Platform_early_resume:
+ platform_resume_early(state);
+
+ Devices_early_resume:
+ dpm_resume_early(PMSG_RESUME);
Platform_finish:
- platform_suspend_finish(state);
+ platform_resume_finish(state);
return error;
}
@@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ trace_suspend_resume(TPS("resume_console"), state, true);
resume_console();
+ trace_suspend_resume(TPS("resume_console"), state, false);
Close:
- platform_suspend_end(state);
+ platform_resume_end(state);
return error;
Recover_platform:
- platform_suspend_recover(state);
+ platform_recover(state);
goto Resume_devices;
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index bd91bc177c93..084452e34a12 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -22,6 +22,8 @@
#define TEST_SUSPEND_SECONDS 10
static unsigned long suspend_test_start_time;
+static u32 test_repeat_count_max = 1;
+static u32 test_repeat_count_current;
void suspend_test_start(void)
{
@@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
int status;
/* this may fail if the RTC hasn't been initialized */
+repeat:
status = rtc_read_time(rtc, &alm.time);
if (status < 0) {
printk(err_readtime, dev_name(&rtc->dev), status);
@@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
if (state == PM_SUSPEND_STANDBY) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
+ if (status < 0)
+ state = PM_SUSPEND_FREEZE;
}
+ if (state == PM_SUSPEND_FREEZE) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+
if (status < 0)
printk(err_suspend, status);
+ test_repeat_count_current++;
+ if (test_repeat_count_current < test_repeat_count_max)
+ goto repeat;
+
/* Some platforms can't detect that the alarm triggered the
* wakeup, or (accordingly) disable it after it afterwards.
* It's supposed to give oneshot behavior; cope.
@@ -137,16 +151,28 @@ static char warn_bad_state[] __initdata =
static int __init setup_test_suspend(char *value)
{
int i;
+ char *repeat;
+ char *suspend_type;
- /* "=mem" ==> "mem" */
+ /* example : "=mem[,N]" ==> "mem[,N]" */
value++;
+ suspend_type = strsep(&value, ",");
+ if (!suspend_type)
+ return 0;
+
+ repeat = strsep(&value, ",");
+ if (repeat) {
+ if (kstrtou32(repeat, 0, &test_repeat_count_max))
+ return 0;
+ }
+
for (i = 0; pm_labels[i]; i++)
- if (!strcmp(pm_labels[i], value)) {
+ if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
return 0;
}
- printk(warn_bad_state, value);
+ printk(warn_bad_state, suspend_type);
return 0;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a3a9e240fcdb..5925f5ae8dff 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to restart the system.
+ */
+static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
+
+/**
+ * register_restart_handler - Register function to be called to reset
+ * the system
+ * @nb: Info about handler function to be called
+ * @nb->priority: Handler priority. Handlers should follow the
+ * following guidelines for setting priorities.
+ * 0: Restart handler of last resort,
+ * with limited restart capabilities
+ * 128: Default restart handler; use if no other
+ * restart handler is expected to be available,
+ * and/or if restart functionality is
+ * sufficient to restart the entire system
+ * 255: Highest priority restart handler, will
+ * preempt all other restart handlers
+ *
+ * Registers a function with code to be called to restart the
+ * system.
+ *
+ * Registered functions will be called from machine_restart as last
+ * step of the restart sequence (if the architecture specific
+ * machine_restart function calls do_kernel_restart - see below
+ * for details).
+ * Registered functions are expected to restart the system immediately.
+ * If more than one function is registered, the restart handler priority
+ * selects which function will be called first.
+ *
+ * Restart handlers are expected to be registered from non-architecture
+ * code, typically from drivers. A typical use case would be a system
+ * where restart functionality is provided through a watchdog. Multiple
+ * restart handlers may exist; for example, one restart handler might
+ * restart the entire system, while another only restarts the CPU.
+ * In such cases, the restart handler which only restarts part of the
+ * hardware is expected to register with low priority to ensure that
+ * it only runs if no other means to restart the system is available.
+ *
+ * Currently always returns zero, as atomic_notifier_chain_register()
+ * always returns zero.
+ */
+int register_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(register_restart_handler);
+
+/**
+ * unregister_restart_handler - Unregister previously registered
+ * restart handler
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered restart handler function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(unregister_restart_handler);
+
+/**
+ * do_kernel_restart - Execute kernel restart handler call chain
+ *
+ * Calls functions registered with register_restart_handler.
+ *
+ * Expected to be called from machine_restart as last step of the restart
+ * sequence.
+ *
+ * Restarts the system immediately if a restart handler function has been
+ * registered. Otherwise does nothing.
+ */
+void do_kernel_restart(char *cmd)
+{
+ atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
+}
+
void migrate_to_reboot_cpu(void)
{
/* The boot cpu is always logical cpu 0 */
diff --git a/kernel/resource.c b/kernel/resource.c
index 60c5a3856ab7..46322019ab7d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1245,6 +1245,76 @@ int release_mem_region_adjustable(struct resource *parent,
/*
* Managed region resource
*/
+static void devm_resource_release(struct device *dev, void *ptr)
+{
+ struct resource **r = ptr;
+
+ release_resource(*r);
+}
+
+/**
+ * devm_request_resource() - request and reserve an I/O or memory resource
+ * @dev: device for which to request the resource
+ * @root: root of the resource tree from which to request the resource
+ * @new: descriptor of the resource to request
+ *
+ * This is a device-managed version of request_resource(). There is usually
+ * no need to release resources requested by this function explicitly since
+ * that will be taken care of when the device is unbound from its driver.
+ * If for some reason the resource needs to be released explicitly, because
+ * of ordering issues for example, drivers must call devm_release_resource()
+ * rather than the regular release_resource().
+ *
+ * When a conflict is detected between any existing resources and the newly
+ * requested resource, an error message will be printed.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int devm_request_resource(struct device *dev, struct resource *root,
+ struct resource *new)
+{
+ struct resource *conflict, **ptr;
+
+ ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ *ptr = new;
+
+ conflict = request_resource_conflict(root, new);
+ if (conflict) {
+ dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
+ new, conflict->name, conflict);
+ devres_free(ptr);
+ return -EBUSY;
+ }
+
+ devres_add(dev, ptr);
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_resource);
+
+static int devm_resource_match(struct device *dev, void *res, void *data)
+{
+ struct resource **ptr = res;
+
+ return *ptr == data;
+}
+
+/**
+ * devm_release_resource() - release a previously requested resource
+ * @dev: device for which to release the resource
+ * @new: descriptor of the resource to release
+ *
+ * Releases a resource previously requested using devm_request_resource().
+ */
+void devm_release_resource(struct device *dev, struct resource *new)
+{
+ WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
+ new));
+}
+EXPORT_SYMBOL(devm_release_resource);
+
struct region_devres {
struct resource *parent;
resource_size_t start;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d68..82088b29704e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1946,7 +1946,7 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma))
continue;
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..dfce4debd138 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
#include <asm/unistd.h>
#ifndef SET_UNALIGN_CTL
-# define SET_UNALIGN_CTL(a,b) (-EINVAL)
+# define SET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
-# define GET_UNALIGN_CTL(a,b) (-EINVAL)
+# define GET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
-# define SET_FPEMU_CTL(a,b) (-EINVAL)
+# define SET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
-# define GET_FPEMU_CTL(a,b) (-EINVAL)
+# define GET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
-# define SET_FPEXC_CTL(a,b) (-EINVAL)
+# define SET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
-# define GET_FPEXC_CTL(a,b) (-EINVAL)
+# define GET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_ENDIAN
-# define GET_ENDIAN(a,b) (-EINVAL)
+# define GET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef SET_ENDIAN
-# define SET_ENDIAN(a,b) (-EINVAL)
+# define SET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a) (-EINVAL)
@@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p)
- error = set_one_prio(p, niceval, error);
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- error = set_one_prio(p, niceval, error);
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
- error = set_one_prio(p, niceval, error);
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* For find_user() */
- break;
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid))
+ error = set_one_prio(p, niceval, error);
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* For find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ }
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
+ goto out_unlock; /* No processes for this user */
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
- goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- }
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* for find_user() */
- break;
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* for find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -306,7 +308,7 @@ out_unlock:
*
* The general idea is that a program which uses just setregid() will be
* 100% compatible with BSD. A program which uses just setgid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
@@ -364,7 +366,7 @@ error:
}
/*
- * setgid() is implemented like SysV w/ SAVED_IDS
+ * setgid() is implemented like SysV w/ SAVED_IDS
*
* SMP: Same implicit races as above.
*/
@@ -442,7 +444,7 @@ static int set_user(struct cred *new)
*
* The general idea is that a program which uses just setreuid() will be
* 100% compatible with BSD. A program which uses just setuid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*/
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
@@ -503,17 +505,17 @@ error:
abort_creds(new);
return retval;
}
-
+
/*
- * setuid() is implemented like SysV with SAVED_IDS
- *
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
* Note that SAVED_ID's is deficient in that a setuid root program
- * like sendmail, for example, cannot set its uid to be a normal
+ * like sendmail, for example, cannot set its uid to be a normal
* user and then switch back, because if you're root, setuid() sets
* the saved uid too. If you don't like this, blame the bright people
* in the POSIX committee and/or USG. Note that the BSD-style setreuid()
* will allow a root program to temporarily drop privileges and be able to
- * regain them by swapping the real and effective uid.
+ * regain them by swapping the real and effective uid.
*/
SYSCALL_DEFINE1(setuid, uid_t, uid)
{
@@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
euid = from_kuid_munged(cred->user_ns, cred->euid);
suid = from_kuid_munged(cred->user_ns, cred->suid);
- if (!(retval = put_user(ruid, ruidp)) &&
- !(retval = put_user(euid, euidp)))
- retval = put_user(suid, suidp);
-
+ retval = put_user(ruid, ruidp);
+ if (!retval) {
+ retval = put_user(euid, euidp);
+ if (!retval)
+ return put_user(suid, suidp);
+ }
return retval;
}
@@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
egid = from_kgid_munged(cred->user_ns, cred->egid);
sgid = from_kgid_munged(cred->user_ns, cred->sgid);
- if (!(retval = put_user(rgid, rgidp)) &&
- !(retval = put_user(egid, egidp)))
- retval = put_user(sgid, sgidp);
+ retval = put_user(rgid, rgidp);
+ if (!retval) {
+ retval = put_user(egid, egidp);
+ if (!retval)
+ retval = put_user(sgid, sgidp);
+ }
return retval;
}
@@ -1284,7 +1291,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
/*
* Back compatibility for getrlimit. Needed for some apps.
*/
-
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
struct rlimit __user *, rlim)
{
@@ -1299,7 +1305,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
x.rlim_cur = 0x7FFFFFFF;
if (x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
- return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+ return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
#endif
@@ -1527,7 +1533,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
cputime_t tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
- memset((char *) r, 0, sizeof *r);
+ memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
if (who == RUSAGE_THREAD) {
@@ -1541,41 +1547,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
return;
switch (who) {
- case RUSAGE_BOTH:
- case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
-
- if (who == RUSAGE_CHILDREN)
- break;
-
- case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
- accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ case RUSAGE_BOTH:
+ case RUSAGE_CHILDREN:
+ utime = p->signal->cutime;
+ stime = p->signal->cstime;
+ r->ru_nvcsw = p->signal->cnvcsw;
+ r->ru_nivcsw = p->signal->cnivcsw;
+ r->ru_minflt = p->signal->cmin_flt;
+ r->ru_majflt = p->signal->cmaj_flt;
+ r->ru_inblock = p->signal->cinblock;
+ r->ru_oublock = p->signal->coublock;
+ maxrss = p->signal->cmaxrss;
+
+ if (who == RUSAGE_CHILDREN)
break;
- default:
- BUG();
+ case RUSAGE_SELF:
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+ r->ru_nvcsw += p->signal->nvcsw;
+ r->ru_nivcsw += p->signal->nivcsw;
+ r->ru_minflt += p->signal->min_flt;
+ r->ru_majflt += p->signal->maj_flt;
+ r->ru_inblock += p->signal->inblock;
+ r->ru_oublock += p->signal->oublock;
+ if (maxrss < p->signal->maxrss)
+ maxrss = p->signal->maxrss;
+ t = p;
+ do {
+ accumulate_thread_rusage(t, r);
+ } while_each_thread(p, t);
+ break;
+
+ default:
+ BUG();
}
unlock_task_sighand(p, &flags);
@@ -1585,6 +1591,7 @@ out:
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
+
if (mm) {
setmax_mm_hiwater_rss(&maxrss, mm);
mmput(mm);
@@ -1596,6 +1603,7 @@ out:
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
struct rusage r;
+
k_getrusage(p, who, &r);
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
@@ -1628,12 +1636,14 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
struct inode *inode;
int err;
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
exe = fdget(fd);
if (!exe.file)
return -EBADF;
@@ -1654,8 +1664,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (err)
goto exit;
- down_write(&mm->mmap_sem);
-
/*
* Forbid mm->exe_file change if old file still mapped.
*/
@@ -1667,7 +1675,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (vma->vm_file &&
path_equal(&vma->vm_file->f_path,
&mm->exe_file->f_path))
- goto exit_unlock;
+ goto exit;
}
/*
@@ -1678,34 +1686,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
*/
err = -EPERM;
if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
- goto exit_unlock;
+ goto exit;
err = 0;
set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
-exit_unlock:
- up_write(&mm->mmap_sem);
-
exit:
fdput(exe);
return err;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+ unsigned long mmap_max_addr = TASK_SIZE;
+ struct mm_struct *mm = current->mm;
+ int error = -EINVAL, i;
+
+ static const unsigned char offsets[] = {
+ offsetof(struct prctl_mm_map, start_code),
+ offsetof(struct prctl_mm_map, end_code),
+ offsetof(struct prctl_mm_map, start_data),
+ offsetof(struct prctl_mm_map, end_data),
+ offsetof(struct prctl_mm_map, start_brk),
+ offsetof(struct prctl_mm_map, brk),
+ offsetof(struct prctl_mm_map, start_stack),
+ offsetof(struct prctl_mm_map, arg_start),
+ offsetof(struct prctl_mm_map, arg_end),
+ offsetof(struct prctl_mm_map, env_start),
+ offsetof(struct prctl_mm_map, env_end),
+ };
+
+ /*
+ * Make sure the members are not somewhere outside
+ * of allowed address space.
+ */
+ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+ u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+
+ if ((unsigned long)val >= mmap_max_addr ||
+ (unsigned long)val < mmap_min_addr)
+ goto out;
+ }
+
+ /*
+ * Make sure the pairs are ordered.
+ */
+#define __prctl_check_order(__m1, __op, __m2) \
+ ((unsigned long)prctl_map->__m1 __op \
+ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+ error = __prctl_check_order(start_code, <, end_code);
+ error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_brk, <=, brk);
+ error |= __prctl_check_order(arg_start, <=, arg_end);
+ error |= __prctl_check_order(env_start, <=, env_end);
+ if (error)
+ goto out;
+#undef __prctl_check_order
+
+ error = -EINVAL;
+
+ /*
+ * @brk should be after @end_data in traditional maps.
+ */
+ if (prctl_map->start_brk <= prctl_map->end_data ||
+ prctl_map->brk <= prctl_map->end_data)
+ goto out;
+
+ /*
+ * Neither we should allow to override limits if they set.
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+ prctl_map->start_brk, prctl_map->end_data,
+ prctl_map->start_data))
+ goto out;
+
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (prctl_map->auxv_size) {
+ if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+ goto out;
+ }
+
+ /*
+ * Finally, make sure the caller has the rights to
+ * change /proc/pid/exe link: only local root should
+ * be allowed to.
+ */
+ if (prctl_map->exe_fd != (u32)-1) {
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *cred = current_cred();
+
+ if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+ !gid_eq(cred->gid, make_kgid(ns, 0)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+ BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+
+ if (opt == PR_SET_MM_MAP_SIZE)
+ return put_user((unsigned int)sizeof(prctl_map),
+ (unsigned int __user *)addr);
+
+ if (data_size != sizeof(prctl_map))
+ return -EINVAL;
+
+ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+ return -EFAULT;
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ return error;
+
+ if (prctl_map.auxv_size) {
+ memset(user_auxv, 0, sizeof(user_auxv));
+ if (copy_from_user(user_auxv,
+ (const void __user *)prctl_map.auxv,
+ prctl_map.auxv_size))
+ return -EFAULT;
+
+ /* Last entry must be AT_NULL as specification requires */
+ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+ user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+ }
+
+ down_write(&mm->mmap_sem);
+ if (prctl_map.exe_fd != (u32)-1)
+ error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
+ downgrade_write(&mm->mmap_sem);
+ if (error)
+ goto out;
+
+ /*
+ * We don't validate if these members are pointing to
+ * real present VMAs because application may have correspond
+ * VMAs already unmapped and kernel uses these members for statistics
+ * output in procfs mostly, except
+ *
+ * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * for VMAs when updating these memvers so anything wrong written
+ * here cause kernel to swear at userspace program but won't lead
+ * to any problem in kernel itself
+ */
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ /*
+ * Note this update of @saved_auxv is lockless thus
+ * if someone reads this member in procfs while we're
+ * updating -- it may get partly updated results. It's
+ * known and acceptable trade off: we leave it as is to
+ * not introduce additional locks here making the kernel
+ * more complex.
+ */
+ if (prctl_map.auxv_size)
+ memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+
+ error = 0;
+out:
+ up_read(&mm->mmap_sem);
+ return error;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
- unsigned long rlim = rlimit(RLIMIT_DATA);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int error;
- if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
+ if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+ opt != PR_SET_MM_MAP &&
+ opt != PR_SET_MM_MAP_SIZE)))
return -EINVAL;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (opt == PR_SET_MM_EXE_FILE)
- return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+ if (opt == PR_SET_MM_EXE_FILE) {
+ down_write(&mm->mmap_sem);
+ error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
+ up_write(&mm->mmap_sem);
+ return error;
+ }
if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
@@ -1733,9 +1929,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (mm->brk - addr) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
+ mm->end_data, mm->start_data))
goto out;
mm->start_brk = addr;
@@ -1745,9 +1940,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (addr - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
mm->brk = addr;
@@ -2023,6 +2217,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
{
int err = 0;
int cpu = raw_smp_processor_id();
+
if (cpup)
err |= put_user(cpu, cpup);
if (nodep)
@@ -2135,7 +2330,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
/* Check to see if any memory value is too large for 32-bit and scale
* down if needed
*/
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
int bitcount = 0;
while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75875a741b5e..91180987e40e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1460,13 +1460,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
- {
- .procname = "scan_unevictable_pages",
- .data = &scan_unevictable_pages,
- .maxlen = sizeof(scan_unevictable_pages),
- .mode = 0644,
- .proc_handler = scan_unevictable_handler,
- },
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0a0608edeb26..052b4b53c3d6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -400,4 +400,5 @@ void tick_resume(void)
void __init tick_init(void)
{
tick_broadcast_init();
+ tick_nohz_init();
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c19c1d84b6f3..366aeb4f2c66 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline bool tick_broadcast_oneshot_available(void) { return false; }
#endif /* !TICK_ONESHOT */
+/* NO_HZ_FULL internal */
+#ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_init(void);
+# else
+static inline void tick_nohz_init(void) { }
+#endif
+
/*
* Broadcasting support
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f654a8a298fa..7c1412ea2d29 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -295,22 +295,12 @@ out:
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
{
- int cpu;
-
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
- alloc_bootmem_cpumask_var(&housekeeping_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ free_bootmem_cpumask_var(tick_nohz_full_mask);
return 1;
}
-
- cpu = smp_processor_id();
- if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
- pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
- cpumask_clear_cpu(cpu, tick_nohz_full_mask);
- }
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, tick_nohz_full_mask);
tick_nohz_full_running = true;
return 1;
@@ -349,18 +339,11 @@ static int tick_nohz_init_all(void)
#ifdef CONFIG_NO_HZ_FULL_ALL
if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
- pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
- return err;
- }
- if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
- pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
err = 0;
cpumask_setall(tick_nohz_full_mask);
- cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
- cpumask_clear(housekeeping_mask);
- cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
tick_nohz_full_running = true;
#endif
return err;
@@ -375,6 +358,37 @@ void __init tick_nohz_init(void)
return;
}
+ if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+ WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ cpumask_clear(tick_nohz_full_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ /*
+ * Full dynticks uses irq work to drive the tick rescheduling on safe
+ * locking contexts. But then we need irq work to raise its own
+ * interrupts to avoid circular dependency on the tick
+ */
+ if (!arch_irq_work_has_interrupt()) {
+ pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
+ "support irq work self-IPIs\n");
+ cpumask_clear(tick_nohz_full_mask);
+ cpumask_copy(housekeeping_mask, cpu_possible_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
+
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, tick_nohz_full_mask);
+
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
@@ -982,6 +996,10 @@ static void tick_nohz_handler(struct clock_event_device *dev)
tick_sched_do_timer(now);
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are running tickless */
+ if (unlikely(ts->tick_stopped))
+ return;
+
while (tick_nohz_reprogram(ts, now)) {
now = ktime_get();
tick_do_update_jiffies64(now);
@@ -1109,6 +1127,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
if (regs)
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are in idle or full dynticks mode */
+ if (unlikely(ts->tick_stopped))
+ return HRTIMER_NORESTART;
+
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index aca5dfe2fa3d..9bbb8344ed3b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1385,7 +1385,7 @@ void update_process_times(int user_tick)
rcu_check_callbacks(cpu, user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
- irq_work_run();
+ irq_work_tick();
#endif
scheduler_tick();
run_posix_cpu_timers(p);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914030fe..7b223b212683 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -333,8 +334,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true)
+ if (__this_cpu_read(soft_watchdog_warn) == true) {
+ /*
+ * When multiple processes are causing softlockups the
+ * softlockup detector only warns on the first one
+ * because the code relies on a full quiet cycle to
+ * re-arm. The second process prevents the quiet cycle
+ * and never gets reported. Use task pointers to detect
+ * this.
+ */
+ if (__this_cpu_read(softlockup_task_ptr_saved) !=
+ current) {
+ __this_cpu_write(soft_watchdog_warn, false);
+ __touch_watchdog();
+ }
return HRTIMER_RESTART;
+ }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -350,6 +365,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)