diff options
Diffstat (limited to 'kernel')
87 files changed, 4111 insertions, 2649 deletions
| diff --git a/kernel/Makefile b/kernel/Makefile index 2d9de86b7e76..cb41b9547c9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,7 +27,6 @@ obj-y += power/  obj-$(CONFIG_FREEZER) += freezer.o  obj-$(CONFIG_PROFILING) += profile.o -obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o  obj-$(CONFIG_STACKTRACE) += stacktrace.o  obj-y += time/  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/audit.c b/kernel/audit.c index bb0eb5bb9a0a..1c7f2c61416b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)  /* This is a helper-function to print the escaped d_path */  void audit_log_d_path(struct audit_buffer *ab, const char *prefix, -		      struct path *path) +		      const struct path *path)  {  	char *p, *pathname; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..f4ea4b6f3cf1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)  	for_each_subsys(cgrp->root, ss)  		if (ss->pre_destroy) { -			ret = ss->pre_destroy(ss, cgrp); +			ret = ss->pre_destroy(cgrp);  			if (ret)  				break;  		} @@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)  		 * Release the subsystem state objects.  		 */  		for_each_subsys(cgrp->root, ss) -			ss->destroy(ss, cgrp); +			ss->destroy(cgrp);  		cgrp->root->number_of_cgroups--;  		mutex_unlock(&cgroup_mutex); @@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,  			list_move(&ss->sibling, &root->subsys_list);  			ss->root = root;  			if (ss->bind) -				ss->bind(ss, cgrp); +				ss->bind(cgrp);  			mutex_unlock(&ss->hierarchy_mutex);  			/* refcount was already taken, and we're keeping it */  		} else if (bit & removed_bits) { @@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,  			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);  			mutex_lock(&ss->hierarchy_mutex);  			if (ss->bind) -				ss->bind(ss, dummytop); +				ss->bind(dummytop);  			dummytop->subsys[i]->cgroup = dummytop;  			cgrp->subsys[i] = NULL;  			subsys[i]->root = &rootnode; @@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb)  	struct inode *inode =  		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); -	struct dentry *dentry;  	if (!inode)  		return -ENOMEM; @@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb)  	inode->i_op = &cgroup_dir_inode_operations;  	/* directories start off with i_nlink == 2 (for "." entry) */  	inc_nlink(inode); -	dentry = d_alloc_root(inode); -	if (!dentry) { -		iput(inode); +	sb->s_root = d_make_root(inode); +	if (!sb->s_root)  		return -ENOMEM; -	} -	sb->s_root = dentry;  	/* for everything else we want ->d_op set */  	sb->s_d_op = &cgroup_dops;  	return 0; @@ -1763,6 +1759,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);  struct task_and_cgroup {  	struct task_struct	*task;  	struct cgroup		*cgrp; +	struct css_set		*cg;  };  struct cgroup_taskset { @@ -1843,11 +1840,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);   * will already exist. If not set, this function might sleep, and can fail with   * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.   */ -static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, -			       struct task_struct *tsk, bool guarantee) +static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +				struct task_struct *tsk, struct css_set *newcg)  {  	struct css_set *oldcg; -	struct css_set *newcg;  	/*  	 * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1857,23 +1853,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,  	WARN_ON_ONCE(tsk->flags & PF_EXITING);  	oldcg = tsk->cgroups; -	/* locate or allocate a new css_set for this task. */ -	if (guarantee) { -		/* we know the css_set we want already exists. */ -		struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; -		read_lock(&css_set_lock); -		newcg = find_existing_css_set(oldcg, cgrp, template); -		BUG_ON(!newcg); -		get_css_set(newcg); -		read_unlock(&css_set_lock); -	} else { -		might_sleep(); -		/* find_css_set will give us newcg already referenced. */ -		newcg = find_css_set(oldcg, cgrp); -		if (!newcg) -			return -ENOMEM; -	} -  	task_lock(tsk);  	rcu_assign_pointer(tsk->cgroups, newcg);  	task_unlock(tsk); @@ -1892,7 +1871,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,  	put_css_set(oldcg);  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags); -	return 0;  }  /** @@ -1910,6 +1888,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	struct cgroup *oldcgrp;  	struct cgroupfs_root *root = cgrp->root;  	struct cgroup_taskset tset = { }; +	struct css_set *newcg;  	/* @tsk either already exited or can't exit until the end */  	if (tsk->flags & PF_EXITING) @@ -1925,7 +1904,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  	for_each_subsys(root, ss) {  		if (ss->can_attach) { -			retval = ss->can_attach(ss, cgrp, &tset); +			retval = ss->can_attach(cgrp, &tset);  			if (retval) {  				/*  				 * Remember on which subsystem the can_attach() @@ -1939,13 +1918,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)  		}  	} -	retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); -	if (retval) +	newcg = find_css_set(tsk->cgroups, cgrp); +	if (!newcg) { +		retval = -ENOMEM;  		goto out; +	} + +	cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);  	for_each_subsys(root, ss) {  		if (ss->attach) -			ss->attach(ss, cgrp, &tset); +			ss->attach(cgrp, &tset);  	}  	synchronize_rcu(); @@ -1967,7 +1950,7 @@ out:  				 */  				break;  			if (ss->cancel_attach) -				ss->cancel_attach(ss, cgrp, &tset); +				ss->cancel_attach(cgrp, &tset);  		}  	}  	return retval; @@ -1997,66 +1980,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)  }  EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -/* - * cgroup_attach_proc works in two stages, the first of which prefetches all - * new css_sets needed (to make sure we have enough memory before committing - * to the move) and stores them in a list of entries of the following type. - * TODO: possible optimization: use css_set->rcu_head for chaining instead - */ -struct cg_list_entry { -	struct css_set *cg; -	struct list_head links; -}; - -static bool css_set_check_fetched(struct cgroup *cgrp, -				  struct task_struct *tsk, struct css_set *cg, -				  struct list_head *newcg_list) -{ -	struct css_set *newcg; -	struct cg_list_entry *cg_entry; -	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - -	read_lock(&css_set_lock); -	newcg = find_existing_css_set(cg, cgrp, template); -	read_unlock(&css_set_lock); - -	/* doesn't exist at all? */ -	if (!newcg) -		return false; -	/* see if it's already in the list */ -	list_for_each_entry(cg_entry, newcg_list, links) -		if (cg_entry->cg == newcg) -			return true; - -	/* not found */ -	return false; -} - -/* - * Find the new css_set and store it in the list in preparation for moving the - * given task to the given cgroup. Returns 0 or -ENOMEM. - */ -static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, -			    struct list_head *newcg_list) -{ -	struct css_set *newcg; -	struct cg_list_entry *cg_entry; - -	/* ensure a new css_set will exist for this thread */ -	newcg = find_css_set(cg, cgrp); -	if (!newcg) -		return -ENOMEM; -	/* add it to the list */ -	cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); -	if (!cg_entry) { -		put_css_set(newcg); -		return -ENOMEM; -	} -	cg_entry->cg = newcg; -	list_add(&cg_entry->links, newcg_list); -	return 0; -} -  /**   * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup   * @cgrp: the cgroup to attach to @@ -2070,20 +1993,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	int retval, i, group_size;  	struct cgroup_subsys *ss, *failed_ss = NULL;  	/* guaranteed to be initialized later, but the compiler needs this */ -	struct css_set *oldcg;  	struct cgroupfs_root *root = cgrp->root;  	/* threadgroup list cursor and array */  	struct task_struct *tsk;  	struct task_and_cgroup *tc;  	struct flex_array *group;  	struct cgroup_taskset tset = { }; -	/* -	 * we need to make sure we have css_sets for all the tasks we're -	 * going to move -before- we actually start moving them, so that in -	 * case we get an ENOMEM we can bail out before making any changes. -	 */ -	struct list_head newcg_list; -	struct cg_list_entry *cg_entry, *temp_nobe;  	/*  	 * step 0: in order to do expensive, possibly blocking operations for @@ -2102,23 +2017,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	if (retval)  		goto out_free_group_list; -	/* prevent changes to the threadgroup list while we take a snapshot. */ -	read_lock(&tasklist_lock); -	if (!thread_group_leader(leader)) { -		/* -		 * a race with de_thread from another thread's exec() may strip -		 * us of our leadership, making while_each_thread unsafe to use -		 * on this task. if this happens, there is no choice but to -		 * throw this task away and try again (from cgroup_procs_write); -		 * this is "double-double-toil-and-trouble-check locking". -		 */ -		read_unlock(&tasklist_lock); -		retval = -EAGAIN; -		goto out_free_group_list; -	} -  	tsk = leader;  	i = 0; +	/* +	 * Prevent freeing of tasks while we take a snapshot. Tasks that are +	 * already PF_EXITING could be freed from underneath us unless we +	 * take an rcu_read_lock. +	 */ +	rcu_read_lock();  	do {  		struct task_and_cgroup ent; @@ -2128,24 +2034,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  		/* as per above, nr_threads may decrease, but not increase. */  		BUG_ON(i >= group_size); -		/* -		 * saying GFP_ATOMIC has no effect here because we did prealloc -		 * earlier, but it's good form to communicate our expectations. -		 */  		ent.task = tsk;  		ent.cgrp = task_cgroup_from_root(tsk, root);  		/* nothing to do if this task is already in the cgroup */  		if (ent.cgrp == cgrp)  			continue; +		/* +		 * saying GFP_ATOMIC has no effect here because we did prealloc +		 * earlier, but it's good form to communicate our expectations. +		 */  		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);  		BUG_ON(retval != 0);  		i++;  	} while_each_thread(leader, tsk); +	rcu_read_unlock();  	/* remember the number of threads in the array for later. */  	group_size = i;  	tset.tc_array = group;  	tset.tc_array_len = group_size; -	read_unlock(&tasklist_lock);  	/* methods shouldn't be called if no task is actually migrating */  	retval = 0; @@ -2157,7 +2063,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 */  	for_each_subsys(root, ss) {  		if (ss->can_attach) { -			retval = ss->can_attach(ss, cgrp, &tset); +			retval = ss->can_attach(cgrp, &tset);  			if (retval) {  				failed_ss = ss;  				goto out_cancel_attach; @@ -2169,17 +2075,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 * step 2: make sure css_sets exist for all threads to be migrated.  	 * we use find_css_set, which allocates a new one if necessary.  	 */ -	INIT_LIST_HEAD(&newcg_list);  	for (i = 0; i < group_size; i++) {  		tc = flex_array_get(group, i); -		oldcg = tc->task->cgroups; - -		/* if we don't already have it in the list get a new one */ -		if (!css_set_check_fetched(cgrp, tc->task, oldcg, -					   &newcg_list)) { -			retval = css_set_prefetch(cgrp, oldcg, &newcg_list); -			if (retval) -				goto out_list_teardown; +		tc->cg = find_css_set(tc->task->cgroups, cgrp); +		if (!tc->cg) { +			retval = -ENOMEM; +			goto out_put_css_set_refs;  		}  	} @@ -2190,8 +2091,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 */  	for (i = 0; i < group_size; i++) {  		tc = flex_array_get(group, i); -		retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); -		BUG_ON(retval); +		cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);  	}  	/* nothing is sensitive to fork() after this point. */ @@ -2200,7 +2100,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	 */  	for_each_subsys(root, ss) {  		if (ss->attach) -			ss->attach(ss, cgrp, &tset); +			ss->attach(cgrp, &tset);  	}  	/* @@ -2209,21 +2109,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)  	synchronize_rcu();  	cgroup_wakeup_rmdir_waiter(cgrp);  	retval = 0; -out_list_teardown: -	/* clean up the list of prefetched css_sets. */ -	list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { -		list_del(&cg_entry->links); -		put_css_set(cg_entry->cg); -		kfree(cg_entry); +out_put_css_set_refs: +	if (retval) { +		for (i = 0; i < group_size; i++) { +			tc = flex_array_get(group, i); +			if (!tc->cg) +				break; +			put_css_set(tc->cg); +		}  	}  out_cancel_attach: -	/* same deal as in cgroup_attach_task */  	if (retval) {  		for_each_subsys(root, ss) {  			if (ss == failed_ss)  				break;  			if (ss->cancel_attach) -				ss->cancel_attach(ss, cgrp, &tset); +				ss->cancel_attach(cgrp, &tset);  		}  	}  out_free_group_list: @@ -2245,22 +2146,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)  	if (!cgroup_lock_live_group(cgrp))  		return -ENODEV; +retry_find_task: +	rcu_read_lock();  	if (pid) { -		rcu_read_lock();  		tsk = find_task_by_vpid(pid);  		if (!tsk) {  			rcu_read_unlock(); -			cgroup_unlock(); -			return -ESRCH; -		} -		if (threadgroup) { -			/* -			 * RCU protects this access, since tsk was found in the -			 * tid map. a race with de_thread may cause group_leader -			 * to stop being the leader, but cgroup_attach_proc will -			 * detect it later. -			 */ -			tsk = tsk->group_leader; +			ret= -ESRCH; +			goto out_unlock_cgroup;  		}  		/*  		 * even if we're attaching all tasks in the thread group, we @@ -2271,29 +2164,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)  		    cred->euid != tcred->uid &&  		    cred->euid != tcred->suid) {  			rcu_read_unlock(); -			cgroup_unlock(); -			return -EACCES; +			ret = -EACCES; +			goto out_unlock_cgroup;  		} -		get_task_struct(tsk); -		rcu_read_unlock(); -	} else { -		if (threadgroup) -			tsk = current->group_leader; -		else -			tsk = current; -		get_task_struct(tsk); -	} - -	threadgroup_lock(tsk); +	} else +		tsk = current;  	if (threadgroup) +		tsk = tsk->group_leader; +	get_task_struct(tsk); +	rcu_read_unlock(); + +	threadgroup_lock(tsk); +	if (threadgroup) { +		if (!thread_group_leader(tsk)) { +			/* +			 * a race with de_thread from another thread's exec() +			 * may strip us of our leadership, if this happens, +			 * there is no choice but to throw this task away and +			 * try again; this is +			 * "double-double-toil-and-trouble-check locking". +			 */ +			threadgroup_unlock(tsk); +			put_task_struct(tsk); +			goto retry_find_task; +		}  		ret = cgroup_attach_proc(cgrp, tsk); -	else +	} else  		ret = cgroup_attach_task(cgrp, tsk); -  	threadgroup_unlock(tsk);  	put_task_struct(tsk); +out_unlock_cgroup:  	cgroup_unlock();  	return ret;  } @@ -2305,16 +2207,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)  static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)  { -	int ret; -	do { -		/* -		 * attach_proc fails with -EAGAIN if threadgroup leadership -		 * changes in the middle of the operation, in which case we need -		 * to find the task_struct for the new leader and start over. -		 */ -		ret = attach_task_by_pid(cgrp, tgid, true); -	} while (ret == -EAGAIN); -	return ret; +	return attach_task_by_pid(cgrp, tgid, true);  }  /** @@ -2804,15 +2697,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp,   * using their cgroups capability, we don't maintain the lists running   * through each css_set to its tasks until we see the list actually   * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU.   */  static void cgroup_enable_task_cg_lists(void)  {  	struct task_struct *p, *g;  	write_lock(&css_set_lock);  	use_task_css_set_links = 1; +	/* +	 * We need tasklist_lock because RCU is not safe against +	 * while_each_thread(). Besides, a forking task that has passed +	 * cgroup_post_fork() without seeing use_task_css_set_links = 1 +	 * is not guaranteed to have its child immediately visible in the +	 * tasklist if we walk through it with RCU. +	 */ +	read_lock(&tasklist_lock);  	do_each_thread(g, p) {  		task_lock(p);  		/* @@ -2824,6 +2722,7 @@ static void cgroup_enable_task_cg_lists(void)  			list_add(&p->cg_list, &p->cgroups->tasks);  		task_unlock(p);  	} while_each_thread(g, p); +	read_unlock(&tasklist_lock);  	write_unlock(&css_set_lock);  } @@ -3043,6 +2942,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)   *   */ +/* which pidlist file are we talking about? */ +enum cgroup_filetype { +	CGROUP_FILE_PROCS, +	CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { +	/* +	 * used to find which pidlist is wanted. doesn't change as long as +	 * this particular list stays in the list. +	*/ +	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; +	/* array of xids */ +	pid_t *list; +	/* how many elements the above list has */ +	int length; +	/* how many files are using the current array */ +	int use_count; +	/* each of these stored in a list by its cgroup */ +	struct list_head links; +	/* pointer to the cgroup we belong to, for list removal purposes */ +	struct cgroup *owner; +	/* protects the other fields */ +	struct rw_semaphore mutex; +}; +  /*   * The following two functions "fix" the issue where there are more pids   * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. @@ -3827,7 +3758,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);  	for_each_subsys(root, ss) { -		struct cgroup_subsys_state *css = ss->create(ss, cgrp); +		struct cgroup_subsys_state *css = ss->create(cgrp);  		if (IS_ERR(css)) {  			err = PTR_ERR(css); @@ -3841,7 +3772,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  		}  		/* At error, ->destroy() callback has to free assigned ID. */  		if (clone_children(parent) && ss->post_clone) -			ss->post_clone(ss, cgrp); +			ss->post_clone(cgrp);  	}  	cgroup_lock_hierarchy(root); @@ -3875,7 +3806,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	for_each_subsys(root, ss) {  		if (cgrp->subsys[ss->subsys_id]) -			ss->destroy(ss, cgrp); +			ss->destroy(cgrp);  	}  	mutex_unlock(&cgroup_mutex); @@ -4099,7 +4030,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)  	/* Create the top cgroup state for this subsystem */  	list_add(&ss->sibling, &rootnode.subsys_list);  	ss->root = &rootnode; -	css = ss->create(ss, dummytop); +	css = ss->create(dummytop);  	/* We don't handle early failures gracefully */  	BUG_ON(IS_ERR(css));  	init_cgroup_css(css, ss, dummytop); @@ -4188,7 +4119,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	 * no ss->create seems to need anything important in the ss struct, so  	 * this can happen first (i.e. before the rootnode attachment).  	 */ -	css = ss->create(ss, dummytop); +	css = ss->create(dummytop);  	if (IS_ERR(css)) {  		/* failure case - need to deassign the subsys[] slot. */  		subsys[i] = NULL; @@ -4206,7 +4137,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  		int ret = cgroup_init_idr(ss, css);  		if (ret) {  			dummytop->subsys[ss->subsys_id] = NULL; -			ss->destroy(ss, dummytop); +			ss->destroy(dummytop);  			subsys[i] = NULL;  			mutex_unlock(&cgroup_mutex);  			return ret; @@ -4304,7 +4235,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	 * pointer to find their state. note that this also takes care of  	 * freeing the css_id.  	 */ -	ss->destroy(ss, dummytop); +	ss->destroy(dummytop);  	dummytop->subsys[ss->subsys_id] = NULL;  	mutex_unlock(&cgroup_mutex); @@ -4580,7 +4511,7 @@ void cgroup_fork_callbacks(struct task_struct *child)  		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {  			struct cgroup_subsys *ss = subsys[i];  			if (ss->fork) -				ss->fork(ss, child); +				ss->fork(child);  		}  	}  } @@ -4596,6 +4527,17 @@ void cgroup_fork_callbacks(struct task_struct *child)   */  void cgroup_post_fork(struct task_struct *child)  { +	/* +	 * use_task_css_set_links is set to 1 before we walk the tasklist +	 * under the tasklist_lock and we read it here after we added the child +	 * to the tasklist under the tasklist_lock as well. If the child wasn't +	 * yet in the tasklist when we walked through it from +	 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value +	 * should be visible now due to the paired locking and barriers implied +	 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock +	 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock +	 * lock on fork. +	 */  	if (use_task_css_set_links) {  		write_lock(&css_set_lock);  		if (list_empty(&child->cg_list)) { @@ -4682,7 +4624,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  				struct cgroup *old_cgrp =  					rcu_dereference_raw(cg->subsys[i])->cgroup;  				struct cgroup *cgrp = task_cgroup(tsk, i); -				ss->exit(ss, cgrp, old_cgrp, tsk); +				ss->exit(cgrp, old_cgrp, tsk);  			}  		}  	} @@ -4939,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)  	rcu_assign_pointer(id->css, NULL);  	rcu_assign_pointer(css->id, NULL); -	write_lock(&ss->id_lock); +	spin_lock(&ss->id_lock);  	idr_remove(&ss->idr, id->id); -	write_unlock(&ss->id_lock); +	spin_unlock(&ss->id_lock);  	kfree_rcu(id, rcu_head);  }  EXPORT_SYMBOL_GPL(free_css_id); @@ -4967,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)  		error = -ENOMEM;  		goto err_out;  	} -	write_lock(&ss->id_lock); +	spin_lock(&ss->id_lock);  	/* Don't use 0. allocates an ID of 1-65535 */  	error = idr_get_new_above(&ss->idr, newid, 1, &myid); -	write_unlock(&ss->id_lock); +	spin_unlock(&ss->id_lock);  	/* Returns error when there are no free spaces for new ID.*/  	if (error) { @@ -4985,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)  	return newid;  remove_idr:  	error = -ENOSPC; -	write_lock(&ss->id_lock); +	spin_lock(&ss->id_lock);  	idr_remove(&ss->idr, myid); -	write_unlock(&ss->id_lock); +	spin_unlock(&ss->id_lock);  err_out:  	kfree(newid);  	return ERR_PTR(error); @@ -4999,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,  {  	struct css_id *newid; -	rwlock_init(&ss->id_lock); +	spin_lock_init(&ss->id_lock);  	idr_init(&ss->idr);  	newid = get_new_cssid(ss, 0); @@ -5087,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,  		return NULL;  	BUG_ON(!ss->use_id); +	WARN_ON_ONCE(!rcu_read_lock_held()); +  	/* fill start point for scan */  	tmpid = id;  	while (1) { @@ -5094,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,  		 * scan next entry from bitmap(tree), tmpid is updated after  		 * idr_get_next().  		 */ -		read_lock(&ss->id_lock);  		tmp = idr_get_next(&ss->idr, &tmpid); -		read_unlock(&ss->id_lock); -  		if (!tmp)  			break;  		if (tmp->depth >= depth && tmp->stack[depth] == rootid) { @@ -5137,8 +5078,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)  }  #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, -						   struct cgroup *cont) +static struct cgroup_subsys_state *debug_create(struct cgroup *cont)  {  	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5148,7 +5088,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,  	return css;  } -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void debug_destroy(struct cgroup *cont)  {  	kfree(cont->subsys[debug_subsys_id]);  } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fc0646b78a64..f86e93920b62 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys;   *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())   *     sighand->siglock   */ -static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, -						  struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)  {  	struct freezer *freezer; @@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,  	return &freezer->css;  } -static void freezer_destroy(struct cgroup_subsys *ss, -			    struct cgroup *cgroup) +static void freezer_destroy(struct cgroup *cgroup)  {  	struct freezer *freezer = cgroup_freezer(cgroup); @@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task)   * a write to that file racing against an attach, and hence the   * can_attach() result will remain valid until the attach completes.   */ -static int freezer_can_attach(struct cgroup_subsys *ss, -			      struct cgroup *new_cgroup, +static int freezer_can_attach(struct cgroup *new_cgroup,  			      struct cgroup_taskset *tset)  {  	struct freezer *freezer; @@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,  	return 0;  } -static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +static void freezer_fork(struct task_struct *task)  {  	struct freezer *freezer; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..1010cc61931f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,  {  	bool need_loop; -repeat:  	/*  	 * Allow tasks that have access to memory reserves because they have  	 * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat:  	 */  	need_loop = task_has_mempolicy(tsk) ||  			!nodes_intersects(*newmems, tsk->mems_allowed); -	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); -	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); -	/* -	 * ensure checking ->mems_allowed_change_disable after setting all new -	 * allowed nodes. -	 * -	 * the read-side task can see an nodemask with new allowed nodes and -	 * old allowed nodes. and if it allocates page when cpuset clears newly -	 * disallowed ones continuous, it can see the new allowed bits. -	 * -	 * And if setting all new allowed nodes is after the checking, setting -	 * all new allowed nodes and clearing newly disallowed ones will be done -	 * continuous, and the read-side task may find no node to alloc page. -	 */ -	smp_mb(); +	if (need_loop) +		write_seqcount_begin(&tsk->mems_allowed_seq); -	/* -	 * Allocation of memory is very fast, we needn't sleep when waiting -	 * for the read-side. -	 */ -	while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { -		task_unlock(tsk); -		if (!task_curr(tsk)) -			yield(); -		goto repeat; -	} - -	/* -	 * ensure checking ->mems_allowed_change_disable before clearing all new -	 * disallowed nodes. -	 * -	 * if clearing newly disallowed bits before the checking, the read-side -	 * task may find no node to alloc page. -	 */ -	smp_mb(); +	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); +	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);  	tsk->mems_allowed = *newmems; + +	if (need_loop) +		write_seqcount_end(&tsk->mems_allowed_seq); +  	task_unlock(tsk);  } @@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from;  static nodemask_t cpuset_attach_nodemask_to;  /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, -			     struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  {  	struct cpuset *cs = cgroup_cs(cgrp);  	struct task_struct *task; @@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,  	return 0;  } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, -			  struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  {  	struct mm_struct *mm;  	struct task_struct *task; @@ -1833,8 +1804,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)   * (and likewise for mems) to the new cgroup. Called with cgroup_mutex   * held.   */ -static void cpuset_post_clone(struct cgroup_subsys *ss, -			      struct cgroup *cgroup) +static void cpuset_post_clone(struct cgroup *cgroup)  {  	struct cgroup *parent, *child;  	struct cpuset *cs, *parent_cs; @@ -1857,13 +1827,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,  /*   *	cpuset_create - create a cpuset - *	ss:	cpuset cgroup subsystem   *	cont:	control group that the new cpuset will be part of   */ -static struct cgroup_subsys_state *cpuset_create( -	struct cgroup_subsys *ss, -	struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)  {  	struct cpuset *cs;  	struct cpuset *parent; @@ -1902,7 +1869,7 @@ static struct cgroup_subsys_state *cpuset_create(   * will call async_rebuild_sched_domains().   */ -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void cpuset_destroy(struct cgroup *cont)  {  	struct cpuset *cs = cgroup_cs(cont); diff --git a/kernel/cred.c b/kernel/cred.c index 5791612a4045..97b36eeca4c9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -16,6 +16,7 @@  #include <linux/keyctl.h>  #include <linux/init_task.h>  #include <linux/security.h> +#include <linux/binfmts.h>  #include <linux/cn_proc.h>  #if 0 diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0d7c08784efb..3f88a45e6f0a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -41,6 +41,7 @@  #include <linux/delay.h>  #include <linux/sched.h>  #include <linux/sysrq.h> +#include <linux/reboot.h>  #include <linux/init.h>  #include <linux/kgdb.h>  #include <linux/kdb.h> @@ -75,6 +76,8 @@ static int			exception_level;  struct kgdb_io		*dbg_io_ops;  static DEFINE_SPINLOCK(kgdb_registration_lock); +/* Action for the reboot notifiter, a global allow kdb to change it */ +static int kgdbreboot;  /* kgdb console driver is loaded */  static int kgdb_con_registered;  /* determine if kgdb console output should be used */ @@ -96,6 +99,7 @@ static int __init opt_kgdb_con(char *str)  early_param("kgdbcon", opt_kgdb_con);  module_param(kgdb_use_con, int, 0644); +module_param(kgdbreboot, int, 0644);  /*   * Holds information about breakpoints in a kernel. These breakpoints are @@ -784,6 +788,33 @@ void __init dbg_late_init(void)  	kdb_init(KDB_INIT_FULL);  } +static int +dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x) +{ +	/* +	 * Take the following action on reboot notify depending on value: +	 *    1 == Enter debugger +	 *    0 == [the default] detatch debug client +	 *   -1 == Do nothing... and use this until the board resets +	 */ +	switch (kgdbreboot) { +	case 1: +		kgdb_breakpoint(); +	case -1: +		goto done; +	} +	if (!dbg_kdb_mode) +		gdbstub_exit(code); +done: +	return NOTIFY_DONE; +} + +static struct notifier_block dbg_reboot_notifier = { +	.notifier_call		= dbg_notify_reboot, +	.next			= NULL, +	.priority		= INT_MAX, +}; +  static void kgdb_register_callbacks(void)  {  	if (!kgdb_io_module_registered) { @@ -791,6 +822,7 @@ static void kgdb_register_callbacks(void)  		kgdb_arch_init();  		if (!dbg_is_early)  			kgdb_arch_late(); +		register_reboot_notifier(&dbg_reboot_notifier);  		atomic_notifier_chain_register(&panic_notifier_list,  					       &kgdb_panic_event_nb);  #ifdef CONFIG_MAGIC_SYSRQ @@ -812,6 +844,7 @@ static void kgdb_unregister_callbacks(void)  	 */  	if (kgdb_io_module_registered) {  		kgdb_io_module_registered = 0; +		unregister_reboot_notifier(&dbg_reboot_notifier);  		atomic_notifier_chain_unregister(&panic_notifier_list,  					       &kgdb_panic_event_nb);  		kgdb_arch_exit(); diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index c22d8c28ad84..ce615e064482 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)  	unsigned char checksum, ch, buffer[3];  	int loop; +	if (!kgdb_connected) +		return; +	kgdb_connected = 0; + +	if (!dbg_io_ops || dbg_kdb_mode) +		return; +  	buffer[0] = 'W';  	buffer[1] = hex_asc_hi(status);  	buffer[2] = hex_asc_lo(status); @@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)  	dbg_io_ops->write_char(hex_asc_lo(checksum));  	/* make sure the output is flushed, lest the bootloader clobber it */ -	dbg_io_ops->flush(); +	if (dbg_io_ops->flush) +		dbg_io_ops->flush();  } diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 20059ef4459a..8418c2f8ec5d 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)  	} else {  		kdb_printf("%s: failed to set breakpoint at 0x%lx\n",  			   __func__, bp->bp_addr); +#ifdef CONFIG_DEBUG_RODATA +		if (!bp->bp_type) { +			kdb_printf("Software breakpoints are unavailable.\n" +				   "  Change the kernel CONFIG_DEBUG_RODATA=n\n" +				   "  OR use hw breaks: help bph\n"); +		} +#endif  		return 1;  	}  	return 0; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 4802eb5840e1..9b5f17da1c56 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -689,7 +689,7 @@ kdb_printit:  	if (!dbg_kdb_mode && kgdb_connected) {  		gdbstub_msg_write(kdb_buffer, retlen);  	} else { -		if (!dbg_io_ops->is_console) { +		if (dbg_io_ops && !dbg_io_ops->is_console) {  			len = strlen(kdb_buffer);  			cp = kdb_buffer;  			while (len--) { diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 4bca634975c0..118527aa60ea 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -25,6 +25,7 @@  #define KBD_STAT_MOUSE_OBF	0x20	/* Mouse output buffer full */  static int kbd_exists; +static int kbd_last_ret;  /*   * Check if the keyboard controller has a keypress for us. @@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)  		return -1;  	} -	if ((scancode & 0x80) != 0) +	if ((scancode & 0x80) != 0) { +		if (scancode == 0x9c) +			kbd_last_ret = 0;  		return -1; +	}  	scancode &= 0x7f; @@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)  		return -1;	/* ignore unprintables */  	} -	if ((scancode & 0x7f) == 0x1c) { -		/* -		 * enter key.  All done.  Absorb the release scancode. -		 */ +	if (scancode == 0x1c) { +		kbd_last_ret = 1; +		return 13; +	} + +	return keychar & 0xff; +} +EXPORT_SYMBOL_GPL(kdb_get_kbd_char); + +/* + * Best effort cleanup of ENTER break codes on leaving KDB. Called on + * exiting KDB, when we know we processed an ENTER or KP ENTER scan + * code. + */ +void kdb_kbd_cleanup_state(void) +{ +	int scancode, scanstatus; + +	/* +	 * Nothing to clean up, since either +	 * ENTER was never pressed, or has already +	 * gotten cleaned up. +	 */ +	if (!kbd_last_ret) +		return; + +	kbd_last_ret = 0; +	/* +	 * Enter key. Need to absorb the break code here, lest it gets +	 * leaked out if we exit KDB as the result of processing 'g'. +	 * +	 * This has several interesting implications: +	 * + Need to handle KP ENTER, which has break code 0xe0 0x9c. +	 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats +	 *   only get a break code at the end of the repeated +	 *   sequence. This means we can't propagate the repeated key +	 *   press, and must swallow it away. +	 * + Need to handle possible PS/2 mouse input. +	 * + Need to handle mashed keys. +	 */ + +	while (1) {  		while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) -			; +			cpu_relax();  		/* -		 * Fetch the scancode +		 * Fetch the scancode.  		 */  		scancode = inb(KBD_DATA_REG);  		scanstatus = inb(KBD_STATUS_REG); -		while (scanstatus & KBD_STAT_MOUSE_OBF) { -			scancode = inb(KBD_DATA_REG); -			scanstatus = inb(KBD_STATUS_REG); -		} +		/* +		 * Skip mouse input. +		 */ +		if (scanstatus & KBD_STAT_MOUSE_OBF) +			continue; -		if (scancode != 0x9c) { -			/* -			 * Wasn't an enter-release,  why not? -			 */ -			kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", -			       scancode, scanstatus); -		} +		/* +		 * If we see 0xe0, this is either a break code for KP +		 * ENTER, or a repeat make for KP ENTER. Either way, +		 * since the second byte is equivalent to an ENTER, +		 * skip the 0xe0 and try again. +		 * +		 * If we see 0x1c, this must be a repeat ENTER or KP +		 * ENTER (and we swallowed 0xe0 before). Try again. +		 * +		 * We can also see make and break codes for other keys +		 * mashed before or after pressing ENTER. Thus, if we +		 * see anything other than 0x9c, we have to try again. +		 * +		 * Note, if you held some key as ENTER was depressed, +		 * that break code would get leaked out. +		 */ +		if (scancode != 0x9c) +			continue; -		return 13; +		return;  	} - -	return keychar & 0xff;  } -EXPORT_SYMBOL_GPL(kdb_get_kbd_char); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index e2ae7349437f..67b847dfa2bb 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,  	if (KDB_STATE(DOING_SS))  		KDB_STATE_CLEAR(SSBPT); +	/* Clean up any keyboard devices before leaving */ +	kdb_kbd_cleanup_state(); +  	return result;  } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index e381d105b40b..47c4e56e513b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -246,6 +246,13 @@ extern void debug_kusage(void);  extern void kdb_set_current_task(struct task_struct *);  extern struct task_struct *kdb_current_task; + +#ifdef CONFIG_KDB_KEYBOARD +extern void kdb_kbd_cleanup_state(void); +#else /* ! CONFIG_KDB_KEYBOARD */ +#define kdb_kbd_cleanup_state() +#endif /* ! CONFIG_KDB_KEYBOARD */ +  #ifdef CONFIG_MODULES  extern struct list_head *kdb_modules;  #endif /* CONFIG_MODULES */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 7d6fb40d2188..d35cc2d3a4cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)  	if (!pfn_valid(pfn))  		return 1;  	page = pfn_to_page(pfn); -	vaddr = kmap_atomic(page, KM_KDB); +	vaddr = kmap_atomic(page);  	memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); -	kunmap_atomic(vaddr, KM_KDB); +	kunmap_atomic(vaddr);  	return 0;  } diff --git a/kernel/events/core.c b/kernel/events/core.c index 1b5c081d8b9f..4b50357914fb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)  		       PERF_FLAG_FD_OUTPUT  |\  		       PERF_FLAG_PID_CGROUP) +/* + * branch priv levels that need permission checks + */ +#define PERF_SAMPLE_BRANCH_PERM_PLM \ +	(PERF_SAMPLE_BRANCH_KERNEL |\ +	 PERF_SAMPLE_BRANCH_HV) +  enum event_type_t {  	EVENT_FLEXIBLE = 0x1,  	EVENT_PINNED = 0x2, @@ -128,8 +135,9 @@ enum event_type_t {   * perf_sched_events : >0 events exist   * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu   */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly;  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);  static atomic_t nr_mmap_events __read_mostly;  static atomic_t nr_comm_events __read_mostly; @@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)  	if (is_cgroup_event(event))  		ctx->nr_cgroups++; +	if (has_branch_stack(event)) +		ctx->nr_branch_stack++; +  	list_add_rcu(&event->event_entry, &ctx->event_list);  	if (!ctx->nr_events)  		perf_pmu_rotate_start(ctx->pmu); @@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)  			cpuctx->cgrp = NULL;  	} +	if (has_branch_stack(event)) +		ctx->nr_branch_stack--; +  	ctx->nr_events--;  	if (event->attr.inherit_stat)  		ctx->nr_stat--; @@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,  }  /* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, +				       struct task_struct *task) +{ +	struct perf_cpu_context *cpuctx; +	struct pmu *pmu; +	unsigned long flags; + +	/* no need to flush branch stack if not changing task */ +	if (prev == task) +		return; + +	local_irq_save(flags); + +	rcu_read_lock(); + +	list_for_each_entry_rcu(pmu, &pmus, entry) { +		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + +		/* +		 * check if the context has at least one +		 * event using PERF_SAMPLE_BRANCH_STACK +		 */ +		if (cpuctx->ctx.nr_branch_stack > 0 +		    && pmu->flush_branch_stack) { + +			pmu = cpuctx->ctx.pmu; + +			perf_ctx_lock(cpuctx, cpuctx->task_ctx); + +			perf_pmu_disable(pmu); + +			pmu->flush_branch_stack(); + +			perf_pmu_enable(pmu); + +			perf_ctx_unlock(cpuctx, cpuctx->task_ctx); +		} +	} + +	rcu_read_unlock(); + +	local_irq_restore(flags); +} + +/*   * Called from scheduler to add the events of the current task   * with interrupts disabled.   * @@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,  	 */  	if (atomic_read(&__get_cpu_var(perf_cgroup_events)))  		perf_cgroup_sched_in(prev, task); + +	/* check for system-wide branch_stack events */ +	if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) +		perf_branch_stack_sched_in(prev, task);  }  static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)  	if (!event->parent) {  		if (event->attach_state & PERF_ATTACH_TASK) -			jump_label_dec_deferred(&perf_sched_events); +			static_key_slow_dec_deferred(&perf_sched_events);  		if (event->attr.mmap || event->attr.mmap_data)  			atomic_dec(&nr_mmap_events);  		if (event->attr.comm) @@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)  			put_callchain_buffers();  		if (is_cgroup_event(event)) {  			atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); -			jump_label_dec_deferred(&perf_sched_events); +			static_key_slow_dec_deferred(&perf_sched_events); +		} + +		if (has_branch_stack(event)) { +			static_key_slow_dec_deferred(&perf_sched_events); +			/* is system-wide event */ +			if (!(event->attach_state & PERF_ATTACH_TASK)) +				atomic_dec(&per_cpu(perf_branch_stack_events, +						    event->cpu));  		}  	} @@ -3238,10 +3324,6 @@ int perf_event_task_disable(void)  	return 0;  } -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif -  static int perf_event_index(struct perf_event *event)  {  	if (event->hw.state & PERF_HES_STOPPED) @@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event)  	if (event->state != PERF_EVENT_STATE_ACTIVE)  		return 0; -	return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +	return event->pmu->event_idx(event);  }  static void calc_timer_values(struct perf_event *event, +				u64 *now,  				u64 *enabled,  				u64 *running)  { -	u64 now, ctx_time; +	u64 ctx_time; -	now = perf_clock(); -	ctx_time = event->shadow_ctx_time + now; +	*now = perf_clock(); +	ctx_time = event->shadow_ctx_time + *now;  	*enabled = ctx_time - event->tstamp_enabled;  	*running = ctx_time - event->tstamp_running;  } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} +  /*   * Callers need to ensure there can be no nesting of this function, otherwise   * the seqlock logic goes bad. We can not serialize this because the arch @@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event)  {  	struct perf_event_mmap_page *userpg;  	struct ring_buffer *rb; -	u64 enabled, running; +	u64 enabled, running, now;  	rcu_read_lock();  	/* @@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event)  	 * because of locking issue as we can be called in  	 * NMI context  	 */ -	calc_timer_values(event, &enabled, &running); +	calc_timer_values(event, &now, &enabled, &running);  	rb = rcu_dereference(event->rb);  	if (!rb)  		goto unlock; @@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event)  	barrier();  	userpg->index = perf_event_index(event);  	userpg->offset = perf_event_count(event); -	if (event->state == PERF_EVENT_STATE_ACTIVE) +	if (userpg->index)  		userpg->offset -= local64_read(&event->hw.prev_count);  	userpg->time_enabled = enabled + @@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event)  	userpg->time_running = running +  			atomic64_read(&event->child_total_time_running); +	perf_update_user_clock(userpg, now); +  	barrier();  	++userpg->lock;  	preempt_enable(); @@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)  	event->mmap_user = get_current_user();  	vma->vm_mm->pinned_vm += event->mmap_locked; +	perf_event_update_userpage(event); +  unlock:  	if (!ret)  		atomic_inc(&event->mmap_count); @@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,  static void perf_output_read(struct perf_output_handle *handle,  			     struct perf_event *event)  { -	u64 enabled = 0, running = 0; +	u64 enabled = 0, running = 0, now;  	u64 read_format = event->attr.read_format;  	/* @@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle,  	 * NMI context  	 */  	if (read_format & PERF_FORMAT_TOTAL_TIMES) -		calc_timer_values(event, &enabled, &running); +		calc_timer_values(event, &now, &enabled, &running);  	if (event->attr.read_format & PERF_FORMAT_GROUP)  		perf_output_read_group(handle, event, enabled, running); @@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,  			}  		}  	} + +	if (sample_type & PERF_SAMPLE_BRANCH_STACK) { +		if (data->br_stack) { +			size_t size; + +			size = data->br_stack->nr +			     * sizeof(struct perf_branch_entry); + +			perf_output_put(handle, data->br_stack->nr); +			perf_output_copy(handle, data->br_stack->entries, size); +		} else { +			/* +			 * we always store at least the value of nr +			 */ +			u64 nr = 0; +			perf_output_put(handle, nr); +		} +	}  }  void perf_prepare_sample(struct perf_event_header *header, @@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,  		WARN_ON_ONCE(size & (sizeof(u64)-1));  		header->size += size;  	} + +	if (sample_type & PERF_SAMPLE_BRANCH_STACK) { +		int size = sizeof(u64); /* nr */ +		if (data->br_stack) { +			size += data->br_stack->nr +			      * sizeof(struct perf_branch_entry); +		} +		header->size += size; +	}  }  static void perf_event_output(struct perf_event *event, @@ -4986,7 +5104,7 @@ fail:  	return err;  } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];  static void sw_perf_event_destroy(struct perf_event *event)  { @@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)  	WARN_ON(event->parent); -	jump_label_dec(&perf_swevent_enabled[event_id]); +	static_key_slow_dec(&perf_swevent_enabled[event_id]);  	swevent_hlist_put(event);  } @@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)  	if (event->attr.type != PERF_TYPE_SOFTWARE)  		return -ENOENT; +	/* +	 * no branch sampling for software events +	 */ +	if (has_branch_stack(event)) +		return -EOPNOTSUPP; +  	switch (event_id) {  	case PERF_COUNT_SW_CPU_CLOCK:  	case PERF_COUNT_SW_TASK_CLOCK: @@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event)  		if (err)  			return err; -		jump_label_inc(&perf_swevent_enabled[event_id]); +		static_key_slow_inc(&perf_swevent_enabled[event_id]);  		event->destroy = sw_perf_event_destroy;  	}  	return 0;  } +static int perf_swevent_event_idx(struct perf_event *event) +{ +	return 0; +} +  static struct pmu perf_swevent = {  	.task_ctx_nr	= perf_sw_context, @@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = {  	.start		= perf_swevent_start,  	.stop		= perf_swevent_stop,  	.read		= perf_swevent_read, + +	.event_idx	= perf_swevent_event_idx,  };  #ifdef CONFIG_EVENT_TRACING @@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)  	if (event->attr.type != PERF_TYPE_TRACEPOINT)  		return -ENOENT; +	/* +	 * no branch sampling for tracepoint events +	 */ +	if (has_branch_stack(event)) +		return -EOPNOTSUPP; +  	err = perf_trace_init(event);  	if (err)  		return err; @@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = {  	.start		= perf_swevent_start,  	.stop		= perf_swevent_stop,  	.read		= perf_swevent_read, + +	.event_idx	= perf_swevent_event_idx,  };  static inline void perf_tp_register(void) @@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)  	if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)  		return -ENOENT; +	/* +	 * no branch sampling for software events +	 */ +	if (has_branch_stack(event)) +		return -EOPNOTSUPP; +  	perf_swevent_init_hrtimer(event);  	return 0; @@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = {  	.start		= cpu_clock_event_start,  	.stop		= cpu_clock_event_stop,  	.read		= cpu_clock_event_read, + +	.event_idx	= perf_swevent_event_idx,  };  /* @@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)  	if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)  		return -ENOENT; +	/* +	 * no branch sampling for software events +	 */ +	if (has_branch_stack(event)) +		return -EOPNOTSUPP; +  	perf_swevent_init_hrtimer(event);  	return 0; @@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = {  	.start		= task_clock_event_start,  	.stop		= task_clock_event_stop,  	.read		= task_clock_event_read, + +	.event_idx	= perf_swevent_event_idx,  };  static void perf_pmu_nop_void(struct pmu *pmu) @@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)  	perf_pmu_enable(pmu);  } +static int perf_event_idx_default(struct perf_event *event) +{ +	return event->hw.idx + 1; +} +  /*   * Ensures all contexts with the same task_ctx_nr have the same   * pmu_cpu_context too. @@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu)  	if (!pmu->dev)  		goto out; +	pmu->dev->groups = pmu->attr_groups;  	device_initialize(pmu->dev);  	ret = dev_set_name(pmu->dev, "%s", pmu->name);  	if (ret) @@ -5633,6 +5794,9 @@ got_cpu_context:  		pmu->pmu_disable = perf_pmu_nop_void;  	} +	if (!pmu->event_idx) +		pmu->event_idx = perf_event_idx_default; +  	list_add_rcu(&pmu->entry, &pmus);  	ret = 0;  unlock: @@ -5825,7 +5989,7 @@ done:  	if (!event->parent) {  		if (event->attach_state & PERF_ATTACH_TASK) -			jump_label_inc(&perf_sched_events.key); +			static_key_slow_inc(&perf_sched_events.key);  		if (event->attr.mmap || event->attr.mmap_data)  			atomic_inc(&nr_mmap_events);  		if (event->attr.comm) @@ -5839,6 +6003,12 @@ done:  				return ERR_PTR(err);  			}  		} +		if (has_branch_stack(event)) { +			static_key_slow_inc(&perf_sched_events.key); +			if (!(event->attach_state & PERF_ATTACH_TASK)) +				atomic_inc(&per_cpu(perf_branch_stack_events, +						    event->cpu)); +		}  	}  	return event; @@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,  	if (attr->read_format & ~(PERF_FORMAT_MAX-1))  		return -EINVAL; +	if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { +		u64 mask = attr->branch_sample_type; + +		/* only using defined bits */ +		if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) +			return -EINVAL; + +		/* at least one branch bit must be set */ +		if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) +			return -EINVAL; + +		/* kernel level capture: check permissions */ +		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) +		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) +			return -EACCES; + +		/* propagate priv level, when not set for branch */ +		if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { + +			/* exclude_kernel checked on syscall entry */ +			if (!attr->exclude_kernel) +				mask |= PERF_SAMPLE_BRANCH_KERNEL; + +			if (!attr->exclude_user) +				mask |= PERF_SAMPLE_BRANCH_USER; + +			if (!attr->exclude_hv) +				mask |= PERF_SAMPLE_BRANCH_HV; +			/* +			 * adjust user setting (for HW filter setup) +			 */ +			attr->branch_sample_type = mask; +		} +	}  out:  	return ret; @@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,  		 * - that may need work on context switch  		 */  		atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); -		jump_label_inc(&perf_sched_events.key); +		static_key_slow_inc(&perf_sched_events.key);  	}  	/* @@ -6943,8 +7147,7 @@ unlock:  device_initcall(perf_event_sysfs_init);  #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( -	struct cgroup_subsys *ss, struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)  {  	struct perf_cgroup *jc; @@ -6961,8 +7164,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(  	return &jc->css;  } -static void perf_cgroup_destroy(struct cgroup_subsys *ss, -				struct cgroup *cont) +static void perf_cgroup_destroy(struct cgroup *cont)  {  	struct perf_cgroup *jc;  	jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -6978,8 +7180,7 @@ static int __perf_cgroup_move(void *info)  	return 0;  } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, -			       struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)  {  	struct task_struct *task; @@ -6987,8 +7188,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,  		task_function_call(task, __perf_cgroup_move, task);  } -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, -		struct cgroup *old_cgrp, struct task_struct *task) +static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, +			     struct task_struct *task)  {  	/*  	 * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..bb38c4d3ee12 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)  	if (bp->attr.type != PERF_TYPE_BREAKPOINT)  		return -ENOENT; +	/* +	 * no branch sampling for breakpoint events +	 */ +	if (has_branch_stack(bp)) +		return -EOPNOTSUPP; +  	err = register_perf_hw_breakpoint(bp);  	if (err)  		return err; @@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)  	bp->hw.state = PERF_HES_STOPPED;  } +static int hw_breakpoint_event_idx(struct perf_event *bp) +{ +	return 0; +} +  static struct pmu perf_breakpoint = {  	.task_ctx_nr	= perf_sw_context, /* could eventually get its own */ @@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = {  	.start		= hw_breakpoint_start,  	.stop		= hw_breakpoint_stop,  	.read		= hw_breakpoint_pmu_read, + +	.event_idx	= hw_breakpoint_event_idx,  };  int __init init_hw_breakpoint(void) @@ -651,10 +664,10 @@ int __init init_hw_breakpoint(void)   err_alloc:  	for_each_possible_cpu(err_cpu) { -		if (err_cpu == cpu) -			break;  		for (i = 0; i < TYPE_MAX; i++)  			kfree(per_cpu(nr_task_bp_pinned[i], cpu)); +		if (err_cpu == cpu) +			break;  	}  	return -ENOMEM; diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..3db1909faed9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -52,6 +52,7 @@  #include <linux/hw_breakpoint.h>  #include <linux/oom.h>  #include <linux/writeback.h> +#include <linux/shm.h>  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -424,7 +425,7 @@ void daemonize(const char *name, ...)  	 */  	exit_mm(current);  	/* -	 * We don't want to have TIF_FREEZE set if the system-wide hibernation +	 * We don't want to get frozen, in case system-wide hibernation  	 * or suspend transition begins right now.  	 */  	current->flags |= (PF_NOFREEZE | PF_KTHREAD); @@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)  }  /* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. + * When we die, we re-parent all our children, and try to: + * 1. give them to another thread in our thread group, if such a member exists + * 2. give it to the first ancestor process which prctl'd itself as a + *    child_subreaper for its children (like a service manager) + * 3. give it to the init process (PID 1) in our pid namespace   */  static struct task_struct *find_new_reaper(struct task_struct *father)  	__releases(&tasklist_lock) @@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)  	if (unlikely(pid_ns->child_reaper == father)) {  		write_unlock_irq(&tasklist_lock); -		if (unlikely(pid_ns == &init_pid_ns)) -			panic("Attempted to kill init!"); +		if (unlikely(pid_ns == &init_pid_ns)) { +			panic("Attempted to kill init! exitcode=0x%08x\n", +				father->signal->group_exit_code ?: +					father->exit_code); +		}  		zap_pid_ns_processes(pid_ns);  		write_lock_irq(&tasklist_lock); @@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)  		 * forget_original_parent() must move them somewhere.  		 */  		pid_ns->child_reaper = init_pid_ns.child_reaper; +	} else if (father->signal->has_child_subreaper) { +		struct task_struct *reaper; + +		/* +		 * Find the first ancestor marked as child_subreaper. +		 * Note that the code below checks same_thread_group(reaper, +		 * pid_ns->child_reaper).  This is what we need to DTRT in a +		 * PID namespace. However we still need the check above, see +		 * http://marc.info/?l=linux-kernel&m=131385460420380 +		 */ +		for (reaper = father->real_parent; +		     reaper != &init_task; +		     reaper = reaper->real_parent) { +			if (same_thread_group(reaper, pid_ns->child_reaper)) +				break; +			if (!reaper->signal->is_child_subreaper) +				continue; +			thread = reaper; +			do { +				if (!(thread->flags & PF_EXITING)) +					return reaper; +			} while_each_thread(reaper, thread); +		}  	}  	return pid_ns->child_reaper; @@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)  	if (group_dead)  		kill_orphaned_pgrp(tsk->group_leader, NULL); -	/* Let father know we died -	 * -	 * Thread signals are configurable, but you aren't going to use -	 * that to send signals to arbitrary processes. -	 * That stops right now. -	 * -	 * If the parent exec id doesn't match the exec id we saved -	 * when we started then we know the parent has changed security -	 * domain. -	 * -	 * If our self_exec id doesn't match our parent_exec_id then -	 * we have changed execution domain as these two values started -	 * the same after a fork. -	 */ -	if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && -	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id || -	     tsk->self_exec_id != tsk->parent_exec_id)) -		tsk->exit_signal = SIGCHLD; -  	if (unlikely(tsk->ptrace)) {  		int sig = thread_group_leader(tsk) &&  				thread_group_empty(tsk) && @@ -935,8 +943,6 @@ void do_exit(long code)  		schedule();  	} -	exit_irq_thread(); -  	exit_signals(tsk);  /* sets PF_EXITING */  	/*  	 * tsk->flags are checked in the futex code to protect against @@ -945,6 +951,8 @@ void do_exit(long code)  	smp_mb();  	raw_spin_unlock_wait(&tsk->pi_lock); +	exit_irq_thread(); +  	if (unlikely(in_atomic()))  		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",  				current->comm, task_pid_nr(current), @@ -953,7 +961,7 @@ void do_exit(long code)  	acct_update_integrals(tsk);  	/* sync mm's RSS info before statistics gathering */  	if (tsk->mm) -		sync_mm_rss(tsk, tsk->mm); +		sync_mm_rss(tsk->mm);  	group_dead = atomic_dec_and_test(&tsk->signal->live);  	if (group_dead) {  		hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index e2cd3e2a5ae8..b9372a0bff18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)  	WARN_ON(atomic_read(&tsk->usage));  	WARN_ON(tsk == current); +	security_task_free(tsk);  	exit_creds(tsk);  	delayacct_tsk_free(tsk);  	put_signal_struct(tsk->signal); @@ -355,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)  		charge = 0;  		if (mpnt->vm_flags & VM_ACCOUNT) {  			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; -			if (security_vm_enough_memory(len)) +			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */  				goto fail_nomem;  			charge = len;  		} @@ -511,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)  	return NULL;  } +static void check_mm(struct mm_struct *mm) +{ +	int i; + +	for (i = 0; i < NR_MM_COUNTERS; i++) { +		long x = atomic_long_read(&mm->rss_stat.count[i]); + +		if (unlikely(x)) +			printk(KERN_ALERT "BUG: Bad rss-counter state " +					  "mm:%p idx:%d val:%ld\n", mm, i, x); +	} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +	VM_BUG_ON(mm->pmd_huge_pte); +#endif +} +  /*   * Allocate and initialize an mm_struct.   */ @@ -538,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)  	mm_free_pgd(mm);  	destroy_context(mm);  	mmu_notifier_mm_destroy(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -	VM_BUG_ON(mm->pmd_huge_pte); -#endif +	check_mm(mm);  	free_mm(mm);  }  EXPORT_SYMBOL_GPL(__mmdrop); @@ -668,6 +684,38 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)  	return mm;  } +static void complete_vfork_done(struct task_struct *tsk) +{ +	struct completion *vfork; + +	task_lock(tsk); +	vfork = tsk->vfork_done; +	if (likely(vfork)) { +		tsk->vfork_done = NULL; +		complete(vfork); +	} +	task_unlock(tsk); +} + +static int wait_for_vfork_done(struct task_struct *child, +				struct completion *vfork) +{ +	int killed; + +	freezer_do_not_count(); +	killed = wait_for_completion_killable(vfork); +	freezer_count(); + +	if (killed) { +		task_lock(child); +		child->vfork_done = NULL; +		task_unlock(child); +	} + +	put_task_struct(child); +	return killed; +} +  /* Please note the differences between mmput and mm_release.   * mmput is called whenever we stop holding onto a mm_struct,   * error success whatever. @@ -683,8 +731,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)   */  void mm_release(struct task_struct *tsk, struct mm_struct *mm)  { -	struct completion *vfork_done = tsk->vfork_done; -  	/* Get rid of any futexes when releasing the mm */  #ifdef CONFIG_FUTEX  	if (unlikely(tsk->robust_list)) { @@ -704,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)  	/* Get rid of any cached register state */  	deactivate_mm(tsk, mm); -	/* notify parent sleeping on vfork() */ -	if (vfork_done) { -		tsk->vfork_done = NULL; -		complete(vfork_done); -	} +	if (tsk->vfork_done) +		complete_vfork_done(tsk);  	/*  	 * If we're exiting normally, clear a user-space tid field if  	 * requested.  We leave this alone when dying by signal, to leave  	 * the value intact in a core dump, and to save the unnecessary -	 * trouble otherwise.  Userland only wants this done for a sys_exit. +	 * trouble, say, a killed vfork parent shouldn't touch this mm. +	 * Userland only wants this done for a sys_exit.  	 */  	if (tsk->clear_child_tid) {  		if (!(tsk->flags & PF_SIGNALED) && @@ -1007,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	sig->oom_score_adj = current->signal->oom_score_adj;  	sig->oom_score_adj_min = current->signal->oom_score_adj_min; +	sig->has_child_subreaper = current->signal->has_child_subreaper || +				   current->signal->is_child_subreaper; +  	mutex_init(&sig->cred_guard_mutex);  	return 0; @@ -1018,7 +1065,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)  	new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);  	new_flags |= PF_FORKNOEXEC; -	new_flags |= PF_STARTING;  	p->flags = new_flags;  } @@ -1195,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  #ifdef CONFIG_CPUSETS  	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;  	p->cpuset_slab_spread_rotor = NUMA_NO_NODE; +	seqcount_init(&p->mems_allowed_seq);  #endif  #ifdef CONFIG_TRACE_IRQFLAGS  	p->irq_events = 0; @@ -1313,7 +1360,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	clear_all_latency_tracing(p);  	/* ok, now we should be set up.. */ -	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); +	if (clone_flags & CLONE_THREAD) +		p->exit_signal = -1; +	else if (clone_flags & CLONE_PARENT) +		p->exit_signal = current->group_leader->exit_signal; +	else +		p->exit_signal = (clone_flags & CSIGNAL); +  	p->pdeath_signal = 0;  	p->exit_state = 0; @@ -1548,16 +1601,9 @@ long do_fork(unsigned long clone_flags,  		if (clone_flags & CLONE_VFORK) {  			p->vfork_done = &vfork;  			init_completion(&vfork); +			get_task_struct(p);  		} -		/* -		 * We set PF_STARTING at creation in case tracing wants to -		 * use this to distinguish a fully live task from one that -		 * hasn't finished SIGSTOP raising yet.  Now we clear it -		 * and set the child going. -		 */ -		p->flags &= ~PF_STARTING; -  		wake_up_new_task(p);  		/* forking complete and child started to run, tell ptracer */ @@ -1565,10 +1611,8 @@ long do_fork(unsigned long clone_flags,  			ptrace_event(trace, nr);  		if (clone_flags & CLONE_VFORK) { -			freezer_do_not_count(); -			wait_for_completion(&vfork); -			freezer_count(); -			ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); +			if (!wait_for_vfork_done(p, &vfork)) +				ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);  		}  	} else {  		nr = PTR_ERR(p); diff --git a/kernel/freezer.c b/kernel/freezer.c index 9815b8d1eed5..11f82a4d4eae 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)   * freeze_task - send a freeze request to given task   * @p: task to send the request to   * - * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE - * flag and either sending a fake signal to it or waking it up, depending - * on whether it has %PF_FREEZER_NOSIG set. + * If @p is freezing, the freeze request is sent either by sending a fake + * signal (if it's not a kernel thread) or waking it up (if it's a kernel + * thread).   *   * RETURNS:   * %false, if @p is not freezing or already frozen; %true, otherwise diff --git a/kernel/futex.c b/kernel/futex.c index 1614be20173d..72efa1e4359a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2628,7 +2628,7 @@ void exit_robust_list(struct task_struct *curr)  long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,  		u32 __user *uaddr2, u32 val2, u32 val3)  { -	int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; +	int cmd = op & FUTEX_CMD_MASK;  	unsigned int flags = 0;  	if (!(op & FUTEX_PRIVATE_FLAG)) @@ -2641,49 +2641,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,  	}  	switch (cmd) { +	case FUTEX_LOCK_PI: +	case FUTEX_UNLOCK_PI: +	case FUTEX_TRYLOCK_PI: +	case FUTEX_WAIT_REQUEUE_PI: +	case FUTEX_CMP_REQUEUE_PI: +		if (!futex_cmpxchg_enabled) +			return -ENOSYS; +	} + +	switch (cmd) {  	case FUTEX_WAIT:  		val3 = FUTEX_BITSET_MATCH_ANY;  	case FUTEX_WAIT_BITSET: -		ret = futex_wait(uaddr, flags, val, timeout, val3); -		break; +		return futex_wait(uaddr, flags, val, timeout, val3);  	case FUTEX_WAKE:  		val3 = FUTEX_BITSET_MATCH_ANY;  	case FUTEX_WAKE_BITSET: -		ret = futex_wake(uaddr, flags, val, val3); -		break; +		return futex_wake(uaddr, flags, val, val3);  	case FUTEX_REQUEUE: -		ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); -		break; +		return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);  	case FUTEX_CMP_REQUEUE: -		ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); -		break; +		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);  	case FUTEX_WAKE_OP: -		ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); -		break; +		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);  	case FUTEX_LOCK_PI: -		if (futex_cmpxchg_enabled) -			ret = futex_lock_pi(uaddr, flags, val, timeout, 0); -		break; +		return futex_lock_pi(uaddr, flags, val, timeout, 0);  	case FUTEX_UNLOCK_PI: -		if (futex_cmpxchg_enabled) -			ret = futex_unlock_pi(uaddr, flags); -		break; +		return futex_unlock_pi(uaddr, flags);  	case FUTEX_TRYLOCK_PI: -		if (futex_cmpxchg_enabled) -			ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); -		break; +		return futex_lock_pi(uaddr, flags, 0, timeout, 1);  	case FUTEX_WAIT_REQUEUE_PI:  		val3 = FUTEX_BITSET_MATCH_ANY; -		ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, -					    uaddr2); -		break; +		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, +					     uaddr2);  	case FUTEX_CMP_REQUEUE_PI: -		ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); -		break; -	default: -		ret = -ENOSYS; +		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);  	} -	return ret; +	return -ENOSYS;  } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2e48ec0c2e91..c21449f85a2a 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -119,15 +119,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)   * For preemptible RCU it is sufficient to call rcu_read_unlock in order   * to exit the grace period. For classic RCU, a reschedule is required.   */ -static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)  { +	bool can_cont; +  	get_task_struct(g);  	get_task_struct(t);  	rcu_read_unlock();  	cond_resched();  	rcu_read_lock(); +	can_cont = pid_alive(g) && pid_alive(t);  	put_task_struct(t);  	put_task_struct(g); + +	return can_cont;  }  /* @@ -154,9 +159,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)  			goto unlock;  		if (!--batch_count) {  			batch_count = HUNG_TASK_BATCHING; -			rcu_lock_break(g, t); -			/* Exit if t or g was unhashed during refresh. */ -			if (t->state == TASK_DEAD || g->state == TASK_DEAD) +			if (!rcu_lock_break(g, t))  				goto unlock;  		}  		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 342d8f44e401..0119b9d467ae 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)  			if (desc->irq_data.chip->irq_set_type)  				desc->irq_data.chip->irq_set_type(&desc->irq_data,  							 IRQ_TYPE_PROBE); -			irq_startup(desc); +			irq_startup(desc, false);  		}  		raw_spin_unlock_irq(&desc->lock);  	} @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)  		raw_spin_lock_irq(&desc->lock);  		if (!desc->action && irq_settings_can_probe(desc)) {  			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; -			if (irq_startup(desc)) +			if (irq_startup(desc, false))  				desc->istate |= IRQS_PENDING;  		}  		raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..6080f6bc8c33 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -16,6 +16,8 @@  #include <linux/interrupt.h>  #include <linux/kernel_stat.h> +#include <trace/events/irq.h> +  #include "internals.h"  /** @@ -61,8 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)  		return -EINVAL;  	type &= IRQ_TYPE_SENSE_MASK; -	if (type != IRQ_TYPE_NONE) -		ret = __irq_set_trigger(desc, irq, type); +	ret = __irq_set_trigger(desc, irq, type);  	irq_put_desc_busunlock(desc, flags);  	return ret;  } @@ -157,19 +158,22 @@ static void irq_state_set_masked(struct irq_desc *desc)  	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);  } -int irq_startup(struct irq_desc *desc) +int irq_startup(struct irq_desc *desc, bool resend)  { +	int ret = 0; +  	irq_state_clr_disabled(desc);  	desc->depth = 0;  	if (desc->irq_data.chip->irq_startup) { -		int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); +		ret = desc->irq_data.chip->irq_startup(&desc->irq_data);  		irq_state_clr_masked(desc); -		return ret; +	} else { +		irq_enable(desc);  	} - -	irq_enable(desc); -	return 0; +	if (resend) +		check_irq_resend(desc, desc->irq_data.irq); +	return ret;  }  void irq_shutdown(struct irq_desc *desc) @@ -330,6 +334,24 @@ out_unlock:  }  EXPORT_SYMBOL_GPL(handle_simple_irq); +/* + * Called unconditionally from handle_level_irq() and only for oneshot + * interrupts from handle_fasteoi_irq() + */ +static void cond_unmask_irq(struct irq_desc *desc) +{ +	/* +	 * We need to unmask in the following cases: +	 * - Standard level irq (IRQF_ONESHOT is not set) +	 * - Oneshot irq which did not wake the thread (caused by a +	 *   spurious interrupt or a primary handler handling it +	 *   completely). +	 */ +	if (!irqd_irq_disabled(&desc->irq_data) && +	    irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) +		unmask_irq(desc); +} +  /**   *	handle_level_irq - Level type irq handler   *	@irq:	the interrupt number @@ -362,8 +384,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)  	handle_irq_event(desc); -	if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) -		unmask_irq(desc); +	cond_unmask_irq(desc); +  out_unlock:  	raw_spin_unlock(&desc->lock);  } @@ -417,6 +439,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)  	preflow_handler(desc);  	handle_irq_event(desc); +	if (desc->istate & IRQS_ONESHOT) +		cond_unmask_irq(desc); +  out_eoi:  	desc->irq_data.chip->irq_eoi(&desc->irq_data);  out_unlock: @@ -625,7 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,  		irq_settings_set_noprobe(desc);  		irq_settings_set_norequest(desc);  		irq_settings_set_nothread(desc); -		irq_startup(desc); +		irq_startup(desc, true);  	}  out:  	irq_put_desc_busunlock(desc, flags); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 470d08c82bbe..6ff84e6a954c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -60,7 +60,7 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)  	 * device interrupt, so no irq storm is lurking. If the  	 * RUNTHREAD bit is already set, nothing to do.  	 */ -	if (test_bit(IRQTF_DIED, &action->thread_flags) || +	if ((action->thread->flags & PF_EXITING) ||  	    test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))  		return; @@ -110,6 +110,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)  	 * threads_oneshot untouched and runs the thread another time.  	 */  	desc->threads_oneshot |= action->thread_mask; + +	/* +	 * We increment the threads_active counter in case we wake up +	 * the irq thread. The irq thread decrements the counter when +	 * it returns from the handler or in the exit path and wakes +	 * up waiters which are stuck in synchronize_irq() when the +	 * active count becomes zero. synchronize_irq() is serialized +	 * against this code (hard irq handler) via IRQS_INPROGRESS +	 * like the finalize_oneshot() code. See comment above. +	 */ +	atomic_inc(&desc->threads_active); +  	wake_up_process(action->thread);  } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b7952316016a..8e5c56b3b7d9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,14 +20,12 @@ extern bool noirqdebug;  /*   * Bits used by threaded handlers:   * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run - * IRQTF_DIED      - handler thread died   * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed   * IRQTF_AFFINITY  - irq thread is requested to adjust affinity   * IRQTF_FORCED_THREAD  - irq action is force threaded   */  enum {  	IRQTF_RUNTHREAD, -	IRQTF_DIED,  	IRQTF_WARNED,  	IRQTF_AFFINITY,  	IRQTF_FORCED_THREAD, @@ -67,7 +65,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,  extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);  extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern int irq_startup(struct irq_desc *desc); +extern int irq_startup(struct irq_desc *desc, bool resend);  extern void irq_shutdown(struct irq_desc *desc);  extern void irq_enable(struct irq_desc *desc);  extern void irq_disable(struct irq_desc *desc); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f9e26526b69..af48e59bc2ff 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,189 +1,793 @@ +#include <linux/debugfs.h> +#include <linux/hardirq.h> +#include <linux/interrupt.h>  #include <linux/irq.h> +#include <linux/irqdesc.h>  #include <linux/irqdomain.h>  #include <linux/module.h>  #include <linux/mutex.h>  #include <linux/of.h>  #include <linux/of_address.h> +#include <linux/seq_file.h>  #include <linux/slab.h> +#include <linux/smp.h> +#include <linux/fs.h> + +#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. +				 * ie. legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ +#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ +#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */  static LIST_HEAD(irq_domain_list);  static DEFINE_MUTEX(irq_domain_mutex); +static DEFINE_MUTEX(revmap_trees_mutex); +static unsigned int irq_virq_count = NR_IRQS; +static struct irq_domain *irq_default_domain; +  /** - * irq_domain_add() - Register an irq_domain - * @domain: ptr to initialized irq_domain structure + * irq_domain_alloc() - Allocate a new irq_domain data structure + * @of_node: optional device-tree node of the interrupt controller + * @revmap_type: type of reverse mapping to use + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer   * - * Registers an irq_domain structure.  The irq_domain must at a minimum be - * initialized with an ops structure pointer, and either a ->to_irq hook or - * a valid irq_base value.  Everything else is optional. + * Allocates and initialize and irq_domain structure.  Caller is expected to + * register allocated irq_domain with irq_domain_register().  Returns pointer + * to IRQ domain, or NULL on failure.   */ -void irq_domain_add(struct irq_domain *domain) +static struct irq_domain *irq_domain_alloc(struct device_node *of_node, +					   unsigned int revmap_type, +					   const struct irq_domain_ops *ops, +					   void *host_data)  { -	struct irq_data *d; -	int hwirq, irq; +	struct irq_domain *domain; -	/* -	 * This assumes that the irq_domain owner has already allocated -	 * the irq_descs.  This block will be removed when support for dynamic -	 * allocation of irq_descs is added to irq_domain. -	 */ -	irq_domain_for_each_irq(domain, hwirq, irq) { -		d = irq_get_irq_data(irq); -		if (!d) { -			WARN(1, "error: assigning domain to non existant irq_desc"); -			return; -		} -		if (d->domain) { -			/* things are broken; just report, don't clean up */ -			WARN(1, "error: irq_desc already assigned to a domain"); -			return; +	domain = kzalloc(sizeof(*domain), GFP_KERNEL); +	if (WARN_ON(!domain)) +		return NULL; + +	/* Fill structure */ +	domain->revmap_type = revmap_type; +	domain->ops = ops; +	domain->host_data = host_data; +	domain->of_node = of_node_get(of_node); + +	return domain; +} + +static void irq_domain_add(struct irq_domain *domain) +{ +	mutex_lock(&irq_domain_mutex); +	list_add(&domain->link, &irq_domain_list); +	mutex_unlock(&irq_domain_mutex); +	pr_debug("irq: Allocated domain of type %d @0x%p\n", +		 domain->revmap_type, domain); +} + +static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, +					     irq_hw_number_t hwirq) +{ +	irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; +	int size = domain->revmap_data.legacy.size; + +	if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) +		return 0; +	return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; +} + +/** + * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @size: total number of irqs in legacy mapping + * @first_irq: first number of irq block assigned to the domain + * @first_hwirq: first hwirq number to use for the translation. Should normally + *               be '0', but a positive integer can be used if the effective + *               hwirqs numbering does not begin at zero. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Note: the map() callback will be called before this function returns + * for all legacy interrupts except 0 (which is always the invalid irq for + * a legacy controller). + */ +struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, +					 unsigned int size, +					 unsigned int first_irq, +					 irq_hw_number_t first_hwirq, +					 const struct irq_domain_ops *ops, +					 void *host_data) +{ +	struct irq_domain *domain; +	unsigned int i; + +	domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); +	if (!domain) +		return NULL; + +	domain->revmap_data.legacy.first_irq = first_irq; +	domain->revmap_data.legacy.first_hwirq = first_hwirq; +	domain->revmap_data.legacy.size = size; + +	mutex_lock(&irq_domain_mutex); +	/* Verify that all the irqs are available */ +	for (i = 0; i < size; i++) { +		int irq = first_irq + i; +		struct irq_data *irq_data = irq_get_irq_data(irq); + +		if (WARN_ON(!irq_data || irq_data->domain)) { +			mutex_unlock(&irq_domain_mutex); +			of_node_put(domain->of_node); +			kfree(domain); +			return NULL;  		} -		d->domain = domain; -		d->hwirq = hwirq;  	} -	mutex_lock(&irq_domain_mutex); -	list_add(&domain->list, &irq_domain_list); +	/* Claim all of the irqs before registering a legacy domain */ +	for (i = 0; i < size; i++) { +		struct irq_data *irq_data = irq_get_irq_data(first_irq + i); +		irq_data->hwirq = first_hwirq + i; +		irq_data->domain = domain; +	}  	mutex_unlock(&irq_domain_mutex); + +	for (i = 0; i < size; i++) { +		int irq = first_irq + i; +		int hwirq = first_hwirq + i; + +		/* IRQ0 gets ignored */ +		if (!irq) +			continue; + +		/* Legacy flags are left to default at this point, +		 * one can then use irq_create_mapping() to +		 * explicitly change them +		 */ +		ops->map(domain, irq, hwirq); + +		/* Clear norequest flags */ +		irq_clear_status_flags(irq, IRQ_NOREQUEST); +	} + +	irq_domain_add(domain); +	return domain; +} + +/** + * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + */ +struct irq_domain *irq_domain_add_linear(struct device_node *of_node, +					 unsigned int size, +					 const struct irq_domain_ops *ops, +					 void *host_data) +{ +	struct irq_domain *domain; +	unsigned int *revmap; + +	revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); +	if (WARN_ON(!revmap)) +		return NULL; + +	domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); +	if (!domain) { +		kfree(revmap); +		return NULL; +	} +	domain->revmap_data.linear.size = size; +	domain->revmap_data.linear.revmap = revmap; +	irq_domain_add(domain); +	return domain; +} + +struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, +					 const struct irq_domain_ops *ops, +					 void *host_data) +{ +	struct irq_domain *domain = irq_domain_alloc(of_node, +					IRQ_DOMAIN_MAP_NOMAP, ops, host_data); +	if (domain) +		irq_domain_add(domain); +	return domain; +} + +/** + * irq_domain_add_tree() + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * + * Note: The radix tree will be allocated later during boot automatically + * (the reverse mapping will use the slow path until that happens). + */ +struct irq_domain *irq_domain_add_tree(struct device_node *of_node, +					 const struct irq_domain_ops *ops, +					 void *host_data) +{ +	struct irq_domain *domain = irq_domain_alloc(of_node, +					IRQ_DOMAIN_MAP_TREE, ops, host_data); +	if (domain) { +		INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); +		irq_domain_add(domain); +	} +	return domain;  }  /** - * irq_domain_del() - Unregister an irq_domain - * @domain: ptr to registered irq_domain. + * irq_find_host() - Locates a domain for a given device node + * @node: device-tree node of the interrupt controller   */ -void irq_domain_del(struct irq_domain *domain) +struct irq_domain *irq_find_host(struct device_node *node)  { -	struct irq_data *d; -	int hwirq, irq; +	struct irq_domain *h, *found = NULL; +	int rc; +	/* We might want to match the legacy controller last since +	 * it might potentially be set to match all interrupts in +	 * the absence of a device node. This isn't a problem so far +	 * yet though... +	 */  	mutex_lock(&irq_domain_mutex); -	list_del(&domain->list); +	list_for_each_entry(h, &irq_domain_list, link) { +		if (h->ops->match) +			rc = h->ops->match(h, node); +		else +			rc = (h->of_node != NULL) && (h->of_node == node); + +		if (rc) { +			found = h; +			break; +		} +	}  	mutex_unlock(&irq_domain_mutex); +	return found; +} +EXPORT_SYMBOL_GPL(irq_find_host); + +/** + * irq_set_default_host() - Set a "default" irq domain + * @domain: default domain pointer + * + * For convenience, it's possible to set a "default" domain that will be used + * whenever NULL is passed to irq_create_mapping(). It makes life easier for + * platforms that want to manipulate a few hard coded interrupt numbers that + * aren't properly represented in the device-tree. + */ +void irq_set_default_host(struct irq_domain *domain) +{ +	pr_debug("irq: Default domain set to @0x%p\n", domain); + +	irq_default_domain = domain; +} + +/** + * irq_set_virq_count() - Set the maximum number of linux irqs + * @count: number of linux irqs, capped with NR_IRQS + * + * This is mainly for use by platforms like iSeries who want to program + * the virtual irq number in the controller to avoid the reverse mapping + */ +void irq_set_virq_count(unsigned int count) +{ +	pr_debug("irq: Trying to set virq count to %d\n", count); -	/* Clear the irq_domain assignments */ -	irq_domain_for_each_irq(domain, hwirq, irq) { -		d = irq_get_irq_data(irq); -		d->domain = NULL; +	BUG_ON(count < NUM_ISA_INTERRUPTS); +	if (count < NR_IRQS) +		irq_virq_count = count; +} + +static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, +			    irq_hw_number_t hwirq) +{ +	struct irq_data *irq_data = irq_get_irq_data(virq); + +	irq_data->hwirq = hwirq; +	irq_data->domain = domain; +	if (domain->ops->map(domain, virq, hwirq)) { +		pr_debug("irq: -> mapping failed, freeing\n"); +		irq_data->domain = NULL; +		irq_data->hwirq = 0; +		return -1;  	} + +	irq_clear_status_flags(virq, IRQ_NOREQUEST); + +	return 0;  } -#if defined(CONFIG_OF_IRQ)  /** - * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec + * irq_create_direct_mapping() - Allocate an irq for direct mapping + * @domain: domain to allocate the irq for or NULL for default domain   * - * Used by the device tree interrupt mapping code to translate a device tree - * interrupt specifier to a valid linux irq number.  Returns either a valid - * linux IRQ number or 0. + * This routine is used for irq controllers which can choose the hardware + * interrupt numbers they generate. In such a case it's simplest to use + * the linux irq as the hardware interrupt number. + */ +unsigned int irq_create_direct_mapping(struct irq_domain *domain) +{ +	unsigned int virq; + +	if (domain == NULL) +		domain = irq_default_domain; + +	BUG_ON(domain == NULL); +	WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + +	virq = irq_alloc_desc_from(1, 0); +	if (!virq) { +		pr_debug("irq: create_direct virq allocation failed\n"); +		return 0; +	} +	if (virq >= irq_virq_count) { +		pr_err("ERROR: no free irqs available below %i maximum\n", +			irq_virq_count); +		irq_free_desc(virq); +		return 0; +	} + +	pr_debug("irq: create_direct obtained virq %d\n", virq); + +	if (irq_setup_virq(domain, virq, virq)) { +		irq_free_desc(virq); +		return 0; +	} + +	return virq; +} + +/** + * irq_create_mapping() - Map a hardware interrupt into linux irq space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space   * - * When the caller no longer need the irq number returned by this function it - * should arrange to call irq_dispose_mapping(). + * Only one mapping per hardware interrupt is permitted. Returns a linux + * irq number. + * If the sense/trigger is to be specified, set_irq_type() should be called + * on the number returned from that call.   */ +unsigned int irq_create_mapping(struct irq_domain *domain, +				irq_hw_number_t hwirq) +{ +	unsigned int virq, hint; + +	pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); + +	/* Look for default domain if nececssary */ +	if (domain == NULL) +		domain = irq_default_domain; +	if (domain == NULL) { +		printk(KERN_WARNING "irq_create_mapping called for" +		       " NULL domain, hwirq=%lx\n", hwirq); +		WARN_ON(1); +		return 0; +	} +	pr_debug("irq: -> using domain @%p\n", domain); + +	/* Check if mapping already exists */ +	virq = irq_find_mapping(domain, hwirq); +	if (virq) { +		pr_debug("irq: -> existing mapping on virq %d\n", virq); +		return virq; +	} + +	/* Get a virtual interrupt number */ +	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) +		return irq_domain_legacy_revmap(domain, hwirq); + +	/* Allocate a virtual interrupt number */ +	hint = hwirq % irq_virq_count; +	if (hint == 0) +		hint++; +	virq = irq_alloc_desc_from(hint, 0); +	if (!virq) +		virq = irq_alloc_desc_from(1, 0); +	if (!virq) { +		pr_debug("irq: -> virq allocation failed\n"); +		return 0; +	} + +	if (irq_setup_virq(domain, virq, hwirq)) { +		if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) +			irq_free_desc(virq); +		return 0; +	} + +	pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", +		hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); + +	return virq; +} +EXPORT_SYMBOL_GPL(irq_create_mapping); +  unsigned int irq_create_of_mapping(struct device_node *controller,  				   const u32 *intspec, unsigned int intsize)  {  	struct irq_domain *domain; -	unsigned long hwirq; -	unsigned int irq, type; -	int rc = -EINVAL; +	irq_hw_number_t hwirq; +	unsigned int type = IRQ_TYPE_NONE; +	unsigned int virq; -	/* Find a domain which can translate the irq spec */ -	mutex_lock(&irq_domain_mutex); -	list_for_each_entry(domain, &irq_domain_list, list) { -		if (!domain->ops->dt_translate) -			continue; -		rc = domain->ops->dt_translate(domain, controller, -					intspec, intsize, &hwirq, &type); -		if (rc == 0) -			break; +	domain = controller ? irq_find_host(controller) : irq_default_domain; +	if (!domain) { +#ifdef CONFIG_MIPS +		/* +		 * Workaround to avoid breaking interrupt controller drivers +		 * that don't yet register an irq_domain.  This is temporary +		 * code. ~~~gcl, Feb 24, 2012 +		 * +		 * Scheduled for removal in Linux v3.6.  That should be enough +		 * time. +		 */ +		if (intsize > 0) +			return intspec[0]; +#endif +		printk(KERN_WARNING "irq: no irq domain found for %s !\n", +		       controller->full_name); +		return 0;  	} -	mutex_unlock(&irq_domain_mutex); -	if (rc != 0) -		return 0; +	/* If domain has no translation, then we assume interrupt line */ +	if (domain->ops->xlate == NULL) +		hwirq = intspec[0]; +	else { +		if (domain->ops->xlate(domain, controller, intspec, intsize, +				     &hwirq, &type)) +			return 0; +	} + +	/* Create mapping */ +	virq = irq_create_mapping(domain, hwirq); +	if (!virq) +		return virq; -	irq = irq_domain_to_irq(domain, hwirq); -	if (type != IRQ_TYPE_NONE) -		irq_set_irq_type(irq, type); -	pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", -		 controller->full_name, (int)hwirq, irq, type); -	return irq; +	/* Set type if specified and different than the current one */ +	if (type != IRQ_TYPE_NONE && +	    type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) +		irq_set_irq_type(virq, type); +	return virq;  }  EXPORT_SYMBOL_GPL(irq_create_of_mapping);  /** - * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() - * @irq: linux irq number to be discarded + * irq_dispose_mapping() - Unmap an interrupt + * @virq: linux irq number of the interrupt to unmap + */ +void irq_dispose_mapping(unsigned int virq) +{ +	struct irq_data *irq_data = irq_get_irq_data(virq); +	struct irq_domain *domain; +	irq_hw_number_t hwirq; + +	if (!virq || !irq_data) +		return; + +	domain = irq_data->domain; +	if (WARN_ON(domain == NULL)) +		return; + +	/* Never unmap legacy interrupts */ +	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) +		return; + +	irq_set_status_flags(virq, IRQ_NOREQUEST); + +	/* remove chip and handler */ +	irq_set_chip_and_handler(virq, NULL, NULL); + +	/* Make sure it's completed */ +	synchronize_irq(virq); + +	/* Tell the PIC about it */ +	if (domain->ops->unmap) +		domain->ops->unmap(domain, virq); +	smp_mb(); + +	/* Clear reverse map */ +	hwirq = irq_data->hwirq; +	switch(domain->revmap_type) { +	case IRQ_DOMAIN_MAP_LINEAR: +		if (hwirq < domain->revmap_data.linear.size) +			domain->revmap_data.linear.revmap[hwirq] = 0; +		break; +	case IRQ_DOMAIN_MAP_TREE: +		mutex_lock(&revmap_trees_mutex); +		radix_tree_delete(&domain->revmap_data.tree, hwirq); +		mutex_unlock(&revmap_trees_mutex); +		break; +	} + +	irq_free_desc(virq); +} +EXPORT_SYMBOL_GPL(irq_dispose_mapping); + +/** + * irq_find_mapping() - Find a linux irq from an hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space + * + * This is a slow path, for use by generic code. It's expected that an + * irq controller implementation directly calls the appropriate low level + * mapping function. + */ +unsigned int irq_find_mapping(struct irq_domain *domain, +			      irq_hw_number_t hwirq) +{ +	unsigned int i; +	unsigned int hint = hwirq % irq_virq_count; + +	/* Look for default domain if nececssary */ +	if (domain == NULL) +		domain = irq_default_domain; +	if (domain == NULL) +		return 0; + +	/* legacy -> bail early */ +	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) +		return irq_domain_legacy_revmap(domain, hwirq); + +	/* Slow path does a linear search of the map */ +	if (hint == 0) +		hint = 1; +	i = hint; +	do { +		struct irq_data *data = irq_get_irq_data(i); +		if (data && (data->domain == domain) && (data->hwirq == hwirq)) +			return i; +		i++; +		if (i >= irq_virq_count) +			i = 1; +	} while(i != hint); +	return 0; +} +EXPORT_SYMBOL_GPL(irq_find_mapping); + +/** + * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space   * - * Calling this function indicates the caller no longer needs a reference to - * the linux irq number returned by a prior call to irq_create_of_mapping(). + * This is a fast path, for use by irq controller code that uses radix tree + * revmaps   */ -void irq_dispose_mapping(unsigned int irq) +unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, +				     irq_hw_number_t hwirq)  { +	struct irq_data *irq_data; + +	if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) +		return irq_find_mapping(domain, hwirq); + +	/* +	 * Freeing an irq can delete nodes along the path to +	 * do the lookup via call_rcu. +	 */ +	rcu_read_lock(); +	irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); +	rcu_read_unlock(); +  	/* -	 * nothing yet; will be filled when support for dynamic allocation of -	 * irq_descs is added to irq_domain +	 * If found in radix tree, then fine. +	 * Else fallback to linear lookup - this should not happen in practice +	 * as it means that we failed to insert the node in the radix tree.  	 */ +	return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);  } -EXPORT_SYMBOL_GPL(irq_dispose_mapping); -int irq_domain_simple_dt_translate(struct irq_domain *d, -			    struct device_node *controller, -			    const u32 *intspec, unsigned int intsize, -			    unsigned long *out_hwirq, unsigned int *out_type) +/** + * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. + * @domain: domain owning this hardware interrupt + * @virq: linux irq number + * @hwirq: hardware irq number in that domain space + * + * This is for use by irq controllers that use a radix tree reverse + * mapping for fast lookup. + */ +void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, +			     irq_hw_number_t hwirq)  { -	if (d->of_node != controller) -		return -EINVAL; -	if (intsize < 1) -		return -EINVAL; -	if (d->nr_irq && ((intspec[0] < d->hwirq_base) || -	    (intspec[0] >= d->hwirq_base + d->nr_irq))) -		return -EINVAL; +	struct irq_data *irq_data = irq_get_irq_data(virq); + +	if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) +		return; + +	if (virq) { +		mutex_lock(&revmap_trees_mutex); +		radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); +		mutex_unlock(&revmap_trees_mutex); +	} +} + +/** + * irq_linear_revmap() - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space + * + * This is a fast path, for use by irq controller code that uses linear + * revmaps. It does fallback to the slow path if the revmap doesn't exist + * yet and will create the revmap entry with appropriate locking + */ +unsigned int irq_linear_revmap(struct irq_domain *domain, +			       irq_hw_number_t hwirq) +{ +	unsigned int *revmap; + +	if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) +		return irq_find_mapping(domain, hwirq); + +	/* Check revmap bounds */ +	if (unlikely(hwirq >= domain->revmap_data.linear.size)) +		return irq_find_mapping(domain, hwirq); + +	/* Check if revmap was allocated */ +	revmap = domain->revmap_data.linear.revmap; +	if (unlikely(revmap == NULL)) +		return irq_find_mapping(domain, hwirq); + +	/* Fill up revmap with slow path if no mapping found */ +	if (unlikely(!revmap[hwirq])) +		revmap[hwirq] = irq_find_mapping(domain, hwirq); + +	return revmap[hwirq]; +} + +#ifdef CONFIG_VIRQ_DEBUG +static int virq_debug_show(struct seq_file *m, void *private) +{ +	unsigned long flags; +	struct irq_desc *desc; +	const char *p; +	static const char none[] = "none"; +	void *data; +	int i; + +	seq_printf(m, "%-5s  %-7s  %-15s  %-18s  %s\n", "virq", "hwirq", +		      "chip name", "chip data", "domain name"); + +	for (i = 1; i < nr_irqs; i++) { +		desc = irq_to_desc(i); +		if (!desc) +			continue; + +		raw_spin_lock_irqsave(&desc->lock, flags); + +		if (desc->action && desc->action->handler) { +			struct irq_chip *chip; + +			seq_printf(m, "%5d  ", i); +			seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq); + +			chip = irq_desc_get_chip(desc); +			if (chip && chip->name) +				p = chip->name; +			else +				p = none; +			seq_printf(m, "%-15s  ", p); + +			data = irq_desc_get_chip_data(desc); +			seq_printf(m, "0x%16p  ", data); + +			if (desc->irq_data.domain->of_node) +				p = desc->irq_data.domain->of_node->full_name; +			else +				p = none; +			seq_printf(m, "%s\n", p); +		} + +		raw_spin_unlock_irqrestore(&desc->lock, flags); +	} + +	return 0; +} +static int virq_debug_open(struct inode *inode, struct file *file) +{ +	return single_open(file, virq_debug_show, inode->i_private); +} + +static const struct file_operations virq_debug_fops = { +	.open = virq_debug_open, +	.read = seq_read, +	.llseek = seq_lseek, +	.release = single_release, +}; + +static int __init irq_debugfs_init(void) +{ +	if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, +				 NULL, &virq_debug_fops) == NULL) +		return -ENOMEM; + +	return 0; +} +__initcall(irq_debugfs_init); +#endif /* CONFIG_VIRQ_DEBUG */ + +int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, +			  irq_hw_number_t hwirq) +{ +	return 0; +} + +/** + * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings + * + * Device Tree IRQ specifier translation function which works with one cell + * bindings where the cell value maps directly to the hwirq number. + */ +int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, +			     const u32 *intspec, unsigned int intsize, +			     unsigned long *out_hwirq, unsigned int *out_type) +{ +	if (WARN_ON(intsize < 1)) +		return -EINVAL;  	*out_hwirq = intspec[0];  	*out_type = IRQ_TYPE_NONE; -	if (intsize > 1) -		*out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;  	return 0;  } +EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);  /** - * irq_domain_create_simple() - Set up a 'simple' translation range + * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags.   */ -void irq_domain_add_simple(struct device_node *controller, int irq_base) +int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, +			const u32 *intspec, unsigned int intsize, +			irq_hw_number_t *out_hwirq, unsigned int *out_type)  { -	struct irq_domain *domain; - -	domain = kzalloc(sizeof(*domain), GFP_KERNEL); -	if (!domain) { -		WARN_ON(1); -		return; -	} +	if (WARN_ON(intsize < 2)) +		return -EINVAL; +	*out_hwirq = intspec[0]; +	*out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; +	return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); -	domain->irq_base = irq_base; -	domain->of_node = of_node_get(controller); -	domain->ops = &irq_domain_simple_ops; -	irq_domain_add(domain); +/** + * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings + * + * Device Tree IRQ specifier translation function which works with either one + * or two cell bindings where the cell values map directly to the hwirq number + * and linux irq flags. + * + * Note: don't use this function unless your interrupt controller explicitly + * supports both one and two cell bindings.  For the majority of controllers + * the _onecell() or _twocell() variants above should be used. + */ +int irq_domain_xlate_onetwocell(struct irq_domain *d, +				struct device_node *ctrlr, +				const u32 *intspec, unsigned int intsize, +				unsigned long *out_hwirq, unsigned int *out_type) +{ +	if (WARN_ON(intsize < 1)) +		return -EINVAL; +	*out_hwirq = intspec[0]; +	*out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; +	return 0;  } -EXPORT_SYMBOL_GPL(irq_domain_add_simple); +EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); +const struct irq_domain_ops irq_domain_simple_ops = { +	.map = irq_domain_simple_map, +	.xlate = irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +#ifdef CONFIG_OF_IRQ  void irq_domain_generate_simple(const struct of_device_id *match,  				u64 phys_base, unsigned int irq_start)  {  	struct device_node *node; -	pr_info("looking for phys_base=%llx, irq_start=%i\n", +	pr_debug("looking for phys_base=%llx, irq_start=%i\n",  		(unsigned long long) phys_base, (int) irq_start);  	node = of_find_matching_node_by_address(NULL, match, phys_base);  	if (node) -		irq_domain_add_simple(node, irq_start); -	else -		pr_info("no node found\n"); +		irq_domain_add_legacy(node, 32, irq_start, 0, +				      &irq_domain_simple_ops, NULL);  }  EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif /* CONFIG_OF_IRQ */ - -struct irq_domain_ops irq_domain_simple_ops = { -#ifdef CONFIG_OF_IRQ -	.dt_translate = irq_domain_simple_dt_translate, -#endif /* CONFIG_OF_IRQ */ -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a9a9dbe49fea..b0ccd1ac2d6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -759,6 +759,13 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,  	return ret;  } +static void wake_threads_waitq(struct irq_desc *desc) +{ +	if (atomic_dec_and_test(&desc->threads_active) && +	    waitqueue_active(&desc->wait_for_threads)) +		wake_up(&desc->wait_for_threads); +} +  /*   * Interrupt handler thread   */ @@ -771,57 +778,41 @@ static int irq_thread(void *data)  	struct irq_desc *desc = irq_to_desc(action->irq);  	irqreturn_t (*handler_fn)(struct irq_desc *desc,  			struct irqaction *action); -	int wake; -	if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, +	if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,  					&action->thread_flags))  		handler_fn = irq_forced_thread_fn;  	else  		handler_fn = irq_thread_fn;  	sched_setscheduler(current, SCHED_FIFO, ¶m); -	current->irqaction = action; +	current->irq_thread = 1;  	while (!irq_wait_for_interrupt(action)) { +		irqreturn_t action_ret;  		irq_thread_check_affinity(desc, action); -		atomic_inc(&desc->threads_active); - -		raw_spin_lock_irq(&desc->lock); -		if (unlikely(irqd_irq_disabled(&desc->irq_data))) { -			/* -			 * CHECKME: We might need a dedicated -			 * IRQ_THREAD_PENDING flag here, which -			 * retriggers the thread in check_irq_resend() -			 * but AFAICT IRQS_PENDING should be fine as it -			 * retriggers the interrupt itself --- tglx -			 */ -			desc->istate |= IRQS_PENDING; -			raw_spin_unlock_irq(&desc->lock); -		} else { -			irqreturn_t action_ret; - -			raw_spin_unlock_irq(&desc->lock); -			action_ret = handler_fn(desc, action); -			if (!noirqdebug) -				note_interrupt(action->irq, desc, action_ret); -		} +		action_ret = handler_fn(desc, action); +		if (!noirqdebug) +			note_interrupt(action->irq, desc, action_ret); -		wake = atomic_dec_and_test(&desc->threads_active); - -		if (wake && waitqueue_active(&desc->wait_for_threads)) -			wake_up(&desc->wait_for_threads); +		wake_threads_waitq(desc);  	} -	/* Prevent a stale desc->threads_oneshot */ -	irq_finalize_oneshot(desc, action, true); -  	/* -	 * Clear irqaction. Otherwise exit_irq_thread() would make +	 * This is the regular exit path. __free_irq() is stopping the +	 * thread via kthread_stop() after calling +	 * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the +	 * oneshot mask bit can be set. We cannot verify that as we +	 * cannot touch the oneshot mask at this point anymore as +	 * __setup_irq() might have given out currents thread_mask +	 * again. +	 * +	 * Clear irq_thread. Otherwise exit_irq_thread() would make  	 * fuzz about an active irq thread going into nirvana.  	 */ -	current->irqaction = NULL; +	current->irq_thread = 0;  	return 0;  } @@ -832,27 +823,28 @@ void exit_irq_thread(void)  {  	struct task_struct *tsk = current;  	struct irq_desc *desc; +	struct irqaction *action; -	if (!tsk->irqaction) +	if (!tsk->irq_thread)  		return; +	action = kthread_data(tsk); +  	printk(KERN_ERR  	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", -	       tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); +	       tsk->comm ? tsk->comm : "", tsk->pid, action->irq); -	desc = irq_to_desc(tsk->irqaction->irq); +	desc = irq_to_desc(action->irq);  	/* -	 * Prevent a stale desc->threads_oneshot. Must be called -	 * before setting the IRQTF_DIED flag. +	 * If IRQTF_RUNTHREAD is set, we need to decrement +	 * desc->threads_active and wake possible waiters.  	 */ -	irq_finalize_oneshot(desc, tsk->irqaction, true); +	if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) +		wake_threads_waitq(desc); -	/* -	 * Set the THREAD DIED flag to prevent further wakeups of the -	 * soon to be gone threaded handler. -	 */ -	set_bit(IRQTF_DIED, &tsk->irqaction->flags); +	/* Prevent a stale desc->threads_oneshot */ +	irq_finalize_oneshot(desc, action, true);  }  static void irq_setup_forced_threading(struct irqaction *new) @@ -985,6 +977,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  		/* add new interrupt at end of irq queue */  		do { +			/* +			 * Or all existing action->thread_mask bits, +			 * so we can find the next zero bit for this +			 * new action. +			 */  			thread_mask |= old->thread_mask;  			old_ptr = &old->next;  			old = *old_ptr; @@ -993,14 +990,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  	}  	/* -	 * Setup the thread mask for this irqaction. Unlikely to have -	 * 32 resp 64 irqs sharing one line, but who knows. +	 * Setup the thread mask for this irqaction for ONESHOT. For +	 * !ONESHOT irqs the thread mask is 0 so we can avoid a +	 * conditional in irq_wake_thread().  	 */ -	if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { -		ret = -EBUSY; -		goto out_mask; +	if (new->flags & IRQF_ONESHOT) { +		/* +		 * Unlikely to have 32 resp 64 irqs sharing one line, +		 * but who knows. +		 */ +		if (thread_mask == ~0UL) { +			ret = -EBUSY; +			goto out_mask; +		} +		/* +		 * The thread_mask for the action is or'ed to +		 * desc->thread_active to indicate that the +		 * IRQF_ONESHOT thread handler has been woken, but not +		 * yet finished. The bit is cleared when a thread +		 * completes. When all threads of a shared interrupt +		 * line have completed desc->threads_active becomes +		 * zero and the interrupt line is unmasked. See +		 * handle.c:irq_wake_thread() for further information. +		 * +		 * If no thread is woken by primary (hard irq context) +		 * interrupt handlers, then desc->threads_active is +		 * also checked for zero to unmask the irq line in the +		 * affected hard irq flow handlers +		 * (handle_[fasteoi|level]_irq). +		 * +		 * The new action gets the first zero bit of +		 * thread_mask assigned. See the loop above which or's +		 * all existing action->thread_mask bits. +		 */ +		new->thread_mask = 1 << ffz(thread_mask);  	} -	new->thread_mask = 1 << ffz(thread_mask);  	if (!shared) {  		init_waitqueue_head(&desc->wait_for_threads); @@ -1027,7 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  			desc->istate |= IRQS_ONESHOT;  		if (irq_settings_can_autoenable(desc)) -			irq_startup(desc); +			irq_startup(desc, true);  		else  			/* Undo nested disables: */  			desc->depth = 1; @@ -1103,8 +1127,7 @@ out_thread:  		struct task_struct *t = new->thread;  		new->thread = NULL; -		if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) -			kthread_stop(t); +		kthread_stop(t);  		put_task_struct(t);  	}  out_mput: @@ -1214,8 +1237,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)  #endif  	if (action->thread) { -		if (!test_bit(IRQTF_DIED, &action->thread_flags)) -			kthread_stop(action->thread); +		kthread_stop(action->thread);  		put_task_struct(action->thread);  	} diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01d3b70fc98a..43049192b5ec 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@  #include <linux/slab.h>  #include <linux/sort.h>  #include <linux/err.h> -#include <linux/jump_label.h> +#include <linux/static_key.h>  #ifdef HAVE_JUMP_LABEL @@ -29,11 +29,6 @@ void jump_label_unlock(void)  	mutex_unlock(&jump_label_mutex);  } -bool jump_label_enabled(struct jump_label_key *key) -{ -	return !!atomic_read(&key->enabled); -} -  static int jump_label_cmp(const void *a, const void *b)  {  	const struct jump_entry *jea = a; @@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)  	sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);  } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key)  {  	if (atomic_inc_not_zero(&key->enabled))  		return;  	jump_label_lock(); -	if (atomic_read(&key->enabled) == 0) -		jump_label_update(key, JUMP_LABEL_ENABLE); +	if (atomic_read(&key->enabled) == 0) { +		if (!jump_label_get_branch_default(key)) +			jump_label_update(key, JUMP_LABEL_ENABLE); +		else +			jump_label_update(key, JUMP_LABEL_DISABLE); +	}  	atomic_inc(&key->enabled);  	jump_label_unlock();  } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key,  		unsigned long rate_limit, struct delayed_work *work)  { -	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) +	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { +		WARN(atomic_read(&key->enabled) < 0, +		     "jump label: negative count!\n");  		return; +	}  	if (rate_limit) {  		atomic_inc(&key->enabled);  		schedule_delayed_work(work, rate_limit); -	} else -		jump_label_update(key, JUMP_LABEL_DISABLE); - +	} else { +		if (!jump_label_get_branch_default(key)) +			jump_label_update(key, JUMP_LABEL_DISABLE); +		else +			jump_label_update(key, JUMP_LABEL_ENABLE); +	}  	jump_label_unlock();  } -EXPORT_SYMBOL_GPL(jump_label_dec);  static void jump_label_update_timeout(struct work_struct *work)  { -	struct jump_label_key_deferred *key = -		container_of(work, struct jump_label_key_deferred, work.work); -	__jump_label_dec(&key->key, 0, NULL); +	struct static_key_deferred *key = +		container_of(work, struct static_key_deferred, work.work); +	__static_key_slow_dec(&key->key, 0, NULL);  } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key)  { -	__jump_label_dec(key, 0, NULL); +	__static_key_slow_dec(key, 0, NULL);  } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key)  { -	__jump_label_dec(&key->key, key->timeout, &key->work); +	__static_key_slow_dec(&key->key, key->timeout, &key->work);  } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key,  		unsigned long rl)  {  	key->timeout = rl; @@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry  	arch_jump_label_transform(entry, type);	  } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key,  				struct jump_entry *entry,  				struct jump_entry *stop, int enable)  { @@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key,  	}  } +static enum jump_label_type jump_label_type(struct static_key *key) +{ +	bool true_branch = jump_label_get_branch_default(key); +	bool state = static_key_enabled(key); + +	if ((!true_branch && state) || (true_branch && !state)) +		return JUMP_LABEL_ENABLE; + +	return JUMP_LABEL_DISABLE; +} +  void __init jump_label_init(void)  {  	struct jump_entry *iter_start = __start___jump_table;  	struct jump_entry *iter_stop = __stop___jump_table; -	struct jump_label_key *key = NULL; +	struct static_key *key = NULL;  	struct jump_entry *iter;  	jump_label_lock();  	jump_label_sort_entries(iter_start, iter_stop);  	for (iter = iter_start; iter < iter_stop; iter++) { -		struct jump_label_key *iterk; +		struct static_key *iterk; -		iterk = (struct jump_label_key *)(unsigned long)iter->key; -		arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? -						 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); +		iterk = (struct static_key *)(unsigned long)iter->key; +		arch_jump_label_transform_static(iter, jump_label_type(iterk));  		if (iterk == key)  			continue;  		key = iterk; -		key->entries = iter; +		/* +		 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. +		 */ +		*((unsigned long *)&key->entries) += (unsigned long)iter;  #ifdef CONFIG_MODULES  		key->next = NULL;  #endif @@ -197,8 +215,8 @@ void __init jump_label_init(void)  #ifdef CONFIG_MODULES -struct jump_label_mod { -	struct jump_label_mod *next; +struct static_key_mod { +	struct static_key_mod *next;  	struct jump_entry *entries;  	struct module *mod;  }; @@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)  				start, end);  } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable)  { -	struct jump_label_mod *mod = key->next; +	struct static_key_mod *mod = key->next;  	while (mod) {  		struct module *m = mod->mod; @@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod)  		return;  	for (iter = iter_start; iter < iter_stop; iter++) { -		struct jump_label_key *iterk; - -		iterk = (struct jump_label_key *)(unsigned long)iter->key; -		arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? -				JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); +		arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);  	}  } @@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod)  	struct jump_entry *iter_start = mod->jump_entries;  	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;  	struct jump_entry *iter; -	struct jump_label_key *key = NULL; -	struct jump_label_mod *jlm; +	struct static_key *key = NULL; +	struct static_key_mod *jlm;  	/* if the module doesn't have jump label entries, just return */  	if (iter_start == iter_stop) @@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod)  	jump_label_sort_entries(iter_start, iter_stop);  	for (iter = iter_start; iter < iter_stop; iter++) { -		if (iter->key == (jump_label_t)(unsigned long)key) -			continue; +		struct static_key *iterk; -		key = (struct jump_label_key *)(unsigned long)iter->key; +		iterk = (struct static_key *)(unsigned long)iter->key; +		if (iterk == key) +			continue; +		key = iterk;  		if (__module_address(iter->key) == mod) { -			atomic_set(&key->enabled, 0); -			key->entries = iter; +			/* +			 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. +			 */ +			*((unsigned long *)&key->entries) += (unsigned long)iter;  			key->next = NULL;  			continue;  		} - -		jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); +		jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);  		if (!jlm)  			return -ENOMEM; -  		jlm->mod = mod;  		jlm->entries = iter;  		jlm->next = key->next;  		key->next = jlm; -		if (jump_label_enabled(key)) +		if (jump_label_type(key) == JUMP_LABEL_ENABLE)  			__jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);  	} @@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod)  	struct jump_entry *iter_start = mod->jump_entries;  	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;  	struct jump_entry *iter; -	struct jump_label_key *key = NULL; -	struct jump_label_mod *jlm, **prev; +	struct static_key *key = NULL; +	struct static_key_mod *jlm, **prev;  	for (iter = iter_start; iter < iter_stop; iter++) {  		if (iter->key == (jump_label_t)(unsigned long)key)  			continue; -		key = (struct jump_label_key *)(unsigned long)iter->key; +		key = (struct static_key *)(unsigned long)iter->key;  		if (__module_address(iter->key) == mod)  			continue; @@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end)  	return ret;  } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable)  { -	struct jump_entry *entry = key->entries, *stop = __stop___jump_table; +	struct jump_entry *stop = __stop___jump_table; +	struct jump_entry *entry = jump_label_get_entries(key);  #ifdef CONFIG_MODULES -	struct module *mod = __module_address((jump_label_t)key); +	struct module *mod = __module_address((unsigned long)key);  	__jump_label_mod_update(key, enable); diff --git a/kernel/kexec.c b/kernel/kexec.c index 7b0886786701..a6a675cb9818 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1546,13 +1546,13 @@ int kernel_kexec(void)  		if (error)  			goto Resume_console;  		/* At this point, dpm_suspend_start() has been called, -		 * but *not* dpm_suspend_noirq(). We *must* call -		 * dpm_suspend_noirq() now.  Otherwise, drivers for +		 * but *not* dpm_suspend_end(). We *must* call +		 * dpm_suspend_end() now.  Otherwise, drivers for  		 * some devices (e.g. interrupt controllers) become  		 * desynchronized with the actual state of the  		 * hardware at resume time, and evil weirdness ensues.  		 */ -		error = dpm_suspend_noirq(PMSG_FREEZE); +		error = dpm_suspend_end(PMSG_FREEZE);  		if (error)  			goto Resume_devices;  		error = disable_nonboot_cpus(); @@ -1579,7 +1579,7 @@ int kernel_kexec(void)  		local_irq_enable();   Enable_cpus:  		enable_nonboot_cpus(); -		dpm_resume_noirq(PMSG_RESTORE); +		dpm_resume_start(PMSG_RESTORE);   Resume_devices:  		dpm_resume_end(PMSG_RESTORE);   Resume_console: diff --git a/kernel/kmod.c b/kernel/kmod.c index a0a88543934e..957a7aab8ebc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);  */  char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static void free_modprobe_argv(struct subprocess_info *info) +{ +	kfree(info->argv[3]); /* check call_modprobe() */ +	kfree(info->argv); +} + +static int call_modprobe(char *module_name, int wait) +{ +	static char *envp[] = { +		"HOME=/", +		"TERM=linux", +		"PATH=/sbin:/usr/sbin:/bin:/usr/bin", +		NULL +	}; + +	char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); +	if (!argv) +		goto out; + +	module_name = kstrdup(module_name, GFP_KERNEL); +	if (!module_name) +		goto free_argv; + +	argv[0] = modprobe_path; +	argv[1] = "-q"; +	argv[2] = "--"; +	argv[3] = module_name;	/* check free_modprobe_argv() */ +	argv[4] = NULL; + +	return call_usermodehelper_fns(modprobe_path, argv, envp, +		wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); +free_argv: +	kfree(argv); +out: +	return -ENOMEM; +} +  /**   * __request_module - try to load a kernel module   * @wait: wait (or not) for the operation to complete @@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)  	char module_name[MODULE_NAME_LEN];  	unsigned int max_modprobes;  	int ret; -	char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; -	static char *envp[] = { "HOME=/", -				"TERM=linux", -				"PATH=/sbin:/usr/sbin:/bin:/usr/bin", -				NULL };  	static atomic_t kmod_concurrent = ATOMIC_INIT(0);  #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */  	static int kmod_loop_msg; @@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)  	trace_module_request(module_name, wait, _RET_IP_); -	ret = call_usermodehelper_fns(modprobe_path, argv, envp, -			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, -			NULL, NULL, NULL); +	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);  	atomic_dec(&kmod_concurrent);  	return ret; @@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)  	/* Exec failed? */  fail:  	sub_info->retval = retval; -	do_exit(0); +	return 0;  }  void call_usermodehelper_freeinfo(struct subprocess_info *info) @@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)  }  EXPORT_SYMBOL(call_usermodehelper_freeinfo); +static void umh_complete(struct subprocess_info *sub_info) +{ +	struct completion *comp = xchg(&sub_info->complete, NULL); +	/* +	 * See call_usermodehelper_exec(). If xchg() returns NULL +	 * we own sub_info, the UMH_KILLABLE caller has gone away. +	 */ +	if (comp) +		complete(comp); +	else +		call_usermodehelper_freeinfo(sub_info); +} +  /* Keventd can't block, but this (a child) can. */  static int wait_for_helper(void *data)  { @@ -235,7 +278,7 @@ static int wait_for_helper(void *data)  			sub_info->retval = ret;  	} -	complete(sub_info->complete); +	umh_complete(sub_info);  	return 0;  } @@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)  {  	struct subprocess_info *sub_info =  		container_of(work, struct subprocess_info, work); -	enum umh_wait wait = sub_info->wait; +	int wait = sub_info->wait & ~UMH_KILLABLE;  	pid_t pid;  	/* CLONE_VFORK: wait until the usermode helper has execve'd @@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)  	case UMH_WAIT_EXEC:  		if (pid < 0)  			sub_info->retval = pid; -		complete(sub_info->complete); +		umh_complete(sub_info);  	}  } @@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);   * asynchronously if wait is not set, and runs as a child of keventd.   * (ie. it runs with full root capabilities).   */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, -			     enum umh_wait wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)  {  	DECLARE_COMPLETION_ONSTACK(done);  	int retval = 0; @@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,  	queue_work(khelper_wq, &sub_info->work);  	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */  		goto unlock; + +	if (wait & UMH_KILLABLE) { +		retval = wait_for_completion_killable(&done); +		if (!retval) +			goto wait_done; + +		/* umh_complete() will see NULL and free sub_info */ +		if (xchg(&sub_info->complete, NULL)) +			goto unlock; +		/* fallthrough, umh_complete() was already called */ +	} +  	wait_for_completion(&done); +wait_done:  	retval = sub_info->retval; -  out:  	call_usermodehelper_freeinfo(sub_info);  unlock: diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9788c0ec6f43..c62b8546cc90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1334,8 +1334,10 @@ int __kprobes register_kprobe(struct kprobe *p)  	if (!kernel_text_address((unsigned long) p->addr) ||  	    in_kprobes_functions((unsigned long) p->addr) ||  	    ftrace_text_reserved(p->addr, p->addr) || -	    jump_label_text_reserved(p->addr, p->addr)) -		goto fail_with_jump_label; +	    jump_label_text_reserved(p->addr, p->addr)) { +		ret = -EINVAL; +		goto cannot_probe; +	}  	/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */  	p->flags &= KPROBE_FLAG_DISABLED; @@ -1352,7 +1354,7 @@ int __kprobes register_kprobe(struct kprobe *p)  		 * its code to prohibit unexpected unloading.  		 */  		if (unlikely(!try_module_get(probed_mod))) -			goto fail_with_jump_label; +			goto cannot_probe;  		/*  		 * If the module freed .init.text, we couldn't insert @@ -1361,7 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)  		if (within_module_init((unsigned long)p->addr, probed_mod) &&  		    probed_mod->state != MODULE_STATE_COMING) {  			module_put(probed_mod); -			goto fail_with_jump_label; +			goto cannot_probe;  		}  		/* ret will be updated by following code */  	} @@ -1409,7 +1411,7 @@ out:  	return ret; -fail_with_jump_label: +cannot_probe:  	preempt_enable();  	jump_label_unlock();  	return ret; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8889f7dd7c46..ea9ee4518c35 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)  	printk("-------------------------------\n");  	printk("%s:%d %s!\n", file, line, s);  	printk("\nother info that might help us debug this:\n\n"); -	printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); +	printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", +	       !rcu_lockdep_current_cpu_online() +			? "RCU used illegally from offline CPU!\n" +			: rcu_is_cpu_idle() +				? "RCU used illegally from idle CPU!\n" +				: "", +	       rcu_scheduler_active, debug_locks);  	/*  	 * If a CPU is in the RCU-free window in idle (ie: in the section diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		/* didn't get the lock, go to sleep: */  		spin_unlock_mutex(&lock->wait_lock, flags); -		preempt_enable_no_resched(); -		schedule(); -		preempt_disable(); +		schedule_preempt_disabled();  		spin_lock_mutex(&lock->wait_lock, flags);  	} diff --git a/kernel/padata.c b/kernel/padata.c index b45259931512..6f10eb285ece 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -29,7 +29,6 @@  #include <linux/sysfs.h>  #include <linux/rcupdate.h> -#define MAX_SEQ_NR (INT_MAX - NR_CPUS)  #define MAX_OBJ_NUM 1000  static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) @@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)  	return target_cpu;  } -static int padata_cpu_hash(struct padata_priv *padata) +static int padata_cpu_hash(struct parallel_data *pd)  {  	int cpu_index; -	struct parallel_data *pd; - -	pd =  padata->pd;  	/*  	 * Hash the sequence numbers to the cpus by taking  	 * seq_nr mod. number of cpus in use.  	 */ -	cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); + +	spin_lock(&pd->seq_lock); +	cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); +	pd->seq_nr++; +	spin_unlock(&pd->seq_lock);  	return padata_index_to_cpu(pd, cpu_index);  } @@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,  	padata->pd = pd;  	padata->cb_cpu = cb_cpu; -	if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) -		atomic_set(&pd->seq_nr, -1); - -	padata->seq_nr = atomic_inc_return(&pd->seq_nr); - -	target_cpu = padata_cpu_hash(padata); +	target_cpu = padata_cpu_hash(pd);  	queue = per_cpu_ptr(pd->pqueue, target_cpu);  	spin_lock(&queue->parallel.lock); @@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel);  static struct padata_priv *padata_get_next(struct parallel_data *pd)  {  	int cpu, num_cpus; -	int next_nr, next_index; +	unsigned int next_nr, next_index;  	struct padata_parallel_queue *queue, *next_queue;  	struct padata_priv *padata;  	struct padata_list *reorder; @@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)  	cpu = padata_index_to_cpu(pd, next_index);  	next_queue = per_cpu_ptr(pd->pqueue, cpu); -	if (unlikely(next_nr > pd->max_seq_nr)) { -		next_nr = next_nr - pd->max_seq_nr - 1; -		next_index = next_nr % num_cpus; -		cpu = padata_index_to_cpu(pd, next_index); -		next_queue = per_cpu_ptr(pd->pqueue, cpu); -		pd->processed = 0; -	} -  	padata = NULL;  	reorder = &next_queue->reorder; @@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)  		padata = list_entry(reorder->list.next,  				    struct padata_priv, list); -		BUG_ON(next_nr != padata->seq_nr); -  		spin_lock(&reorder->lock);  		list_del_init(&padata->list);  		atomic_dec(&pd->reorder_objects); @@ -230,6 +215,7 @@ out:  static void padata_reorder(struct parallel_data *pd)  { +	int cb_cpu;  	struct padata_priv *padata;  	struct padata_serial_queue *squeue;  	struct padata_instance *pinst = pd->pinst; @@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd)  			return;  		} -		squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); +		cb_cpu = padata->cb_cpu; +		squeue = per_cpu_ptr(pd->squeue, cb_cpu);  		spin_lock(&squeue->serial.lock);  		list_add_tail(&padata->list, &squeue->serial.list);  		spin_unlock(&squeue->serial.lock); -		queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); +		queue_work_on(cb_cpu, pinst->wq, &squeue->work);  	}  	spin_unlock_bh(&pd->lock); @@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)  /* Initialize all percpu queues used by parallel workers */  static void padata_init_pqueues(struct parallel_data *pd)  { -	int cpu_index, num_cpus, cpu; +	int cpu_index, cpu;  	struct padata_parallel_queue *pqueue;  	cpu_index = 0; @@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)  		INIT_WORK(&pqueue->work, padata_parallel_worker);  		atomic_set(&pqueue->num_obj, 0);  	} - -	num_cpus = cpumask_weight(pd->cpumask.pcpu); -	pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;  }  /* Allocate and initialize the internal cpumask dependend resources. */ @@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,  	padata_init_pqueues(pd);  	padata_init_squeues(pd);  	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); -	atomic_set(&pd->seq_nr, -1); +	pd->seq_nr = 0;  	atomic_set(&pd->reorder_objects, 0);  	atomic_set(&pd->refcnt, 0);  	pd->pinst = pinst; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a8968396046d..17b232869a04 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	while (nr > 0) {  		rcu_read_lock(); -		/* -		 * Any nested-container's init processes won't ignore the -		 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). -		 */  		task = pid_task(find_vpid(nr), PIDTYPE_PID); -		if (task) -			send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); +		if (task && !__fatal_signal_pending(task)) +			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);  		rcu_read_unlock(); diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 07e0e28ffba7..66d808ec5252 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,8 @@  ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG -obj-$(CONFIG_PM)		+= main.o qos.o +obj-y				+= qos.o +obj-$(CONFIG_PM)		+= main.o  obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o  obj-$(CONFIG_FREEZER)		+= process.o  obj-$(CONFIG_SUSPEND)		+= suspend.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..0a186cfde788 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,   * create_image - Create a hibernation image.   * @platform_mode: Whether or not to use the platform driver.   * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.   *   * Control reappears in this routine after the subsequent restore.   */ @@ -254,7 +254,7 @@ static int create_image(int platform_mode)  {  	int error; -	error = dpm_suspend_noirq(PMSG_FREEZE); +	error = dpm_suspend_end(PMSG_FREEZE);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down, "  			"aborting hibernation\n"); @@ -306,7 +306,7 @@ static int create_image(int platform_mode)   Platform_finish:  	platform_finish(platform_mode); -	dpm_resume_noirq(in_suspend ? +	dpm_resume_start(in_suspend ?  		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);  	return error; @@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode)  		 * successful freezer test.  		 */  		freezer_test_done = true; -		goto Cleanup; +		goto Thaw;  	}  	error = dpm_prepare(PMSG_FREEZE);  	if (error) {  		dpm_complete(PMSG_RECOVER); -		goto Cleanup; +		goto Thaw;  	}  	suspend_console(); @@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode)  	platform_end(platform_mode);  	return error; + Thaw: +	thaw_kernel_threads();   Cleanup:  	swsusp_free();  	goto Close; @@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode)   * resume_target_kernel - Restore system state from a hibernation image.   * @platform_mode: Whether or not to use the platform driver.   * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel.   */  static int resume_target_kernel(bool platform_mode)  {  	int error; -	error = dpm_suspend_noirq(PMSG_QUIESCE); +	error = dpm_suspend_end(PMSG_QUIESCE);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down, "  			"aborting resume\n"); @@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode)   Cleanup:  	platform_restore_cleanup(platform_mode); -	dpm_resume_noirq(PMSG_RECOVER); +	dpm_resume_start(PMSG_RECOVER);  	return error;  } @@ -518,7 +520,7 @@ int hibernation_platform_enter(void)  		goto Resume_devices;  	} -	error = dpm_suspend_noirq(PMSG_HIBERNATE); +	error = dpm_suspend_end(PMSG_HIBERNATE);  	if (error)  		goto Resume_devices; @@ -549,7 +551,7 @@ int hibernation_platform_enter(void)   Platform_finish:  	hibernation_ops->finish(); -	dpm_resume_noirq(PMSG_RESTORE); +	dpm_resume_start(PMSG_RESTORE);   Resume_devices:  	entering_platform_hibernation = false; @@ -616,7 +618,7 @@ int hibernate(void)  	/* Allocate memory management structures */  	error = create_basic_memory_bitmaps();  	if (error) -		goto Exit; +		goto Enable_umh;  	printk(KERN_INFO "PM: Syncing filesystems ... ");  	sys_sync(); @@ -624,15 +626,11 @@ int hibernate(void)  	error = freeze_processes();  	if (error) -		goto Finish; +		goto Free_bitmaps;  	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); -	if (error) -		goto Thaw; -	if (freezer_test_done) { -		freezer_test_done = false; +	if (error || freezer_test_done)  		goto Thaw; -	}  	if (in_suspend) {  		unsigned int flags = 0; @@ -657,8 +655,13 @@ int hibernate(void)   Thaw:  	thaw_processes(); - Finish: + +	/* Don't bother checking whether freezer_test_done is true */ +	freezer_test_done = false; + + Free_bitmaps:  	free_basic_memory_bitmaps(); + Enable_umh:  	usermodehelper_enable();   Exit:  	pm_notifier_call_chain(PM_POST_HIBERNATION); diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..1c12581f1c62 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)  	last_errno %= REC_FAILED_NUM;  	last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;  	last_step %= REC_FAILED_NUM; -	seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" -			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n", +	seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" +			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",  			"success", suspend_stats.success,  			"fail", suspend_stats.fail,  			"failed_freeze", suspend_stats.failed_freeze,  			"failed_prepare", suspend_stats.failed_prepare,  			"failed_suspend", suspend_stats.failed_suspend, +			"failed_suspend_late", +				suspend_stats.failed_suspend_late,  			"failed_suspend_noirq",  				suspend_stats.failed_suspend_noirq,  			"failed_resume", suspend_stats.failed_resume, +			"failed_resume_early", +				suspend_stats.failed_resume_early,  			"failed_resume_noirq",  				suspend_stats.failed_resume_noirq);  	seq_printf(s,	"failures:\n  last_failed_dev:\t%-s\n", @@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,  #ifdef CONFIG_SUSPEND  	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { -		if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) +		if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { +			error = pm_suspend(state);  			break; -	} -	if (state < PM_SUSPEND_MAX && *s) { -		error = enter_state(state); -		if (error) { -			suspend_stats.fail++; -			dpm_save_failed_errno(error); -		} else -			suspend_stats.success++; +		}  	}  #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 21724eee5206..98f3622d7407 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -177,13 +177,11 @@ extern const char *const pm_states[];  extern bool valid_state(suspend_state_t state);  extern int suspend_devices_and_enter(suspend_state_t state); -extern int enter_state(suspend_state_t state);  #else /* !CONFIG_SUSPEND */  static inline int suspend_devices_and_enter(suspend_state_t state)  {  	return -ENOSYS;  } -static inline int enter_state(suspend_state_t state) { return -ENOSYS; }  static inline bool valid_state(suspend_state_t state) { return false; }  #endif /* !CONFIG_SUSPEND */ @@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void)  	int error;  	error = freeze_processes(); -  	/*  	 * freeze_processes() automatically thaws every task if freezing  	 * fails. So we need not do anything extra upon error.  	 */  	if (error) -		goto Finish; +		return error;  	error = freeze_kernel_threads(); -  	/*  	 * freeze_kernel_threads() thaws only kernel threads upon freezing  	 * failure. So we have to thaw the userspace tasks ourselves. @@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void)  	if (error)  		thaw_processes(); - Finish:  	return error;  } diff --git a/kernel/power/process.c b/kernel/power/process.c index 7e426459e60a..0d2aeb226108 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)  			 * It is "frozen enough".  If the task does wake  			 * up, it will immediately call try_to_freeze.  			 * -			 * Because freeze_task() goes through p's -			 * scheduler lock after setting TIF_FREEZE, it's -			 * guaranteed that either we see TASK_RUNNING or -			 * try_to_stop() after schedule() in ptrace/signal -			 * stop sees TIF_FREEZE. +			 * Because freeze_task() goes through p's scheduler lock, it's +			 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING +			 * transition can't race with task state testing here.  			 */  			if (!task_is_stopped_or_traced(p) &&  			    !freezer_should_skip(p)) @@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only)  		       elapsed_csecs / 100, elapsed_csecs % 100,  		       todo - wq_busy, wq_busy); -		read_lock(&tasklist_lock); -		do_each_thread(g, p) { -			if (!wakeup && !freezer_should_skip(p) && -			    p != current && freezing(p) && !frozen(p)) -				sched_show_task(p); -		} while_each_thread(g, p); -		read_unlock(&tasklist_lock); +		if (!wakeup) { +			read_lock(&tasklist_lock); +			do_each_thread(g, p) { +				if (p != current && !freezer_should_skip(p) +				    && freezing(p) && !frozen(p)) +					sched_show_task(p); +			} while_each_thread(g, p); +			read_unlock(&tasklist_lock); +		}  	} else {  		printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,  			elapsed_csecs % 100); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 995e3bd3417b..d6d6dbd1ecc0 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,  static int __init pm_qos_power_init(void)  {  	int ret = 0; +	int i; -	ret = register_pm_qos_misc(&cpu_dma_pm_qos); -	if (ret < 0) { -		printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); -		return ret; -	} -	ret = register_pm_qos_misc(&network_lat_pm_qos); -	if (ret < 0) { -		printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); -		return ret; +	BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + +	for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { +		ret = register_pm_qos_misc(pm_qos_array[i]); +		if (ret < 0) { +			printk(KERN_ERR "pm_qos_param: %s setup failed\n", +			       pm_qos_array[i]->name); +			return ret; +		}  	} -	ret = register_pm_qos_misc(&network_throughput_pm_qos); -	if (ret < 0) -		printk(KERN_ERR -			"pm_qos_param: network_throughput setup failed\n");  	return ret;  } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6a768e537001..0de28576807d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)  	list_for_each_entry(region, &nosave_regions, list) {  		unsigned long pfn; -		pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", -				region->start_pfn << PAGE_SHIFT, -				region->end_pfn << PAGE_SHIFT); +		pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", +			 (unsigned long long) region->start_pfn << PAGE_SHIFT, +			 ((unsigned long long) region->end_pfn << PAGE_SHIFT) +				- 1);  		for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)  			if (pfn_valid(pfn)) { @@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)  	s_page = pfn_to_page(src_pfn);  	d_page = pfn_to_page(dst_pfn);  	if (PageHighMem(s_page)) { -		src = kmap_atomic(s_page, KM_USER0); -		dst = kmap_atomic(d_page, KM_USER1); +		src = kmap_atomic(s_page); +		dst = kmap_atomic(d_page);  		do_copy_page(dst, src); -		kunmap_atomic(dst, KM_USER1); -		kunmap_atomic(src, KM_USER0); +		kunmap_atomic(dst); +		kunmap_atomic(src);  	} else {  		if (PageHighMem(d_page)) {  			/* Page pointed to by src may contain some kernel  			 * data modified by kmap_atomic()  			 */  			safe_copy_page(buffer, s_page); -			dst = kmap_atomic(d_page, KM_USER0); +			dst = kmap_atomic(d_page);  			copy_page(dst, buffer); -			kunmap_atomic(dst, KM_USER0); +			kunmap_atomic(dst);  		} else {  			safe_copy_page(page_address(d_page), s_page);  		} @@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle)  			 */  			void *kaddr; -			kaddr = kmap_atomic(page, KM_USER0); +			kaddr = kmap_atomic(page);  			copy_page(buffer, kaddr); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			handle->buffer = buffer;  		} else {  			handle->buffer = page_address(page); @@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void)  	if (last_highmem_page) {  		void *dst; -		dst = kmap_atomic(last_highmem_page, KM_USER0); +		dst = kmap_atomic(last_highmem_page);  		copy_page(dst, buffer); -		kunmap_atomic(dst, KM_USER0); +		kunmap_atomic(dst);  		last_highmem_page = NULL;  	}  } @@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)  {  	void *kaddr1, *kaddr2; -	kaddr1 = kmap_atomic(p1, KM_USER0); -	kaddr2 = kmap_atomic(p2, KM_USER1); +	kaddr1 = kmap_atomic(p1); +	kaddr2 = kmap_atomic(p2);  	copy_page(buf, kaddr1);  	copy_page(kaddr1, kaddr2);  	copy_page(kaddr2, buf); -	kunmap_atomic(kaddr2, KM_USER1); -	kunmap_atomic(kaddr1, KM_USER0); +	kunmap_atomic(kaddr2); +	kunmap_atomic(kaddr1);  }  /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..88e5c967370d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {  static const struct platform_suspend_ops *suspend_ops;  /** - *	suspend_set_ops - Set the global suspend method table. - *	@ops:	Pointer to ops structure. + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use.   */  void suspend_set_ops(const struct platform_suspend_ops *ops)  { @@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state)  }  /** - * suspend_valid_only_mem - generic memory-only valid callback + * suspend_valid_only_mem - Generic memory-only valid callback.   * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback.   */  int suspend_valid_only_mem(suspend_state_t state)  { @@ -83,10 +83,11 @@ static int suspend_test(int level)  }  /** - *	suspend_prepare - Do prep work before entering low-power state. + * suspend_prepare - Prepare for entering system sleep state.   * - *	This is common code that is called for each state that we're entering. - *	Run suspend notifiers, allocate a console and stop all processes. + * Common code run for every system sleep state that can be entered (except for + * hibernation).  Run suspend notifiers, allocate the "suspend" console and + * freeze processes.   */  static int suspend_prepare(void)  { @@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)  }  /** - * suspend_enter - enter the desired system sleep state. - * @state: State to enter - * @wakeup: Returns information that suspend should not be entered again. + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered.   *   * This function should be called after devices have been suspended.   */ @@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  			goto Platform_finish;  	} -	error = dpm_suspend_noirq(PMSG_SUSPEND); +	error = dpm_suspend_end(PMSG_SUSPEND);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down\n");  		goto Platform_finish; @@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	if (suspend_ops->wake)  		suspend_ops->wake(); -	dpm_resume_noirq(PMSG_RESUME); +	dpm_resume_start(PMSG_RESUME);   Platform_finish:  	if (suspend_ops->finish) @@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  }  /** - *	suspend_devices_and_enter - suspend devices and enter the desired system - *				    sleep state. - *	@state:		  state to enter + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter.   */  int suspend_devices_and_enter(suspend_state_t state)  { @@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state)  }  /** - *	suspend_finish - Do final work before exiting suspend sequence. + * suspend_finish - Clean up before finishing the suspend sequence.   * - *	Call platform code to clean up, restart processes, and free the - *	console that we've allocated. This is not called for suspend-to-disk. + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation.   */  static void suspend_finish(void)  { @@ -265,16 +265,14 @@ static void suspend_finish(void)  }  /** - *	enter_state - Do common work of entering low-power state. - *	@state:		pm_state structure for state we're entering. + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter.   * - *	Make sure we're the only ones trying to enter a sleep state. Fail - *	if someone has beat us to it, since we don't want anything weird to - *	happen when we wake up. - *	Then, do the setup for suspend, enter the state, and cleaup (after - *	we've woken up). + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case.  Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup.   */ -int enter_state(suspend_state_t state) +static int enter_state(suspend_state_t state)  {  	int error; @@ -310,24 +308,26 @@ int enter_state(suspend_state_t state)  }  /** - *	pm_suspend - Externally visible function for suspending system. - *	@state:		Enumerated value of state to enter. + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter.   * - *	Determine whether or not value is within range, get state - *	structure, and enter (above). + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics.   */  int pm_suspend(suspend_state_t state)  { -	int ret; -	if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { -		ret = enter_state(state); -		if (ret) { -			suspend_stats.fail++; -			dpm_save_failed_errno(ret); -		} else -			suspend_stats.success++; -		return ret; +	int error; + +	if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) +		return -EINVAL; + +	error = enter_state(state); +	if (error) { +		suspend_stats.fail++; +		dpm_save_failed_errno(error); +	} else { +		suspend_stats.success++;  	} -	return -EINVAL; +	return error;  }  EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/user.c b/kernel/power/user.c index 3e100075b13c..33c4329205af 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,16 +249,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		}  		pm_restore_gfp_mask();  		error = hibernation_snapshot(data->platform_support); -		if (error) { -			thaw_kernel_threads(); -		} else { +		if (!error) {  			error = put_user(in_suspend, (int __user *)arg); -			if (!error && !freezer_test_done) -				data->ready = 1; -			if (freezer_test_done) { -				freezer_test_done = false; -				thaw_kernel_threads(); -			} +			data->ready = !freezer_test_done && !error; +			freezer_test_done = false;  		}  		break; diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..b663c2c95d39 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -44,6 +44,9 @@  #include <asm/uaccess.h> +#define CREATE_TRACE_POINTS +#include <trace/events/printk.h> +  /*   * Architectures can override it:   */ @@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"  static void _call_console_drivers(unsigned start,  				unsigned end, int msg_log_level)  { +	trace_console(&LOG_BUF(0), start, end, log_buf_len); +  	if ((msg_log_level < console_loglevel || ignore_loglevel) &&  			console_drivers && start != end) {  		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { @@ -702,6 +707,9 @@ static bool printk_time = 0;  #endif  module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +  /* Check if we have any console registered that can be called early in boot. */  static int have_callable_console(void)  { @@ -1208,13 +1216,27 @@ int is_console_locked(void)  	return console_locked;  } +/* + * Delayed printk facility, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE		512 + +#define PRINTK_PENDING_WAKEUP	0x01 +#define PRINTK_PENDING_SCHED	0x02 +  static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);  void printk_tick(void)  {  	if (__this_cpu_read(printk_pending)) { -		__this_cpu_write(printk_pending, 0); -		wake_up_interruptible(&log_wait); +		int pending = __this_cpu_xchg(printk_pending, 0); +		if (pending & PRINTK_PENDING_SCHED) { +			char *buf = __get_cpu_var(printk_sched_buf); +			printk(KERN_WARNING "[sched_delayed] %s", buf); +		} +		if (pending & PRINTK_PENDING_WAKEUP) +			wake_up_interruptible(&log_wait);  	}  } @@ -1228,7 +1250,7 @@ int printk_needs_cpu(int cpu)  void wake_up_klogd(void)  {  	if (waitqueue_active(&log_wait)) -		this_cpu_write(printk_pending, 1); +		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);  }  /** @@ -1621,6 +1643,26 @@ late_initcall(printk_late_init);  #if defined CONFIG_PRINTK +int printk_sched(const char *fmt, ...) +{ +	unsigned long flags; +	va_list args; +	char *buf; +	int r; + +	local_irq_save(flags); +	buf = __get_cpu_var(printk_sched_buf); + +	va_start(args, fmt); +	r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); +	va_end(args); + +	__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); +	local_irq_restore(flags); + +	return r; +} +  /*   * printk rate limiting, lifted from the networking subsystem.   * @@ -1732,6 +1774,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)  	unsigned long l1, l2;  	unsigned long flags; +	if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) +		return; +  	/* Theoretically, the log could move on after we do this, but  	   there's not a lot we can do about that. The new messages  	   will overwrite the start of what we dump. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 00ab2ca5ed11..ee8d49b9c309 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)  }  static int ptrace_attach(struct task_struct *task, long request, +			 unsigned long addr,  			 unsigned long flags)  {  	bool seize = (request == PTRACE_SEIZE);  	int retval; -	/* -	 * SEIZE will enable new ptrace behaviors which will be implemented -	 * gradually.  SEIZE_DEVEL is used to prevent applications -	 * expecting full SEIZE behaviors trapping on kernel commits which -	 * are still in the process of implementing them. -	 * -	 * Only test programs for new ptrace behaviors being implemented -	 * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO. -	 * -	 * Once SEIZE behaviors are completely implemented, this flag and -	 * the following test will be removed. -	 */  	retval = -EIO; -	if (seize && !(flags & PTRACE_SEIZE_DEVEL)) -		goto out; +	if (seize) { +		if (addr != 0) +			goto out; +		if (flags & ~(unsigned long)PTRACE_O_MASK) +			goto out; +		flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); +	} else { +		flags = PT_PTRACED; +	}  	audit_ptrace(task); @@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,  	/*  	 * Protect exec's credential calculations against our interference; -	 * interference; SUID, SGID and LSM creds get determined differently +	 * SUID, SGID and LSM creds get determined differently  	 * under ptrace.  	 */  	retval = -ERESTARTNOINTR; @@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,  	if (task->ptrace)  		goto unlock_tasklist; -	task->ptrace = PT_PTRACED;  	if (seize) -		task->ptrace |= PT_SEIZED; +		flags |= PT_SEIZED;  	if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) -		task->ptrace |= PT_PTRACE_CAP; +		flags |= PT_PTRACE_CAP; +	task->ptrace = flags;  	__ptrace_link(task, current); @@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds  static int ptrace_setoptions(struct task_struct *child, unsigned long data)  { -	child->ptrace &= ~PT_TRACE_MASK; +	unsigned flags; -	if (data & PTRACE_O_TRACESYSGOOD) -		child->ptrace |= PT_TRACESYSGOOD; - -	if (data & PTRACE_O_TRACEFORK) -		child->ptrace |= PT_TRACE_FORK; - -	if (data & PTRACE_O_TRACEVFORK) -		child->ptrace |= PT_TRACE_VFORK; - -	if (data & PTRACE_O_TRACECLONE) -		child->ptrace |= PT_TRACE_CLONE; - -	if (data & PTRACE_O_TRACEEXEC) -		child->ptrace |= PT_TRACE_EXEC; - -	if (data & PTRACE_O_TRACEVFORKDONE) -		child->ptrace |= PT_TRACE_VFORK_DONE; +	if (data & ~(unsigned long)PTRACE_O_MASK) +		return -EINVAL; -	if (data & PTRACE_O_TRACEEXIT) -		child->ptrace |= PT_TRACE_EXIT; +	/* Avoid intermediate state when all opts are cleared */ +	flags = child->ptrace; +	flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); +	flags |= (data << PT_OPT_FLAG_SHIFT); +	child->ptrace = flags; -	return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; +	return 0;  }  static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) @@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,  	}  	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { -		ret = ptrace_attach(child, request, data); +		ret = ptrace_attach(child, request, addr, data);  		/*  		 * Some architectures need to do book-keeping after  		 * a ptrace attach. @@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,  	}  	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { -		ret = ptrace_attach(child, request, data); +		ret = ptrace_attach(child, request, addr, data);  		/*  		 * Some architectures need to do book-keeping after  		 * a ptrace attach. diff --git a/kernel/rcu.h b/kernel/rcu.h index aa88baab5f78..8ba99cdc6515 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -33,8 +33,27 @@   * Process-level increment to ->dynticks_nesting field.  This allows for   * architectures that use half-interrupts and half-exceptions from   * process context. + * + * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH + * that counts the number of process-based reasons why RCU cannot + * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE + * is the value used to increment or decrement this field. + * + * The rest of the bits could in principle be used to count interrupts, + * but this would mean that a negative-one value in the interrupt + * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. + * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK + * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. + * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon + * initial exit from idle.   */ -#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) +#define DYNTICK_TASK_NEST_WIDTH 7 +#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) +#define DYNTICK_TASK_NEST_MASK  (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) +#define DYNTICK_TASK_FLAG	   ((DYNTICK_TASK_NEST_VALUE / 8) * 2) +#define DYNTICK_TASK_MASK	   ((DYNTICK_TASK_NEST_VALUE / 8) * 3) +#define DYNTICK_TASK_EXIT_IDLE	   (DYNTICK_TASK_NEST_VALUE + \ +				    DYNTICK_TASK_FLAG)  /*   * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally @@ -50,7 +69,6 @@ extern struct debug_obj_descr rcuhead_debug_descr;  static inline void debug_rcu_head_queue(struct rcu_head *head)  { -	WARN_ON_ONCE((unsigned long)head & 0x3);  	debug_object_activate(head, &rcuhead_debug_descr);  	debug_object_active_state(head, &rcuhead_debug_descr,  				  STATE_RCU_HEAD_READY, @@ -76,16 +94,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)  extern void kfree(const void *); -static inline void __rcu_reclaim(char *rn, struct rcu_head *head) +static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)  {  	unsigned long offset = (unsigned long)head->func;  	if (__is_kfree_rcu_offset(offset)) {  		RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));  		kfree((void *)head - offset); +		return 1;  	} else {  		RCU_TRACE(trace_rcu_invoke_callback(rn, head));  		head->func(head); +		return 0;  	}  } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2bc4e135ff23..a86f1741cc27 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);   * section.   *   * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well.   */  int rcu_read_lock_bh_held(void)  { @@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void)  		return 1;  	if (rcu_is_cpu_idle())  		return 0; +	if (!rcu_lockdep_current_cpu_online()) +		return 0;  	return in_softirq() || irqs_disabled();  }  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 977296dca0a4..37a5444204d2 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head,  #include "rcutiny_plugin.h" -static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; +static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */  static void rcu_idle_enter_common(long long oldval) @@ -88,10 +88,16 @@ void rcu_idle_enter(void)  	local_irq_save(flags);  	oldval = rcu_dynticks_nesting; -	rcu_dynticks_nesting = 0; +	WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); +	if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == +	    DYNTICK_TASK_NEST_VALUE) +		rcu_dynticks_nesting = 0; +	else +		rcu_dynticks_nesting  -= DYNTICK_TASK_NEST_VALUE;  	rcu_idle_enter_common(oldval);  	local_irq_restore(flags);  } +EXPORT_SYMBOL_GPL(rcu_idle_enter);  /*   * Exit an interrupt handler towards idle. @@ -140,11 +146,15 @@ void rcu_idle_exit(void)  	local_irq_save(flags);  	oldval = rcu_dynticks_nesting; -	WARN_ON_ONCE(oldval != 0); -	rcu_dynticks_nesting = DYNTICK_TASK_NESTING; +	WARN_ON_ONCE(rcu_dynticks_nesting < 0); +	if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) +		rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; +	else +		rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  	rcu_idle_exit_common(oldval);  	local_irq_restore(flags);  } +EXPORT_SYMBOL_GPL(rcu_idle_exit);  /*   * Enter an interrupt handler, moving away from idle. @@ -258,7 +268,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)  	/* If no RCU callbacks ready to invoke, just return. */  	if (&rcp->rcucblist == rcp->donetail) { -		RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); +		RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));  		RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,  					      ACCESS_ONCE(rcp->rcucblist),  					      need_resched(), @@ -269,7 +279,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)  	/* Move the ready-to-invoke callbacks to a local list. */  	local_irq_save(flags); -	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); +	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));  	list = rcp->rcucblist;  	rcp->rcucblist = *rcp->donetail;  	*rcp->donetail = NULL; @@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)   */  void synchronize_sched(void)  { +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_sched() in RCU read-side critical section");  	cond_resched();  }  EXPORT_SYMBOL_GPL(synchronize_sched); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 9cb1ae4aabdd..22ecea0dfb62 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {  	RCU_TRACE(.rcb.name = "rcu_preempt")  }; +static void rcu_read_unlock_special(struct task_struct *t);  static int rcu_preempted_readers_exp(void);  static void rcu_report_exp_done(void); @@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void)  /*   * Check for a running RCU reader.  Because there is only one CPU,   * there can be but one running RCU reader at a time.  ;-) + * + * Returns zero if there are no running readers.  Returns a positive + * number if there is at least one reader within its RCU read-side + * critical section.  Returns a negative number if an outermost reader + * is in the midst of exiting from its RCU read-side critical section + * + * Returns zero if there are no running readers.  Returns a positive + * number if there is at least one reader within its RCU read-side + * critical section.  Returns a negative number if an outermost reader + * is in the midst of exiting from its RCU read-side critical section.   */  static int rcu_preempt_running_reader(void)  { @@ -307,7 +318,6 @@ static int rcu_boost(void)  	t = container_of(tb, struct task_struct, rcu_node_entry);  	rt_mutex_init_proxy_locked(&mtx, t);  	t->rcu_boost_mutex = &mtx; -	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;  	raw_local_irq_restore(flags);  	rt_mutex_lock(&mtx);  	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */ @@ -475,7 +485,7 @@ void rcu_preempt_note_context_switch(void)  	unsigned long flags;  	local_irq_save(flags); /* must exclude scheduler_tick(). */ -	if (rcu_preempt_running_reader() && +	if (rcu_preempt_running_reader() > 0 &&  	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {  		/* Possibly blocking in an RCU read-side critical section. */ @@ -494,6 +504,13 @@ void rcu_preempt_note_context_switch(void)  		list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);  		if (rcu_cpu_blocking_cur_gp())  			rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; +	} else if (rcu_preempt_running_reader() < 0 && +		   t->rcu_read_unlock_special) { +		/* +		 * Complete exit from RCU read-side critical section on +		 * behalf of preempted instance of __rcu_read_unlock(). +		 */ +		rcu_read_unlock_special(t);  	}  	/* @@ -526,12 +543,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);   * notify RCU core processing or task having blocked during the RCU   * read-side critical section.   */ -static void rcu_read_unlock_special(struct task_struct *t) +static noinline void rcu_read_unlock_special(struct task_struct *t)  {  	int empty;  	int empty_exp;  	unsigned long flags;  	struct list_head *np; +#ifdef CONFIG_RCU_BOOST +	struct rt_mutex *rbmp = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */  	int special;  	/* @@ -552,7 +572,7 @@ static void rcu_read_unlock_special(struct task_struct *t)  		rcu_preempt_cpu_qs();  	/* Hardware IRQ handlers cannot block. */ -	if (in_irq()) { +	if (in_irq() || in_serving_softirq()) {  		local_irq_restore(flags);  		return;  	} @@ -597,10 +617,10 @@ static void rcu_read_unlock_special(struct task_struct *t)  	}  #ifdef CONFIG_RCU_BOOST  	/* Unboost self if was boosted. */ -	if (special & RCU_READ_UNLOCK_BOOSTED) { -		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; -		rt_mutex_unlock(t->rcu_boost_mutex); +	if (t->rcu_boost_mutex != NULL) { +		rbmp = t->rcu_boost_mutex;  		t->rcu_boost_mutex = NULL; +		rt_mutex_unlock(rbmp);  	}  #endif /* #ifdef CONFIG_RCU_BOOST */  	local_irq_restore(flags); @@ -618,13 +638,22 @@ void __rcu_read_unlock(void)  	struct task_struct *t = current;  	barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ -	--t->rcu_read_lock_nesting; -	barrier();  /* decrement before load of ->rcu_read_unlock_special */ -	if (t->rcu_read_lock_nesting == 0 && -	    unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) -		rcu_read_unlock_special(t); +	if (t->rcu_read_lock_nesting != 1) +		--t->rcu_read_lock_nesting; +	else { +		t->rcu_read_lock_nesting = INT_MIN; +		barrier();  /* assign before ->rcu_read_unlock_special load */ +		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) +			rcu_read_unlock_special(t); +		barrier();  /* ->rcu_read_unlock_special load before assign */ +		t->rcu_read_lock_nesting = 0; +	}  #ifdef CONFIG_PROVE_LOCKING -	WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +	{ +		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + +		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); +	}  #endif /* #ifdef CONFIG_PROVE_LOCKING */  }  EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -649,7 +678,7 @@ static void rcu_preempt_check_callbacks(void)  		invoke_rcu_callbacks();  	if (rcu_preempt_gp_in_progress() &&  	    rcu_cpu_blocking_cur_gp() && -	    rcu_preempt_running_reader()) +	    rcu_preempt_running_reader() > 0)  		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;  } @@ -706,6 +735,11 @@ EXPORT_SYMBOL_GPL(call_rcu);   */  void synchronize_rcu(void)  { +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_rcu() in RCU read-side critical section"); +  #ifdef CONFIG_DEBUG_LOCK_ALLOC  	if (!rcu_scheduler_active)  		return; @@ -882,7 +916,8 @@ static void rcu_preempt_process_callbacks(void)  static void invoke_rcu_callbacks(void)  {  	have_rcu_kthread_work = 1; -	wake_up(&rcu_kthread_wq); +	if (rcu_kthread_task != NULL) +		wake_up(&rcu_kthread_wq);  }  #ifdef CONFIG_RCU_TRACE @@ -943,12 +978,16 @@ early_initcall(rcu_spawn_kthreads);  #else /* #ifdef CONFIG_RCU_BOOST */ +/* Hold off callback invocation until early_initcall() time. */ +static int rcu_scheduler_fully_active __read_mostly; +  /*   * Start up softirq processing of callbacks.   */  void invoke_rcu_callbacks(void)  { -	raise_softirq(RCU_SOFTIRQ); +	if (rcu_scheduler_fully_active) +		raise_softirq(RCU_SOFTIRQ);  }  #ifdef CONFIG_RCU_TRACE @@ -963,10 +1002,14 @@ static bool rcu_is_callbacks_kthread(void)  #endif /* #ifdef CONFIG_RCU_TRACE */ -void rcu_init(void) +static int __init rcu_scheduler_really_started(void)  { +	rcu_scheduler_fully_active = 1;  	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +	raise_softirq(RCU_SOFTIRQ);  /* Invoke any callbacks from early boot. */ +	return 0;  } +early_initcall(rcu_scheduler_really_started);  #endif /* #else #ifdef CONFIG_RCU_BOOST */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a58ac285fc69..a89b381a8c6e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -65,7 +65,10 @@ static int fqs_duration;	/* Duration of bursts (us), 0 to disable. */  static int fqs_holdoff;		/* Hold time within burst (us). */  static int fqs_stutter = 3;	/* Wait time between bursts (s). */  static int onoff_interval;	/* Wait time between CPU hotplugs, 0=disable. */ +static int onoff_holdoff;	/* Seconds after boot before CPU hotplugs. */  static int shutdown_secs;	/* Shutdown time (s).  <=0 for no shutdown. */ +static int stall_cpu;		/* CPU-stall duration (s).  0 for no stall. */ +static int stall_cpu_holdoff = 10; /* Time to wait until stall (s).  */  static int test_boost = 1;	/* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */  static int test_boost_interval = 7; /* Interval between boost tests, seconds. */  static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ @@ -95,8 +98,14 @@ module_param(fqs_stutter, int, 0444);  MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");  module_param(onoff_interval, int, 0444);  MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +module_param(onoff_holdoff, int, 0444); +MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");  module_param(shutdown_secs, int, 0444);  MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); +module_param(stall_cpu, int, 0444); +MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +module_param(stall_cpu_holdoff, int, 0444); +MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");  module_param(test_boost, int, 0444);  MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");  module_param(test_boost_interval, int, 0444); @@ -129,6 +138,7 @@ static struct task_struct *shutdown_task;  #ifdef CONFIG_HOTPLUG_CPU  static struct task_struct *onoff_task;  #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static struct task_struct *stall_task;  #define RCU_TORTURE_PIPE_LEN 10 @@ -990,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused)  				  rcu_read_lock_bh_held() ||  				  rcu_read_lock_sched_held() ||  				  srcu_read_lock_held(&srcu_ctl)); -	do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);  	if (p == NULL) {  		/* Leave because rcu_torture_writer is not yet underway */  		cur_ops->readunlock(idx);  		return;  	} +	do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);  	if (p->rtort_mbtest == 0)  		atomic_inc(&n_rcu_torture_mberror);  	spin_lock(&rand_lock); @@ -1053,13 +1063,13 @@ rcu_torture_reader(void *arg)  					  rcu_read_lock_bh_held() ||  					  rcu_read_lock_sched_held() ||  					  srcu_read_lock_held(&srcu_ctl)); -		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);  		if (p == NULL) {  			/* Wait for rcu_torture_writer to get underway */  			cur_ops->readunlock(idx);  			schedule_timeout_interruptible(HZ);  			continue;  		} +		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);  		if (p->rtort_mbtest == 0)  			atomic_inc(&n_rcu_torture_mberror);  		cur_ops->read_delay(&rand); @@ -1300,13 +1310,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)  		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "  		"test_boost=%d/%d test_boost_interval=%d "  		"test_boost_duration=%d shutdown_secs=%d " -		"onoff_interval=%d\n", +		"onoff_interval=%d onoff_holdoff=%d\n",  		torture_type, tag, nrealreaders, nfakewriters,  		stat_interval, verbose, test_no_idle_hz, shuffle_interval,  		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,  		test_boost, cur_ops->can_boost,  		test_boost_interval, test_boost_duration, shutdown_secs, -		onoff_interval); +		onoff_interval, onoff_holdoff);  }  static struct notifier_block rcutorture_shutdown_nb = { @@ -1410,6 +1420,11 @@ rcu_torture_onoff(void *arg)  	for_each_online_cpu(cpu)  		maxcpu = cpu;  	WARN_ON(maxcpu < 0); +	if (onoff_holdoff > 0) { +		VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); +		schedule_timeout_interruptible(onoff_holdoff * HZ); +		VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); +	}  	while (!kthread_should_stop()) {  		cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);  		if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { @@ -1450,12 +1465,15 @@ rcu_torture_onoff(void *arg)  static int __cpuinit  rcu_torture_onoff_init(void)  { +	int ret; +  	if (onoff_interval <= 0)  		return 0;  	onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");  	if (IS_ERR(onoff_task)) { +		ret = PTR_ERR(onoff_task);  		onoff_task = NULL; -		return PTR_ERR(onoff_task); +		return ret;  	}  	return 0;  } @@ -1481,6 +1499,63 @@ static void rcu_torture_onoff_cleanup(void)  #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ +/* + * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then + * induces a CPU stall for the time specified by stall_cpu. + */ +static int __cpuinit rcu_torture_stall(void *args) +{ +	unsigned long stop_at; + +	VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); +	if (stall_cpu_holdoff > 0) { +		VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); +		schedule_timeout_interruptible(stall_cpu_holdoff * HZ); +		VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); +	} +	if (!kthread_should_stop()) { +		stop_at = get_seconds() + stall_cpu; +		/* RCU CPU stall is expected behavior in following code. */ +		printk(KERN_ALERT "rcu_torture_stall start.\n"); +		rcu_read_lock(); +		preempt_disable(); +		while (ULONG_CMP_LT(get_seconds(), stop_at)) +			continue;  /* Induce RCU CPU stall warning. */ +		preempt_enable(); +		rcu_read_unlock(); +		printk(KERN_ALERT "rcu_torture_stall end.\n"); +	} +	rcutorture_shutdown_absorb("rcu_torture_stall"); +	while (!kthread_should_stop()) +		schedule_timeout_interruptible(10 * HZ); +	return 0; +} + +/* Spawn CPU-stall kthread, if stall_cpu specified. */ +static int __init rcu_torture_stall_init(void) +{ +	int ret; + +	if (stall_cpu <= 0) +		return 0; +	stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); +	if (IS_ERR(stall_task)) { +		ret = PTR_ERR(stall_task); +		stall_task = NULL; +		return ret; +	} +	return 0; +} + +/* Clean up after the CPU-stall kthread, if one was spawned. */ +static void rcu_torture_stall_cleanup(void) +{ +	if (stall_task == NULL) +		return; +	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); +	kthread_stop(stall_task); +} +  static int rcutorture_cpu_notify(struct notifier_block *self,  				 unsigned long action, void *hcpu)  { @@ -1523,6 +1598,7 @@ rcu_torture_cleanup(void)  	fullstop = FULLSTOP_RMMOD;  	mutex_unlock(&fullstop_mutex);  	unregister_reboot_notifier(&rcutorture_shutdown_nb); +	rcu_torture_stall_cleanup();  	if (stutter_task) {  		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");  		kthread_stop(stutter_task); @@ -1602,6 +1678,10 @@ rcu_torture_cleanup(void)  		cur_ops->cleanup();  	if (atomic_read(&n_rcu_torture_error))  		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); +	else if (n_online_successes != n_online_attempts || +		 n_offline_successes != n_offline_attempts) +		rcu_torture_print_module_parms(cur_ops, +					       "End of test: RCU_HOTPLUG");  	else  		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");  } @@ -1819,6 +1899,7 @@ rcu_torture_init(void)  	}  	rcu_torture_onoff_init();  	register_reboot_notifier(&rcutorture_shutdown_nb); +	rcu_torture_stall_init();  	rcutorture_record_test_transition();  	mutex_unlock(&fullstop_mutex);  	return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6c4a6722abfd..1050d6d3922c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -50,6 +50,8 @@  #include <linux/wait.h>  #include <linux/kthread.h>  #include <linux/prefetch.h> +#include <linux/delay.h> +#include <linux/stop_machine.h>  #include "rcutree.h"  #include <trace/events/rcu.h> @@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu)  EXPORT_SYMBOL_GPL(rcu_note_context_switch);  DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { -	.dynticks_nesting = DYNTICK_TASK_NESTING, +	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,  	.dynticks = ATOMIC_INIT(1),  }; @@ -208,8 +210,11 @@ module_param(blimit, int, 0);  module_param(qhimark, int, 0);  module_param(qlowmark, int, 0); -int rcu_cpu_stall_suppress __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +  module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644);  static void force_quiescent_state(struct rcu_state *rsp, int relaxed);  static int rcu_pending(int cpu); @@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)  	return &rsp->node[0];  } -#ifdef CONFIG_SMP -  /*   * If the specified CPU is offline, tell the caller that it is in   * a quiescent state.  Otherwise, whack it with a reschedule IPI. @@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)  static int rcu_implicit_offline_qs(struct rcu_data *rdp)  {  	/* -	 * If the CPU is offline, it is in a quiescent state.  We can -	 * trust its state not to change because interrupts are disabled. +	 * If the CPU is offline for more than a jiffy, it is in a quiescent +	 * state.  We can trust its state not to change because interrupts +	 * are disabled.  The reason for the jiffy's worth of slack is to +	 * handle CPUs initializing on the way up and finding their way +	 * to the idle loop on the way down.  	 */ -	if (cpu_is_offline(rdp->cpu)) { +	if (cpu_is_offline(rdp->cpu) && +	    ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {  		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");  		rdp->offline_fqs++;  		return 1;  	} - -	/* -	 * The CPU is online, so send it a reschedule IPI.  This forces -	 * it through the scheduler, and (inefficiently) also handles cases -	 * where idle loops fail to inform RCU about the CPU being idle. -	 */ -	if (rdp->cpu != smp_processor_id()) -		smp_send_reschedule(rdp->cpu); -	else -		set_need_resched(); -	rdp->resched_ipi++;  	return 0;  } -#endif /* #ifdef CONFIG_SMP */ -  /*   * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle   * @@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)  	atomic_inc(&rdtp->dynticks);  	smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */  	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + +	/* +	 * The idle task is not permitted to enter the idle loop while +	 * in an RCU read-side critical section. +	 */ +	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), +			   "Illegal idle entry in RCU read-side critical section."); +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), +			   "Illegal idle entry in RCU-bh read-side critical section."); +	rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), +			   "Illegal idle entry in RCU-sched read-side critical section.");  }  /** @@ -389,10 +394,15 @@ void rcu_idle_enter(void)  	local_irq_save(flags);  	rdtp = &__get_cpu_var(rcu_dynticks);  	oldval = rdtp->dynticks_nesting; -	rdtp->dynticks_nesting = 0; +	WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); +	if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) +		rdtp->dynticks_nesting = 0; +	else +		rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;  	rcu_idle_enter_common(rdtp, oldval);  	local_irq_restore(flags);  } +EXPORT_SYMBOL_GPL(rcu_idle_enter);  /**   * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)   * Exit idle mode, in other words, -enter- the mode in which RCU   * read-side critical sections can occur.   * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to   * allow for the possibility of usermode upcalls messing up our count   * of interrupt nesting level during the busy period that is just   * now starting. @@ -476,11 +486,15 @@ void rcu_idle_exit(void)  	local_irq_save(flags);  	rdtp = &__get_cpu_var(rcu_dynticks);  	oldval = rdtp->dynticks_nesting; -	WARN_ON_ONCE(oldval != 0); -	rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; +	WARN_ON_ONCE(oldval < 0); +	if (oldval & DYNTICK_TASK_NEST_MASK) +		rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; +	else +		rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  	rcu_idle_exit_common(rdtp, oldval);  	local_irq_restore(flags);  } +EXPORT_SYMBOL_GPL(rcu_idle_exit);  /**   * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void)  }  EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Is the current CPU online?  Disable preemption to avoid false positives + * that could otherwise happen due to the current CPU number being sampled, + * this task being preempted, its old CPU being taken offline, resuming + * on some other CPU, then determining that its old CPU is now offline. + * It is OK to use RCU on an offline processor during initial boot, hence + * the check for rcu_scheduler_fully_active.  Note also that it is OK + * for a CPU coming online to use RCU for one jiffy prior to marking itself + * online in the cpu_online_mask.  Similarly, it is OK for a CPU going + * offline to continue to use RCU for one jiffy after marking itself + * offline in the cpu_online_mask.  This leniency is necessary given the + * non-atomic nature of the online and offline processing, for example, + * the fact that a CPU enters the scheduler after completing the CPU_DYING + * notifiers. + * + * This is also why RCU internally marks CPUs online during the + * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * + * Disable checking if in an NMI handler because we cannot safely report + * errors from NMI handlers anyway. + */ +bool rcu_lockdep_current_cpu_online(void) +{ +	struct rcu_data *rdp; +	struct rcu_node *rnp; +	bool ret; + +	if (in_nmi()) +		return 1; +	preempt_disable(); +	rdp = &__get_cpu_var(rcu_sched_data); +	rnp = rdp->mynode; +	ret = (rdp->grpmask & rnp->qsmaskinit) || +	      !rcu_scheduler_fully_active; +	preempt_enable(); +	return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +  #endif /* #ifdef CONFIG_PROVE_RCU */  /** @@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void)  	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;  } -#ifdef CONFIG_SMP -  /*   * Snapshot the specified CPU's dynticks counter so that we can later   * credit them with an implicit quiescent state.  Return 1 if this CPU @@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  	return rcu_implicit_offline_qs(rdp);  } -#endif /* #ifdef CONFIG_SMP */ +static int jiffies_till_stall_check(void) +{ +	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + +	/* +	 * Limit check must be consistent with the Kconfig limits +	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. +	 */ +	if (till_stall_check < 3) { +		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; +		till_stall_check = 3; +	} else if (till_stall_check > 300) { +		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; +		till_stall_check = 300; +	} +	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +}  static void record_gp_stall_check_time(struct rcu_state *rsp)  {  	rsp->gp_start = jiffies; -	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +	rsp->jiffies_stall = jiffies + jiffies_till_stall_check();  }  static void print_other_cpu_stall(struct rcu_state *rsp) @@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  		raw_spin_unlock_irqrestore(&rnp->lock, flags);  		return;  	} -	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - -	/* -	 * Now rat on any tasks that got kicked up to the root rcu_node -	 * due to CPU offlining. -	 */ -	ndetected = rcu_print_task_stall(rnp); +	rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	/* @@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	 * See Documentation/RCU/stallwarn.txt for info on how to debug  	 * RCU CPU stall warnings.  	 */ -	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", +	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",  	       rsp->name); +	print_cpu_stall_info_begin();  	rcu_for_each_leaf_node(rsp, rnp) {  		raw_spin_lock_irqsave(&rnp->lock, flags);  		ndetected += rcu_print_task_stall(rnp); @@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  			continue;  		for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)  			if (rnp->qsmask & (1UL << cpu)) { -				printk(" %d", rnp->grplo + cpu); +				print_cpu_stall_info(rsp, rnp->grplo + cpu);  				ndetected++;  			}  	} -	printk("} (detected by %d, t=%ld jiffies)\n", + +	/* +	 * Now rat on any tasks that got kicked up to the root rcu_node +	 * due to CPU offlining. +	 */ +	rnp = rcu_get_root(rsp); +	raw_spin_lock_irqsave(&rnp->lock, flags); +	ndetected = rcu_print_task_stall(rnp); +	raw_spin_unlock_irqrestore(&rnp->lock, flags); + +	print_cpu_stall_info_end(); +	printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",  	       smp_processor_id(), (long)(jiffies - rsp->gp_start));  	if (ndetected == 0)  		printk(KERN_ERR "INFO: Stall ended before state dump start\n"); @@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp)  	 * See Documentation/RCU/stallwarn.txt for info on how to debug  	 * RCU CPU stall warnings.  	 */ -	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", -	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start); +	printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); +	print_cpu_stall_info_begin(); +	print_cpu_stall_info(rsp, smp_processor_id()); +	print_cpu_stall_info_end(); +	printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);  	if (!trigger_all_cpu_backtrace())  		dump_stack();  	raw_spin_lock_irqsave(&rnp->lock, flags);  	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) -		rsp->jiffies_stall = -			jiffies + RCU_SECONDS_TILL_STALL_RECHECK; +		rsp->jiffies_stall = jiffies + +				     3 * jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	set_need_resched();  /* kick ourselves to get things going. */ @@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct  			rdp->passed_quiesce = 0;  		} else  			rdp->qs_pending = 0; +		zero_cpu_stall_ticks(rdp);  	}  } @@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat   * in preparation for detecting the next grace period.  The caller must hold   * the root node's ->lock, which is released before return.  Hard irqs must   * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function.  This can happen when the dying CPU reports its + * quiescent state.   */  static void  rcu_start_gp(struct rcu_state *rsp, unsigned long flags) @@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)  	rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */  	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;  	record_gp_stall_check_time(rsp); - -	/* Special-case the common single-level case. */ -	if (NUM_RCU_NODES == 1) { -		rcu_preempt_check_blocked_tasks(rnp); -		rnp->qsmask = rnp->qsmaskinit; -		rnp->gpnum = rsp->gpnum; -		rnp->completed = rsp->completed; -		rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ -		rcu_start_gp_per_cpu(rsp, rnp, rdp); -		rcu_preempt_boost_start_gp(rnp); -		trace_rcu_grace_period_init(rsp->name, rnp->gpnum, -					    rnp->level, rnp->grplo, -					    rnp->grphi, rnp->qsmask); -		raw_spin_unlock_irqrestore(&rnp->lock, flags); -		return; -	} -  	raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */ -  	/* Exclude any concurrent CPU-hotplug operations. */  	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */ @@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)  /*   * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Synchronization is not required because this function executes - * in stop_machine() context. + * Also record a quiescent state for this CPU for the current grace period. + * Synchronization and interrupt disabling are not required because + * this function executes in stop_machine() context.  Therefore, cleanup + * operations that might block must be done later from the CPU_DEAD + * notifier. + * + * Note that the outgoing CPU's bit has already been cleared in the + * cpu_online_mask.  This allows us to randomly pick a callback + * destination from the bits set in that mask.   */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)  {  	int i; -	/* current DYING CPU is cleared in the cpu_online_mask */ +	unsigned long mask;  	int receive_cpu = cpumask_any(cpu_online_mask);  	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);  	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); +	RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ + +	/* First, adjust the counts. */ +	if (rdp->nxtlist != NULL) { +		receive_rdp->qlen_lazy += rdp->qlen_lazy; +		receive_rdp->qlen += rdp->qlen; +		rdp->qlen_lazy = 0; +		rdp->qlen = 0; +	} -	if (rdp->nxtlist == NULL) -		return;  /* irqs disabled, so comparison is stable. */ +	/* +	 * Next, move ready-to-invoke callbacks to be invoked on some +	 * other CPU.  These will not be required to pass through another +	 * grace period:  They are done, regardless of CPU. +	 */ +	if (rdp->nxtlist != NULL && +	    rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { +		struct rcu_head *oldhead; +		struct rcu_head **oldtail; +		struct rcu_head **newtail; + +		oldhead = rdp->nxtlist; +		oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; +		rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; +		*rdp->nxttail[RCU_DONE_TAIL] = *oldtail; +		*receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; +		newtail = rdp->nxttail[RCU_DONE_TAIL]; +		for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { +			if (receive_rdp->nxttail[i] == oldtail) +				receive_rdp->nxttail[i] = newtail; +			if (rdp->nxttail[i] == newtail) +				rdp->nxttail[i] = &rdp->nxtlist; +		} +	} -	*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; -	receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; -	receive_rdp->qlen += rdp->qlen; -	receive_rdp->n_cbs_adopted += rdp->qlen; -	rdp->n_cbs_orphaned += rdp->qlen; +	/* +	 * Finally, put the rest of the callbacks at the end of the list. +	 * The ones that made it partway through get to start over:  We +	 * cannot assume that grace periods are synchronized across CPUs. +	 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but +	 * this does not seem compelling.  Not yet, anyway.) +	 */ +	if (rdp->nxtlist != NULL) { +		*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; +		receive_rdp->nxttail[RCU_NEXT_TAIL] = +				rdp->nxttail[RCU_NEXT_TAIL]; +		receive_rdp->n_cbs_adopted += rdp->qlen; +		rdp->n_cbs_orphaned += rdp->qlen; + +		rdp->nxtlist = NULL; +		for (i = 0; i < RCU_NEXT_SIZE; i++) +			rdp->nxttail[i] = &rdp->nxtlist; +	} -	rdp->nxtlist = NULL; -	for (i = 0; i < RCU_NEXT_SIZE; i++) -		rdp->nxttail[i] = &rdp->nxtlist; -	rdp->qlen = 0; +	/* +	 * Record a quiescent state for the dying CPU.  This is safe +	 * only because we have already cleared out the callbacks. +	 * (Otherwise, the RCU core might try to schedule the invocation +	 * of callbacks on this now-offline CPU, which would be bad.) +	 */ +	mask = rdp->grpmask;	/* rnp->grplo is constant. */ +	trace_rcu_grace_period(rsp->name, +			       rnp->gpnum + 1 - !!(rnp->qsmask & mask), +			       "cpuofl"); +	rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); +	/* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */  }  /* - * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy - * and move all callbacks from the outgoing CPU to the current one. + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context.  Do the remainder of the cleanup.   * There can only be one CPU hotplug operation at a time, so no other   * CPU can be attempting to update rcu_cpu_kthread_task.   */ -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  {  	unsigned long flags;  	unsigned long mask;  	int need_report = 0;  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); -	struct rcu_node *rnp; +	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */ +	/* Adjust any no-longer-needed kthreads. */  	rcu_stop_cpu_kthread(cpu); +	rcu_node_kthread_setaffinity(rnp, -1); + +	/* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */  	/* Exclude any attempts to start a new grace period. */  	raw_spin_lock_irqsave(&rsp->onofflock, flags);  	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ -	rnp = rdp->mynode;	/* this is the outgoing CPU's rnp. */  	mask = rdp->grpmask;	/* rnp->grplo is constant. */  	do {  		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */ @@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)  		if (rnp->qsmaskinit != 0) {  			if (rnp != rdp->mynode)  				raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ -			else -				trace_rcu_grace_period(rsp->name, -						       rnp->gpnum + 1 - -						       !!(rnp->qsmask & mask), -						       "cpuofl");  			break;  		} -		if (rnp == rdp->mynode) { -			trace_rcu_grace_period(rsp->name, -					       rnp->gpnum + 1 - -					       !!(rnp->qsmask & mask), -					       "cpuofl"); +		if (rnp == rdp->mynode)  			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); -		} else +		else  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */  		mask = rnp->grpmask;  		rnp = rnp->parent; @@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)  		raw_spin_unlock_irqrestore(&rnp->lock, flags);  	if (need_report & RCU_OFL_TASKS_EXP_GP)  		rcu_report_exp_rnp(rsp, rnp, true); -	rcu_node_kthread_setaffinity(rnp, -1); -} - -/* - * Remove the specified CPU from the RCU hierarchy and move any pending - * callbacks that it might have to the current CPU.  This code assumes - * that at least one CPU in the system will remain running at all times. - * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. - */ -static void rcu_offline_cpu(int cpu) -{ -	__rcu_offline_cpu(cpu, &rcu_sched_state); -	__rcu_offline_cpu(cpu, &rcu_bh_state); -	rcu_preempt_offline_cpu(cpu);  }  #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)  {  } -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  {  } @@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  {  	unsigned long flags;  	struct rcu_head *next, *list, **tail; -	int bl, count; +	int bl, count, count_lazy;  	/* If no callbacks are ready, just return.*/  	if (!cpu_has_callbacks_ready_to_invoke(rdp)) { -		trace_rcu_batch_start(rsp->name, 0, 0); +		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);  		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),  				    need_resched(), is_idle_task(current),  				    rcu_is_callbacks_kthread()); @@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	 * races with call_rcu() from interrupt handlers.  	 */  	local_irq_save(flags); +	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));  	bl = rdp->blimit; -	trace_rcu_batch_start(rsp->name, rdp->qlen, bl); +	trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);  	list = rdp->nxtlist;  	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];  	*rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	local_irq_restore(flags);  	/* Invoke callbacks. */ -	count = 0; +	count = count_lazy = 0;  	while (list) {  		next = list->next;  		prefetch(next);  		debug_rcu_head_unqueue(list); -		__rcu_reclaim(rsp->name, list); +		if (__rcu_reclaim(rsp->name, list)) +			count_lazy++;  		list = next;  		/* Stop only if limit reached and CPU has something to do. */  		if (++count >= bl && @@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  			    rcu_is_callbacks_kthread());  	/* Update count, and requeue any remaining callbacks. */ +	rdp->qlen_lazy -= count_lazy;  	rdp->qlen -= count;  	rdp->n_cbs_invoked += count;  	if (list != NULL) { @@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  void rcu_check_callbacks(int cpu, int user)  {  	trace_rcu_utilization("Start scheduler-tick"); +	increment_cpu_stall_ticks();  	if (user || rcu_is_cpu_rrupt_from_idle()) {  		/* @@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user)  	trace_rcu_utilization("End scheduler-tick");  } -#ifdef CONFIG_SMP -  /*   * Scan the leaf rcu_node structures, processing dyntick state for any that   * have not yet encountered a quiescent state, using the function specified. @@ -1616,15 +1724,6 @@ unlock_fqs_ret:  	trace_rcu_utilization("End fqs");  } -#else /* #ifdef CONFIG_SMP */ - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ -	set_need_resched(); -} - -#endif /* #else #ifdef CONFIG_SMP */ -  /*   * This does the RCU core processing work for the specified rcu_state   * and rcu_data structures.  This may be called only from the CPU to @@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void)  static void  __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), -	   struct rcu_state *rsp) +	   struct rcu_state *rsp, bool lazy)  {  	unsigned long flags;  	struct rcu_data *rdp; +	WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */  	debug_rcu_head_queue(head);  	head->func = func;  	head->next = NULL; @@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  	 * a quiescent state betweentimes.  	 */  	local_irq_save(flags); +	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));  	rdp = this_cpu_ptr(rsp->rda);  	/* Add the callback to our list. */  	*rdp->nxttail[RCU_NEXT_TAIL] = head;  	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;  	rdp->qlen++; +	if (lazy) +		rdp->qlen_lazy++;  	if (__is_kfree_rcu_offset((unsigned long)func))  		trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, -					 rdp->qlen); +					 rdp->qlen_lazy, rdp->qlen);  	else -		trace_rcu_callback(rsp->name, head, rdp->qlen); +		trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);  	/* If interrupts were disabled, don't dive into RCU core. */  	if (irqs_disabled_flags(flags)) { @@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),   */  void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  { -	__call_rcu(head, func, &rcu_sched_state); +	__call_rcu(head, func, &rcu_sched_state, 0);  }  EXPORT_SYMBOL_GPL(call_rcu_sched);  /* - * Queue an RCU for invocation after a quicker grace period. + * Queue an RCU callback for invocation after a quicker grace period.   */  void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  { -	__call_rcu(head, func, &rcu_bh_state); +	__call_rcu(head, func, &rcu_bh_state, 0);  }  EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);   */  void synchronize_sched(void)  { +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_sched() in RCU-sched read-side critical section");  	if (rcu_blocking_is_gp())  		return;  	wait_rcu_gp(call_rcu_sched); @@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched);   */  void synchronize_rcu_bh(void)  { +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");  	if (rcu_blocking_is_gp())  		return;  	wait_rcu_gp(call_rcu_bh);  }  EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ +	/* +	 * There must be a full memory barrier on each affected CPU +	 * between the time that try_stop_cpus() is called and the +	 * time that it returns. +	 * +	 * In the current initial implementation of cpu_stop, the +	 * above condition is already met when the control reaches +	 * this point and the following smp_mb() is not strictly +	 * necessary.  Do smp_mb() anyway for documentation and +	 * robustness against future implementation changes. +	 */ +	smp_mb(); /* See above comment block. */ +	return 0; +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly.  This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code.  In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal + * to call this function from a CPU-hotplug notifier.  Failing to observe + * these restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word.  Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs.  If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period.  We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done.  If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot.  In this case, our work is + * done for us, and we can simply return.  Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ +	int firstsnap, s, snap, trycount = 0; + +	/* Note that atomic_inc_return() implies full memory barrier. */ +	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); +	get_online_cpus(); +	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + +	/* +	 * Each pass through the following loop attempts to force a +	 * context switch on each CPU. +	 */ +	while (try_stop_cpus(cpu_online_mask, +			     synchronize_sched_expedited_cpu_stop, +			     NULL) == -EAGAIN) { +		put_online_cpus(); + +		/* No joy, try again later.  Or just synchronize_sched(). */ +		if (trycount++ < 10) +			udelay(trycount * num_online_cpus()); +		else { +			synchronize_sched(); +			return; +		} + +		/* Check to see if someone else did our work for us. */ +		s = atomic_read(&sync_sched_expedited_done); +		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { +			smp_mb(); /* ensure test happens before caller kfree */ +			return; +		} + +		/* +		 * Refetching sync_sched_expedited_started allows later +		 * callers to piggyback on our grace period.  We subtract +		 * 1 to get the same token that the last incrementer got. +		 * We retry after they started, so our grace period works +		 * for them, and they started after our first try, so their +		 * grace period works for us. +		 */ +		get_online_cpus(); +		snap = atomic_read(&sync_sched_expedited_started); +		smp_mb(); /* ensure read is before try_stop_cpus(). */ +	} + +	/* +	 * Everyone up to our most recent fetch is covered by our grace +	 * period.  Update the counter, but only if our work is still +	 * relevant -- which it won't be if someone who started later +	 * than we did beat us to the punch. +	 */ +	do { +		s = atomic_read(&sync_sched_expedited_done); +		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { +			smp_mb(); /* ensure test happens before caller kfree */ +			break; +		} +	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + +	put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); +  /*   * Check to see if there is any immediate RCU-related work to be done   * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu)  	/* RCU callbacks either ready or pending? */  	return per_cpu(rcu_sched_data, cpu).nxtlist ||  	       per_cpu(rcu_bh_data, cpu).nxtlist || -	       rcu_preempt_needs_cpu(cpu); +	       rcu_preempt_cpu_has_callbacks(cpu);  }  static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; @@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  	rdp->nxtlist = NULL;  	for (i = 0; i < RCU_NEXT_SIZE; i++)  		rdp->nxttail[i] = &rdp->nxtlist; +	rdp->qlen_lazy = 0;  	rdp->qlen = 0;  	rdp->dynticks = &per_cpu(rcu_dynticks, cpu); -	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); +	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);  	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);  	rdp->cpu = cpu;  	rdp->rsp = rsp; @@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	rdp->qlen_last_fqs_check = 0;  	rdp->n_force_qs_snap = rsp->n_force_qs;  	rdp->blimit = blimit; -	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; +	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;  	atomic_set(&rdp->dynticks->dynticks,  		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);  	rcu_prepare_for_idle_init(cpu); @@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  		 * touch any data without introducing corruption. We send the  		 * dying CPU's callbacks to an arbitrarily chosen online CPU.  		 */ -		rcu_send_cbs_to_online(&rcu_bh_state); -		rcu_send_cbs_to_online(&rcu_sched_state); -		rcu_preempt_send_cbs_to_online(); +		rcu_cleanup_dying_cpu(&rcu_bh_state); +		rcu_cleanup_dying_cpu(&rcu_sched_state); +		rcu_preempt_cleanup_dying_cpu();  		rcu_cleanup_after_idle(cpu);  		break;  	case CPU_DEAD:  	case CPU_DEAD_FROZEN:  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN: -		rcu_offline_cpu(cpu); +		rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); +		rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); +		rcu_preempt_cleanup_dead_cpu(cpu);  		break;  	default:  		break; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index fddff92d6676..cdd1be0a4072 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -239,6 +239,12 @@ struct rcu_data {  	bool		preemptible;	/* Preemptible RCU? */  	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */  	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO +	unsigned long	ticks_this_gp;	/* The number of scheduling-clock */ +					/*  ticks this CPU has handled */ +					/*  during and after the last grace */ +					/* period it is aware of. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */  	/* 2) batch handling */  	/* @@ -265,7 +271,8 @@ struct rcu_data {  	 */  	struct rcu_head *nxtlist;  	struct rcu_head **nxttail[RCU_NEXT_SIZE]; -	long		qlen;		/* # of queued callbacks */ +	long		qlen_lazy;	/* # of lazy queued callbacks */ +	long		qlen;		/* # of queued callbacks, incl lazy */  	long		qlen_last_fqs_check;  					/* qlen at last check for QS forcing */  	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */ @@ -282,7 +289,6 @@ struct rcu_data {  	/* 4) reasons this CPU needed to be kicked by force_quiescent_state */  	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */  	unsigned long offline_fqs;	/* Kicked due to being offline. */ -	unsigned long resched_ipi;	/* Sent a resched IPI. */  	/* 5) __rcu_pending() statistics. */  	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */ @@ -313,12 +319,6 @@ struct rcu_data {  #else  #define RCU_STALL_DELAY_DELTA	       0  #endif - -#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ -					RCU_STALL_DELAY_DELTA) -						/* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) -						/* for rsp->jiffies_stall */  #define RCU_STALL_RAT_DELAY		2	/* Allow other CPUs time */  						/*  to take at least one */  						/*  scheduling clock irq */ @@ -438,8 +438,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);  static int rcu_preempt_offline_tasks(struct rcu_state *rsp,  				     struct rcu_node *rnp,  				     struct rcu_data *rdp); -static void rcu_preempt_offline_cpu(int cpu);  #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_cleanup_dead_cpu(int cpu);  static void rcu_preempt_check_callbacks(int cpu);  static void rcu_preempt_process_callbacks(void);  void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); @@ -448,9 +448,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,  			       bool wake);  #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */  static int rcu_preempt_pending(int cpu); -static int rcu_preempt_needs_cpu(int cpu); +static int rcu_preempt_cpu_has_callbacks(int cpu);  static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_send_cbs_to_online(void); +static void rcu_preempt_cleanup_dying_cpu(void);  static void __init __rcu_init_preempt(void);  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); @@ -471,5 +471,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);  static void rcu_prepare_for_idle_init(int cpu);  static void rcu_cleanup_after_idle(int cpu);  static void rcu_prepare_for_idle(int cpu); +static void print_cpu_stall_info_begin(void); +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); +static void print_cpu_stall_info_end(void); +static void zero_cpu_stall_ticks(struct rcu_data *rdp); +static void increment_cpu_stall_ticks(void);  #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8bb35d73e1f9..c023464816be 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,7 +25,6 @@   */  #include <linux/delay.h> -#include <linux/stop_machine.h>  #define RCU_KTHREAD_PRIO 1 @@ -63,7 +62,10 @@ static void __init rcu_bootup_announce_oddness(void)  	printk(KERN_INFO "\tRCU torture testing starts during boot.\n");  #endif  #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) -	printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); +	printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); +#endif +#if defined(CONFIG_RCU_CPU_STALL_INFO) +	printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");  #endif  #if NUM_RCU_LVL_4 != 0  	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); @@ -490,6 +492,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)  #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ +	printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", +	       rnp->level, rnp->grplo, rnp->grphi); +} + +static void rcu_print_task_stall_end(void) +{ +	printk(KERN_CONT "\n"); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ +} + +static void rcu_print_task_stall_end(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ +  /*   * Scan the current list of tasks blocked within RCU read-side critical   * sections, printing out the tid of each. @@ -501,12 +528,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp)  	if (!rcu_preempt_blocked_readers_cgp(rnp))  		return 0; +	rcu_print_task_stall_begin(rnp);  	t = list_entry(rnp->gp_tasks,  		       struct task_struct, rcu_node_entry);  	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		printk(" P%d", t->pid); +		printk(KERN_CONT " P%d", t->pid);  		ndetected++;  	} +	rcu_print_task_stall_end();  	return ndetected;  } @@ -581,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,  	 * absolutely necessary, but this is a good performance/complexity  	 * tradeoff.  	 */ -	if (rcu_preempt_blocked_readers_cgp(rnp)) +	if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)  		retval |= RCU_OFL_TASKS_NORM_GP;  	if (rcu_preempted_readers_exp(rnp))  		retval |= RCU_OFL_TASKS_EXP_GP; @@ -618,16 +647,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,  	return retval;  } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +  /*   * Do CPU-offline processing for preemptible RCU.   */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu)  { -	__rcu_offline_cpu(cpu, &rcu_preempt_state); +	rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);  } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -  /*   * Check for a quiescent state from the current CPU.  When a task blocks,   * the task is recorded in the corresponding CPU's rcu_node structure, @@ -671,10 +700,24 @@ static void rcu_preempt_do_callbacks(void)   */  void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  { -	__call_rcu(head, func, &rcu_preempt_state); +	__call_rcu(head, func, &rcu_preempt_state, 0);  }  EXPORT_SYMBOL_GPL(call_rcu); +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks.  Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, +		    void (*func)(struct rcu_head *rcu)) +{ +	__call_rcu(head, func, &rcu_preempt_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); +  /**   * synchronize_rcu - wait until a grace period has elapsed.   * @@ -688,6 +731,10 @@ EXPORT_SYMBOL_GPL(call_rcu);   */  void synchronize_rcu(void)  { +	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_rcu() in RCU read-side critical section");  	if (!rcu_scheduler_active)  		return;  	wait_rcu_gp(call_rcu); @@ -788,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)  		rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */  } -/* - * Wait for an rcu-preempt grace period, but expedite it.  The basic idea - * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it.  The basic + * idea is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blkd_tasks lists and wait for this list to drain.  This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. + * In fact, if you are using synchronize_rcu_expedited() in a loop, + * please restructure your code to batch your updates, and then Use a + * single synchronize_rcu() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal + * to call this function from a CPU-hotplug notifier.  Failing to observe + * these restriction will result in deadlock.   */  void synchronize_rcu_expedited(void)  { @@ -869,9 +928,9 @@ static int rcu_preempt_pending(int cpu)  }  /* - * Does preemptible RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU have callbacks on this CPU?   */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu)  {  	return !!per_cpu(rcu_preempt_data, cpu).nxtlist;  } @@ -894,11 +953,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)  }  /* - * Move preemptible RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU + * and record a quiescent state.   */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void)  { -	rcu_send_cbs_to_online(&rcu_preempt_state); +	rcu_cleanup_dying_cpu(&rcu_preempt_state);  }  /* @@ -1034,16 +1094,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,  	return 0;  } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +  /*   * Because preemptible RCU does not exist, it never needs CPU-offline   * processing.   */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu)  {  } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -  /*   * Because preemptible RCU does not exist, it never has any callbacks   * to check. @@ -1061,6 +1121,22 @@ static void rcu_preempt_process_callbacks(void)  }  /* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks.  Until then, this + * function may only be called from __kfree_rcu(). + * + * Because there is no preemptible RCU, we use RCU-sched instead. + */ +void kfree_call_rcu(struct rcu_head *head, +		    void (*func)(struct rcu_head *rcu)) +{ +	__call_rcu(head, func, &rcu_sched_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + +/*   * Wait for an rcu-preempt grace period, but make it happen quickly.   * But because preemptible RCU does not exist, map to rcu-sched.   */ @@ -1093,9 +1169,9 @@ static int rcu_preempt_pending(int cpu)  }  /* - * Because preemptible RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never has callbacks   */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu)  {  	return 0;  } @@ -1119,9 +1195,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)  }  /* - * Because there is no preemptible RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there is no cleanup to do.   */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void)  {  } @@ -1823,132 +1899,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)  #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#ifndef CONFIG_SMP - -void synchronize_sched_expedited(void) -{ -	cond_resched(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#else /* #ifndef CONFIG_SMP */ - -static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); -static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ -	/* -	 * There must be a full memory barrier on each affected CPU -	 * between the time that try_stop_cpus() is called and the -	 * time that it returns. -	 * -	 * In the current initial implementation of cpu_stop, the -	 * above condition is already met when the control reaches -	 * this point and the following smp_mb() is not strictly -	 * necessary.  Do smp_mb() anyway for documentation and -	 * robustness against future implementation changes. -	 */ -	smp_mb(); /* See above comment block. */ -	return 0; -} - -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly.  This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. - * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier.  Failing to - * observe this restriction will result in deadlock. - * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word.  Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs.  If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period.  We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done.  If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot.  In this case, our work is - * done for us, and we can simply return.  Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). - */ -void synchronize_sched_expedited(void) -{ -	int firstsnap, s, snap, trycount = 0; - -	/* Note that atomic_inc_return() implies full memory barrier. */ -	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); -	get_online_cpus(); - -	/* -	 * Each pass through the following loop attempts to force a -	 * context switch on each CPU. -	 */ -	while (try_stop_cpus(cpu_online_mask, -			     synchronize_sched_expedited_cpu_stop, -			     NULL) == -EAGAIN) { -		put_online_cpus(); - -		/* No joy, try again later.  Or just synchronize_sched(). */ -		if (trycount++ < 10) -			udelay(trycount * num_online_cpus()); -		else { -			synchronize_sched(); -			return; -		} - -		/* Check to see if someone else did our work for us. */ -		s = atomic_read(&sync_sched_expedited_done); -		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { -			smp_mb(); /* ensure test happens before caller kfree */ -			return; -		} - -		/* -		 * Refetching sync_sched_expedited_started allows later -		 * callers to piggyback on our grace period.  We subtract -		 * 1 to get the same token that the last incrementer got. -		 * We retry after they started, so our grace period works -		 * for them, and they started after our first try, so their -		 * grace period works for us. -		 */ -		get_online_cpus(); -		snap = atomic_read(&sync_sched_expedited_started); -		smp_mb(); /* ensure read is before try_stop_cpus(). */ -	} - -	/* -	 * Everyone up to our most recent fetch is covered by our grace -	 * period.  Update the counter, but only if our work is still -	 * relevant -- which it won't be if someone who started later -	 * than we did beat us to the punch. -	 */ -	do { -		s = atomic_read(&sync_sched_expedited_done); -		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { -			smp_mb(); /* ensure test happens before caller kfree */ -			break; -		} -	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); - -	put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#endif /* #else #ifndef CONFIG_SMP */ -  #if !defined(CONFIG_RCU_FAST_NO_HZ)  /* @@ -1981,7 +1931,7 @@ static void rcu_cleanup_after_idle(int cpu)  }  /* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,   * is nothing.   */  static void rcu_prepare_for_idle(int cpu) @@ -2015,6 +1965,9 @@ static void rcu_prepare_for_idle(int cpu)   *	number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your   *	system.  And if you are -that- concerned about energy efficiency,   *	just power the system down and be done with it! + * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is + *	permitted to sleep in dyntick-idle mode with only lazy RCU + *	callbacks pending.  Setting this too high can OOM your system.   *   * The values below work well in practice.  If future workloads require   * adjustment, they can be converted into kernel config parameters, though @@ -2023,11 +1976,13 @@ static void rcu_prepare_for_idle(int cpu)  #define RCU_IDLE_FLUSHES 5		/* Number of dyntick-idle tries. */  #define RCU_IDLE_OPT_FLUSHES 3		/* Optional dyntick-idle tries. */  #define RCU_IDLE_GP_DELAY 6		/* Roughly one grace period. */ +#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */  static DEFINE_PER_CPU(int, rcu_dyntick_drain);  static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);  static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; +static ktime_t rcu_idle_gp_wait;	/* If some non-lazy callbacks. */ +static ktime_t rcu_idle_lazy_gp_wait;	/* If only lazy callbacks. */  /*   * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2048,6 +2003,48 @@ int rcu_needs_cpu(int cpu)  }  /* + * Does the specified flavor of RCU have non-lazy callbacks pending on + * the specified CPU?  Both RCU flavor and CPU are specified by the + * rcu_data structure. + */ +static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) +{ +	return rdp->qlen != rdp->qlen_lazy; +} + +#ifdef CONFIG_TREE_PREEMPT_RCU + +/* + * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there + * is no RCU-preempt in the kernel.) + */ +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ +	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + +	return __rcu_cpu_has_nonlazy_callbacks(rdp); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ +	return 0; +} + +#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/* + * Does any flavor of RCU have non-lazy callbacks on the specified CPU? + */ +static bool rcu_cpu_has_nonlazy_callbacks(int cpu) +{ +	return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || +	       __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || +	       rcu_preempt_cpu_has_nonlazy_callbacks(cpu); +} + +/*   * Timer handler used to force CPU to start pushing its remaining RCU   * callbacks in the case where it entered dyntick-idle mode with callbacks   * pending.  The hander doesn't really need to do anything because the @@ -2074,6 +2071,8 @@ static void rcu_prepare_for_idle_init(int cpu)  		unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);  		rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); +		upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); +		rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);  		firsttime = 0;  	}  } @@ -2109,10 +2108,6 @@ static void rcu_cleanup_after_idle(int cpu)   */  static void rcu_prepare_for_idle(int cpu)  { -	unsigned long flags; - -	local_irq_save(flags); -  	/*  	 * If there are no callbacks on this CPU, enter dyntick-idle mode.  	 * Also reset state to avoid prejudicing later attempts. @@ -2120,7 +2115,6 @@ static void rcu_prepare_for_idle(int cpu)  	if (!rcu_cpu_has_callbacks(cpu)) {  		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;  		per_cpu(rcu_dyntick_drain, cpu) = 0; -		local_irq_restore(flags);  		trace_rcu_prep_idle("No callbacks");  		return;  	} @@ -2130,7 +2124,6 @@ static void rcu_prepare_for_idle(int cpu)  	 * refrained from disabling the scheduling-clock tick.  	 */  	if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { -		local_irq_restore(flags);  		trace_rcu_prep_idle("In holdoff");  		return;  	} @@ -2140,18 +2133,22 @@ static void rcu_prepare_for_idle(int cpu)  		/* First time through, initialize the counter. */  		per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;  	} else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && -		   !rcu_pending(cpu)) { +		   !rcu_pending(cpu) && +		   !local_softirq_pending()) {  		/* Can we go dyntick-idle despite still having callbacks? */  		trace_rcu_prep_idle("Dyntick with callbacks");  		per_cpu(rcu_dyntick_drain, cpu) = 0; -		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; -		hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), -			      rcu_idle_gp_wait, HRTIMER_MODE_REL); +		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; +		if (rcu_cpu_has_nonlazy_callbacks(cpu)) +			hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), +				      rcu_idle_gp_wait, HRTIMER_MODE_REL); +		else +			hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), +				      rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);  		return; /* Nothing more to do immediately. */  	} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {  		/* We have hit the limit, so time to give up. */  		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; -		local_irq_restore(flags);  		trace_rcu_prep_idle("Begin holdoff");  		invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */  		return; @@ -2163,23 +2160,17 @@ static void rcu_prepare_for_idle(int cpu)  	 */  #ifdef CONFIG_TREE_PREEMPT_RCU  	if (per_cpu(rcu_preempt_data, cpu).nxtlist) { -		local_irq_restore(flags);  		rcu_preempt_qs(cpu);  		force_quiescent_state(&rcu_preempt_state, 0); -		local_irq_save(flags);  	}  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */  	if (per_cpu(rcu_sched_data, cpu).nxtlist) { -		local_irq_restore(flags);  		rcu_sched_qs(cpu);  		force_quiescent_state(&rcu_sched_state, 0); -		local_irq_save(flags);  	}  	if (per_cpu(rcu_bh_data, cpu).nxtlist) { -		local_irq_restore(flags);  		rcu_bh_qs(cpu);  		force_quiescent_state(&rcu_bh_state, 0); -		local_irq_save(flags);  	}  	/* @@ -2187,13 +2178,124 @@ static void rcu_prepare_for_idle(int cpu)  	 * So try forcing the callbacks through the grace period.  	 */  	if (rcu_cpu_has_callbacks(cpu)) { -		local_irq_restore(flags);  		trace_rcu_prep_idle("More callbacks");  		invoke_rcu_core(); -	} else { -		local_irq_restore(flags); +	} else  		trace_rcu_prep_idle("Callbacks drained"); -	}  }  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +#ifdef CONFIG_RCU_CPU_STALL_INFO + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +	struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + +	sprintf(cp, "drain=%d %c timer=%lld", +		per_cpu(rcu_dyntick_drain, cpu), +		per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', +		hrtimer_active(hrtp) +			? ktime_to_us(hrtimer_get_remaining(hrtp)) +			: -1); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* Initiate the stall-info list. */ +static void print_cpu_stall_info_begin(void) +{ +	printk(KERN_CONT "\n"); +} + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period + * (flavor specified by rsp), then print the number of scheduling + * clock interrupts the CPU has taken during the time that it has + * been aware.  Otherwise, print the number of RCU grace periods + * that this CPU is ignorant of, for example, "1" if the CPU was + * aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ +	char fast_no_hz[72]; +	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); +	struct rcu_dynticks *rdtp = rdp->dynticks; +	char *ticks_title; +	unsigned long ticks_value; + +	if (rsp->gpnum == rdp->gpnum) { +		ticks_title = "ticks this GP"; +		ticks_value = rdp->ticks_this_gp; +	} else { +		ticks_title = "GPs behind"; +		ticks_value = rsp->gpnum - rdp->gpnum; +	} +	print_cpu_stall_fast_no_hz(fast_no_hz, cpu); +	printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", +	       cpu, ticks_value, ticks_title, +	       atomic_read(&rdtp->dynticks) & 0xfff, +	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, +	       fast_no_hz); +} + +/* Terminate the stall-info list. */ +static void print_cpu_stall_info_end(void) +{ +	printk(KERN_ERR "\t"); +} + +/* Zero ->ticks_this_gp for all flavors of RCU. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +	rdp->ticks_this_gp = 0; +} + +/* Increment ->ticks_this_gp for all flavors of RCU. */ +static void increment_cpu_stall_ticks(void) +{ +	__get_cpu_var(rcu_sched_data).ticks_this_gp++; +	__get_cpu_var(rcu_bh_data).ticks_this_gp++; +#ifdef CONFIG_TREE_PREEMPT_RCU +	__get_cpu_var(rcu_preempt_data).ticks_this_gp++; +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void print_cpu_stall_info_begin(void) +{ +	printk(KERN_CONT " {"); +} + +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ +	printk(KERN_CONT " %d", cpu); +} + +static void print_cpu_stall_info_end(void) +{ +	printk(KERN_CONT "} "); +} + +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +} + +static void increment_cpu_stall_ticks(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 654cfe67f0d1..ed459edeff43 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -72,9 +72,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)  		   rdp->dynticks->dynticks_nesting,  		   rdp->dynticks->dynticks_nmi_nesting,  		   rdp->dynticks_fqs); -	seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); -	seq_printf(m, " ql=%ld qs=%c%c%c%c", -		   rdp->qlen, +	seq_printf(m, " of=%lu", rdp->offline_fqs); +	seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", +		   rdp->qlen_lazy, rdp->qlen,  		   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=  			rdp->nxttail[RCU_NEXT_TAIL]],  		   ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -144,8 +144,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)  		   rdp->dynticks->dynticks_nesting,  		   rdp->dynticks->dynticks_nmi_nesting,  		   rdp->dynticks_fqs); -	seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); -	seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, +	seq_printf(m, ",%lu", rdp->offline_fqs); +	seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,  		   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=  			rdp->nxttail[RCU_NEXT_TAIL]],  		   ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)  {  	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");  	seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); -	seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); +	seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");  #ifdef CONFIG_RCU_BOOST  	seq_puts(m, "\"kt\",\"ktl\"");  #endif /* #ifdef CONFIG_RCU_BOOST */ diff --git a/kernel/resource.c b/kernel/resource.c index 7640b3a947d0..7e8ea66a8c01 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t  	write_unlock(&resource_lock);  	return result;  } +EXPORT_SYMBOL(adjust_resource);  static void __init __reserve_region_with_split(struct resource *root,  		resource_size_t start, resource_size_t end, @@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,  	write_unlock(&resource_lock);  } -EXPORT_SYMBOL(adjust_resource); -  /**   * resource_alignment - calculate resource's alignment   * @res: resource pointer diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e7..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup);  #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)  {  	static unsigned long next = INITIAL_JIFFIES;  	struct autogroup *ag;  	int err; -	if (*nice < -20 || *nice > 19) +	if (nice < -20 || nice > 19)  		return -EINVAL; -	err = security_task_setnice(current, *nice); +	err = security_task_setnice(current, nice);  	if (err)  		return err; -	if (*nice < 0 && !can_nice(current, *nice)) +	if (nice < 0 && !can_nice(current, nice))  		return -EPERM;  	/* this is a heavy operation taking global locks.. */ @@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)  	ag = autogroup_task_get(p);  	down_write(&ag->lock); -	err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); +	err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);  	if (!err) -		ag->nice = *nice; +		ag->nice = nice;  	up_write(&ag->lock);  	autogroup_kref_put(ag); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..503d6426126d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -71,6 +71,7 @@  #include <linux/ftrace.h>  #include <linux/slab.h>  #include <linux/init_task.h> +#include <linux/binfmts.h>  #include <asm/tlb.h>  #include <asm/irq_regs.h> @@ -162,13 +163,13 @@ static int sched_feat_show(struct seq_file *m, void *v)  #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true  jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true  STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE  #define SCHED_FEAT(name, enabled)	\  	jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {  #include "features.h"  }; @@ -176,14 +177,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {  static void sched_feat_disable(int i)  { -	if (jump_label_enabled(&sched_feat_keys[i])) -		jump_label_dec(&sched_feat_keys[i]); +	if (static_key_enabled(&sched_feat_keys[i])) +		static_key_slow_dec(&sched_feat_keys[i]);  }  static void sched_feat_enable(int i)  { -	if (!jump_label_enabled(&sched_feat_keys[i])) -		jump_label_inc(&sched_feat_keys[i]); +	if (!static_key_enabled(&sched_feat_keys[i])) +		static_key_slow_inc(&sched_feat_keys[i]);  }  #else  static void sched_feat_disable(int i) { }; @@ -894,7 +895,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)  	delta -= irq_delta;  #endif  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -	if (static_branch((¶virt_steal_rq_enabled))) { +	if (static_key_false((¶virt_steal_rq_enabled))) {  		u64 st;  		steal = paravirt_steal_clock(cpu_of(rq)); @@ -1284,7 +1285,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  	 * leave kernel.  	 */  	if (p->mm && printk_ratelimit()) { -		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", +		printk_sched("process %d (%s) no longer affine to cpu%d\n",  				task_pid_nr(p), p->comm, cpu);  	} @@ -1507,7 +1508,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)  }  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -static inline int ttwu_share_cache(int this_cpu, int that_cpu) +bool cpus_share_cache(int this_cpu, int that_cpu)  {  	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);  } @@ -1518,7 +1519,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)  	struct rq *rq = cpu_rq(cpu);  #if defined(CONFIG_SMP) -	if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { +	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {  		sched_clock_cpu(cpu); /* sync clocks x-cpu */  		ttwu_queue_remote(p, cpu);  		return; @@ -1932,7 +1933,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	local_irq_enable();  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */  	finish_lock_switch(rq, prev); -	trace_sched_stat_sleeptime(current, rq->clock);  	fire_sched_in_preempt_notifiers(current);  	if (mm) @@ -2267,13 +2267,10 @@ calc_load_n(unsigned long load, unsigned long exp,   * Once we've updated the global active value, we need to apply the exponential   * weights adjusted to the number of cycles missed.   */ -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void)  {  	long delta, active, n; -	if (time_before(jiffies, calc_load_update)) -		return; -  	/*  	 * If we crossed a calc_load_update boundary, make sure to fold  	 * any pending idle changes, the respective CPUs might have @@ -2285,31 +2282,25 @@ static void calc_global_nohz(unsigned long ticks)  		atomic_long_add(delta, &calc_load_tasks);  	/* -	 * If we were idle for multiple load cycles, apply them. +	 * It could be the one fold was all it took, we done!  	 */ -	if (ticks >= LOAD_FREQ) { -		n = ticks / LOAD_FREQ; +	if (time_before(jiffies, calc_load_update + 10)) +		return; -		active = atomic_long_read(&calc_load_tasks); -		active = active > 0 ? active * FIXED_1 : 0; +	/* +	 * Catch-up, fold however many we are behind still +	 */ +	delta = jiffies - calc_load_update - 10; +	n = 1 + (delta / LOAD_FREQ); -		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); -		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); -		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); +	active = atomic_long_read(&calc_load_tasks); +	active = active > 0 ? active * FIXED_1 : 0; -		calc_load_update += n * LOAD_FREQ; -	} +	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); -	/* -	 * Its possible the remainder of the above division also crosses -	 * a LOAD_FREQ period, the regular check in calc_global_load() -	 * which comes after this will take care of that. -	 * -	 * Consider us being 11 ticks before a cycle completion, and us -	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will -	 * age us 4 cycles, and the test in calc_global_load() will -	 * pick up the final one. -	 */ +	calc_load_update += n * LOAD_FREQ;  }  #else  void calc_load_account_idle(struct rq *this_rq) @@ -2321,7 +2312,7 @@ static inline long calc_load_fold_idle(void)  	return 0;  } -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void)  {  }  #endif @@ -2349,8 +2340,6 @@ void calc_global_load(unsigned long ticks)  {  	long active; -	calc_global_nohz(ticks); -  	if (time_before(jiffies, calc_load_update + 10))  		return; @@ -2362,6 +2351,16 @@ void calc_global_load(unsigned long ticks)  	avenrun[2] = calc_load(avenrun[2], EXP_15, active);  	calc_load_update += LOAD_FREQ; + +	/* +	 * Account one period with whatever state we found before +	 * folding in the nohz state and ageing the entire idle period. +	 * +	 * This avoids loosing a sample when we go idle between  +	 * calc_load_account_active() (10 ticks ago) and now and thus +	 * under-accounting. +	 */ +	calc_global_nohz();  }  /* @@ -2756,7 +2755,7 @@ void account_idle_time(cputime_t cputime)  static __always_inline bool steal_account_process_tick(void)  {  #ifdef CONFIG_PARAVIRT -	if (static_branch(¶virt_steal_enabled)) { +	if (static_key_false(¶virt_steal_enabled)) {  		u64 steal, st = 0;  		steal = paravirt_steal_clock(smp_processor_id()); @@ -3221,14 +3220,14 @@ need_resched:  	post_schedule(rq); -	preempt_enable_no_resched(); +	sched_preempt_enable_no_resched();  	if (need_resched())  		goto need_resched;  }  static inline void sched_submit_work(struct task_struct *tsk)  { -	if (!tsk->state) +	if (!tsk->state || tsk_is_pi_blocked(tsk))  		return;  	/*  	 * If we are going to sleep and we have plugged IO queued, @@ -3247,6 +3246,18 @@ asmlinkage void __sched schedule(void)  }  EXPORT_SYMBOL(schedule); +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ +	sched_preempt_enable_no_resched(); +	schedule(); +	preempt_disable(); +} +  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER  static inline bool owner_running(struct mutex *lock, struct task_struct *owner) @@ -3407,9 +3418,9 @@ EXPORT_SYMBOL(__wake_up);  /*   * Same as __wake_up but called with the spinlock in wait_queue_head_t held.   */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)  { -	__wake_up_common(q, mode, 1, 0, NULL); +	__wake_up_common(q, mode, nr, 0, NULL);  }  EXPORT_SYMBOL_GPL(__wake_up_locked); @@ -3768,6 +3779,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	rq = __task_rq_lock(p); +	/* +	 * Idle task boosting is a nono in general. There is one +	 * exception, when PREEMPT_RT and NOHZ is active: +	 * +	 * The idle task calls get_next_timer_interrupt() and holds +	 * the timer wheel base->lock on the CPU and another CPU wants +	 * to access the timer (probably to cancel it). We can safely +	 * ignore the boosting request, as the idle CPU runs this code +	 * with interrupts disabled and will complete the lock +	 * protected section without being interrupted. So there is no +	 * real need to boost. +	 */ +	if (unlikely(p == rq->idle)) { +		WARN_ON(p != rq->curr); +		WARN_ON(p->pi_blocked_on); +		goto out_unlock; +	} +  	trace_sched_pi_setprio(p, prio);  	oldprio = p->prio;  	prev_class = p->sched_class; @@ -3791,11 +3820,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);  	check_class_changed(rq, p, prev_class, oldprio); +out_unlock:  	__task_rq_unlock(rq);  } -  #endif -  void set_user_nice(struct task_struct *p, long nice)  {  	int old_prio, delta, on_rq; @@ -4475,7 +4503,7 @@ SYSCALL_DEFINE0(sched_yield)  	__release(rq->lock);  	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);  	do_raw_spin_unlock(&rq->lock); -	preempt_enable_no_resched(); +	sched_preempt_enable_no_resched();  	schedule(); @@ -4549,8 +4577,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);  /**   * yield - yield the current processor to other threads.   * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * 	yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not!   */  void __sched yield(void)  { @@ -5382,7 +5426,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { -	case CPU_ONLINE: +	case CPU_STARTING:  	case CPU_DOWN_FAILED:  		set_cpu_active((long)hcpu, true);  		return NOTIFY_OK; @@ -5754,7 +5798,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)   *   * Also keep a unique ID per domain (we use the first cpu number in   * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). + * two cpus are in the same cache domain, see cpus_share_cache().   */  DEFINE_PER_CPU(struct sched_domain *, sd_llc);  DEFINE_PER_CPU(int, sd_llc_id); @@ -6931,6 +6975,9 @@ void __init sched_init(void)  		rq->online = 0;  		rq->idle_stamp = 0;  		rq->avg_idle = 2*sysctl_sched_migration_cost; + +		INIT_LIST_HEAD(&rq->cfs_tasks); +  		rq_attach_root(rq, &def_root_domain);  #ifdef CONFIG_NO_HZ  		rq->nohz_flags = 0; @@ -7525,8 +7572,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)  			    struct task_group, css);  } -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)  {  	struct task_group *tg, *parent; @@ -7543,15 +7589,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)  	return &tg->css;  } -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpu_cgroup_destroy(struct cgroup *cgrp)  {  	struct task_group *tg = cgroup_tg(cgrp);  	sched_destroy_group(tg);  } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup *cgrp,  				 struct cgroup_taskset *tset)  {  	struct task_struct *task; @@ -7569,7 +7614,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,  	return 0;  } -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup *cgrp,  			      struct cgroup_taskset *tset)  {  	struct task_struct *task; @@ -7579,8 +7624,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,  }  static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, -		struct cgroup *old_cgrp, struct task_struct *task) +cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, +		struct task_struct *task)  {  	/*  	 * cgroup_exit() is called in the copy_process() failure path. @@ -7930,8 +7975,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {   */  /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( -	struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)  {  	struct cpuacct *ca; @@ -7961,8 +8005,7 @@ out:  }  /* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpuacct_destroy(struct cgroup *cgrp)  {  	struct cpuacct *ca = cgroup_ca(cgrp); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004b..09acaa15161d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu)  	P(yld_count); -	P(sched_switch);  	P(sched_count);  	P(sched_goidle);  #ifdef CONFIG_SMP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..94340c7544a9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)   * Scheduling class queueing methods:   */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -	cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif -  static void  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  {  	update_load_add(&cfs_rq->load, se->load.weight);  	if (!parent_entity(se))  		update_load_add(&rq_of(cfs_rq)->load, se->load.weight); -	if (entity_is_task(se)) { -		add_cfs_task_weight(cfs_rq, se->load.weight); -		list_add(&se->group_node, &cfs_rq->tasks); -	} +#ifdef CONFIG_SMP +	if (entity_is_task(se)) +		list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +#endif  	cfs_rq->nr_running++;  } @@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  	update_load_sub(&cfs_rq->load, se->load.weight);  	if (!parent_entity(se))  		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); -	if (entity_is_task(se)) { -		add_cfs_task_weight(cfs_rq, -se->load.weight); +	if (entity_is_task(se))  		list_del_init(&se->group_node); -	}  	cfs_rq->nr_running--;  } @@ -1003,6 +988,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		if (unlikely(delta > se->statistics.sleep_max))  			se->statistics.sleep_max = delta; +		se->statistics.sleep_start = 0;  		se->statistics.sum_sleep_runtime += delta;  		if (tsk) { @@ -1019,6 +1005,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  		if (unlikely(delta > se->statistics.block_max))  			se->statistics.block_max = delta; +		se->statistics.block_start = 0;  		se->statistics.sum_sleep_runtime += delta;  		if (tsk) { @@ -1399,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  #ifdef CONFIG_CFS_BANDWIDTH  #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used;  static inline bool cfs_bandwidth_used(void)  { -	return static_branch(&__cfs_bandwidth_used); +	return static_key_false(&__cfs_bandwidth_used);  }  void account_cfs_bandwidth_used(int enabled, int was_enabled)  {  	/* only need to count groups transitioning between enabled/!enabled */  	if (enabled && !was_enabled) -		jump_label_inc(&__cfs_bandwidth_used); +		static_key_slow_inc(&__cfs_bandwidth_used);  	else if (!enabled && was_enabled) -		jump_label_dec(&__cfs_bandwidth_used); +		static_key_slow_dec(&__cfs_bandwidth_used);  }  #else /* HAVE_JUMP_LABEL */  static bool cfs_bandwidth_used(void) @@ -2670,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target)  	/*  	 * Otherwise, iterate the domains and find an elegible idle cpu.  	 */ -	rcu_read_lock(); -  	sd = rcu_dereference(per_cpu(sd_llc, target));  	for_each_lower_domain(sd) {  		sg = sd->groups; @@ -2693,8 +2678,6 @@ next:  		} while (sg != sd->groups);  	}  done: -	rcu_read_unlock(); -  	return target;  } @@ -2920,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_  		return;  	/* -	 * This is possible from callers such as pull_task(), in which we +	 * This is possible from callers such as move_task(), in which we  	 * unconditionally check_prempt_curr() after an enqueue (which may have  	 * lead to a throttle).  This both saves work and prevents false  	 * next-buddy nomination below. @@ -3084,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp   * Fair scheduling class load-balancing methods:   */ +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +#define LBF_ALL_PINNED	0x01 +#define LBF_NEED_BREAK	0x02 + +struct lb_env { +	struct sched_domain	*sd; + +	int			src_cpu; +	struct rq		*src_rq; + +	int			dst_cpu; +	struct rq		*dst_rq; + +	enum cpu_idle_type	idle; +	long			load_move; +	unsigned int		flags; + +	unsigned int		loop; +	unsigned int		loop_break; +	unsigned int		loop_max; +}; +  /* - * pull_task - move a task from a remote runqueue to the local runqueue. + * move_task - move a task from one runqueue to another runqueue.   * Both runqueues must be locked.   */ -static void pull_task(struct rq *src_rq, struct task_struct *p, -		      struct rq *this_rq, int this_cpu) +static void move_task(struct task_struct *p, struct lb_env *env)  { -	deactivate_task(src_rq, p, 0); -	set_task_cpu(p, this_cpu); -	activate_task(this_rq, p, 0); -	check_preempt_curr(this_rq, p, 0); +	deactivate_task(env->src_rq, p, 0); +	set_task_cpu(p, env->dst_cpu); +	activate_task(env->dst_rq, p, 0); +	check_preempt_curr(env->dst_rq, p, 0);  }  /* @@ -3129,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)  	return delta < (s64)sysctl_sched_migration_cost;  } -#define LBF_ALL_PINNED	0x01 -#define LBF_NEED_BREAK	0x02	/* clears into HAD_BREAK */ -#define LBF_HAD_BREAK	0x04 -#define LBF_HAD_BREAKS	0x0C	/* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT	0x10 -  /*   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?   */  static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, -		     struct sched_domain *sd, enum cpu_idle_type idle, -		     int *lb_flags) +int can_migrate_task(struct task_struct *p, struct lb_env *env)  {  	int tsk_cache_hot = 0;  	/* @@ -3150,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  	 * 2) cannot be migrated to this CPU due to cpus_allowed, or  	 * 3) are cache-hot on their current CPU.  	 */ -	if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { +	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {  		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);  		return 0;  	} -	*lb_flags &= ~LBF_ALL_PINNED; +	env->flags &= ~LBF_ALL_PINNED; -	if (task_running(rq, p)) { +	if (task_running(env->src_rq, p)) {  		schedstat_inc(p, se.statistics.nr_failed_migrations_running);  		return 0;  	} @@ -3167,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,  	 * 2) too many balance attempts have failed.  	 */ -	tsk_cache_hot = task_hot(p, rq->clock_task, sd); +	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);  	if (!tsk_cache_hot || -		sd->nr_balance_failed > sd->cache_nice_tries) { +		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {  #ifdef CONFIG_SCHEDSTATS  		if (tsk_cache_hot) { -			schedstat_inc(sd, lb_hot_gained[idle]); +			schedstat_inc(env->sd, lb_hot_gained[env->idle]);  			schedstat_inc(p, se.statistics.nr_forced_migrations);  		}  #endif @@ -3193,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,   *   * Called with both runqueues locked.   */ -static int -move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, -	      struct sched_domain *sd, enum cpu_idle_type idle) +static int move_one_task(struct lb_env *env)  {  	struct task_struct *p, *n; -	struct cfs_rq *cfs_rq; -	int pinned = 0; -	for_each_leaf_cfs_rq(busiest, cfs_rq) { -		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { -			if (throttled_lb_pair(task_group(p), -					      busiest->cpu, this_cpu)) -				break; +	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { +		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) +			continue; -			if (!can_migrate_task(p, busiest, this_cpu, -						sd, idle, &pinned)) -				continue; +		if (!can_migrate_task(p, env)) +			continue; -			pull_task(busiest, p, this_rq, this_cpu); -			/* -			 * Right now, this is only the second place pull_task() -			 * is called, so we can safely collect pull_task() -			 * stats here rather than inside pull_task(). -			 */ -			schedstat_inc(sd, lb_gained[idle]); -			return 1; -		} +		move_task(p, env); +		/* +		 * Right now, this is only the second place move_task() +		 * is called, so we can safely collect move_task() +		 * stats here rather than inside move_task(). +		 */ +		schedstat_inc(env->sd, lb_gained[env->idle]); +		return 1;  	} -  	return 0;  } -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, -	      unsigned long max_load_move, struct sched_domain *sd, -	      enum cpu_idle_type idle, int *lb_flags, -	      struct cfs_rq *busiest_cfs_rq) +static unsigned long task_h_load(struct task_struct *p); + +/* + * move_tasks tries to move up to load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct lb_env *env)  { -	int loops = 0, pulled = 0; -	long rem_load_move = max_load_move; -	struct task_struct *p, *n; +	struct list_head *tasks = &env->src_rq->cfs_tasks; +	struct task_struct *p; +	unsigned long load; +	int pulled = 0; + +	if (env->load_move <= 0) +		return 0; -	if (max_load_move == 0) -		goto out; +	while (!list_empty(tasks)) { +		p = list_first_entry(tasks, struct task_struct, se.group_node); -	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { -		if (loops++ > sysctl_sched_nr_migrate) { -			*lb_flags |= LBF_NEED_BREAK; +		env->loop++; +		/* We've more or less seen every task there is, call it quits */ +		if (env->loop > env->loop_max) +			break; + +		/* take a breather every nr_migrate tasks */ +		if (env->loop > env->loop_break) { +			env->loop_break += sysctl_sched_nr_migrate; +			env->flags |= LBF_NEED_BREAK;  			break;  		} -		if ((p->se.load.weight >> 1) > rem_load_move || -		    !can_migrate_task(p, busiest, this_cpu, sd, idle, -				      lb_flags)) -			continue; +		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) +			goto next; + +		load = task_h_load(p); + +		if (load < 16 && !env->sd->nr_balance_failed) +			goto next; + +		if ((load / 2) > env->load_move) +			goto next; -		pull_task(busiest, p, this_rq, this_cpu); +		if (!can_migrate_task(p, env)) +			goto next; + +		move_task(p, env);  		pulled++; -		rem_load_move -= p->se.load.weight; +		env->load_move -= load;  #ifdef CONFIG_PREEMPT  		/* @@ -3259,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,  		 * kernels will stop after the first task is pulled to minimize  		 * the critical section.  		 */ -		if (idle == CPU_NEWLY_IDLE) { -			*lb_flags |= LBF_ABORT; +		if (env->idle == CPU_NEWLY_IDLE)  			break; -		}  #endif  		/*  		 * We only want to steal up to the prescribed amount of  		 * weighted load.  		 */ -		if (rem_load_move <= 0) +		if (env->load_move <= 0)  			break; + +		continue; +next: +		list_move_tail(&p->se.group_node, tasks);  	} -out: +  	/* -	 * Right now, this is one of only two places pull_task() is called, -	 * so we can safely collect pull_task() stats here rather than -	 * inside pull_task(). +	 * Right now, this is one of only two places move_task() is called, +	 * so we can safely collect move_task() stats here rather than +	 * inside move_task().  	 */ -	schedstat_add(sd, lb_gained[idle], pulled); +	schedstat_add(env->sd, lb_gained[env->idle], pulled); -	return max_load_move - rem_load_move; +	return pulled;  }  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3360,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data)  static void update_h_load(long cpu)  { +	rcu_read_lock();  	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +	rcu_read_unlock();  } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, -		  unsigned long max_load_move, -		  struct sched_domain *sd, enum cpu_idle_type idle, -		  int *lb_flags) +static unsigned long task_h_load(struct task_struct *p)  { -	long rem_load_move = max_load_move; -	struct cfs_rq *busiest_cfs_rq; - -	rcu_read_lock(); -	update_h_load(cpu_of(busiest)); - -	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { -		unsigned long busiest_h_load = busiest_cfs_rq->h_load; -		unsigned long busiest_weight = busiest_cfs_rq->load.weight; -		u64 rem_load, moved_load; - -		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) -			break; - -		/* -		 * empty group or part of a throttled hierarchy -		 */ -		if (!busiest_cfs_rq->task_weight || -		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) -			continue; - -		rem_load = (u64)rem_load_move * busiest_weight; -		rem_load = div_u64(rem_load, busiest_h_load + 1); - -		moved_load = balance_tasks(this_rq, this_cpu, busiest, -				rem_load, sd, idle, lb_flags, -				busiest_cfs_rq); - -		if (!moved_load) -			continue; +	struct cfs_rq *cfs_rq = task_cfs_rq(p); +	unsigned long load; -		moved_load *= busiest_h_load; -		moved_load = div_u64(moved_load, busiest_weight + 1); +	load = p->se.load.weight; +	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); -		rem_load_move -= moved_load; -		if (rem_load_move < 0) -			break; -	} -	rcu_read_unlock(); - -	return max_load_move - rem_load_move; +	return load;  }  #else  static inline void update_shares(int cpu)  {  } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, -		  unsigned long max_load_move, -		  struct sched_domain *sd, enum cpu_idle_type idle, -		  int *lb_flags) +static inline void update_h_load(long cpu)  { -	return balance_tasks(this_rq, this_cpu, busiest, -			max_load_move, sd, idle, lb_flags, -			&busiest->cfs);  } -#endif -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, -		      unsigned long max_load_move, -		      struct sched_domain *sd, enum cpu_idle_type idle, -		      int *lb_flags) +static unsigned long task_h_load(struct task_struct *p)  { -	unsigned long total_load_moved = 0, load_moved; - -	do { -		load_moved = load_balance_fair(this_rq, this_cpu, busiest, -				max_load_move - total_load_moved, -				sd, idle, lb_flags); - -		total_load_moved += load_moved; - -		if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) -			break; - -#ifdef CONFIG_PREEMPT -		/* -		 * NEWIDLE balancing is a source of latency, so preemptible -		 * kernels will stop after the first task is pulled to minimize -		 * the critical section. -		 */ -		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { -			*lb_flags |= LBF_ABORT; -			break; -		} -#endif -	} while (load_moved && max_load_move > total_load_moved); - -	return total_load_moved > 0; +	return p->se.load.weight;  } +#endif  /********** Helpers for find_busiest_group ************************/  /* @@ -3776,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu)  	struct sched_domain *child = sd->child;  	struct sched_group *group, *sdg = sd->groups;  	unsigned long power; +	unsigned long interval; + +	interval = msecs_to_jiffies(sd->balance_interval); +	interval = clamp(interval, 1UL, max_load_balance_interval); +	sdg->sgp->next_update = jiffies + interval;  	if (!child) {  		update_cpu_power(sd, cpu); @@ -3883,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,  	 * domains. In the newly idle case, we will allow all the cpu's  	 * to do the newly idle load balance.  	 */ -	if (idle != CPU_NEWLY_IDLE && local_group) { -		if (balance_cpu != this_cpu) { -			*balance = 0; -			return; -		} -		update_group_power(sd, this_cpu); +	if (local_group) { +		if (idle != CPU_NEWLY_IDLE) { +			if (balance_cpu != this_cpu) { +				*balance = 0; +				return; +			} +			update_group_power(sd, this_cpu); +		} else if (time_after_eq(jiffies, group->sgp->next_update)) +			update_group_power(sd, this_cpu);  	}  	/* Adjust by relative CPU power of the group */ @@ -4451,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq,  			struct sched_domain *sd, enum cpu_idle_type idle,  			int *balance)  { -	int ld_moved, lb_flags = 0, active_balance = 0; +	int ld_moved, active_balance = 0;  	struct sched_group *group;  	unsigned long imbalance;  	struct rq *busiest;  	unsigned long flags;  	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); +	struct lb_env env = { +		.sd		= sd, +		.dst_cpu	= this_cpu, +		.dst_rq		= this_rq, +		.idle		= idle, +		.loop_break	= sysctl_sched_nr_migrate, +	}; +  	cpumask_copy(cpus, cpu_active_mask);  	schedstat_inc(sd, lb_count[idle]); @@ -4492,32 +4444,34 @@ redo:  		 * still unbalanced. ld_moved simply stays zero, so it is  		 * correctly treated as an imbalance.  		 */ -		lb_flags |= LBF_ALL_PINNED; +		env.flags |= LBF_ALL_PINNED; +		env.load_move = imbalance; +		env.src_cpu = busiest->cpu; +		env.src_rq = busiest; +		env.loop_max = busiest->nr_running; + +more_balance:  		local_irq_save(flags);  		double_rq_lock(this_rq, busiest); -		ld_moved = move_tasks(this_rq, this_cpu, busiest, -				      imbalance, sd, idle, &lb_flags); +		if (!env.loop) +			update_h_load(env.src_cpu); +		ld_moved += move_tasks(&env);  		double_rq_unlock(this_rq, busiest);  		local_irq_restore(flags); +		if (env.flags & LBF_NEED_BREAK) { +			env.flags &= ~LBF_NEED_BREAK; +			goto more_balance; +		} +  		/*  		 * some other cpu did the load balance for us.  		 */  		if (ld_moved && this_cpu != smp_processor_id())  			resched_cpu(this_cpu); -		if (lb_flags & LBF_ABORT) -			goto out_balanced; - -		if (lb_flags & LBF_NEED_BREAK) { -			lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; -			if (lb_flags & LBF_ABORT) -				goto out_balanced; -			goto redo; -		} -  		/* All tasks on this runqueue were pinned by CPU affinity */ -		if (unlikely(lb_flags & LBF_ALL_PINNED)) { +		if (unlikely(env.flags & LBF_ALL_PINNED)) {  			cpumask_clear_cpu(cpu_of(busiest), cpus);  			if (!cpumask_empty(cpus))  				goto redo; @@ -4547,7 +4501,7 @@ redo:  					tsk_cpus_allowed(busiest->curr))) {  				raw_spin_unlock_irqrestore(&busiest->lock,  							    flags); -				lb_flags |= LBF_ALL_PINNED; +				env.flags |= LBF_ALL_PINNED;  				goto out_one_pinned;  			} @@ -4600,7 +4554,7 @@ out_balanced:  out_one_pinned:  	/* tune up the balancing interval */ -	if (((lb_flags & LBF_ALL_PINNED) && +	if (((env.flags & LBF_ALL_PINNED) &&  			sd->balance_interval < MAX_PINNED_INTERVAL) ||  			(sd->balance_interval < sd->max_interval))  		sd->balance_interval *= 2; @@ -4710,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data)  	}  	if (likely(sd)) { +		struct lb_env env = { +			.sd		= sd, +			.dst_cpu	= target_cpu, +			.dst_rq		= target_rq, +			.src_cpu	= busiest_rq->cpu, +			.src_rq		= busiest_rq, +			.idle		= CPU_IDLE, +		}; +  		schedstat_inc(sd, alb_count); -		if (move_one_task(target_rq, target_cpu, busiest_rq, -				  sd, CPU_IDLE)) +		if (move_one_task(&env))  			schedstat_inc(sd, alb_pushed);  		else  			schedstat_inc(sd, alb_failed); @@ -4945,8 +4907,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,  static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; -  /*   * Scale the max load_balance interval with the number of CPUs in the system.   * This trades load-balance latency on larger machines for less cross talk. @@ -5340,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq)  void init_cfs_rq(struct cfs_rq *cfs_rq)  {  	cfs_rq->tasks_timeline = RB_ROOT; -	INIT_LIST_HEAD(&cfs_rq->tasks);  	cfs_rq->min_vruntime = (u64)(-(1LL << 20));  #ifndef CONFIG_64BIT  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -5612,6 +5571,7 @@ __init void init_sched_fair_class(void)  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);  #ifdef CONFIG_NO_HZ +	nohz.next_balance = jiffies;  	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);  	cpu_notifier(sched_ilb_notifier, 0);  #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f42ae7fb5ec5..b60dad720173 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq)  static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)  { -	int i, idle = 1; +	int i, idle = 1, throttled = 0;  	const struct cpumask *span; -	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) -		return 1; -  	span = sched_rt_period_mask();  	for_each_cpu(i, span) {  		int enqueue = 0; @@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)  			if (!rt_rq_throttled(rt_rq))  				enqueue = 1;  		} +		if (rt_rq->rt_throttled) +			throttled = 1;  		if (enqueue)  			sched_rt_rq_enqueue(rt_rq);  		raw_spin_unlock(&rq->lock);  	} +	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) +		return 1; +  	return idle;  } @@ -855,8 +857,30 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)  		return 0;  	if (rt_rq->rt_time > runtime) { -		rt_rq->rt_throttled = 1; -		printk_once(KERN_WARNING "sched: RT throttling activated\n"); +		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + +		/* +		 * Don't actually throttle groups that have no runtime assigned +		 * but accrue some time due to boosting. +		 */ +		if (likely(rt_b->rt_runtime)) { +			static bool once = false; + +			rt_rq->rt_throttled = 1; + +			if (!once) { +				once = true; +				printk_sched("sched: RT throttling activated\n"); +			} +		} else { +			/* +			 * In case we did anyway, make it go away, +			 * replenishment is a joke, since it will replenish us +			 * with exactly 0 ns. +			 */ +			rt_rq->rt_time = 0; +		} +  		if (rt_rq_throttled(rt_rq)) {  			sched_rt_rq_dequeue(rt_rq);  			return 1; @@ -884,7 +908,8 @@ static void update_curr_rt(struct rq *rq)  	if (unlikely((s64)delta_exec < 0))  		delta_exec = 0; -	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); +	schedstat_set(curr->se.statistics.exec_max, +		      max(curr->se.statistics.exec_max, delta_exec));  	curr->se.sum_exec_runtime += delta_exec;  	account_group_exec_runtime(curr, delta_exec); @@ -1972,7 +1997,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  	if (--p->rt.time_slice)  		return; -	p->rt.time_slice = DEF_TIMESLICE; +	p->rt.time_slice = RR_TIMESLICE;  	/*  	 * Requeue to the end of queue if we are not the only element @@ -2000,7 +2025,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)  	 * Time slice is 0 for SCHED_FIFO tasks  	 */  	if (task->policy == SCHED_RR) -		return DEF_TIMESLICE; +		return RR_TIMESLICE;  	else  		return 0;  } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..42b1f304b044 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running;  /*   * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire.   */ -#define DEF_TIMESLICE		(100 * HZ / 1000)  /*   * single value that denotes runtime == period, ie unlimited time. @@ -216,9 +212,6 @@ struct cfs_rq {  	struct rb_root tasks_timeline;  	struct rb_node *rb_leftmost; -	struct list_head tasks; -	struct list_head *balance_iterator; -  	/*  	 * 'curr' points to currently running entity on this cfs_rq.  	 * It is set to NULL otherwise (i.e when none are currently running). @@ -246,11 +239,6 @@ struct cfs_rq {  #ifdef CONFIG_SMP  	/* -	 * the part of load.weight contributed by tasks -	 */ -	unsigned long task_weight; - -	/*  	 *   h_load = weight * f(tg)  	 *  	 * Where f(tg) is the recursive weight fraction assigned to @@ -424,6 +412,8 @@ struct rq {  	int cpu;  	int online; +	struct list_head cfs_tasks; +  	u64 rt_avg;  	u64 age_stamp;  	u64 idle_stamp; @@ -462,7 +452,6 @@ struct rq {  	unsigned int yld_count;  	/* schedule() stats */ -	unsigned int sched_switch;  	unsigned int sched_count;  	unsigned int sched_goidle; @@ -611,7 +600,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)   * Tunables that become constants when CONFIG_SCHED_DEBUG is off:   */  #ifdef CONFIG_SCHED_DEBUG -# include <linux/jump_label.h> +# include <linux/static_key.h>  # define const_debug __read_mostly  #else  # define const_debug const @@ -630,18 +619,18 @@ enum {  #undef SCHED_FEAT  #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key)  { -	return likely(static_branch(key)); /* Not out of line branch. */ +	return static_key_true(key); /* Not out of line branch. */  } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key)  { -	return unlikely(static_branch(key)); /* Out of line branch. */ +	return static_key_false(key); /* Out of line branch. */  }  #define SCHED_FEAT(name, enabled)					\ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \  {									\  	return static_branch__##enabled(key);				\  } @@ -650,7 +639,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \  #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];  #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))  #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */  #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e190..903ffa9e8872 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v)  		/* runqueue-specific stats */  		seq_printf(seq, -		    "cpu%d %u %u %u %u %u %u %llu %llu %lu", +		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",  		    cpu, rq->yld_count, -		    rq->sched_switch, rq->sched_count, rq->sched_goidle, +		    rq->sched_count, rq->sched_goidle,  		    rq->ttwu_count, rq->ttwu_local,  		    rq->rq_cpu_time,  		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); diff --git a/kernel/signal.c b/kernel/signal.c index c73c4284160e..d523da02dd14 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig)  		(handler == SIG_DFL && sig_kernel_ignore(sig));  } -static int sig_task_ignored(struct task_struct *t, int sig, -		int from_ancestor_ns) +static int sig_task_ignored(struct task_struct *t, int sig, bool force)  {  	void __user *handler;  	handler = sig_handler(t, sig);  	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && -			handler == SIG_DFL && !from_ancestor_ns) +			handler == SIG_DFL && !force)  		return 1;  	return sig_handler_ignored(handler, sig);  } -static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +static int sig_ignored(struct task_struct *t, int sig, bool force)  {  	/*  	 * Blocked signals are never ignored, since the @@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)  	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))  		return 0; -	if (!sig_task_ignored(t, sig, from_ancestor_ns)) +	if (!sig_task_ignored(t, sig, force))  		return 0;  	/* @@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t)   * Returns true if the signal should be actually delivered, otherwise   * it should be dropped.   */ -static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) +static int prepare_signal(int sig, struct task_struct *p, bool force)  {  	struct signal_struct *signal = p->signal;  	struct task_struct *t; @@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)  		}  	} -	return !sig_ignored(p, sig, from_ancestor_ns); +	return !sig_ignored(p, sig, force);  }  /* @@ -1054,13 +1053,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,  	struct sigpending *pending;  	struct sigqueue *q;  	int override_rlimit; - -	trace_signal_generate(sig, info, t); +	int ret = 0, result;  	assert_spin_locked(&t->sighand->siglock); -	if (!prepare_signal(sig, t, from_ancestor_ns)) -		return 0; +	result = TRACE_SIGNAL_IGNORED; +	if (!prepare_signal(sig, t, +			from_ancestor_ns || (info == SEND_SIG_FORCED))) +		goto ret;  	pending = group ? &t->signal->shared_pending : &t->pending;  	/* @@ -1068,8 +1068,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,  	 * exactly one non-rt signal, so that we can get more  	 * detailed information about the cause of the signal.  	 */ +	result = TRACE_SIGNAL_ALREADY_PENDING;  	if (legacy_queue(pending, sig)) -		return 0; +		goto ret; + +	result = TRACE_SIGNAL_DELIVERED;  	/*  	 * fast-pathed signals for kernel-internal things like SIGSTOP  	 * or SIGKILL. @@ -1127,14 +1130,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,  			 * signal was rt and sent by user using something  			 * other than kill().  			 */ -			trace_signal_overflow_fail(sig, group, info); -			return -EAGAIN; +			result = TRACE_SIGNAL_OVERFLOW_FAIL; +			ret = -EAGAIN; +			goto ret;  		} else {  			/*  			 * This is a silent loss of information.  We still  			 * send the signal, but the *info bits are lost.  			 */ -			trace_signal_lose_info(sig, group, info); +			result = TRACE_SIGNAL_LOSE_INFO;  		}  	} @@ -1142,7 +1146,9 @@ out_set:  	signalfd_notify(t, sig);  	sigaddset(&pending->signal, sig);  	complete_signal(sig, t, group); -	return 0; +ret: +	trace_signal_generate(sig, info, t, group, result); +	return ret;  }  static int send_signal(int sig, struct siginfo *info, struct task_struct *t, @@ -1585,7 +1591,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)  	int sig = q->info.si_signo;  	struct sigpending *pending;  	unsigned long flags; -	int ret; +	int ret, result;  	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); @@ -1594,7 +1600,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)  		goto ret;  	ret = 1; /* the signal is ignored */ -	if (!prepare_signal(sig, t, 0)) +	result = TRACE_SIGNAL_IGNORED; +	if (!prepare_signal(sig, t, false))  		goto out;  	ret = 0; @@ -1605,6 +1612,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)  		 */  		BUG_ON(q->info.si_code != SI_TIMER);  		q->info.si_overrun++; +		result = TRACE_SIGNAL_ALREADY_PENDING;  		goto out;  	}  	q->info.si_overrun = 0; @@ -1614,7 +1622,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)  	list_add_tail(&q->list, &pending->list);  	sigaddset(&pending->signal, sig);  	complete_signal(sig, t, group); +	result = TRACE_SIGNAL_DELIVERED;  out: +	trace_signal_generate(sig, &q->info, t, group, result);  	unlock_task_sighand(t, &flags);  ret:  	return ret; @@ -1642,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)  	BUG_ON(!tsk->ptrace &&  	       (tsk->group_leader != tsk || !thread_group_empty(tsk))); +	if (sig != SIGCHLD) { +		/* +		 * This is only possible if parent == real_parent. +		 * Check if it has changed security domain. +		 */ +		if (tsk->parent_exec_id != tsk->parent->self_exec_id) +			sig = SIGCHLD; +	} +  	info.si_signo = sig;  	info.si_errno = 0;  	/* diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..671f9594e368 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -297,7 +297,7 @@ void irq_enter(void)  	int cpu = smp_processor_id();  	rcu_irq_enter(); -	if (idle_cpu(cpu) && !in_interrupt()) { +	if (is_idle_task(current) && !in_interrupt()) {  		/*  		 * Prevent raise_softirq from needlessly waking up ksoftirqd  		 * here, as softirq will be serviced on return from interrupt. @@ -310,31 +310,21 @@ void irq_enter(void)  	__irq_enter();  } -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED  static inline void invoke_softirq(void)  { -	if (!force_irqthreads) +	if (!force_irqthreads) { +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED  		__do_softirq(); -	else { -		__local_bh_disable((unsigned long)__builtin_return_address(0), -				SOFTIRQ_OFFSET); -		wakeup_softirqd(); -		__local_bh_enable(SOFTIRQ_OFFSET); -	} -}  #else -static inline void invoke_softirq(void) -{ -	if (!force_irqthreads)  		do_softirq(); -	else { +#endif +	} else {  		__local_bh_disable((unsigned long)__builtin_return_address(0),  				SOFTIRQ_OFFSET);  		wakeup_softirqd();  		__local_bh_enable(SOFTIRQ_OFFSET);  	}  } -#endif  /*   * Exit an interrupt context. Process softirqs if needed and possible: @@ -353,7 +343,7 @@ void irq_exit(void)  		tick_nohz_irq_exit();  #endif  	rcu_irq_exit(); -	preempt_enable_no_resched(); +	sched_preempt_enable_no_resched();  }  /* @@ -385,6 +375,12 @@ void raise_softirq(unsigned int nr)  	local_irq_restore(flags);  } +void __raise_softirq_irqoff(unsigned int nr) +{ +	trace_softirq_raise(nr); +	or_softirq_pending(1UL << nr); +} +  void open_softirq(int nr, void (*action)(struct softirq_action *))  {  	softirq_vec[nr].action = action; @@ -744,9 +740,7 @@ static int run_ksoftirqd(void * __bind_cpu)  	while (!kthread_should_stop()) {  		preempt_disable();  		if (!local_softirq_pending()) { -			preempt_enable_no_resched(); -			schedule(); -			preempt_disable(); +			schedule_preempt_disabled();  		}  		__set_current_state(TASK_RUNNING); @@ -761,7 +755,7 @@ static int run_ksoftirqd(void * __bind_cpu)  			if (local_softirq_pending())  				__do_softirq();  			local_irq_enable(); -			preempt_enable_no_resched(); +			sched_preempt_enable_no_resched();  			cond_resched();  			preempt_disable();  			rcu_note_context_switch((long)__bind_cpu); diff --git a/kernel/srcu.c b/kernel/srcu.c index 0febf61e1aa3..ba35f3a4a1f4 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))  {  	int idx; +	rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && +			   !lock_is_held(&rcu_bh_lock_map) && +			   !lock_is_held(&rcu_lock_map) && +			   !lock_is_held(&rcu_sched_lock_map), +			   "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); +  	idx = sp->completed;  	mutex_lock(&sp->mutex); @@ -280,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp)  EXPORT_SYMBOL_GPL(synchronize_srcu);  /** - * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * synchronize_srcu_expedited - Brute-force SRCU grace period   * @sp: srcu_struct with which to synchronize.   * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates.  Can block; must be called from - * process context. + * Wait for an SRCU grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly.  This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code.  In fact, + * if you are using synchronize_srcu_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_srcu() instead.   * - * Note that it is illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; doing so - * will result in deadlock.  However, it is perfectly legal to call - * synchronize_srcu_expedited() on one srcu_struct from some other - * srcu_struct's read-side critical section. + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal + * to call this function from a CPU-hotplug notifier.  Failing to observe + * these restriction will result in deadlock.  It is also illegal to call + * synchronize_srcu_expedited() from the corresponding SRCU read-side + * critical section; doing so will result in deadlock.  However, it is + * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct + * from some other srcu_struct's read-side critical section, as long as + * the resulting graph of srcu_structs is acyclic.   */  void synchronize_srcu_expedited(struct srcu_struct *sp)  { diff --git a/kernel/sys.c b/kernel/sys.c index 40701538fbd1..9eb7fcab8df6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1706,7 +1706,7 @@ static int prctl_set_mm(int opt, unsigned long addr,  	if (arg4 | arg5)  		return -EINVAL; -	if (!capable(CAP_SYS_ADMIN)) +	if (!capable(CAP_SYS_RESOURCE))  		return -EPERM;  	if (addr >= TASK_SIZE) @@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		case PR_SET_MM:  			error = prctl_set_mm(arg2, arg3, arg4, arg5);  			break; +		case PR_SET_CHILD_SUBREAPER: +			me->signal->is_child_subreaper = !!arg2; +			error = 0; +			break; +		case PR_GET_CHILD_SUBREAPER: +			error = put_user(me->signal->is_child_subreaper, +					 (int __user *) arg2); +			break;  		default:  			error = -EINVAL;  			break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..d48ff4fd44c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,6 +58,7 @@  #include <linux/oom.h>  #include <linux/kmod.h>  #include <linux/capability.h> +#include <linux/binfmts.h>  #include <asm/uaccess.h>  #include <asm/processor.h> @@ -192,20 +193,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,  #endif -static struct ctl_table root_table[]; -static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { -	{{.count = 1, -	.ctl_table = root_table, -	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, -	.root = &sysctl_table_root, -	.set = &sysctl_table_root.default_set, -}; -static struct ctl_table_root sysctl_table_root = { -	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), -	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), -}; -  static struct ctl_table kern_table[];  static struct ctl_table vm_table[];  static struct ctl_table fs_table[]; @@ -222,7 +209,7 @@ int sysctl_legacy_va_layout;  /* The default sysctl tables: */ -static struct ctl_table root_table[] = { +static struct ctl_table sysctl_base_table[] = {  	{  		.procname	= "kernel",  		.mode		= 0555, @@ -1559,490 +1546,12 @@ static struct ctl_table dev_table[] = {  	{ }  }; -static DEFINE_SPINLOCK(sysctl_lock); - -/* called under sysctl_lock */ -static int use_table(struct ctl_table_header *p) -{ -	if (unlikely(p->unregistering)) -		return 0; -	p->used++; -	return 1; -} - -/* called under sysctl_lock */ -static void unuse_table(struct ctl_table_header *p) -{ -	if (!--p->used) -		if (unlikely(p->unregistering)) -			complete(p->unregistering); -} - -/* called under sysctl_lock, will reacquire if has to wait */ -static void start_unregistering(struct ctl_table_header *p) -{ -	/* -	 * if p->used is 0, nobody will ever touch that entry again; -	 * we'll eliminate all paths to it before dropping sysctl_lock -	 */ -	if (unlikely(p->used)) { -		struct completion wait; -		init_completion(&wait); -		p->unregistering = &wait; -		spin_unlock(&sysctl_lock); -		wait_for_completion(&wait); -		spin_lock(&sysctl_lock); -	} else { -		/* anything non-NULL; we'll never dereference it */ -		p->unregistering = ERR_PTR(-EINVAL); -	} -	/* -	 * do not remove from the list until nobody holds it; walking the -	 * list in do_sysctl() relies on that. -	 */ -	list_del_init(&p->ctl_entry); -} - -void sysctl_head_get(struct ctl_table_header *head) -{ -	spin_lock(&sysctl_lock); -	head->count++; -	spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -	spin_lock(&sysctl_lock); -	if (!--head->count) -		kfree_rcu(head, rcu); -	spin_unlock(&sysctl_lock); -} - -struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) -{ -	if (!head) -		BUG(); -	spin_lock(&sysctl_lock); -	if (!use_table(head)) -		head = ERR_PTR(-ENOENT); -	spin_unlock(&sysctl_lock); -	return head; -} - -void sysctl_head_finish(struct ctl_table_header *head) -{ -	if (!head) -		return; -	spin_lock(&sysctl_lock); -	unuse_table(head); -	spin_unlock(&sysctl_lock); -} - -static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) -{ -	struct ctl_table_set *set = &root->default_set; -	if (root->lookup) -		set = root->lookup(root, namespaces); -	return set; -} - -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ -	struct ctl_table_set *set = lookup_header_set(root, namespaces); -	return &set->list; -} - -struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, -					    struct ctl_table_header *prev) -{ -	struct ctl_table_root *root; -	struct list_head *header_list; -	struct ctl_table_header *head; -	struct list_head *tmp; - -	spin_lock(&sysctl_lock); -	if (prev) { -		head = prev; -		tmp = &prev->ctl_entry; -		unuse_table(prev); -		goto next; -	} -	tmp = &root_table_header.ctl_entry; -	for (;;) { -		head = list_entry(tmp, struct ctl_table_header, ctl_entry); - -		if (!use_table(head)) -			goto next; -		spin_unlock(&sysctl_lock); -		return head; -	next: -		root = head->root; -		tmp = tmp->next; -		header_list = lookup_header_list(root, namespaces); -		if (tmp != header_list) -			continue; - -		do { -			root = list_entry(root->root_list.next, -					struct ctl_table_root, root_list); -			if (root == &sysctl_table_root) -				goto out; -			header_list = lookup_header_list(root, namespaces); -		} while (list_empty(header_list)); -		tmp = header_list->next; -	} -out: -	spin_unlock(&sysctl_lock); -	return NULL; -} - -struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) -{ -	return __sysctl_head_next(current->nsproxy, prev); -} - -void register_sysctl_root(struct ctl_table_root *root) -{ -	spin_lock(&sysctl_lock); -	list_add_tail(&root->root_list, &sysctl_table_root.root_list); -	spin_unlock(&sysctl_lock); -} - -/* - * sysctl_perm does NOT grant the superuser all rights automatically, because - * some sysctl variables are readonly even to root. - */ - -static int test_perm(int mode, int op) -{ -	if (!current_euid()) -		mode >>= 6; -	else if (in_egroup_p(0)) -		mode >>= 3; -	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) -		return 0; -	return -EACCES; -} - -int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) -{ -	int mode; - -	if (root->permissions) -		mode = root->permissions(root, current->nsproxy, table); -	else -		mode = table->mode; - -	return test_perm(mode, op); -} - -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ -	for (; table->procname; table++) { -		table->parent = parent; -		if (table->child) -			sysctl_set_parent(table, table->child); -	} -} - -static __init int sysctl_init(void) +int __init sysctl_init(void)  { -	sysctl_set_parent(NULL, root_table); -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK -	sysctl_check_table(current->nsproxy, root_table); -#endif +	register_sysctl_table(sysctl_base_table);  	return 0;  } -core_initcall(sysctl_init); - -static struct ctl_table *is_branch_in(struct ctl_table *branch, -				      struct ctl_table *table) -{ -	struct ctl_table *p; -	const char *s = branch->procname; - -	/* branch should have named subdirectory as its first element */ -	if (!s || !branch->child) -		return NULL; - -	/* ... and nothing else */ -	if (branch[1].procname) -		return NULL; - -	/* table should contain subdirectory with the same name */ -	for (p = table; p->procname; p++) { -		if (!p->child) -			continue; -		if (p->procname && strcmp(p->procname, s) == 0) -			return p; -	} -	return NULL; -} - -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) -{ -	struct ctl_table *to = p->ctl_table, *by = q->ctl_table; -	struct ctl_table *next; -	int is_better = 0; -	int not_in_parent = !p->attached_by; - -	while ((next = is_branch_in(by, to)) != NULL) { -		if (by == q->attached_by) -			is_better = 1; -		if (to == p->attached_by) -			not_in_parent = 1; -		by = by->child; -		to = next->child; -	} - -	if (is_better && not_in_parent) { -		q->attached_by = by; -		q->attached_to = to; -		q->parent = p; -	} -} - -/** - * __register_sysctl_paths - register a sysctl hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * The members of the &struct ctl_table structure are used as follows: - * - * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not - *            enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) - * - * child - a pointer to the child sysctl table if this entry is a directory, or - *         %NULL. - * - * proc_handler - the text handler routine (described below) - * - * de - for internal use by the sysctl routines - * - * extra1, extra2 - extra pointers usable by the proc handler routines - * - * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. - * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table.  The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler).  Several default handlers are available to - * cover common cases - - * - * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), - * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),  - * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() - * - * It is the handler's job to read the input buffer from user memory - * and process it. The handler should return 0 on success. - * - * This routine returns %NULL on a failure to register, and a pointer - * to the table header on success. - */ -struct ctl_table_header *__register_sysctl_paths( -	struct ctl_table_root *root, -	struct nsproxy *namespaces, -	const struct ctl_path *path, struct ctl_table *table) -{ -	struct ctl_table_header *header; -	struct ctl_table *new, **prevp; -	unsigned int n, npath; -	struct ctl_table_set *set; - -	/* Count the path components */ -	for (npath = 0; path[npath].procname; ++npath) -		; - -	/* -	 * For each path component, allocate a 2-element ctl_table array. -	 * The first array element will be filled with the sysctl entry -	 * for this, the second will be the sentinel (procname == 0). -	 * -	 * We allocate everything in one go so that we don't have to -	 * worry about freeing additional memory in unregister_sysctl_table. -	 */ -	header = kzalloc(sizeof(struct ctl_table_header) + -			 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); -	if (!header) -		return NULL; - -	new = (struct ctl_table *) (header + 1); - -	/* Now connect the dots */ -	prevp = &header->ctl_table; -	for (n = 0; n < npath; ++n, ++path) { -		/* Copy the procname */ -		new->procname = path->procname; -		new->mode     = 0555; - -		*prevp = new; -		prevp = &new->child; - -		new += 2; -	} -	*prevp = table; -	header->ctl_table_arg = table; - -	INIT_LIST_HEAD(&header->ctl_entry); -	header->used = 0; -	header->unregistering = NULL; -	header->root = root; -	sysctl_set_parent(NULL, header->ctl_table); -	header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK -	if (sysctl_check_table(namespaces, header->ctl_table)) { -		kfree(header); -		return NULL; -	} -#endif -	spin_lock(&sysctl_lock); -	header->set = lookup_header_set(root, namespaces); -	header->attached_by = header->ctl_table; -	header->attached_to = root_table; -	header->parent = &root_table_header; -	for (set = header->set; set; set = set->parent) { -		struct ctl_table_header *p; -		list_for_each_entry(p, &set->list, ctl_entry) { -			if (p->unregistering) -				continue; -			try_attach(p, header); -		} -	} -	header->parent->count++; -	list_add_tail(&header->ctl_entry, &header->set->list); -	spin_unlock(&sysctl_lock); - -	return header; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, -						struct ctl_table *table) -{ -	return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, -					path, table); -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ -	static const struct ctl_path null_path[] = { {} }; - -	return register_sysctl_paths(null_path, table); -} - -/** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table - * - * Unregisters the sysctl table and all children. proc entries may not - * actually be removed until they are no longer used by anyone. - */ -void unregister_sysctl_table(struct ctl_table_header * header) -{ -	might_sleep(); - -	if (header == NULL) -		return; - -	spin_lock(&sysctl_lock); -	start_unregistering(header); -	if (!--header->parent->count) { -		WARN_ON(1); -		kfree_rcu(header->parent, rcu); -	} -	if (!--header->count) -		kfree_rcu(header, rcu); -	spin_unlock(&sysctl_lock); -} - -int sysctl_is_seen(struct ctl_table_header *p) -{ -	struct ctl_table_set *set = p->set; -	int res; -	spin_lock(&sysctl_lock); -	if (p->unregistering) -		res = 0; -	else if (!set->is_seen) -		res = 1; -	else -		res = set->is_seen(set); -	spin_unlock(&sysctl_lock); -	return res; -} - -void setup_sysctl_set(struct ctl_table_set *p, -	struct ctl_table_set *parent, -	int (*is_seen)(struct ctl_table_set *)) -{ -	INIT_LIST_HEAD(&p->list); -	p->parent = parent ? parent : &sysctl_table_root.default_set; -	p->is_seen = is_seen; -} - -#else /* !CONFIG_SYSCTL */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -{ -	return NULL; -} - -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, -						    struct ctl_table *table) -{ -	return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -void setup_sysctl_set(struct ctl_table_set *p, -	struct ctl_table_set *parent, -	int (*is_seen)(struct ctl_table_set *)) -{ -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -} -  #endif /* CONFIG_SYSCTL */  /* @@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);  EXPORT_SYMBOL(proc_dostring);  EXPORT_SYMBOL(proc_doulongvec_minmax);  EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); -EXPORT_SYMBOL(register_sysctl_table); -EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c deleted file mode 100644 index 362da653813d..000000000000 --- a/kernel/sysctl_check.c +++ /dev/null @@ -1,160 +0,0 @@ -#include <linux/stat.h> -#include <linux/sysctl.h> -#include "../fs/xfs/xfs_sysctl.h" -#include <linux/sunrpc/debug.h> -#include <linux/string.h> -#include <net/ip_vs.h> - - -static int sysctl_depth(struct ctl_table *table) -{ -	struct ctl_table *tmp; -	int depth; - -	depth = 0; -	for (tmp = table; tmp->parent; tmp = tmp->parent) -		depth++; - -	return depth; -} - -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) -{ -	int i; - -	for (i = 0; table && i < n; i++) -		table = table->parent; - -	return table; -} - - -static void sysctl_print_path(struct ctl_table *table) -{ -	struct ctl_table *tmp; -	int depth, i; -	depth = sysctl_depth(table); -	if (table->procname) { -		for (i = depth; i >= 0; i--) { -			tmp = sysctl_parent(table, i); -			printk("/%s", tmp->procname?tmp->procname:""); -		} -	} -	printk(" "); -} - -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, -						struct ctl_table *table) -{ -	struct ctl_table_header *head; -	struct ctl_table *ref, *test; -	int depth, cur_depth; - -	depth = sysctl_depth(table); - -	for (head = __sysctl_head_next(namespaces, NULL); head; -	     head = __sysctl_head_next(namespaces, head)) { -		cur_depth = depth; -		ref = head->ctl_table; -repeat: -		test = sysctl_parent(table, cur_depth); -		for (; ref->procname; ref++) { -			int match = 0; -			if (cur_depth && !ref->child) -				continue; - -			if (test->procname && ref->procname && -			    (strcmp(test->procname, ref->procname) == 0)) -					match++; - -			if (match) { -				if (cur_depth != 0) { -					cur_depth--; -					ref = ref->child; -					goto repeat; -				} -				goto out; -			} -		} -	} -	ref = NULL; -out: -	sysctl_head_finish(head); -	return ref; -} - -static void set_fail(const char **fail, struct ctl_table *table, const char *str) -{ -	if (*fail) { -		printk(KERN_ERR "sysctl table check failed: "); -		sysctl_print_path(table); -		printk(" %s\n", *fail); -		dump_stack(); -	} -	*fail = str; -} - -static void sysctl_check_leaf(struct nsproxy *namespaces, -				struct ctl_table *table, const char **fail) -{ -	struct ctl_table *ref; - -	ref = sysctl_check_lookup(namespaces, table); -	if (ref && (ref != table)) -		set_fail(fail, table, "Sysctl already exists"); -} - -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) -{ -	int error = 0; -	for (; table->procname; table++) { -		const char *fail = NULL; - -		if (table->parent) { -			if (!table->parent->procname) -				set_fail(&fail, table, "Parent without procname"); -		} -		if (table->child) { -			if (table->data) -				set_fail(&fail, table, "Directory with data?"); -			if (table->maxlen) -				set_fail(&fail, table, "Directory with maxlen?"); -			if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) -				set_fail(&fail, table, "Writable sysctl directory"); -			if (table->proc_handler) -				set_fail(&fail, table, "Directory with proc_handler"); -			if (table->extra1) -				set_fail(&fail, table, "Directory with extra1"); -			if (table->extra2) -				set_fail(&fail, table, "Directory with extra2"); -		} else { -			if ((table->proc_handler == proc_dostring) || -			    (table->proc_handler == proc_dointvec) || -			    (table->proc_handler == proc_dointvec_minmax) || -			    (table->proc_handler == proc_dointvec_jiffies) || -			    (table->proc_handler == proc_dointvec_userhz_jiffies) || -			    (table->proc_handler == proc_dointvec_ms_jiffies) || -			    (table->proc_handler == proc_doulongvec_minmax) || -			    (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { -				if (!table->data) -					set_fail(&fail, table, "No data"); -				if (!table->maxlen) -					set_fail(&fail, table, "No maxlen"); -			} -#ifdef CONFIG_PROC_SYSCTL -			if (!table->proc_handler) -				set_fail(&fail, table, "No proc_handler"); -#endif -			sysctl_check_leaf(namespaces, table, &fail); -		} -		if (table->mode > 0777) -			set_fail(&fail, table, "bogus .mode"); -		if (fail) { -			set_fail(&fail, table, NULL); -			error = -EINVAL; -		} -		if (table->child) -			error |= sysctl_check_table(namespaces, table->child); -	} -	return error; -} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f6117a4c7cb8..6e039b144daf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,13 +22,16 @@   * NTP timekeeping variables:   */ +DEFINE_SPINLOCK(ntp_lock); + +  /* USER_HZ period (usecs): */  unsigned long			tick_usec = TICK_USEC;  /* ACTHZ period (nsecs): */  unsigned long			tick_nsec; -u64				tick_length; +static u64			tick_length;  static u64			tick_length_base;  static struct hrtimer		leap_timer; @@ -49,7 +52,7 @@ static struct hrtimer		leap_timer;  static int			time_state = TIME_OK;  /* clock status bits:							*/ -int				time_status = STA_UNSYNC; +static int			time_status = STA_UNSYNC;  /* TAI offset (secs):							*/  static long			time_tai; @@ -133,7 +136,7 @@ static inline void pps_reset_freq_interval(void)  /**   * pps_clear - Clears the PPS state variables   * - * Must be called while holding a write on the xtime_lock + * Must be called while holding a write on the ntp_lock   */  static inline void pps_clear(void)  { @@ -149,7 +152,7 @@ static inline void pps_clear(void)   * the last PPS signal. When it reaches 0, indicate that PPS signal is   * missing.   * - * Must be called while holding a write on the xtime_lock + * Must be called while holding a write on the ntp_lock   */  static inline void pps_dec_valid(void)  { @@ -233,6 +236,17 @@ static inline void pps_fill_timex(struct timex *txc)  #endif /* CONFIG_NTP_PPS */ + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ +	return !(time_status & STA_UNSYNC); +} + +  /*   * NTP methods:   */ @@ -275,7 +289,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)  	time_status |= STA_MODE; -	return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); +	return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);  }  static void ntp_update_offset(long offset) @@ -330,11 +344,13 @@ static void ntp_update_offset(long offset)  /**   * ntp_clear - Clears the NTP state variables - * - * Must be called while holding a write on the xtime_lock   */  void ntp_clear(void)  { +	unsigned long flags; + +	spin_lock_irqsave(&ntp_lock, flags); +  	time_adjust	= 0;		/* stop active adjtime() */  	time_status	|= STA_UNSYNC;  	time_maxerror	= NTP_PHASE_LIMIT; @@ -347,8 +363,23 @@ void ntp_clear(void)  	/* Clear PPS state variables */  	pps_clear(); +	spin_unlock_irqrestore(&ntp_lock, flags); +  } + +u64 ntp_tick_length(void) +{ +	unsigned long flags; +	s64 ret; + +	spin_lock_irqsave(&ntp_lock, flags); +	ret = tick_length; +	spin_unlock_irqrestore(&ntp_lock, flags); +	return ret; +} + +  /*   * Leap second processing. If in leap-insert state at the end of the   * day, the system clock is set back one second; if in leap-delete @@ -357,14 +388,15 @@ void ntp_clear(void)  static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)  {  	enum hrtimer_restart res = HRTIMER_NORESTART; +	unsigned long flags; +	int leap = 0; -	write_seqlock(&xtime_lock); - +	spin_lock_irqsave(&ntp_lock, flags);  	switch (time_state) {  	case TIME_OK:  		break;  	case TIME_INS: -		timekeeping_leap_insert(-1); +		leap = -1;  		time_state = TIME_OOP;  		printk(KERN_NOTICE  			"Clock: inserting leap second 23:59:60 UTC\n"); @@ -372,7 +404,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)  		res = HRTIMER_RESTART;  		break;  	case TIME_DEL: -		timekeeping_leap_insert(1); +		leap = 1;  		time_tai--;  		time_state = TIME_WAIT;  		printk(KERN_NOTICE @@ -387,8 +419,14 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)  			time_state = TIME_OK;  		break;  	} +	spin_unlock_irqrestore(&ntp_lock, flags); -	write_sequnlock(&xtime_lock); +	/* +	 * We have to call this outside of the ntp_lock to keep +	 * the proper locking hierarchy +	 */ +	if (leap) +		timekeeping_leap_insert(leap);  	return res;  } @@ -404,6 +442,9 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)  void second_overflow(void)  {  	s64 delta; +	unsigned long flags; + +	spin_lock_irqsave(&ntp_lock, flags);  	/* Bump the maxerror field */  	time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -423,23 +464,25 @@ void second_overflow(void)  	pps_dec_valid();  	if (!time_adjust) -		return; +		goto out;  	if (time_adjust > MAX_TICKADJ) {  		time_adjust -= MAX_TICKADJ;  		tick_length += MAX_TICKADJ_SCALED; -		return; +		goto out;  	}  	if (time_adjust < -MAX_TICKADJ) {  		time_adjust += MAX_TICKADJ;  		tick_length -= MAX_TICKADJ_SCALED; -		return; +		goto out;  	}  	tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)  							 << NTP_SCALE_SHIFT;  	time_adjust = 0; +out: +	spin_unlock_irqrestore(&ntp_lock, flags);  }  #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -663,7 +706,7 @@ int do_adjtimex(struct timex *txc)  	getnstimeofday(&ts); -	write_seqlock_irq(&xtime_lock); +	spin_lock_irq(&ntp_lock);  	if (txc->modes & ADJ_ADJTIME) {  		long save_adjust = time_adjust; @@ -705,7 +748,7 @@ int do_adjtimex(struct timex *txc)  	/* fill PPS status fields */  	pps_fill_timex(txc); -	write_sequnlock_irq(&xtime_lock); +	spin_unlock_irq(&ntp_lock);  	txc->time.tv_sec = ts.tv_sec;  	txc->time.tv_usec = ts.tv_nsec; @@ -903,7 +946,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	pts_norm = pps_normalize_ts(*phase_ts); -	write_seqlock_irqsave(&xtime_lock, flags); +	spin_lock_irqsave(&ntp_lock, flags);  	/* clear the error bits, they will be set again if needed */  	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -916,7 +959,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	 * just start the frequency interval */  	if (unlikely(pps_fbase.tv_sec == 0)) {  		pps_fbase = *raw_ts; -		write_sequnlock_irqrestore(&xtime_lock, flags); +		spin_unlock_irqrestore(&ntp_lock, flags);  		return;  	} @@ -931,7 +974,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  		time_status |= STA_PPSJITTER;  		/* restart the frequency calibration interval */  		pps_fbase = *raw_ts; -		write_sequnlock_irqrestore(&xtime_lock, flags); +		spin_unlock_irqrestore(&ntp_lock, flags);  		pr_err("hardpps: PPSJITTER: bad pulse\n");  		return;  	} @@ -948,7 +991,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	hardpps_update_phase(pts_norm.nsec); -	write_sequnlock_irqrestore(&xtime_lock, flags); +	spin_unlock_irqrestore(&ntp_lock, flags);  }  EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index fd4a7b1625a2..e883f57a3cd3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void)  	unsigned long flags;  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); +	if (cpumask_empty(tick_get_broadcast_mask())) +		goto end;  	tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;  	bc = tick_broadcast_device.evtdev;  	if (bc)  		tick_broadcast_setup_oneshot(bc); + +end:  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7656642e4b8e..3526038f2836 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)  static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)  { -	ktime_t now; - -	now = ktime_get(); - -	update_ts_time_stats(cpu, ts, now, NULL); +	ktime_t now = ktime_get();  	ts->idle_entrytime = now;  	ts->idle_active = 1; @@ -562,20 +558,21 @@ void tick_nohz_idle_exit(void)  	local_irq_disable(); -	if (ts->idle_active || (ts->inidle && ts->tick_stopped)) +	WARN_ON_ONCE(!ts->inidle); + +	ts->inidle = 0; + +	if (ts->idle_active || ts->tick_stopped)  		now = ktime_get();  	if (ts->idle_active)  		tick_nohz_stop_idle(cpu, now); -	if (!ts->inidle || !ts->tick_stopped) { -		ts->inidle = 0; +	if (!ts->tick_stopped) {  		local_irq_enable();  		return;  	} -	ts->inidle = 0; -  	/* Update jiffies first */  	select_nohz_load_balancer(0);  	tick_do_update_jiffies64(now); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0c6358186401..15be32e19c6e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,8 @@  struct timekeeper {  	/* Current clocksource used for timekeeping. */  	struct clocksource *clock; +	/* NTP adjusted clock multiplier */ +	u32	mult;  	/* The shift value of the current clocksource. */  	int	shift; @@ -45,12 +47,47 @@ struct timekeeper {  	/* Shift conversion between clock shifted nano seconds and  	 * ntp shifted nano seconds. */  	int	ntp_error_shift; -	/* NTP adjusted clock multiplier */ -	u32	mult; + +	/* The current time */ +	struct timespec xtime; +	/* +	 * wall_to_monotonic is what we need to add to xtime (or xtime corrected +	 * for sub jiffie times) to get to monotonic time.  Monotonic is pegged +	 * at zero at system boot time, so wall_to_monotonic will be negative, +	 * however, we will ALWAYS keep the tv_nsec part positive so we can use +	 * the usual normalization. +	 * +	 * wall_to_monotonic is moved after resume from suspend for the +	 * monotonic time not to jump. We need to add total_sleep_time to +	 * wall_to_monotonic to get the real boot based time offset. +	 * +	 * - wall_to_monotonic is no longer the boot time, getboottime must be +	 * used instead. +	 */ +	struct timespec wall_to_monotonic; +	/* time spent in suspend */ +	struct timespec total_sleep_time; +	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ +	struct timespec raw_time; + +	/* Seqlock for all timekeeper values */ +	seqlock_t lock;  };  static struct timekeeper timekeeper; +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + + +  /**   * timekeeper_setup_internals - Set up internals to use clocksource clock.   * @@ -135,47 +172,28 @@ static inline s64 timekeeping_get_ns_raw(void)  	return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);  } -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time.  Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the monotonic - * time not to jump. We need to add total_sleep_time to wall_to_monotonic - * to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. - */ -static struct timespec xtime __attribute__ ((aligned (16))); -static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static struct timespec total_sleep_time; - -/* - * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. - */ -static struct timespec raw_time; +/* must hold write on timekeeper.lock */ +static void timekeeping_update(bool clearntp) +{ +	if (clearntp) { +		timekeeper.ntp_error = 0; +		ntp_clear(); +	} +	update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, +			 timekeeper.clock, timekeeper.mult); +} -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; -/* must hold xtime_lock */  void timekeeping_leap_insert(int leapsecond)  { -	xtime.tv_sec += leapsecond; -	wall_to_monotonic.tv_sec -= leapsecond; -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -			timekeeper.mult); +	unsigned long flags; + +	write_seqlock_irqsave(&timekeeper.lock, flags); +	timekeeper.xtime.tv_sec += leapsecond; +	timekeeper.wall_to_monotonic.tv_sec -= leapsecond; +	timekeeping_update(false); +	write_sequnlock_irqrestore(&timekeeper.lock, flags); +  }  /** @@ -202,10 +220,10 @@ static void timekeeping_forward_now(void)  	/* If arch requires, add in gettimeoffset() */  	nsec += arch_gettimeoffset(); -	timespec_add_ns(&xtime, nsec); +	timespec_add_ns(&timekeeper.xtime, nsec);  	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); -	timespec_add_ns(&raw_time, nsec); +	timespec_add_ns(&timekeeper.raw_time, nsec);  }  /** @@ -222,15 +240,15 @@ void getnstimeofday(struct timespec *ts)  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&timekeeper.lock); -		*ts = xtime; +		*ts = timekeeper.xtime;  		nsecs = timekeeping_get_ns();  		/* If arch requires, add in gettimeoffset() */  		nsecs += arch_gettimeoffset(); -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&timekeeper.lock, seq));  	timespec_add_ns(ts, nsecs);  } @@ -245,14 +263,16 @@ ktime_t ktime_get(void)  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&xtime_lock); -		secs = xtime.tv_sec + wall_to_monotonic.tv_sec; -		nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; +		seq = read_seqbegin(&timekeeper.lock); +		secs = timekeeper.xtime.tv_sec + +				timekeeper.wall_to_monotonic.tv_sec; +		nsecs = timekeeper.xtime.tv_nsec + +				timekeeper.wall_to_monotonic.tv_nsec;  		nsecs += timekeeping_get_ns();  		/* If arch requires, add in gettimeoffset() */  		nsecs += arch_gettimeoffset(); -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&timekeeper.lock, seq));  	/*  	 * Use ktime_set/ktime_add_ns to create a proper ktime on  	 * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -278,14 +298,14 @@ void ktime_get_ts(struct timespec *ts)  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&xtime_lock); -		*ts = xtime; -		tomono = wall_to_monotonic; +		seq = read_seqbegin(&timekeeper.lock); +		*ts = timekeeper.xtime; +		tomono = timekeeper.wall_to_monotonic;  		nsecs = timekeeping_get_ns();  		/* If arch requires, add in gettimeoffset() */  		nsecs += arch_gettimeoffset(); -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&timekeeper.lock, seq));  	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,  				ts->tv_nsec + tomono.tv_nsec + nsecs); @@ -313,10 +333,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)  	do {  		u32 arch_offset; -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&timekeeper.lock); -		*ts_raw = raw_time; -		*ts_real = xtime; +		*ts_raw = timekeeper.raw_time; +		*ts_real = timekeeper.xtime;  		nsecs_raw = timekeeping_get_ns_raw();  		nsecs_real = timekeeping_get_ns(); @@ -326,7 +346,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)  		nsecs_raw += arch_offset;  		nsecs_real += arch_offset; -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&timekeeper.lock, seq));  	timespec_add_ns(ts_raw, nsecs_raw);  	timespec_add_ns(ts_real, nsecs_real); @@ -365,23 +385,19 @@ int do_settimeofday(const struct timespec *tv)  	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)  		return -EINVAL; -	write_seqlock_irqsave(&xtime_lock, flags); +	write_seqlock_irqsave(&timekeeper.lock, flags);  	timekeeping_forward_now(); -	ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; -	ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; -	wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - -	xtime = *tv; +	ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; +	ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; +	timekeeper.wall_to_monotonic = +			timespec_sub(timekeeper.wall_to_monotonic, ts_delta); -	timekeeper.ntp_error = 0; -	ntp_clear(); - -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -				timekeeper.mult); +	timekeeper.xtime = *tv; +	timekeeping_update(true); -	write_sequnlock_irqrestore(&xtime_lock, flags); +	write_sequnlock_irqrestore(&timekeeper.lock, flags);  	/* signal hrtimers about time change */  	clock_was_set(); @@ -405,20 +421,17 @@ int timekeeping_inject_offset(struct timespec *ts)  	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)  		return -EINVAL; -	write_seqlock_irqsave(&xtime_lock, flags); +	write_seqlock_irqsave(&timekeeper.lock, flags);  	timekeeping_forward_now(); -	xtime = timespec_add(xtime, *ts); -	wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); - -	timekeeper.ntp_error = 0; -	ntp_clear(); +	timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); +	timekeeper.wall_to_monotonic = +				timespec_sub(timekeeper.wall_to_monotonic, *ts); -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -				timekeeper.mult); +	timekeeping_update(true); -	write_sequnlock_irqrestore(&xtime_lock, flags); +	write_sequnlock_irqrestore(&timekeeper.lock, flags);  	/* signal hrtimers about time change */  	clock_was_set(); @@ -490,11 +503,11 @@ void getrawmonotonic(struct timespec *ts)  	s64 nsecs;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&timekeeper.lock);  		nsecs = timekeeping_get_ns_raw(); -		*ts = raw_time; +		*ts = timekeeper.raw_time; -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&timekeeper.lock, seq));  	timespec_add_ns(ts, nsecs);  } @@ -510,24 +523,30 @@ int timekeeping_valid_for_hres(void)  	int ret;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&timekeeper.lock);  		ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&timekeeper.lock, seq));  	return ret;  }  /**   * timekeeping_max_deferment - Returns max time the clocksource can be deferred - * - * Caller must observe xtime_lock via read_seqbegin/read_seqretry to - * ensure that the clocksource does not change!   */  u64 timekeeping_max_deferment(void)  { -	return timekeeper.clock->max_idle_ns; +	unsigned long seq; +	u64 ret; +	do { +		seq = read_seqbegin(&timekeeper.lock); + +		ret = timekeeper.clock->max_idle_ns; + +	} while (read_seqretry(&timekeeper.lock, seq)); + +	return ret;  }  /** @@ -572,28 +591,29 @@ void __init timekeeping_init(void)  	read_persistent_clock(&now);  	read_boot_clock(&boot); -	write_seqlock_irqsave(&xtime_lock, flags); +	seqlock_init(&timekeeper.lock);  	ntp_init(); +	write_seqlock_irqsave(&timekeeper.lock, flags);  	clock = clocksource_default_clock();  	if (clock->enable)  		clock->enable(clock);  	timekeeper_setup_internals(clock); -	xtime.tv_sec = now.tv_sec; -	xtime.tv_nsec = now.tv_nsec; -	raw_time.tv_sec = 0; -	raw_time.tv_nsec = 0; +	timekeeper.xtime.tv_sec = now.tv_sec; +	timekeeper.xtime.tv_nsec = now.tv_nsec; +	timekeeper.raw_time.tv_sec = 0; +	timekeeper.raw_time.tv_nsec = 0;  	if (boot.tv_sec == 0 && boot.tv_nsec == 0) { -		boot.tv_sec = xtime.tv_sec; -		boot.tv_nsec = xtime.tv_nsec; +		boot.tv_sec = timekeeper.xtime.tv_sec; +		boot.tv_nsec = timekeeper.xtime.tv_nsec;  	} -	set_normalized_timespec(&wall_to_monotonic, +	set_normalized_timespec(&timekeeper.wall_to_monotonic,  				-boot.tv_sec, -boot.tv_nsec); -	total_sleep_time.tv_sec = 0; -	total_sleep_time.tv_nsec = 0; -	write_sequnlock_irqrestore(&xtime_lock, flags); +	timekeeper.total_sleep_time.tv_sec = 0; +	timekeeper.total_sleep_time.tv_nsec = 0; +	write_sequnlock_irqrestore(&timekeeper.lock, flags);  }  /* time in seconds when suspend began */ @@ -614,9 +634,11 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)  		return;  	} -	xtime = timespec_add(xtime, *delta); -	wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); -	total_sleep_time = timespec_add(total_sleep_time, *delta); +	timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); +	timekeeper.wall_to_monotonic = +			timespec_sub(timekeeper.wall_to_monotonic, *delta); +	timekeeper.total_sleep_time = timespec_add( +					timekeeper.total_sleep_time, *delta);  } @@ -640,17 +662,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta)  	if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))  		return; -	write_seqlock_irqsave(&xtime_lock, flags); +	write_seqlock_irqsave(&timekeeper.lock, flags); +  	timekeeping_forward_now();  	__timekeeping_inject_sleeptime(delta); -	timekeeper.ntp_error = 0; -	ntp_clear(); -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -				timekeeper.mult); +	timekeeping_update(true); -	write_sequnlock_irqrestore(&xtime_lock, flags); +	write_sequnlock_irqrestore(&timekeeper.lock, flags);  	/* signal hrtimers about time change */  	clock_was_set(); @@ -673,7 +693,7 @@ static void timekeeping_resume(void)  	clocksource_resume(); -	write_seqlock_irqsave(&xtime_lock, flags); +	write_seqlock_irqsave(&timekeeper.lock, flags);  	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {  		ts = timespec_sub(ts, timekeeping_suspend_time); @@ -683,7 +703,7 @@ static void timekeeping_resume(void)  	timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);  	timekeeper.ntp_error = 0;  	timekeeping_suspended = 0; -	write_sequnlock_irqrestore(&xtime_lock, flags); +	write_sequnlock_irqrestore(&timekeeper.lock, flags);  	touch_softlockup_watchdog(); @@ -701,7 +721,7 @@ static int timekeeping_suspend(void)  	read_persistent_clock(&timekeeping_suspend_time); -	write_seqlock_irqsave(&xtime_lock, flags); +	write_seqlock_irqsave(&timekeeper.lock, flags);  	timekeeping_forward_now();  	timekeeping_suspended = 1; @@ -711,7 +731,7 @@ static int timekeeping_suspend(void)  	 * try to compensate so the difference in system time  	 * and persistent_clock time stays close to constant.  	 */ -	delta = timespec_sub(xtime, timekeeping_suspend_time); +	delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);  	delta_delta = timespec_sub(delta, old_delta);  	if (abs(delta_delta.tv_sec)  >= 2) {  		/* @@ -724,7 +744,7 @@ static int timekeeping_suspend(void)  		timekeeping_suspend_time =  			timespec_add(timekeeping_suspend_time, delta_delta);  	} -	write_sequnlock_irqrestore(&xtime_lock, flags); +	write_sequnlock_irqrestore(&timekeeper.lock, flags);  	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);  	clocksource_suspend(); @@ -775,7 +795,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,  	 * Now calculate the error in (1 << look_ahead) ticks, but first  	 * remove the single look ahead already included in the error.  	 */ -	tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); +	tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);  	tick_error -= timekeeper.xtime_interval >> 1;  	error = ((error - tick_error) >> look_ahead) + tick_error; @@ -943,22 +963,22 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)  	timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;  	while (timekeeper.xtime_nsec >= nsecps) {  		timekeeper.xtime_nsec -= nsecps; -		xtime.tv_sec++; +		timekeeper.xtime.tv_sec++;  		second_overflow();  	}  	/* Accumulate raw time */  	raw_nsecs = timekeeper.raw_interval << shift; -	raw_nsecs += raw_time.tv_nsec; +	raw_nsecs += timekeeper.raw_time.tv_nsec;  	if (raw_nsecs >= NSEC_PER_SEC) {  		u64 raw_secs = raw_nsecs;  		raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); -		raw_time.tv_sec += raw_secs; +		timekeeper.raw_time.tv_sec += raw_secs;  	} -	raw_time.tv_nsec = raw_nsecs; +	timekeeper.raw_time.tv_nsec = raw_nsecs;  	/* Accumulate error between NTP and clock interval */ -	timekeeper.ntp_error += tick_length << shift; +	timekeeper.ntp_error += ntp_tick_length() << shift;  	timekeeper.ntp_error -=  	    (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<  				(timekeeper.ntp_error_shift + shift); @@ -970,17 +990,19 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)  /**   * update_wall_time - Uses the current clocksource to increment the wall time   * - * Called from the timer interrupt, must hold a write on xtime_lock.   */  static void update_wall_time(void)  {  	struct clocksource *clock;  	cycle_t offset;  	int shift = 0, maxshift; +	unsigned long flags; + +	write_seqlock_irqsave(&timekeeper.lock, flags);  	/* Make sure we're fully resumed: */  	if (unlikely(timekeeping_suspended)) -		return; +		goto out;  	clock = timekeeper.clock; @@ -989,7 +1011,8 @@ static void update_wall_time(void)  #else  	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;  #endif -	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; +	timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << +						timekeeper.shift;  	/*  	 * With NO_HZ we may have to accumulate many cycle_intervals @@ -1002,7 +1025,7 @@ static void update_wall_time(void)  	shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);  	shift = max(0, shift);  	/* Bound shift to one less then what overflows tick_length */ -	maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; +	maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;  	shift = min(shift, maxshift);  	while (offset >= timekeeper.cycle_interval) {  		offset = logarithmic_accumulation(offset, shift); @@ -1040,8 +1063,10 @@ static void update_wall_time(void)  	 * Store full nanoseconds into xtime after rounding it up and  	 * add the remainder to the error difference.  	 */ -	xtime.tv_nsec =	((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; -	timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; +	timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> +						timekeeper.shift) + 1; +	timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << +						timekeeper.shift;  	timekeeper.ntp_error +=	timekeeper.xtime_nsec <<  				timekeeper.ntp_error_shift; @@ -1049,15 +1074,17 @@ static void update_wall_time(void)  	 * Finally, make sure that after the rounding  	 * xtime.tv_nsec isn't larger then NSEC_PER_SEC  	 */ -	if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { -		xtime.tv_nsec -= NSEC_PER_SEC; -		xtime.tv_sec++; +	if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { +		timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; +		timekeeper.xtime.tv_sec++;  		second_overflow();  	} -	/* check to see if there is a new clocksource to use */ -	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, -				timekeeper.mult); +	timekeeping_update(false); + +out: +	write_sequnlock_irqrestore(&timekeeper.lock, flags); +  }  /** @@ -1074,8 +1101,10 @@ static void update_wall_time(void)  void getboottime(struct timespec *ts)  {  	struct timespec boottime = { -		.tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, -		.tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec +		.tv_sec = timekeeper.wall_to_monotonic.tv_sec + +				timekeeper.total_sleep_time.tv_sec, +		.tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + +				timekeeper.total_sleep_time.tv_nsec  	};  	set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); @@ -1101,13 +1130,13 @@ void get_monotonic_boottime(struct timespec *ts)  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&xtime_lock); -		*ts = xtime; -		tomono = wall_to_monotonic; -		sleep = total_sleep_time; +		seq = read_seqbegin(&timekeeper.lock); +		*ts = timekeeper.xtime; +		tomono = timekeeper.wall_to_monotonic; +		sleep = timekeeper.total_sleep_time;  		nsecs = timekeeping_get_ns(); -	} while (read_seqretry(&xtime_lock, seq)); +	} while (read_seqretry(&timekeeper.lock, seq));  	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,  			ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); @@ -1137,19 +1166,19 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);   */  void monotonic_to_bootbased(struct timespec *ts)  { -	*ts = timespec_add(*ts, total_sleep_time); +	*ts = timespec_add(*ts, timekeeper.total_sleep_time);  }  EXPORT_SYMBOL_GPL(monotonic_to_bootbased);  unsigned long get_seconds(void)  { -	return xtime.tv_sec; +	return timekeeper.xtime.tv_sec;  }  EXPORT_SYMBOL(get_seconds);  struct timespec __current_kernel_time(void)  { -	return xtime; +	return timekeeper.xtime;  }  struct timespec current_kernel_time(void) @@ -1158,10 +1187,10 @@ struct timespec current_kernel_time(void)  	unsigned long seq;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&timekeeper.lock); -		now = xtime; -	} while (read_seqretry(&xtime_lock, seq)); +		now = timekeeper.xtime; +	} while (read_seqretry(&timekeeper.lock, seq));  	return now;  } @@ -1173,11 +1202,11 @@ struct timespec get_monotonic_coarse(void)  	unsigned long seq;  	do { -		seq = read_seqbegin(&xtime_lock); +		seq = read_seqbegin(&timekeeper.lock); -		now = xtime; -		mono = wall_to_monotonic; -	} while (read_seqretry(&xtime_lock, seq)); +		now = timekeeper.xtime; +		mono = timekeeper.wall_to_monotonic; +	} while (read_seqretry(&timekeeper.lock, seq));  	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,  				now.tv_nsec + mono.tv_nsec); @@ -1209,11 +1238,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,  	unsigned long seq;  	do { -		seq = read_seqbegin(&xtime_lock); -		*xtim = xtime; -		*wtom = wall_to_monotonic; -		*sleep = total_sleep_time; -	} while (read_seqretry(&xtime_lock, seq)); +		seq = read_seqbegin(&timekeeper.lock); +		*xtim = timekeeper.xtime; +		*wtom = timekeeper.wall_to_monotonic; +		*sleep = timekeeper.total_sleep_time; +	} while (read_seqretry(&timekeeper.lock, seq));  }  /** @@ -1225,11 +1254,14 @@ ktime_t ktime_get_monotonic_offset(void)  	struct timespec wtom;  	do { -		seq = read_seqbegin(&xtime_lock); -		wtom = wall_to_monotonic; -	} while (read_seqretry(&xtime_lock, seq)); +		seq = read_seqbegin(&timekeeper.lock); +		wtom = timekeeper.wall_to_monotonic; +	} while (read_seqretry(&timekeeper.lock, seq)); +  	return timespec_to_ktime(wtom);  } +EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); +  /**   * xtime_update() - advances the timekeeping infrastructure diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 683d559a0eef..867bd1dd2dd0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,6 +62,8 @@  #define FTRACE_HASH_DEFAULT_BITS 10  #define FTRACE_HASH_MAX_BITS 12 +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +  /* ftrace_enabled is a method to turn ftrace on or off */  int ftrace_enabled __read_mostly;  static int last_ftrace_enabled; @@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {  };  static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;  static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;  ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;  static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;  ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;  ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;  static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops;  static void  ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); @@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)  }  #endif +static void control_ops_disable_all(struct ftrace_ops *ops) +{ +	int cpu; + +	for_each_possible_cpu(cpu) +		*per_cpu_ptr(ops->disabled, cpu) = 1; +} + +static int control_ops_alloc(struct ftrace_ops *ops) +{ +	int __percpu *disabled; + +	disabled = alloc_percpu(int); +	if (!disabled) +		return -ENOMEM; + +	ops->disabled = disabled; +	control_ops_disable_all(ops); +	return 0; +} + +static void control_ops_free(struct ftrace_ops *ops) +{ +	free_percpu(ops->disabled); +} +  static void update_global_ops(void)  {  	ftrace_func_t func; @@ -259,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)  	return 0;  } +static void add_ftrace_list_ops(struct ftrace_ops **list, +				struct ftrace_ops *main_ops, +				struct ftrace_ops *ops) +{ +	int first = *list == &ftrace_list_end; +	add_ftrace_ops(list, ops); +	if (first) +		add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, +				  struct ftrace_ops *main_ops, +				  struct ftrace_ops *ops) +{ +	int ret = remove_ftrace_ops(list, ops); +	if (!ret && *list == &ftrace_list_end) +		ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); +	return ret; +} +  static int __register_ftrace_function(struct ftrace_ops *ops)  {  	if (ftrace_disabled) @@ -270,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)  	if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))  		return -EBUSY; +	/* We don't support both control and global flags set. */ +	if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) +		return -EINVAL; +  	if (!core_kernel_data((unsigned long)ops))  		ops->flags |= FTRACE_OPS_FL_DYNAMIC;  	if (ops->flags & FTRACE_OPS_FL_GLOBAL) { -		int first = ftrace_global_list == &ftrace_list_end; -		add_ftrace_ops(&ftrace_global_list, ops); +		add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);  		ops->flags |= FTRACE_OPS_FL_ENABLED; -		if (first) -			add_ftrace_ops(&ftrace_ops_list, &global_ops); +	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) { +		if (control_ops_alloc(ops)) +			return -ENOMEM; +		add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);  	} else  		add_ftrace_ops(&ftrace_ops_list, ops); @@ -302,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  		return -EINVAL;  	if (ops->flags & FTRACE_OPS_FL_GLOBAL) { -		ret = remove_ftrace_ops(&ftrace_global_list, ops); -		if (!ret && ftrace_global_list == &ftrace_list_end) -			ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); +		ret = remove_ftrace_list_ops(&ftrace_global_list, +					     &global_ops, ops);  		if (!ret)  			ops->flags &= ~FTRACE_OPS_FL_ENABLED; +	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) { +		ret = remove_ftrace_list_ops(&ftrace_control_list, +					     &control_ops, ops); +		if (!ret) { +			/* +			 * The ftrace_ops is now removed from the list, +			 * so there'll be no new users. We must ensure +			 * all current users are done before we free +			 * the control data. +			 */ +			synchronize_sched(); +			control_ops_free(ops); +		}  	} else  		ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -1119,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)  	call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);  } +void ftrace_free_filter(struct ftrace_ops *ops) +{ +	free_ftrace_hash(ops->filter_hash); +	free_ftrace_hash(ops->notrace_hash); +} +  static struct ftrace_hash *alloc_ftrace_hash(int size_bits)  {  	struct ftrace_hash *hash; @@ -1129,7 +1202,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)  		return NULL;  	size = 1 << size_bits; -	hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); +	hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);  	if (!hash->buckets) {  		kfree(hash); @@ -3146,8 +3219,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,  	mutex_lock(&ftrace_regex_lock);  	if (reset)  		ftrace_filter_reset(hash); -	if (buf) -		ftrace_match_records(hash, buf, len); +	if (buf && !ftrace_match_records(hash, buf, len)) { +		ret = -EINVAL; +		goto out_regex_unlock; +	}  	mutex_lock(&ftrace_lock);  	ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3157,6 +3232,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,  	mutex_unlock(&ftrace_lock); + out_regex_unlock:  	mutex_unlock(&ftrace_regex_lock);  	free_ftrace_hash(hash); @@ -3173,10 +3249,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,   * Filters denote which functions should be enabled when tracing is enabled.   * If @buf is NULL and reset is set, all functions will be enabled for tracing.   */ -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,  		       int len, int reset)  { -	ftrace_set_regex(ops, buf, len, reset, 1); +	return ftrace_set_regex(ops, buf, len, reset, 1);  }  EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3191,10 +3267,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);   * is enabled. If @buf is NULL and reset is set, all functions will be enabled   * for tracing.   */ -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,  			int len, int reset)  { -	ftrace_set_regex(ops, buf, len, reset, 0); +	return ftrace_set_regex(ops, buf, len, reset, 0);  }  EXPORT_SYMBOL_GPL(ftrace_set_notrace);  /** @@ -3871,6 +3947,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)  #endif /* CONFIG_DYNAMIC_FTRACE */  static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +{ +	struct ftrace_ops *op; + +	if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) +		return; + +	/* +	 * Some of the ops may be dynamically allocated, +	 * they must be freed after a synchronize_sched(). +	 */ +	preempt_disable_notrace(); +	trace_recursion_set(TRACE_CONTROL_BIT); +	op = rcu_dereference_raw(ftrace_control_list); +	while (op != &ftrace_list_end) { +		if (!ftrace_function_local_disabled(op) && +		    ftrace_ops_test(op, ip)) +			op->func(ip, parent_ip); + +		op = rcu_dereference_raw(op->next); +	}; +	trace_recursion_clear(TRACE_CONTROL_BIT); +	preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { +	.func = ftrace_ops_control_func, +}; + +static void  ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)  {  	struct ftrace_ops *op; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f1bc5d2a00..10d5503f0d04 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2764,12 +2764,12 @@ static const char readme_msg[] =  	"tracing mini-HOWTO:\n\n"  	"# mount -t debugfs nodev /sys/kernel/debug\n\n"  	"# cat /sys/kernel/debug/tracing/available_tracers\n" -	"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" +	"wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"  	"# cat /sys/kernel/debug/tracing/current_tracer\n"  	"nop\n" -	"# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" +	"# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"  	"# cat /sys/kernel/debug/tracing/current_tracer\n" -	"sched_switch\n" +	"wakeup\n"  	"# cat /sys/kernel/debug/tracing/trace_options\n"  	"noprint-parent nosym-offset nosym-addr noverbose\n"  	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecbadad6d..54faec790bc1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,17 +56,23 @@ enum trace_type {  #define F_STRUCT(args...)		args  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\ -	struct struct_name {					\ -		struct trace_entry	ent;			\ -		tstruct						\ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ +	struct struct_name {						\ +		struct trace_entry	ent;				\ +		tstruct							\  	}  #undef TP_ARGS  #define TP_ARGS(args...)	args  #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print,	\ +			 filter, regfn) \ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter)  #include "trace_entries.h" @@ -288,6 +294,8 @@ struct tracer {  /* for function tracing recursion */  #define TRACE_INTERNAL_BIT		(1<<11)  #define TRACE_GLOBAL_BIT		(1<<12) +#define TRACE_CONTROL_BIT		(1<<13) +  /*   * Abuse of the trace_recursion.   * As we need a way to maintain state if we are tracing the function @@ -589,6 +597,8 @@ static inline int ftrace_trace_task(struct task_struct *task)  static inline int ftrace_is_dead(void) { return 0; }  #endif +int ftrace_event_is_function(struct ftrace_event_call *call); +  /*   * struct trace_parser - servers for reading the user input separated by spaces   * @cont: set if the input is not complete - no final space char was found @@ -766,9 +776,7 @@ struct filter_pred {  	u64 			val;  	struct regex		regex;  	unsigned short		*ops; -#ifdef CONFIG_FTRACE_STARTUP_TEST  	struct ftrace_event_field *field; -#endif  	int 			offset;  	int 			not;  	int 			op; @@ -818,12 +826,22 @@ extern const char *__start___trace_bprintk_fmt[];  extern const char *__stop___trace_bprintk_fmt[];  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print)		\ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\  	extern struct ftrace_event_call					\  	__attribute__((__aligned__(4))) event_##call;  #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)		\ -	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\ +	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter)  #include "trace_entries.h" +#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_FUNCTION_TRACER +int perf_ftrace_event_register(struct ftrace_event_call *call, +			       enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_PERF_EVENTS */ +  #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..d91eb0541b3a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -55,7 +55,7 @@  /*   * Function trace entry - function address and parent function address:   */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry,  	TRACE_FN, @@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry,  		__field(	unsigned long,	parent_ip	)  	), -	F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +	F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + +	FILTER_TRACE_FN, + +	perf_ftrace_event_register  );  /* Function call entry */ @@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,  		__field_desc(	int,		graph_ent,	depth		)  	), -	F_printk("--> %lx (%d)", __entry->func, __entry->depth) +	F_printk("--> %lx (%d)", __entry->func, __entry->depth), + +	FILTER_OTHER  );  /* Function return entry */ @@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,  	F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",  		 __entry->func, __entry->depth,  		 __entry->calltime, __entry->rettime, -		 __entry->depth) +		 __entry->depth), + +	FILTER_OTHER  );  /* @@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,  	F_printk("%u:%u:%u  ==> %u:%u:%u [%03u]",  		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,  		 __entry->next_pid, __entry->next_prio, __entry->next_state, -		 __entry->next_cpu -		) +		 __entry->next_cpu), + +	FILTER_OTHER  );  /* @@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,  	F_printk("%u:%u:%u  ==+ %u:%u:%u [%03u]",  		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,  		 __entry->next_pid, __entry->next_prio, __entry->next_state, -		 __entry->next_cpu -		) +		 __entry->next_cpu), + +	FILTER_OTHER  );  /* @@ -169,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry,  		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",  		 __entry->caller[0], __entry->caller[1], __entry->caller[2],  		 __entry->caller[3], __entry->caller[4], __entry->caller[5], -		 __entry->caller[6], __entry->caller[7]) +		 __entry->caller[6], __entry->caller[7]), + +	FILTER_OTHER  );  FTRACE_ENTRY(user_stack, userstack_entry, @@ -185,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry,  		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",  		 __entry->caller[0], __entry->caller[1], __entry->caller[2],  		 __entry->caller[3], __entry->caller[4], __entry->caller[5], -		 __entry->caller[6], __entry->caller[7]) +		 __entry->caller[6], __entry->caller[7]), + +	FILTER_OTHER  );  /* @@ -202,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry,  	),  	F_printk("%08lx fmt:%p", -		 __entry->ip, __entry->fmt) +		 __entry->ip, __entry->fmt), + +	FILTER_OTHER  );  FTRACE_ENTRY(print, print_entry, @@ -215,7 +231,9 @@ FTRACE_ENTRY(print, print_entry,  	),  	F_printk("%08lx %s", -		 __entry->ip, __entry->buf) +		 __entry->ip, __entry->buf), + +	FILTER_OTHER  );  FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -234,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,  	F_printk("%lx %lx %lx %d %x %x",  		 (unsigned long)__entry->phys, __entry->value, __entry->pc, -		 __entry->map_id, __entry->opcode, __entry->width) +		 __entry->map_id, __entry->opcode, __entry->width), + +	FILTER_OTHER  );  FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -252,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,  	F_printk("%lx %lx %lx %d %x",  		 (unsigned long)__entry->phys, __entry->virt, __entry->len, -		 __entry->map_id, __entry->opcode) +		 __entry->map_id, __entry->opcode), + +	FILTER_OTHER  ); @@ -272,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch,  	F_printk("%u:%s:%s (%u)",  		 __entry->line, -		 __entry->func, __entry->file, __entry->correct) +		 __entry->func, __entry->file, __entry->correct), + +	FILTER_OTHER  ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d5e6d5..fee3752ae8f6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,11 @@ static int	total_ref_count;  static int perf_trace_event_perm(struct ftrace_event_call *tp_event,  				 struct perf_event *p_event)  { +	/* The ftrace function trace is allowed only for root. */ +	if (ftrace_event_is_function(tp_event) && +	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) +		return -EPERM; +  	/* No tracing, just counting, so no obvious leak */  	if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))  		return 0; @@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,  	return 0;  } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, -				 struct perf_event *p_event) +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, +				struct perf_event *p_event)  {  	struct hlist_head __percpu *list; -	int ret; +	int ret = -ENOMEM;  	int cpu; -	ret = perf_trace_event_perm(tp_event, p_event); -	if (ret) -		return ret; -  	p_event->tp_event = tp_event;  	if (tp_event->perf_refcount++ > 0)  		return 0; -	ret = -ENOMEM; -  	list = alloc_percpu(struct hlist_head);  	if (!list)  		goto fail; @@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,  		}  	} -	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); +	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);  	if (ret)  		goto fail; @@ -108,6 +107,69 @@ fail:  	return ret;  } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ +	struct ftrace_event_call *tp_event = p_event->tp_event; +	int i; + +	if (--tp_event->perf_refcount > 0) +		goto out; + +	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + +	/* +	 * Ensure our callback won't be called anymore. The buffers +	 * will be freed after that. +	 */ +	tracepoint_synchronize_unregister(); + +	free_percpu(tp_event->perf_events); +	tp_event->perf_events = NULL; + +	if (!--total_ref_count) { +		for (i = 0; i < PERF_NR_CONTEXTS; i++) { +			free_percpu(perf_trace_buf[i]); +			perf_trace_buf[i] = NULL; +		} +	} +out: +	module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ +	struct ftrace_event_call *tp_event = p_event->tp_event; +	return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ +	struct ftrace_event_call *tp_event = p_event->tp_event; +	tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, +				 struct perf_event *p_event) +{ +	int ret; + +	ret = perf_trace_event_perm(tp_event, p_event); +	if (ret) +		return ret; + +	ret = perf_trace_event_reg(tp_event, p_event); +	if (ret) +		return ret; + +	ret = perf_trace_event_open(p_event); +	if (ret) { +		perf_trace_event_unreg(p_event); +		return ret; +	} + +	return 0; +} +  int perf_trace_init(struct perf_event *p_event)  {  	struct ftrace_event_call *tp_event; @@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event)  	return ret;  } +void perf_trace_destroy(struct perf_event *p_event) +{ +	mutex_lock(&event_mutex); +	perf_trace_event_close(p_event); +	perf_trace_event_unreg(p_event); +	mutex_unlock(&event_mutex); +} +  int perf_trace_add(struct perf_event *p_event, int flags)  {  	struct ftrace_event_call *tp_event = p_event->tp_event; @@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags)  	list = this_cpu_ptr(pcpu_list);  	hlist_add_head_rcu(&p_event->hlist_entry, list); -	return 0; +	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);  }  void perf_trace_del(struct perf_event *p_event, int flags)  { -	hlist_del_rcu(&p_event->hlist_entry); -} - -void perf_trace_destroy(struct perf_event *p_event) -{  	struct ftrace_event_call *tp_event = p_event->tp_event; -	int i; - -	mutex_lock(&event_mutex); -	if (--tp_event->perf_refcount > 0) -		goto out; - -	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - -	/* -	 * Ensure our callback won't be called anymore. The buffers -	 * will be freed after that. -	 */ -	tracepoint_synchronize_unregister(); - -	free_percpu(tp_event->perf_events); -	tp_event->perf_events = NULL; - -	if (!--total_ref_count) { -		for (i = 0; i < PERF_NR_CONTEXTS; i++) { -			free_percpu(perf_trace_buf[i]); -			perf_trace_buf[i] = NULL; -		} -	} -out: -	module_put(tp_event->mod); -	mutex_unlock(&event_mutex); +	hlist_del_rcu(&p_event->hlist_entry); +	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);  }  __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, @@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,  	return raw_data;  }  EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +{ +	struct ftrace_entry *entry; +	struct hlist_head *head; +	struct pt_regs regs; +	int rctx; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ +		    sizeof(u64)) - sizeof(u32)) + +	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + +	perf_fetch_caller_regs(®s); + +	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); +	if (!entry) +		return; + +	entry->ip = ip; +	entry->parent_ip = parent_ip; + +	head = this_cpu_ptr(event_function.perf_events); +	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, +			      1, ®s, head); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ +	struct ftrace_ops *ops = &event->ftrace_ops; + +	ops->flags |= FTRACE_OPS_FL_CONTROL; +	ops->func = perf_ftrace_function_call; +	return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ +	struct ftrace_ops *ops = &event->ftrace_ops; +	int ret = unregister_ftrace_function(ops); +	ftrace_free_filter(ops); +	return ret; +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ +	ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ +	ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, +			       enum trace_reg type, void *data) +{ +	switch (type) { +	case TRACE_REG_REGISTER: +	case TRACE_REG_UNREGISTER: +		break; +	case TRACE_REG_PERF_REGISTER: +	case TRACE_REG_PERF_UNREGISTER: +		return 0; +	case TRACE_REG_PERF_OPEN: +		return perf_ftrace_function_register(data); +	case TRACE_REG_PERF_CLOSE: +		return perf_ftrace_function_unregister(data); +	case TRACE_REG_PERF_ADD: +		perf_ftrace_function_enable(data); +		return 0; +	case TRACE_REG_PERF_DEL: +		perf_ftrace_function_disable(data); +		return 0; +	} + +	return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f934ec..079a93ae8a9d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call)  }  EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +int ftrace_event_reg(struct ftrace_event_call *call, +		     enum trace_reg type, void *data)  {  	switch (type) {  	case TRACE_REG_REGISTER: @@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)  					    call->class->perf_probe,  					    call);  		return 0; +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL: +		return 0;  #endif  	}  	return 0; @@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,  				tracing_stop_cmdline_record();  				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;  			} -			call->class->reg(call, TRACE_REG_UNREGISTER); +			call->class->reg(call, TRACE_REG_UNREGISTER, NULL);  		}  		break;  	case 1: @@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,  				tracing_start_cmdline_record();  				call->flags |= TRACE_EVENT_FL_RECORDED_CMD;  			} -			ret = call->class->reg(call, TRACE_REG_REGISTER); +			ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);  			if (ret) {  				tracing_stop_cmdline_record();  				pr_info("event trace: Could not enable event " diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 24aee7127451..431dba8b7542 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -81,6 +81,7 @@ enum {  	FILT_ERR_TOO_MANY_PREDS,  	FILT_ERR_MISSING_FIELD,  	FILT_ERR_INVALID_FILTER, +	FILT_ERR_IP_FIELD_ONLY,  };  static char *err_text[] = { @@ -96,6 +97,7 @@ static char *err_text[] = {  	"Too many terms in predicate expression",  	"Missing field name and/or value",  	"Meaningless filter expression", +	"Only 'ip' field is supported for function trace",  };  struct opstack_op { @@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name)  static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)  { -	stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); +	stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);  	if (!stack->preds)  		return -ENOMEM;  	stack->index = n_preds; @@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)  	if (filter->preds)  		__free_preds(filter); -	filter->preds = -		kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); +	filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);  	if (!filter->preds)  		return -ENOMEM; @@ -900,6 +901,11 @@ int filter_assign_type(const char *type)  	return FILTER_OTHER;  } +static bool is_function_field(struct ftrace_event_field *field) +{ +	return field->filter_type == FILTER_TRACE_FN; +} +  static bool is_string_field(struct ftrace_event_field *field)  {  	return field->filter_type == FILTER_DYN_STRING || @@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps,  			fn = filter_pred_strloc;  		else  			fn = filter_pred_pchar; +	} else if (is_function_field(field)) { +		if (strcmp(field->name, "ip")) { +			parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); +			return -EINVAL; +		}  	} else {  		if (field->is_signed)  			ret = strict_strtoll(pred->regex.pattern, 0, &val); @@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,  	strcpy(pred.regex.pattern, operand2);  	pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST  	pred.field = field; -#endif  	return init_pred(ps, field, &pred) ? NULL : &pred;  } @@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)  	children = count_leafs(preds, &preds[root->left]);  	children += count_leafs(preds, &preds[root->right]); -	root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); +	root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);  	if (!root->ops)  		return -ENOMEM; @@ -1950,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event)  	__free_filter(filter);  } +struct function_filter_data { +	struct ftrace_ops *ops; +	int first_filter; +	int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ +	char *str, *sep, **re; + +	str = kstrndup(buf, len, GFP_KERNEL); +	if (!str) +		return NULL; + +	/* +	 * The argv_split function takes white space +	 * as a separator, so convert ',' into spaces. +	 */ +	while ((sep = strchr(str, ','))) +		*sep = ' '; + +	re = argv_split(GFP_KERNEL, str, count); +	kfree(str); +	return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, +				      int reset, char *re, int len) +{ +	int ret; + +	if (filter) +		ret = ftrace_set_filter(ops, re, len, reset); +	else +		ret = ftrace_set_notrace(ops, re, len, reset); + +	return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, +					struct function_filter_data *data) +{ +	int i, re_cnt, ret; +	int *reset; +	char **re; + +	reset = filter ? &data->first_filter : &data->first_notrace; + +	/* +	 * The 'ip' field could have multiple filters set, separated +	 * either by space or comma. We first cut the filter and apply +	 * all pieces separatelly. +	 */ +	re = ftrace_function_filter_re(buf, len, &re_cnt); +	if (!re) +		return -EINVAL; + +	for (i = 0; i < re_cnt; i++) { +		ret = ftrace_function_set_regexp(data->ops, filter, *reset, +						 re[i], strlen(re[i])); +		if (ret) +			break; + +		if (*reset) +			*reset = 0; +	} + +	argv_free(re); +	return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ +	struct ftrace_event_field *field = pred->field; + +	if (leaf) { +		/* +		 * Check the leaf predicate for function trace, verify: +		 *  - only '==' and '!=' is used +		 *  - the 'ip' field is used +		 */ +		if ((pred->op != OP_EQ) && (pred->op != OP_NE)) +			return -EINVAL; + +		if (strcmp(field->name, "ip")) +			return -EINVAL; +	} else { +		/* +		 * Check the non leaf predicate for function trace, verify: +		 *  - only '||' is used +		*/ +		if (pred->op != OP_OR) +			return -EINVAL; +	} + +	return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, +					 struct filter_pred *pred, +					 int *err, void *data) +{ +	/* Checking the node is valid for function trace. */ +	if ((move != MOVE_DOWN) || +	    (pred->left != FILTER_PRED_INVALID)) { +		*err = ftrace_function_check_pred(pred, 0); +	} else { +		*err = ftrace_function_check_pred(pred, 1); +		if (*err) +			return WALK_PRED_ABORT; + +		*err = __ftrace_function_set_filter(pred->op == OP_EQ, +						    pred->regex.pattern, +						    pred->regex.len, +						    data); +	} + +	return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, +				      struct event_filter *filter) +{ +	struct function_filter_data data = { +		.first_filter  = 1, +		.first_notrace = 1, +		.ops           = &event->ftrace_ops, +	}; + +	return walk_pred_tree(filter->preds, filter->root, +			      ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, +				      struct event_filter *filter) +{ +	return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ +  int ftrace_profile_set_filter(struct perf_event *event, int event_id,  			      char *filter_str)  { @@ -1970,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,  		goto out_unlock;  	err = create_filter(call, filter_str, false, &filter); -	if (!err) -		event->filter = filter; +	if (err) +		goto free_filter; + +	if (ftrace_event_is_function(call)) +		err = ftrace_function_set_filter(event, filter);  	else +		event->filter = filter; + +free_filter: +	if (err || ftrace_event_is_function(call))  		__free_filter(filter);  out_unlock: diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..7b46c9bd22ae 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,16 @@  #undef TRACE_SYSTEM  #define TRACE_SYSTEM	ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ +			 filter, regfn) \ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter) +  /* not needed for this file */  #undef __field_struct  #define __field_struct(type, item) @@ -44,21 +54,22 @@  #define F_printk(fmt, args...) fmt, args  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\ -struct ____ftrace_##name {					\ -	tstruct							\ -};								\ -static void __always_unused ____ftrace_check_##name(void)	\ -{								\ -	struct ____ftrace_##name *__entry = NULL;		\ -								\ -	/* force compile-time check on F_printk() */		\ -	printk(print);						\ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\ +struct ____ftrace_##name {						\ +	tstruct								\ +};									\ +static void __always_unused ____ftrace_check_##name(void)		\ +{									\ +	struct ____ftrace_##name *__entry = NULL;			\ +									\ +	/* force compile-time check on F_printk() */			\ +	printk(print);							\  }  #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print)	\ -	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter)	\ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ +		     filter)  #include "trace_entries.h" @@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void)	\  	ret = trace_define_field(event_call, #type, #item,		\  				 offsetof(typeof(field), item),		\  				 sizeof(field.item),			\ -				 is_signed_type(type), FILTER_OTHER);	\ +				 is_signed_type(type), filter_type);	\  	if (ret)							\  		return ret; @@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void)	\  				 offsetof(typeof(field),		\  					  container.item),		\  				 sizeof(field.container.item),		\ -				 is_signed_type(type), FILTER_OTHER);	\ +				 is_signed_type(type), filter_type);	\  	if (ret)							\  		return ret; @@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void)	\  		ret = trace_define_field(event_call, event_storage, #item, \  				 offsetof(typeof(field), item),		\  				 sizeof(field.item),			\ -				 is_signed_type(type), FILTER_OTHER);	\ +				 is_signed_type(type), filter_type);	\  		mutex_unlock(&event_storage_mutex);			\  		if (ret)						\  			return ret;					\ @@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void)	\  				 offsetof(typeof(field),		\  					  container.item),		\  				 sizeof(field.container.item),		\ -				 is_signed_type(type), FILTER_OTHER);	\ +				 is_signed_type(type), filter_type);	\  	if (ret)							\  		return ret; @@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void)	\  #define __dynamic_array(type, item)					\  	ret = trace_define_field(event_call, #type, #item,		\  				 offsetof(typeof(field), item),		\ -				 0, is_signed_type(type), FILTER_OTHER);\ +				 0, is_signed_type(type), filter_type);\  	if (ret)							\  		return ret;  #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\  int									\  ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  {									\  	struct struct_name field;					\  	int ret;							\ +	int filter_type = filter;					\  									\  	tstruct;							\  									\ @@ -152,13 +164,15 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  #undef F_printk  #define F_printk(fmt, args...) #fmt ", "  __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)		\ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ +			 regfn)						\  									\  struct ftrace_event_class event_class_ftrace_##call = {			\  	.system			= __stringify(TRACE_SYSTEM),		\  	.define_fields		= ftrace_define_fields_##call,		\  	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ +	.reg			= regfn,				\  };									\  									\  struct ftrace_event_call __used event_##call = {			\ @@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = {			\  struct ftrace_event_call __used						\  __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter)	\ +	FTRACE_ENTRY_REG(call, struct_name, etype,			\ +			 PARAMS(tstruct), PARAMS(print), filter, NULL) + +int ftrace_event_is_function(struct ftrace_event_call *call) +{ +	return call == &event_function; +} +  #include "trace_entries.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c945a4..580a05ec926b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,  #endif	/* CONFIG_PERF_EVENTS */  static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +int kprobe_register(struct ftrace_event_call *event, +		    enum trace_reg type, void *data)  {  	struct trace_probe *tp = (struct trace_probe *)event->data; @@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)  	case TRACE_REG_PERF_UNREGISTER:  		disable_trace_probe(tp, TP_FLAG_PROFILE);  		return 0; +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL: +		return 0;  #endif  	}  	return 0; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..859fae6b1825 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)  	return ret;  } -int trace_seq_path(struct trace_seq *s, struct path *path) +int trace_seq_path(struct trace_seq *s, const struct path *path)  {  	unsigned char *p; @@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,  	unsigned long mask;  	const char *str;  	const char *ret = p->buffer + p->len; -	int i; +	int i, first = 1;  	for (i = 0;  flag_array[i].name && flags; i++) { @@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,  		str = flag_array[i].name;  		flags &= ~mask; -		if (p->len && delim) +		if (!first && delim)  			trace_seq_puts(p, delim); +		else +			first = 0;  		trace_seq_puts(p, str);  	}  	/* check for left over flags */  	if (flags) { -		if (p->len && delim) +		if (!first && delim)  			trace_seq_puts(p, delim);  		trace_seq_printf(p, "0x%lx", flags);  	} @@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,  		break;  	} -	if (!p->len) +	if (ret == (const char *)(p->buffer + p->len))  		trace_seq_printf(p, "0x%lx", val);  	trace_seq_putc(p, 0); @@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,  		break;  	} -	if (!p->len) +	if (ret == (const char *)(p->buffer + p->len))  		trace_seq_printf(p, "0x%llx", val);  	trace_seq_putc(p, 0); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb654542c1a1..96fc73369099 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);  static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);  static int syscall_enter_register(struct ftrace_event_call *event, -				 enum trace_reg type); +				 enum trace_reg type, void *data);  static int syscall_exit_register(struct ftrace_event_call *event, -				 enum trace_reg type); +				 enum trace_reg type, void *data);  static int syscall_enter_define_fields(struct ftrace_event_call *call);  static int syscall_exit_define_fields(struct ftrace_event_call *call); @@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void)  	unsigned long addr;  	int i; -	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * -					NR_syscalls, GFP_KERNEL); +	syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), +				    GFP_KERNEL);  	if (!syscalls_metadata) {  		WARN_ON(1);  		return -ENOMEM; @@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call)  #endif /* CONFIG_PERF_EVENTS */  static int syscall_enter_register(struct ftrace_event_call *event, -				 enum trace_reg type) +				 enum trace_reg type, void *data)  {  	switch (type) {  	case TRACE_REG_REGISTER: @@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event,  	case TRACE_REG_PERF_UNREGISTER:  		perf_sysenter_disable(event);  		return 0; +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL: +		return 0;  #endif  	}  	return 0;  }  static int syscall_exit_register(struct ftrace_event_call *event, -				 enum trace_reg type) +				 enum trace_reg type, void *data)  {  	switch (type) {  	case TRACE_REG_REGISTER: @@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event,  	case TRACE_REG_PERF_UNREGISTER:  		perf_sysexit_disable(event);  		return 0; +	case TRACE_REG_PERF_OPEN: +	case TRACE_REG_PERF_CLOSE: +	case TRACE_REG_PERF_ADD: +	case TRACE_REG_PERF_DEL: +		return 0;  #endif  	}  	return 0; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99d..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@  #include <linux/err.h>  #include <linux/slab.h>  #include <linux/sched.h> -#include <linux/jump_label.h> +#include <linux/static_key.h>  extern struct tracepoint * const __start___tracepoints_ptrs[];  extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,  {  	WARN_ON(strcmp((*entry)->name, elem->name) != 0); -	if (elem->regfunc && !jump_label_enabled(&elem->key) && active) +	if (elem->regfunc && !static_key_enabled(&elem->key) && active)  		elem->regfunc(); -	else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) +	else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)  		elem->unregfunc();  	/* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,  	 * is used.  	 */  	rcu_assign_pointer(elem->funcs, (*entry)->funcs); -	if (active && !jump_label_enabled(&elem->key)) -		jump_label_inc(&elem->key); -	else if (!active && jump_label_enabled(&elem->key)) -		jump_label_dec(&elem->key); +	if (active && !static_key_enabled(&elem->key)) +		static_key_slow_inc(&elem->key); +	else if (!active && static_key_enabled(&elem->key)) +		static_key_slow_dec(&elem->key);  }  /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,   */  static void disable_tracepoint(struct tracepoint *elem)  { -	if (elem->unregfunc && jump_label_enabled(&elem->key)) +	if (elem->unregfunc && static_key_enabled(&elem->key))  		elem->unregfunc(); -	if (jump_label_enabled(&elem->key)) -		jump_label_dec(&elem->key); +	if (static_key_enabled(&elem->key)) +		static_key_slow_dec(&elem->key);  	rcu_assign_pointer(elem->funcs, NULL);  } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d117262deba3..df30ee08bdd4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -3,15 +3,14 @@   *   * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.   * - * this code detects hard lockups: incidents in where on a CPU - * the kernel does not respond to anything except NMI. - * - * Note: Most of this code is borrowed heavily from softlockup.c, - * so thanks to Ingo for the initial implementation. - * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks   * to those contributors as well.   */ +#define pr_fmt(fmt) "NMI watchdog: " fmt +  #include <linux/mm.h>  #include <linux/cpu.h>  #include <linux/nmi.h> @@ -117,9 +116,10 @@ static unsigned long get_sample_period(void)  {  	/*  	 * convert watchdog_thresh from seconds to ns -	 * the divide by 5 is to give hrtimer 5 chances to -	 * increment before the hardlockup detector generates -	 * a warning +	 * the divide by 5 is to give hrtimer several chances (two +	 * or three with the current relation between the soft +	 * and hard thresholds) to increment before the +	 * hardlockup detector generates a warning  	 */  	return get_softlockup_thresh() * (NSEC_PER_SEC / 5);  } @@ -321,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)   */  static int watchdog(void *unused)  { -	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +	struct sched_param param = { .sched_priority = 0 };  	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); -	sched_setscheduler(current, SCHED_FIFO, ¶m); -  	/* initialize timestamp */  	__touch_watchdog(); @@ -336,9 +334,11 @@ static int watchdog(void *unused)  	set_current_state(TASK_INTERRUPTIBLE);  	/* -	 * Run briefly once per second to reset the softlockup timestamp. -	 * If this gets delayed for more than 60 seconds then the -	 * debug-printout triggers in watchdog_timer_fn(). +	 * Run briefly (kicked by the hrtimer callback function) once every +	 * get_sample_period() seconds (4 seconds by default) to reset the +	 * softlockup timestamp. If this gets delayed for more than +	 * 2*watchdog_thresh seconds then the debug-printout triggers in +	 * watchdog_timer_fn().  	 */  	while (!kthread_should_stop()) {  		__touch_watchdog(); @@ -349,8 +349,11 @@ static int watchdog(void *unused)  		set_current_state(TASK_INTERRUPTIBLE);  	} +	/* +	 * Drop the policy/priority elevation during thread exit to avoid a +	 * scheduling latency spike. +	 */  	__set_current_state(TASK_RUNNING); -	param.sched_priority = 0;  	sched_setscheduler(current, SCHED_NORMAL, ¶m);  	return 0;  } @@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)  	/* Try to register using hardware perf events */  	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);  	if (!IS_ERR(event)) { -		printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); +		pr_info("enabled, takes one hw-pmu counter.\n");  		goto out_save;  	}  	/* vary the KERN level based on the returned errno */  	if (PTR_ERR(event) == -EOPNOTSUPP) -		printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); +		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);  	else if (PTR_ERR(event) == -ENOENT) -		printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); +		pr_warning("disabled (cpu%i): hardware events not enabled\n", +			 cpu);  	else -		printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); +		pr_err("disabled (cpu%i): unable to create perf event: %ld\n", +			cpu, PTR_ERR(event));  	return PTR_ERR(event);  	/* success path */ @@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)  	/* create the watchdog thread */  	if (!p) { +		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };  		p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);  		if (IS_ERR(p)) { -			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); +			pr_err("softlockup watchdog for %i failed\n", cpu);  			if (!err) {  				/* if hardlockup hasn't already set this */  				err = PTR_ERR(p); @@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)  			}  			goto out;  		} +		sched_setscheduler(p, SCHED_FIFO, ¶m);  		kthread_bind(p, cpu);  		per_cpu(watchdog_touch_ts, cpu) = 0;  		per_cpu(softlockup_watchdog, cpu) = p; @@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)  			watchdog_enabled = 1;  	if (!watchdog_enabled) -		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); +		pr_err("failed to be enabled on some cpus\n");  } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bec7b5b53e03..5abf42f63c08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -253,11 +253,13 @@ struct workqueue_struct *system_long_wq __read_mostly;  struct workqueue_struct *system_nrt_wq __read_mostly;  struct workqueue_struct *system_unbound_wq __read_mostly;  struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_nrt_freezable_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_wq);  EXPORT_SYMBOL_GPL(system_long_wq);  EXPORT_SYMBOL_GPL(system_nrt_wq);  EXPORT_SYMBOL_GPL(system_unbound_wq);  EXPORT_SYMBOL_GPL(system_freezable_wq); +EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);  #define CREATE_TRACE_POINTS  #include <trace/events/workqueue.h> @@ -474,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,  					    struct workqueue_struct *wq)  {  	if (!(wq->flags & WQ_UNBOUND)) { -		if (likely(cpu < nr_cpu_ids)) { -#ifdef CONFIG_SMP +		if (likely(cpu < nr_cpu_ids))  			return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); -#else -			return wq->cpu_wq.single; -#endif -		}  	} else if (likely(cpu == WORK_CPU_UNBOUND))  		return wq->cpu_wq.single;  	return NULL; @@ -2897,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq)  	const size_t size = sizeof(struct cpu_workqueue_struct);  	const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,  				   __alignof__(unsigned long long)); -#ifdef CONFIG_SMP -	bool percpu = !(wq->flags & WQ_UNBOUND); -#else -	bool percpu = false; -#endif -	if (percpu) +	if (!(wq->flags & WQ_UNBOUND))  		wq->cpu_wq.pcpu = __alloc_percpu(size, align);  	else {  		void *ptr; @@ -2927,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)  static void free_cwqs(struct workqueue_struct *wq)  { -#ifdef CONFIG_SMP -	bool percpu = !(wq->flags & WQ_UNBOUND); -#else -	bool percpu = false; -#endif - -	if (percpu) +	if (!(wq->flags & WQ_UNBOUND))  		free_percpu(wq->cpu_wq.pcpu);  	else if (wq->cpu_wq.single) {  		/* the pointer to free is stored right after the cwq */ @@ -3833,8 +3819,11 @@ static int __init init_workqueues(void)  					    WQ_UNBOUND_MAX_ACTIVE);  	system_freezable_wq = alloc_workqueue("events_freezable",  					      WQ_FREEZABLE, 0); +	system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", +			WQ_NON_REENTRANT | WQ_FREEZABLE, 0);  	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || -	       !system_unbound_wq || !system_freezable_wq); +	       !system_unbound_wq || !system_freezable_wq || +		!system_nrt_freezable_wq);  	return 0;  }  early_initcall(init_workqueues); | 
