diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 674 | 
1 files changed, 405 insertions, 269 deletions
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 228d6461c12a..b2ee6df0e9bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {  	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */  	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */  	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ -	MEM_CGROUP_ON_MOVE,	/* someone is moving account between groups */  	MEM_CGROUP_STAT_NSTATS,  }; @@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {   */  struct mem_cgroup_per_zone {  	struct lruvec		lruvec; -	unsigned long		count[NR_LRU_LISTS]; +	unsigned long		lru_size[NR_LRU_LISTS];  	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; @@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {  	unsigned long long	usage_in_excess;/* Set to the value by which */  						/* the soft limit is exceeded*/  	bool			on_tree; -	struct mem_cgroup	*mem;		/* Back pointer, we cannot */ +	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */  						/* use container_of	   */  }; -/* Macro for accessing counter */ -#define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])  struct mem_cgroup_per_node {  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; @@ -230,10 +227,30 @@ struct mem_cgroup {  	 * the counter to account for memory usage  	 */  	struct res_counter res; -	/* -	 * the counter to account for mem+swap usage. -	 */ -	struct res_counter memsw; + +	union { +		/* +		 * the counter to account for mem+swap usage. +		 */ +		struct res_counter memsw; + +		/* +		 * rcu_freeing is used only when freeing struct mem_cgroup, +		 * so put it into a union to avoid wasting more memory. +		 * It must be disjoint from the css field.  It could be +		 * in a union with the res field, but res plays a much +		 * larger part in mem_cgroup life than memsw, and might +		 * be of interest, even at time of free, when debugging. +		 * So share rcu_head with the less interesting memsw. +		 */ +		struct rcu_head rcu_freeing; +		/* +		 * But when using vfree(), that cannot be done at +		 * interrupt time, so we must then queue the work. +		 */ +		struct work_struct work_freeing; +	}; +  	/*  	 * Per cgroup active and inactive list, similar to the  	 * per zone LRU lists. @@ -280,6 +297,12 @@ struct mem_cgroup {  	 */  	unsigned long 	move_charge_at_immigrate;  	/* +	 * set > 0 if pages under this cgroup are moving to other cgroup. +	 */ +	atomic_t	moving_account; +	/* taken only while moving_account > 0 */ +	spinlock_t	move_lock; +	/*  	 * percpu counter.  	 */  	struct mem_cgroup_stat_cpu *stat; @@ -592,9 +615,9 @@ retry:  	 * we will to add it back at the end of reclaim to its correct  	 * position in the tree.  	 */ -	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz); -	if (!res_counter_soft_limit_excess(&mz->mem->res) || -		!css_tryget(&mz->mem->css)) +	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +	if (!res_counter_soft_limit_excess(&mz->memcg->res) || +		!css_tryget(&mz->memcg->css))  		goto retry;  done:  	return mz; @@ -672,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,  }  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, -					 bool file, int nr_pages) +					 bool anon, int nr_pages)  {  	preempt_disable(); -	if (file) -		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], +	/* +	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is +	 * counted as CACHE even if it's on ANON LRU. +	 */ +	if (anon) +		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],  				nr_pages);  	else -		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], +		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],  				nr_pages);  	/* pagein of a big page is an event. So, ignore page size */ @@ -701,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,  			unsigned int lru_mask)  {  	struct mem_cgroup_per_zone *mz; -	enum lru_list l; +	enum lru_list lru;  	unsigned long ret = 0;  	mz = mem_cgroup_zoneinfo(memcg, nid, zid); -	for_each_lru(l) { -		if (BIT(l) & lru_mask) -			ret += MEM_CGROUP_ZSTAT(mz, l); +	for_each_lru(lru) { +		if (BIT(lru) & lru_mask) +			ret += mz->lru_size[lru];  	}  	return ret;  } @@ -1042,9 +1069,22 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,  	pc = lookup_page_cgroup(page);  	memcg = pc->mem_cgroup; + +	/* +	 * Surreptitiously switch any uncharged page to root: +	 * an uncharged page off lru does nothing to secure +	 * its former mem_cgroup from sudden removal. +	 * +	 * Our caller holds lru_lock, and PageCgroupUsed is updated +	 * under page_cgroup lock: between them, they make all uses +	 * of pc->mem_cgroup safe. +	 */ +	if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) +		pc->mem_cgroup = memcg = root_mem_cgroup; +  	mz = page_cgroup_zoneinfo(memcg, page);  	/* compound_order() is stabilized through lru_lock */ -	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); +	mz->lru_size[lru] += 1 << compound_order(page);  	return &mz->lruvec;  } @@ -1072,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)  	VM_BUG_ON(!memcg);  	mz = page_cgroup_zoneinfo(memcg, page);  	/* huge page split is done under lru_lock. so, we have no races. */ -	VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); -	MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); +	VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); +	mz->lru_size[lru] -= 1 << compound_order(page);  }  void mem_cgroup_lru_del(struct page *page) @@ -1252,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)  	return memcg->swappiness;  } -static void mem_cgroup_start_move(struct mem_cgroup *memcg) -{ -	int cpu; +/* + * memcg->moving_account is used for checking possibility that some thread is + * calling move_account(). When a thread on CPU-A starts moving pages under + * a memcg, other threads should check memcg->moving_account under + * rcu_read_lock(), like this: + * + *         CPU-A                                    CPU-B + *                                              rcu_read_lock() + *         memcg->moving_account+1              if (memcg->mocing_account) + *                                                   take heavy locks. + *         synchronize_rcu()                    update something. + *                                              rcu_read_unlock() + *         start move here. + */ -	get_online_cpus(); -	spin_lock(&memcg->pcp_counter_lock); -	for_each_online_cpu(cpu) -		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; -	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; -	spin_unlock(&memcg->pcp_counter_lock); -	put_online_cpus(); +/* for quick checking without looking up memcg */ +atomic_t memcg_moving __read_mostly; +static void mem_cgroup_start_move(struct mem_cgroup *memcg) +{ +	atomic_inc(&memcg_moving); +	atomic_inc(&memcg->moving_account);  	synchronize_rcu();  }  static void mem_cgroup_end_move(struct mem_cgroup *memcg)  { -	int cpu; - -	if (!memcg) -		return; -	get_online_cpus(); -	spin_lock(&memcg->pcp_counter_lock); -	for_each_online_cpu(cpu) -		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; -	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; -	spin_unlock(&memcg->pcp_counter_lock); -	put_online_cpus(); +	/* +	 * Now, mem_cgroup_clear_mc() may call this function with NULL. +	 * We check NULL in callee rather than caller. +	 */ +	if (memcg) { +		atomic_dec(&memcg_moving); +		atomic_dec(&memcg->moving_account); +	}  } +  /*   * 2 routines for checking "mem" is under move_account() or not.   * - * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used - *			  for avoiding race in accounting. If true, + * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This + *			  is used for avoiding races in accounting.  If true,   *			  pc->mem_cgroup may be overwritten.   *   * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or @@ -1293,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)   *			  waiting at hith-memory prressure caused by "move".   */ -static bool mem_cgroup_stealed(struct mem_cgroup *memcg) +static bool mem_cgroup_stolen(struct mem_cgroup *memcg)  {  	VM_BUG_ON(!rcu_read_lock_held()); -	return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; +	return atomic_read(&memcg->moving_account) > 0;  }  static bool mem_cgroup_under_move(struct mem_cgroup *memcg) @@ -1337,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)  	return false;  } +/* + * Take this lock when + * - a code tries to modify page's memcg while it's USED. + * - a code tries to modify page state accounting in a memcg. + * see mem_cgroup_stolen(), too. + */ +static void move_lock_mem_cgroup(struct mem_cgroup *memcg, +				  unsigned long *flags) +{ +	spin_lock_irqsave(&memcg->move_lock, *flags); +} + +static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, +				unsigned long *flags) +{ +	spin_unlock_irqrestore(&memcg->move_lock, *flags); +} +  /**   * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.   * @memcg: The memory cgroup that went over limit @@ -1360,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)  	if (!memcg || !p)  		return; -  	rcu_read_lock();  	mem_cgrp = memcg->css.cgroup; @@ -1739,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);  struct oom_wait_info { -	struct mem_cgroup *mem; +	struct mem_cgroup *memcg;  	wait_queue_t	wait;  };  static int memcg_oom_wake_function(wait_queue_t *wait,  	unsigned mode, int sync, void *arg)  { -	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, -			  *oom_wait_memcg; +	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; +	struct mem_cgroup *oom_wait_memcg;  	struct oom_wait_info *oom_wait_info;  	oom_wait_info = container_of(wait, struct oom_wait_info, wait); -	oom_wait_memcg = oom_wait_info->mem; +	oom_wait_memcg = oom_wait_info->memcg;  	/* -	 * Both of oom_wait_info->mem and wake_mem are stable under us. +	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.  	 * Then we can use css_is_ancestor without taking care of RCU.  	 */  	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) @@ -1778,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)  /*   * try to call OOM killer. returns false if we should exit memory-reclaim loop.   */ -bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) +bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)  {  	struct oom_wait_info owait;  	bool locked, need_to_kill; -	owait.mem = memcg; +	owait.memcg = memcg;  	owait.wait.flags = 0;  	owait.wait.func = memcg_oom_wake_function;  	owait.wait.private = current; @@ -1808,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)  	if (need_to_kill) {  		finish_wait(&memcg_oom_waitq, &owait.wait); -		mem_cgroup_out_of_memory(memcg, mask); +		mem_cgroup_out_of_memory(memcg, mask, order);  	} else {  		schedule();  		finish_wait(&memcg_oom_waitq, &owait.wait); @@ -1848,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)   * by flags.   *   * Considering "move", this is an only case we see a race. To make the race - * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are - * possibility of race condition. If there is, we take a lock. + * small, we check mm->moving_account and detect there are possibility of race + * If there is, we take a lock.   */ +void __mem_cgroup_begin_update_page_stat(struct page *page, +				bool *locked, unsigned long *flags) +{ +	struct mem_cgroup *memcg; +	struct page_cgroup *pc; + +	pc = lookup_page_cgroup(page); +again: +	memcg = pc->mem_cgroup; +	if (unlikely(!memcg || !PageCgroupUsed(pc))) +		return; +	/* +	 * If this memory cgroup is not under account moving, we don't +	 * need to take move_lock_page_cgroup(). Because we already hold +	 * rcu_read_lock(), any calls to move_account will be delayed until +	 * rcu_read_unlock() if mem_cgroup_stolen() == true. +	 */ +	if (!mem_cgroup_stolen(memcg)) +		return; + +	move_lock_mem_cgroup(memcg, flags); +	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { +		move_unlock_mem_cgroup(memcg, flags); +		goto again; +	} +	*locked = true; +} + +void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +{ +	struct page_cgroup *pc = lookup_page_cgroup(page); + +	/* +	 * It's guaranteed that pc->mem_cgroup never changes while +	 * lock is held because a routine modifies pc->mem_cgroup +	 * should take move_lock_page_cgroup(). +	 */ +	move_unlock_mem_cgroup(pc->mem_cgroup, flags); +} +  void mem_cgroup_update_page_stat(struct page *page,  				 enum mem_cgroup_page_stat_item idx, int val)  {  	struct mem_cgroup *memcg;  	struct page_cgroup *pc = lookup_page_cgroup(page); -	bool need_unlock = false;  	unsigned long uninitialized_var(flags);  	if (mem_cgroup_disabled())  		return; -	rcu_read_lock();  	memcg = pc->mem_cgroup;  	if (unlikely(!memcg || !PageCgroupUsed(pc))) -		goto out; -	/* pc->mem_cgroup is unstable ? */ -	if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { -		/* take a lock against to access pc->mem_cgroup */ -		move_lock_page_cgroup(pc, &flags); -		need_unlock = true; -		memcg = pc->mem_cgroup; -		if (!memcg || !PageCgroupUsed(pc)) -			goto out; -	} +		return;  	switch (idx) {  	case MEMCG_NR_FILE_MAPPED: -		if (val > 0) -			SetPageCgroupFileMapped(pc); -		else if (!page_mapped(page)) -			ClearPageCgroupFileMapped(pc);  		idx = MEM_CGROUP_STAT_FILE_MAPPED;  		break;  	default: @@ -1890,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,  	}  	this_cpu_add(memcg->stat->count[idx], val); - -out: -	if (unlikely(need_unlock)) -		move_unlock_page_cgroup(pc, &flags); -	rcu_read_unlock(); -	return;  } -EXPORT_SYMBOL(mem_cgroup_update_page_stat);  /*   * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -2068,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)  		per_cpu(memcg->stat->events[i], cpu) = 0;  		memcg->nocpu_base.events[i] += x;  	} -	/* need to clear ON_MOVE value, works as a kind of lock. */ -	per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; -	spin_unlock(&memcg->pcp_counter_lock); -} - -static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) -{ -	int idx = MEM_CGROUP_ON_MOVE; - -	spin_lock(&memcg->pcp_counter_lock); -	per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];  	spin_unlock(&memcg->pcp_counter_lock);  } @@ -2090,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,  	struct memcg_stock_pcp *stock;  	struct mem_cgroup *iter; -	if ((action == CPU_ONLINE)) { -		for_each_mem_cgroup(iter) -			synchronize_mem_cgroup_on_move(iter, cpu); +	if (action == CPU_ONLINE)  		return NOTIFY_OK; -	}  	if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)  		return NOTIFY_OK; @@ -2179,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,  	if (!oom_check)  		return CHARGE_NOMEM;  	/* check OOM */ -	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) +	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))  		return CHARGE_OOM_DIE;  	return CHARGE_RETRY; @@ -2408,8 +2477,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  				       struct page *page,  				       unsigned int nr_pages,  				       struct page_cgroup *pc, -				       enum charge_type ctype) +				       enum charge_type ctype, +				       bool lrucare)  { +	struct zone *uninitialized_var(zone); +	bool was_on_lru = false; +	bool anon; +  	lock_page_cgroup(pc);  	if (unlikely(PageCgroupUsed(pc))) {  		unlock_page_cgroup(pc); @@ -2420,6 +2494,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	 * we don't need page_cgroup_lock about tail pages, becase they are not  	 * accessed by any other context at this point.  	 */ + +	/* +	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page +	 * may already be on some other mem_cgroup's LRU.  Take care of it. +	 */ +	if (lrucare) { +		zone = page_zone(page); +		spin_lock_irq(&zone->lru_lock); +		if (PageLRU(page)) { +			ClearPageLRU(page); +			del_page_from_lru_list(zone, page, page_lru(page)); +			was_on_lru = true; +		} +	} +  	pc->mem_cgroup = memcg;  	/*  	 * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2429,23 +2518,25 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  	 * See mem_cgroup_add_lru_list(), etc.   	 */  	smp_wmb(); -	switch (ctype) { -	case MEM_CGROUP_CHARGE_TYPE_CACHE: -	case MEM_CGROUP_CHARGE_TYPE_SHMEM: -		SetPageCgroupCache(pc); -		SetPageCgroupUsed(pc); -		break; -	case MEM_CGROUP_CHARGE_TYPE_MAPPED: -		ClearPageCgroupCache(pc); -		SetPageCgroupUsed(pc); -		break; -	default: -		break; +	SetPageCgroupUsed(pc); + +	if (lrucare) { +		if (was_on_lru) { +			VM_BUG_ON(PageLRU(page)); +			SetPageLRU(page); +			add_page_to_lru_list(zone, page, page_lru(page)); +		} +		spin_unlock_irq(&zone->lru_lock);  	} -	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); +	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) +		anon = true; +	else +		anon = false; + +	mem_cgroup_charge_statistics(memcg, anon, nr_pages);  	unlock_page_cgroup(pc); -	WARN_ON_ONCE(PageLRU(page)); +  	/*  	 * "charge_statistics" updated event counter. Then, check it.  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. @@ -2456,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ -			(1 << PCG_MIGRATION)) +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))  /*   * Because tail pages are not marked as "used", set it. We're under   * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -2508,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,  {  	unsigned long flags;  	int ret; +	bool anon = PageAnon(page);  	VM_BUG_ON(from == to);  	VM_BUG_ON(PageLRU(page)); @@ -2527,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,  	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)  		goto unlock; -	move_lock_page_cgroup(pc, &flags); +	move_lock_mem_cgroup(from, &flags); -	if (PageCgroupFileMapped(pc)) { +	if (!anon && page_mapped(page)) {  		/* Update mapped_file data for mem_cgroup */  		preempt_disable();  		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);  		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);  		preempt_enable();  	} -	mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); +	mem_cgroup_charge_statistics(from, anon, -nr_pages);  	if (uncharge)  		/* This is not "cancel", but cancel_charge does all we need. */  		__mem_cgroup_cancel_charge(from, nr_pages);  	/* caller should have done css_get */  	pc->mem_cgroup = to; -	mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); +	mem_cgroup_charge_statistics(to, anon, nr_pages);  	/*  	 * We charges against "to" which may not have any tasks. Then, "to"  	 * can be under rmdir(). But in current implementation, caller of @@ -2551,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,  	 * guaranteed that "to" is never removed. So, we don't check rmdir  	 * status here.  	 */ -	move_unlock_page_cgroup(pc, &flags); +	move_unlock_mem_cgroup(from, &flags);  	ret = 0;  unlock:  	unlock_page_cgroup(pc); @@ -2643,7 +2734,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,  	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);  	if (ret == -ENOMEM)  		return ret; -	__mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); +	__mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);  	return 0;  } @@ -2663,35 +2754,6 @@ static void  __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,  					enum charge_type ctype); -static void -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, -					enum charge_type ctype) -{ -	struct page_cgroup *pc = lookup_page_cgroup(page); -	struct zone *zone = page_zone(page); -	unsigned long flags; -	bool removed = false; - -	/* -	 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page -	 * is already on LRU. It means the page may on some other page_cgroup's -	 * LRU. Take care of it. -	 */ -	spin_lock_irqsave(&zone->lru_lock, flags); -	if (PageLRU(page)) { -		del_page_from_lru_list(zone, page, page_lru(page)); -		ClearPageLRU(page); -		removed = true; -	} -	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); -	if (removed) { -		add_page_to_lru_list(zone, page, page_lru(page)); -		SetPageLRU(page); -	} -	spin_unlock_irqrestore(&zone->lru_lock, flags); -	return; -} -  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,  				gfp_t gfp_mask)  { @@ -2769,13 +2831,16 @@ static void  __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,  					enum charge_type ctype)  { +	struct page_cgroup *pc; +  	if (mem_cgroup_disabled())  		return;  	if (!memcg)  		return;  	cgroup_exclude_rmdir(&memcg->css); -	__mem_cgroup_commit_charge_lrucare(page, memcg, ctype); +	pc = lookup_page_cgroup(page); +	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);  	/*  	 * Now swap is on-memory. This means this page may be  	 * counted both as mem and swap....double count. @@ -2879,7 +2944,6 @@ direct_uncharge:  		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);  	if (unlikely(batch->memcg != memcg))  		memcg_oom_recover(memcg); -	return;  }  /* @@ -2891,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)  	struct mem_cgroup *memcg = NULL;  	unsigned int nr_pages = 1;  	struct page_cgroup *pc; +	bool anon;  	if (mem_cgroup_disabled())  		return NULL; @@ -2916,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)  	if (!PageCgroupUsed(pc))  		goto unlock_out; +	anon = PageAnon(page); +  	switch (ctype) {  	case MEM_CGROUP_CHARGE_TYPE_MAPPED: +		/* +		 * Generally PageAnon tells if it's the anon statistics to be +		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is +		 * used before page reached the stage of being marked PageAnon. +		 */ +		anon = true; +		/* fallthrough */  	case MEM_CGROUP_CHARGE_TYPE_DROP:  		/* See mem_cgroup_prepare_migration() */  		if (page_mapped(page) || PageCgroupMigration(pc)) @@ -2934,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)  		break;  	} -	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); +	mem_cgroup_charge_statistics(memcg, anon, -nr_pages);  	ClearPageCgroupUsed(pc);  	/* @@ -3027,23 +3101,6 @@ void mem_cgroup_uncharge_end(void)  	batch->memcg = NULL;  } -/* - * A function for resetting pc->mem_cgroup for newly allocated pages. - * This function should be called if the newpage will be added to LRU - * before start accounting. - */ -void mem_cgroup_reset_owner(struct page *newpage) -{ -	struct page_cgroup *pc; - -	if (mem_cgroup_disabled()) -		return; - -	pc = lookup_page_cgroup(newpage); -	VM_BUG_ON(PageCgroupUsed(pc)); -	pc->mem_cgroup = root_mem_cgroup; -} -  #ifdef CONFIG_SWAP  /*   * called after __delete_from_swap_cache() and drop "page" account. @@ -3248,7 +3305,7 @@ int mem_cgroup_prepare_migration(struct page *page,  		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;  	else  		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; -	__mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); +	__mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);  	return ret;  } @@ -3258,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,  {  	struct page *used, *unused;  	struct page_cgroup *pc; +	bool anon;  	if (!memcg)  		return; @@ -3279,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,  	lock_page_cgroup(pc);  	ClearPageCgroupMigration(pc);  	unlock_page_cgroup(pc); - -	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); +	anon = PageAnon(used); +	__mem_cgroup_uncharge_common(unused, +		anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED +		     : MEM_CGROUP_CHARGE_TYPE_CACHE);  	/*  	 * If a page is a file cache, radix-tree replacement is very atomic @@ -3290,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,  	 * and USED bit check in mem_cgroup_uncharge_page() will do enough  	 * check. (see prepare_charge() also)  	 */ -	if (PageAnon(used)) +	if (anon)  		mem_cgroup_uncharge_page(used);  	/*  	 * At migration, we may charge account against cgroup which has no @@ -3320,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,  	/* fix accounting on old pages */  	lock_page_cgroup(pc);  	memcg = pc->mem_cgroup; -	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); +	mem_cgroup_charge_statistics(memcg, false, -1);  	ClearPageCgroupUsed(pc);  	unlock_page_cgroup(pc); @@ -3332,7 +3392,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,  	 * the newpage may be on LRU(or pagevec for LRU) already. We lock  	 * LRU while we overwrite pc->mem_cgroup.  	 */ -	__mem_cgroup_commit_charge_lrucare(newpage, memcg, type); +	__mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);  }  #ifdef CONFIG_DEBUG_VM @@ -3531,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  			break;  		nr_scanned = 0; -		reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, +		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,  						    gfp_mask, &nr_scanned);  		nr_reclaimed += reclaimed;  		*total_scanned += nr_scanned; @@ -3558,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  				next_mz =  				__mem_cgroup_largest_soft_limit_node(mctz);  				if (next_mz == mz) -					css_put(&next_mz->mem->css); +					css_put(&next_mz->memcg->css);  				else /* next_mz == NULL or other memcg */  					break;  			} while (1);  		} -		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz); -		excess = res_counter_soft_limit_excess(&mz->mem->res); +		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +		excess = res_counter_soft_limit_excess(&mz->memcg->res);  		/*  		 * One school of thought says that we should not add  		 * back the node to the tree if reclaim returns 0. @@ -3574,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  		 * term TODO.  		 */  		/* If excess == 0, no tree ops */ -		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); +		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);  		spin_unlock(&mctz->lock); -		css_put(&mz->mem->css); +		css_put(&mz->memcg->css);  		loop++;  		/*  		 * Could not reclaim anything and there are no more @@ -3589,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  			break;  	} while (!nr_reclaimed);  	if (next_mz) -		css_put(&next_mz->mem->css); +		css_put(&next_mz->memcg->css);  	return nr_reclaimed;  } @@ -3611,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,  	mz = mem_cgroup_zoneinfo(memcg, node, zid);  	list = &mz->lruvec.lists[lru]; -	loop = MEM_CGROUP_ZSTAT(mz, lru); +	loop = mz->lru_size[lru];  	/* give some margin against EBUSY etc...*/  	loop += 256;  	busy = NULL; @@ -3685,10 +3745,10 @@ move_account:  		mem_cgroup_start_move(memcg);  		for_each_node_state(node, N_HIGH_MEMORY) {  			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { -				enum lru_list l; -				for_each_lru(l) { +				enum lru_list lru; +				for_each_lru(lru) {  					ret = mem_cgroup_force_empty_list(memcg, -							node, zid, l); +							node, zid, lru);  					if (ret)  						break;  				} @@ -3842,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)  		break;  	default:  		BUG(); -		break;  	}  	return val;  } @@ -3921,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,  out:  	*mem_limit = min_limit;  	*memsw_limit = min_memsw_limit; -	return;  }  static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) @@ -4080,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)  	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;  	unsigned long node_nr;  	struct cgroup *cont = m->private; -	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); -	total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); +	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);  	seq_printf(m, "total=%lu", total_nr);  	for_each_node_state(nid, N_HIGH_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); +		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);  		seq_printf(m, " N%d=%lu", nid, node_nr);  	}  	seq_putc(m, '\n'); -	file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); +	file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);  	seq_printf(m, "file=%lu", file_nr);  	for_each_node_state(nid, N_HIGH_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, +		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,  				LRU_ALL_FILE);  		seq_printf(m, " N%d=%lu", nid, node_nr);  	}  	seq_putc(m, '\n'); -	anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); +	anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);  	seq_printf(m, "anon=%lu", anon_nr);  	for_each_node_state(nid, N_HIGH_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, +		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,  				LRU_ALL_ANON);  		seq_printf(m, " N%d=%lu", nid, node_nr);  	}  	seq_putc(m, '\n'); -	unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); +	unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));  	seq_printf(m, "unevictable=%lu", unevictable_nr);  	for_each_node_state(nid, N_HIGH_MEMORY) { -		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, +		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,  				BIT(LRU_UNEVICTABLE));  		seq_printf(m, " N%d=%lu", nid, node_nr);  	} @@ -4123,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)  static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,  				 struct cgroup_map_cb *cb)  { -	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); +	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);  	struct mcs_total_stat mystat;  	int i;  	memset(&mystat, 0, sizeof(mystat)); -	mem_cgroup_get_local_stat(mem_cont, &mystat); +	mem_cgroup_get_local_stat(memcg, &mystat);  	for (i = 0; i < NR_MCS_STAT; i++) { @@ -4140,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,  	/* Hierarchical information */  	{  		unsigned long long limit, memsw_limit; -		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); +		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);  		cb->fill(cb, "hierarchical_memory_limit", limit);  		if (do_swap_account)  			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);  	}  	memset(&mystat, 0, sizeof(mystat)); -	mem_cgroup_get_total_stat(mem_cont, &mystat); +	mem_cgroup_get_total_stat(memcg, &mystat);  	for (i = 0; i < NR_MCS_STAT; i++) {  		if (i == MCS_SWAP && !do_swap_account)  			continue; @@ -4163,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,  		for_each_online_node(nid)  			for (zid = 0; zid < MAX_NR_ZONES; zid++) { -				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); +				mz = mem_cgroup_zoneinfo(memcg, nid, zid);  				recent_rotated[0] +=  					mz->reclaim_stat.recent_rotated[0]; @@ -4408,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,  	else  		BUG(); -	/* -	 * Something went wrong if we trying to unregister a threshold -	 * if we don't have thresholds -	 */ -	BUG_ON(!thresholds); -  	if (!thresholds->primary)  		goto unlock; @@ -4584,10 +4636,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)  	return mem_cgroup_sockets_init(cont, ss);  }; -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, -				struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont)  { -	mem_cgroup_sockets_destroy(cont, ss); +	mem_cgroup_sockets_destroy(cont);  }  #else  static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) @@ -4595,8 +4646,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)  	return 0;  } -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, -				struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont)  {  }  #endif @@ -4720,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  {  	struct mem_cgroup_per_node *pn;  	struct mem_cgroup_per_zone *mz; -	enum lru_list l; +	enum lru_list lru;  	int zone, tmp = node;  	/*  	 * This routine is called against possible nodes. @@ -4738,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {  		mz = &pn->zoneinfo[zone]; -		for_each_lru(l) -			INIT_LIST_HEAD(&mz->lruvec.lists[l]); +		for_each_lru(lru) +			INIT_LIST_HEAD(&mz->lruvec.lists[lru]);  		mz->usage_in_excess = 0;  		mz->on_tree = false; -		mz->mem = memcg; +		mz->memcg = memcg;  	}  	memcg->info.nodeinfo[node] = pn;  	return 0; @@ -4755,33 +4805,54 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)  static struct mem_cgroup *mem_cgroup_alloc(void)  { -	struct mem_cgroup *mem; +	struct mem_cgroup *memcg;  	int size = sizeof(struct mem_cgroup);  	/* Can be very big if MAX_NUMNODES is very big */  	if (size < PAGE_SIZE) -		mem = kzalloc(size, GFP_KERNEL); +		memcg = kzalloc(size, GFP_KERNEL);  	else -		mem = vzalloc(size); +		memcg = vzalloc(size); -	if (!mem) +	if (!memcg)  		return NULL; -	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); -	if (!mem->stat) +	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); +	if (!memcg->stat)  		goto out_free; -	spin_lock_init(&mem->pcp_counter_lock); -	return mem; +	spin_lock_init(&memcg->pcp_counter_lock); +	return memcg;  out_free:  	if (size < PAGE_SIZE) -		kfree(mem); +		kfree(memcg);  	else -		vfree(mem); +		vfree(memcg);  	return NULL;  }  /* + * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, + * but in process context.  The work_freeing structure is overlaid + * on the rcu_freeing structure, which itself is overlaid on memsw. + */ +static void vfree_work(struct work_struct *work) +{ +	struct mem_cgroup *memcg; + +	memcg = container_of(work, struct mem_cgroup, work_freeing); +	vfree(memcg); +} +static void vfree_rcu(struct rcu_head *rcu_head) +{ +	struct mem_cgroup *memcg; + +	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); +	INIT_WORK(&memcg->work_freeing, vfree_work); +	schedule_work(&memcg->work_freeing); +} + +/*   * At destroying mem_cgroup, references from swap_cgroup can remain.   * (scanning all at force_empty is too costly...)   * @@ -4804,9 +4875,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)  	free_percpu(memcg->stat);  	if (sizeof(struct mem_cgroup) < PAGE_SIZE) -		kfree(memcg); +		kfree_rcu(memcg, rcu_freeing);  	else -		vfree(memcg); +		call_rcu(&memcg->rcu_freeing, vfree_rcu);  }  static void mem_cgroup_get(struct mem_cgroup *memcg) @@ -4888,7 +4959,7 @@ err_cleanup:  }  static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +mem_cgroup_create(struct cgroup *cont)  {  	struct mem_cgroup *memcg, *parent;  	long error = -ENOMEM; @@ -4944,26 +5015,25 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)  	atomic_set(&memcg->refcnt, 1);  	memcg->move_charge_at_immigrate = 0;  	mutex_init(&memcg->thresholds_lock); +	spin_lock_init(&memcg->move_lock);  	return &memcg->css;  free_out:  	__mem_cgroup_free(memcg);  	return ERR_PTR(error);  } -static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, -					struct cgroup *cont) +static int mem_cgroup_pre_destroy(struct cgroup *cont)  {  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);  	return mem_cgroup_force_empty(memcg, false);  } -static void mem_cgroup_destroy(struct cgroup_subsys *ss, -				struct cgroup *cont) +static void mem_cgroup_destroy(struct cgroup *cont)  {  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); -	kmem_cgroup_destroy(ss, cont); +	kmem_cgroup_destroy(cont);  	mem_cgroup_put(memcg);  } @@ -5040,7 +5110,7 @@ one_by_one:  }  /** - * is_target_pte_for_mc - check a pte whether it is valid for move charge + * get_mctgt_type - get target type of moving charge   * @vma: the vma the pte to be checked belongs   * @addr: the address corresponding to the pte to be checked   * @ptent: the pte to be checked @@ -5063,7 +5133,7 @@ union mc_target {  };  enum mc_target_type { -	MC_TARGET_NONE,	/* not used */ +	MC_TARGET_NONE = 0,  	MC_TARGET_PAGE,  	MC_TARGET_SWAP,  }; @@ -5144,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,  	return page;  } -static int is_target_pte_for_mc(struct vm_area_struct *vma, +static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,  		unsigned long addr, pte_t ptent, union mc_target *target)  {  	struct page *page = NULL;  	struct page_cgroup *pc; -	int ret = 0; +	enum mc_target_type ret = MC_TARGET_NONE;  	swp_entry_t ent = { .val = 0 };  	if (pte_present(ptent)) @@ -5160,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,  		page = mc_handle_file_pte(vma, addr, ptent, &ent);  	if (!page && !ent.val) -		return 0; +		return ret;  	if (page) {  		pc = lookup_page_cgroup(page);  		/* @@ -5186,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,  	return ret;  } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * We don't consider swapping or file mapped pages because THP does not + * support them for now. + * Caller should make sure that pmd_trans_huge(pmd) is true. + */ +static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, +		unsigned long addr, pmd_t pmd, union mc_target *target) +{ +	struct page *page = NULL; +	struct page_cgroup *pc; +	enum mc_target_type ret = MC_TARGET_NONE; + +	page = pmd_page(pmd); +	VM_BUG_ON(!page || !PageHead(page)); +	if (!move_anon()) +		return ret; +	pc = lookup_page_cgroup(page); +	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { +		ret = MC_TARGET_PAGE; +		if (target) { +			get_page(page); +			target->page = page; +		} +	} +	return ret; +} +#else +static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, +		unsigned long addr, pmd_t pmd, union mc_target *target) +{ +	return MC_TARGET_NONE; +} +#endif +  static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,  					unsigned long addr, unsigned long end,  					struct mm_walk *walk) @@ -5194,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,  	pte_t *pte;  	spinlock_t *ptl; -	split_huge_page_pmd(walk->mm, pmd); +	if (pmd_trans_huge_lock(pmd, vma) == 1) { +		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) +			mc.precharge += HPAGE_PMD_NR; +		spin_unlock(&vma->vm_mm->page_table_lock); +		return 0; +	}  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);  	for (; addr != end; pte++, addr += PAGE_SIZE) -		if (is_target_pte_for_mc(vma, addr, *pte, NULL)) +		if (get_mctgt_type(vma, addr, *pte, NULL))  			mc.precharge++;	/* increment precharge temporarily */  	pte_unmap_unlock(pte - 1, ptl);  	cond_resched(); @@ -5300,9 +5410,8 @@ static void mem_cgroup_clear_mc(void)  	mem_cgroup_end_move(from);  } -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, -				struct cgroup *cgroup, -				struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, +				 struct cgroup_taskset *tset)  {  	struct task_struct *p = cgroup_taskset_first(tset);  	int ret = 0; @@ -5340,9 +5449,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,  	return ret;  } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, -				struct cgroup *cgroup, -				struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +				     struct cgroup_taskset *tset)  {  	mem_cgroup_clear_mc();  } @@ -5355,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,  	struct vm_area_struct *vma = walk->private;  	pte_t *pte;  	spinlock_t *ptl; +	enum mc_target_type target_type; +	union mc_target target; +	struct page *page; +	struct page_cgroup *pc; + +	/* +	 * We don't take compound_lock() here but no race with splitting thp +	 * happens because: +	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not +	 *    under splitting, which means there's no concurrent thp split, +	 *  - if another thread runs into split_huge_page() just after we +	 *    entered this if-block, the thread must wait for page table lock +	 *    to be unlocked in __split_huge_page_splitting(), where the main +	 *    part of thp split is not executed yet. +	 */ +	if (pmd_trans_huge_lock(pmd, vma) == 1) { +		if (!mc.precharge) { +			spin_unlock(&vma->vm_mm->page_table_lock); +			return 0; +		} +		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); +		if (target_type == MC_TARGET_PAGE) { +			page = target.page; +			if (!isolate_lru_page(page)) { +				pc = lookup_page_cgroup(page); +				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, +							     pc, mc.from, mc.to, +							     false)) { +					mc.precharge -= HPAGE_PMD_NR; +					mc.moved_charge += HPAGE_PMD_NR; +				} +				putback_lru_page(page); +			} +			put_page(page); +		} +		spin_unlock(&vma->vm_mm->page_table_lock); +		return 0; +	} -	split_huge_page_pmd(walk->mm, pmd);  retry:  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);  	for (; addr != end; addr += PAGE_SIZE) {  		pte_t ptent = *(pte++); -		union mc_target target; -		int type; -		struct page *page; -		struct page_cgroup *pc;  		swp_entry_t ent;  		if (!mc.precharge)  			break; -		type = is_target_pte_for_mc(vma, addr, ptent, &target); -		switch (type) { +		switch (get_mctgt_type(vma, addr, ptent, &target)) {  		case MC_TARGET_PAGE:  			page = target.page;  			if (isolate_lru_page(page)) @@ -5384,7 +5524,7 @@ retry:  				mc.moved_charge++;  			}  			putback_lru_page(page); -put:			/* is_target_pte_for_mc() gets the page */ +put:			/* get_mctgt_type() gets the page */  			put_page(page);  			break;  		case MC_TARGET_SWAP: @@ -5457,9 +5597,8 @@ retry:  	up_read(&mm->mmap_sem);  } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, -				struct cgroup *cont, -				struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, +				 struct cgroup_taskset *tset)  {  	struct task_struct *p = cgroup_taskset_first(tset);  	struct mm_struct *mm = get_task_mm(p); @@ -5474,20 +5613,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,  		mem_cgroup_clear_mc();  }  #else	/* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, -				struct cgroup *cgroup, -				struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, +				 struct cgroup_taskset *tset)  {  	return 0;  } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, -				struct cgroup *cgroup, -				struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +				     struct cgroup_taskset *tset)  {  } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, -				struct cgroup *cont, -				struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, +				 struct cgroup_taskset *tset)  {  }  #endif | 
