summaryrefslogtreecommitdiff
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c104
1 files changed, 73 insertions, 31 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87a177917cb2..63c19c24dfe1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -720,7 +720,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
- new_pol, vma->vm_userfaultfd_ctx);
+ new_pol, vma->vm_userfaultfd_ctx,
+ vma_get_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
@@ -818,7 +819,7 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
struct page *p;
int err;
- err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+ err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, &p, NULL);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
@@ -894,11 +895,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
- if (vma) {
- up_read(&current->mm->mmap_sem);
- vma = NULL;
- }
-
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
@@ -1237,6 +1233,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
+ unsigned long t;
unsigned long nlongs;
unsigned long endmask;
@@ -1253,13 +1250,19 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
- /* When the user specified more nodes than supported just check
- if the non supported part is all zero. */
+ /*
+ * When the user specified more nodes than supported just check
+ * if the non supported part is all zero.
+ *
+ * If maxnode have more longs than MAX_NUMNODES, check
+ * the bits in that area first. And then go through to
+ * check the rest bits which equal or bigger than MAX_NUMNODES.
+ * Otherwise, just check bits [MAX_NUMNODES, maxnode).
+ */
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
if (nlongs > PAGE_SIZE/sizeof(long))
return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
- unsigned long t;
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
@@ -1272,6 +1275,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
endmask = ~0UL;
}
+ if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
+ unsigned long valid_mask = endmask;
+
+ valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
+ if (get_user(t, nmask + nlongs - 1))
+ return -EFAULT;
+ if (t & valid_mask)
+ return -EINVAL;
+ }
+
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
@@ -1398,10 +1411,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out_put;
}
- if (!nodes_subset(*new, node_states[N_MEMORY])) {
- err = -EINVAL;
+ task_nodes = cpuset_mems_allowed(current);
+ nodes_and(*new, *new, task_nodes);
+ if (nodes_empty(*new))
+ goto out_put;
+
+ nodes_and(*new, *new, node_states[N_MEMORY]);
+ if (nodes_empty(*new))
goto out_put;
- }
err = security_task_movememory(task);
if (err)
@@ -1492,7 +1509,6 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode)
{
- long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
@@ -1501,14 +1517,13 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(bm, nmask, nr_bits);
+ if (compat_get_bitmap(bm, nmask, nr_bits))
+ return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, bm, alloc_size);
+ if (copy_to_user(nm, bm, alloc_size))
+ return -EFAULT;
}
- if (err)
- return -EFAULT;
-
return sys_set_mempolicy(mode, nm, nr_bits+1);
}
@@ -1516,7 +1531,6 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
compat_ulong_t, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode, compat_ulong_t, flags)
{
- long err = 0;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
nodemask_t bm;
@@ -1525,14 +1539,13 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
- err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
+ if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
+ return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
- err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
+ if (copy_to_user(nm, nodes_addr(bm), alloc_size))
+ return -EFAULT;
}
- if (err)
- return -EFAULT;
-
return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
}
@@ -1998,16 +2011,44 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
- page = __alloc_pages_node(hpage_node,
- gfp | __GFP_THISNODE, order);
+ /*
+ * We cannot invoke reclaim if __GFP_THISNODE
+ * is set. Invoking reclaim with
+ * __GFP_THISNODE set, would cause THP
+ * allocations to trigger heavy swapping
+ * despite there may be tons of free memory
+ * (including potentially plenty of THP
+ * already available in the buddy) on all the
+ * other NUMA nodes.
+ *
+ * At most we could invoke compaction when
+ * __GFP_THISNODE is set (but we would need to
+ * refrain from invoking reclaim even if
+ * compaction returned COMPACT_SKIPPED because
+ * there wasn't not enough memory to succeed
+ * compaction). For now just avoid
+ * __GFP_THISNODE instead of limiting the
+ * allocation path to a strict and single
+ * compaction invocation.
+ *
+ * Supposedly if direct reclaim was enabled by
+ * the caller, the app prefers THP regardless
+ * of the node it comes from so this would be
+ * more desiderable behavior than only
+ * providing THP originated from the local
+ * node in such case.
+ */
+ if (!(gfp & __GFP_DIRECT_RECLAIM))
+ gfp |= __GFP_THISNODE;
+ page = __alloc_pages_node(hpage_node, gfp, order);
goto out;
}
}
nmask = policy_nodemask(gfp, pol);
zl = policy_zonelist(gfp, pol, node);
- mpol_cond_put(pol);
page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+ mpol_cond_put(pol);
out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
@@ -2130,6 +2171,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
+ /* a's ->flags is the same as b's */
+ if (a->flags & MPOL_F_LOCAL)
+ return true;
return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
@@ -2522,9 +2566,7 @@ static void __init check_numabalancing_enable(void)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
- pr_info("%s automatic NUMA balancing. "
- "Configure with numa_balancing= or the "
- "kernel.numa_balancing sysctl",
+ pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}