summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c120
1 files changed, 99 insertions, 21 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index b577fbb98d4b..488dda209431 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -587,19 +587,6 @@ vma_address(struct page *page, struct vm_area_struct *vma)
}
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-static void percpu_flush_tlb_batch_pages(void *data)
-{
- /*
- * All TLB entries are flushed on the assumption that it is
- * cheaper to flush all TLBs and let them be refilled than
- * flushing individual PFNs. Note that we do not track mm's
- * to flush as that might simply be multiple full TLB flushes
- * for no gain.
- */
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- flush_tlb_local();
-}
-
/*
* Flush TLB entries for recently unmapped pages from remote CPUs. It is
* important if a PTE was dirty when it was unmapped that it's flushed
@@ -616,15 +603,14 @@ void try_to_unmap_flush(void)
cpu = get_cpu();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
- percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
- smp_call_function_many(&tlb_ubc->cpumask,
- percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+ if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
}
+
+ if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
cpumask_clear(&tlb_ubc->cpumask);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
@@ -649,6 +635,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
tlb_ubc->flush_required = true;
/*
+ * Ensure compiler does not re-order the setting of tlb_flush_batched
+ * before the PTE is cleared.
+ */
+ barrier();
+ mm->tlb_flush_batched = true;
+
+ /*
* If the PTE was dirty then it's best to assume it's writable. The
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
* before the page is queued for IO.
@@ -675,6 +668,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
return should_defer;
}
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ if (mm->tlb_flush_batched) {
+ flush_tlb_mm(mm);
+
+ /*
+ * Do not allow the compiler to re-order the clearing of
+ * tlb_flush_batched before the tlb is flushed.
+ */
+ barrier();
+ mm->tlb_flush_batched = false;
+ }
+}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
struct page *page, bool writable)
@@ -1302,12 +1324,41 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
+ unsigned long sh_address;
+ bool pmd_sharing_possible = false;
+ unsigned long spmd_start, spmd_end;
enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
goto out;
+ /*
+ * Only use the range_start/end mmu notifiers if huge pmd sharing
+ * is possible. In the normal case, mmu_notifier_invalidate_page
+ * is sufficient as we only unmap a page. However, if we unshare
+ * a pmd, we will unmap a PUD_SIZE range.
+ */
+ if (PageHuge(page)) {
+ spmd_start = address;
+ spmd_end = spmd_start + vma_mmu_pagesize(vma);
+
+ /*
+ * Check if pmd sharing is possible. If possible, we could
+ * unmap a PUD_SIZE range. spmd_start/spmd_end will be
+ * modified if sharing is possible.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &spmd_start,
+ &spmd_end);
+ if (spmd_end - spmd_start != vma_mmu_pagesize(vma)) {
+ sh_address = address;
+
+ pmd_sharing_possible = true;
+ mmu_notifier_invalidate_range_start(vma->vm_mm,
+ spmd_start, spmd_end);
+ }
+ }
+
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -1334,6 +1385,30 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
}
+ /*
+ * Call huge_pmd_unshare to potentially unshare a huge pmd. Pass
+ * sh_address as it will be modified if unsharing is successful.
+ */
+ if (PageHuge(page) && huge_pmd_unshare(mm, &sh_address, pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD page. There is
+ * no way of knowing exactly which PMDs may be cached for
+ * this mm, so flush them all. spmd_start/spmd_end cover
+ * this PUD_SIZE range.
+ */
+ flush_cache_range(vma, spmd_start, spmd_end);
+ flush_tlb_range(vma, spmd_start, spmd_end);
+
+ /*
+ * The ref count of the PMD page was dropped which is part
+ * of the way map counting is done for shared PMDs. When
+ * there is no other sharing, huge_pmd_unshare returns false
+ * and we will unmap the actual page and drop map count
+ * to zero.
+ */
+ goto out_unmap;
+ }
+
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
if (should_defer_flush(mm, flags)) {
@@ -1428,6 +1503,9 @@ out_unmap:
if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
mmu_notifier_invalidate_page(mm, address);
out:
+ if (pmd_sharing_possible)
+ mmu_notifier_invalidate_range_end(vma->vm_mm,
+ spmd_start, spmd_end);
return ret;
}