1 files changed, 230 insertions, 84 deletions
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index cf1bb922d467..960071d258c3 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -1,160 +1,306 @@
-
-
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
-
 #include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include <asm/kaiser.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 #ifdef CONFIG_KAISER
 
 __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
 
-/**
- * Get the real ppn from a address in kernel mapping.
- * @param address The virtual adrress
- * @return the physical address
+/*
+ * Returns -1 on error.
  */
-static inline unsigned long get_pa_from_mapping (unsigned long address)
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 
-	pgd = pgd_offset_k(address);
-	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
-
-	pud = pud_offset(pgd, address);
-	BUG_ON(pud_none(*pud));
+	pgd = pgd_offset_k(vaddr);
+	/*
+	 * We made all the kernel PGDs present in kaiser_init().
+	 * We expect them to stay that way.
+	 */
+	BUG_ON(pgd_none(*pgd));
+	/*
+	 * PGDs are either 512GB or 128TB on all x86_64
+	 * configurations.  We don't handle these.
+	 */
+	BUG_ON(pgd_large(*pgd));
 
-	if (pud_large(*pud)) {
-		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		WARN_ON_ONCE(1);
+		return -1;
 	}
 
-	pmd = pmd_offset(pud, address);
-	BUG_ON(pmd_none(*pmd));
+	if (pud_large(*pud))
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
 
-	if (pmd_large(*pmd)) {
-		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		WARN_ON_ONCE(1);
+		return -1;
 	}
 
-	pte = pte_offset_kernel(pmd, address);
-	BUG_ON(pte_none(*pte));
+	if (pmd_large(*pmd))
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
 
-	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_none(*pte)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 }
 
-void _kaiser_copy (unsigned long start_addr, unsigned long size,
-					unsigned long flags)
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 {
-	pgd_t *pgd;
-	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long address;
-	unsigned long end_addr = start_addr + size;
-	unsigned long target_address;
+	pud_t *pud;
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 
-	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
-			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
-		target_address = get_pa_from_mapping(address);
+	might_sleep();
+	if (is_atomic) {
+		gfp &= ~GFP_KERNEL;
+		gfp |= __GFP_HIGH | __GFP_ATOMIC;
+	}
 
-		pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	if (pgd_none(*pgd)) {
+		WARN_ONCE(1, "All shadow pgds should have been populated");
+		return NULL;
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 
-		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
-		BUG_ON(pgd_large(*pgd));
+	pud = pud_offset(pgd, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pud_none(*pud))
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+		else
+			free_page(new_pmd_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
 
-		pud = pud_offset(pgd, address);
-		if (pud_none(*pud)) {
-			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
-		}
-		BUG_ON(pud_large(*pud));
+	pmd = pmd_offset(pud, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pmd_none(*pmd))
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+		else
+			free_page(new_pte_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
 
-		pmd = pmd_offset(pud, address);
-		if (pmd_none(*pmd)) {
-			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
-		}
-		BUG_ON(pmd_large(*pmd));
+	return pte_offset_kernel(pmd, address);
+}
 
-		pte = pte_offset_kernel(pmd, address);
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			unsigned long flags)
+{
+	int ret = 0;
+	pte_t *pte;
+	unsigned long start_addr = (unsigned long )__start_addr;
+	unsigned long address = start_addr & PAGE_MASK;
+	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+	unsigned long target_address;
+
+	for (;address < end_addr; address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+		if (target_address == -1) {
+			ret = -EIO;
+			break;
+		}
+		pte = kaiser_pagetable_walk(address, false);
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {
-			BUG_ON(__pa(pte_page(*pte)) != target_address);
+			pte_t tmp;
+			set_pte(&tmp, __pte(flags | target_address));
+			WARN_ON_ONCE(!pte_same(*pte, tmp));
 		}
 	}
+	return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+	unsigned long size = end - start;
+
+	return kaiser_add_user_map(start, size, flags);
 }
 
-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
-static inline void __init _kaiser_init(void)
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
 {
 	pgd_t *pgd;
 	int i = 0;
 
 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
-		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+		pgd_t new_pgd;
+		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+		if (!pud) {
+			WARN_ON(1);
+			break;
+		}
+		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+		/*
+		 * Make sure not to stomp on some other pgd entry.
+		 */
+		if (!pgd_none(pgd[i])) {
+			WARN_ON(1);
+			continue;
+		}
+		set_pgd(pgd + i, new_pgd);
 	}
 }
 
+#define kaiser_add_user_map_early(start, size, flags) do {	\
+	int __ret = kaiser_add_user_map(start, size, flags);	\
+	WARN_ON(__ret);						\
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
+	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
+	WARN_ON(__ret);							\
+} while (0)
+
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-spinlock_t shadow_table_lock;
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
 void __init kaiser_init(void)
 {
 	int cpu;
-	spin_lock_init(&shadow_table_lock);
-
-	spin_lock(&shadow_table_lock);
 
-	_kaiser_init();
+	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
-		// map the per cpu user variables
-		_kaiser_copy(
-				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
-				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
-				__PAGE_KERNEL);
+		void *percpu_vaddr = __per_cpu_user_mapped_start +
+				     per_cpu_offset(cpu);
+		unsigned long percpu_sz = __per_cpu_user_mapped_end -
+					  __per_cpu_user_mapped_start;
+		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+					  __PAGE_KERNEL);
 	}
 
-	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
-	_kaiser_copy(
-			(unsigned long) __entry_text_start,
-			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
-			__PAGE_KERNEL_RX);
+	/*
+	 * Map the entry/exit text section, which is needed at
+	 * switches from user to and from kernel.
+	 */
+	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+				       __PAGE_KERNEL_RX);
 
-	// the fixed map address of the idt_table
-	_kaiser_copy(
-			(unsigned long) idt_descr.address,
-			sizeof(gate_desc) * NR_VECTORS,
-			__PAGE_KERNEL_RO);
-
-	spin_unlock(&shadow_table_lock);
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+				       __irqentry_text_end,
+				       __PAGE_KERNEL_RX);
+#endif
+	kaiser_add_user_map_early((void *)idt_descr.address,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+	kaiser_add_user_map_early(&trace_idt_descr,
+				  sizeof(trace_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&trace_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
+#endif
+	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&debug_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
 }
 
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
 // add a mapping to the shadow-mapping, and synchronize the mappings
-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
-	spin_lock(&shadow_table_lock);
-	_kaiser_copy(addr, size, flags);
-	spin_unlock(&shadow_table_lock);
+	return kaiser_add_user_map((const void *)addr, size, flags);
 }
 
-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
-	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
-	spin_lock(&shadow_table_lock);
-	do {
-		unmap_pud_range(pgd, start, start + size);
-	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
-	spin_unlock(&shadow_table_lock);
+	unsigned long end = start + size;
+	unsigned long addr;
+
+	for (addr = start; addr < end; addr += PGDIR_SIZE) {
+		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+		/*
+		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
+		 * so no need to trim 'end'.
+		 */
+		unmap_pud_range_nofree(pgd, addr, end);
+	}
 }
 #endif /* CONFIG_KAISER */