From adbe236b953f4537f9e5ce86d1c7ace613dec38c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 6 Apr 2016 10:05:16 +0200 Subject: x86/mce: Avoid using object after free in genpool commit a3125494cff084b098c80bb36fbe2061ffed9d52 upstream. When we loop over all queued machine check error records to pass them to the registered notifiers we use llist_for_each_entry(). But the loop calls gen_pool_free() for the entry in the body of the loop - and then the iterator looks at node->next after the free. Use llist_for_each_entry_safe() instead. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Gong Chen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac Link: http://lkml.kernel.org/r/0205920@agluck-desk.sc.intel.com Link: http://lkml.kernel.org/r/1459929916-12852-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce-genpool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 0a850100c594..2658e2af74ec 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -29,7 +29,7 @@ static char gen_pool_buf[MCE_POOLSZ]; void mce_gen_pool_process(void) { struct llist_node *head; - struct mce_evt_llist *node; + struct mce_evt_llist *node, *tmp; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -37,7 +37,7 @@ void mce_gen_pool_process(void) return; head = llist_reverse_order(head); - llist_for_each_entry(node, head, llnode) { + llist_for_each_entry_safe(node, tmp, head, llnode) { mce = &node->mce; atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); -- cgit v1.2.3 From 1c8497d2035d95e4e26bdf6cec34150bfb972776 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 30 Mar 2016 12:24:47 -0700 Subject: kvm: x86: do not leak guest xcr0 into host interrupt handlers commit fc5b7f3bf1e1414bd4e91db6918c85ace0c873a5 upstream. An interrupt handler that uses the fpu can kill a KVM VM, if it runs under the following conditions: - the guest's xcr0 register is loaded on the cpu - the guest's fpu context is not loaded - the host is using eagerfpu Note that the guest's xcr0 register and fpu context are not loaded as part of the atomic world switch into "guest mode". They are loaded by KVM while the cpu is still in "host mode". Usage of the fpu in interrupt context is gated by irq_fpu_usable(). The interrupt handler will look something like this: if (irq_fpu_usable()) { kernel_fpu_begin(); [... code that uses the fpu ...] kernel_fpu_end(); } As long as the guest's fpu is not loaded and the host is using eager fpu, irq_fpu_usable() returns true (interrupted_kernel_fpu_idle() returns true). The interrupt handler proceeds to use the fpu with the guest's xcr0 live. kernel_fpu_begin() saves the current fpu context. If this uses XSAVE[OPT], it may leave the xsave area in an undesirable state. According to the SDM, during XSAVE bit i of XSTATE_BV is not modified if bit i is 0 in xcr0. So it's possible that XSTATE_BV[i] == 1 and xcr0[i] == 0 following an XSAVE. kernel_fpu_end() restores the fpu context. Now if any bit i in XSTATE_BV == 1 while xcr0[i] == 0, XRSTOR generates a #GP. The fault is trapped and SIGSEGV is delivered to the current process. Only pre-4.2 kernels appear to be vulnerable to this sequence of events. Commit 653f52c ("kvm,x86: load guest FPU context more eagerly") from 4.2 forces the guest's fpu to always be loaded on eagerfpu hosts. This patch fixes the bug by keeping the host's xcr0 loaded outside of the interrupts-disabled region where KVM switches into guest mode. Suggested-by: Andy Lutomirski Signed-off-by: David Matlack [Move load after goto cancel_injection. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/x86.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7eb4ebd3ebea..605cea75eb0d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -697,7 +697,6 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } - kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) @@ -6495,8 +6494,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); - kvm_load_guest_xcr0(vcpu); - vcpu->mode = IN_GUEST_MODE; srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -6519,6 +6516,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } + kvm_load_guest_xcr0(vcpu); + if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); @@ -6568,6 +6567,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_put_guest_xcr0(vcpu); + /* Interrupt is enabled by handle_external_intr() */ kvm_x86_ops->handle_external_intr(vcpu); @@ -7215,7 +7216,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) * and assume host would use all available bits. * Guest xcr0 would be loaded later. */ - kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); @@ -7224,8 +7224,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) { vcpu->fpu_counter = 0; return; -- cgit v1.2.3 From 4bb48b5f95a9e40451e259e295d03cd301740440 Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Tue, 8 Mar 2016 12:24:35 +0530 Subject: ARM: dts: AM43x-epos: Fix clk parent for synctimer commit cfe1580a6415bc37fd62d79eb8102a618f7650b2 upstream. commit 55ee7017ee31 ("arm: omap2: board-generic: use omap4_local_timer_init for AM437x") makes synctimer32k as the clocksource on AM43xx. By default the synctimer32k is clocked by 32K RTC OSC on AM43xx. But this 32K RTC OSC is not available on epos boards which makes it fail to boot. Synctimer32k can also be clocked by a peripheral PLL, so making this as clock parent for synctimer3k on epos boards. Fixes: 55ee7017ee31 ("arm: omap2: board-generic: use omap4_local_timer_init for AM437x") Reported-by: Nishanth Menon Signed-off-by: Lokesh Vutla Signed-off-by: Tony Lindgren Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/am43x-epos-evm.dts | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/am43x-epos-evm.dts b/arch/arm/boot/dts/am43x-epos-evm.dts index 47954ed990f8..00707aac72fc 100644 --- a/arch/arm/boot/dts/am43x-epos-evm.dts +++ b/arch/arm/boot/dts/am43x-epos-evm.dts @@ -792,3 +792,8 @@ tx-num-evt = <32>; rx-num-evt = <32>; }; + +&synctimer_32kclk { + assigned-clocks = <&mux_synctimer32k_ck>; + assigned-clock-parents = <&clkdiv32k_ick>; +}; -- cgit v1.2.3 From c6cf3b71df047f61b568795fe926f107806eae82 Mon Sep 17 00:00:00 2001 From: Patrick Uiterwijk Date: Tue, 29 Mar 2016 16:57:40 +0000 Subject: ARM: mvebu: Correct unit address for linksys commit 199831c77c50e6913e893b6bc268ba9f4a9a2bf8 upstream. The USB2 port for Armada 38x is defined to be at 58000, not at 50000. Fixes: 2d0a7addbd10 ("ARM: Kirkwood: Add support for many Synology NAS devices") Signed-off-by: Patrick Uiterwijk Acked-by: Imre Kaloz Signed-off-by: Gregory CLEMENT Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/armada-385-linksys.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/armada-385-linksys.dtsi b/arch/arm/boot/dts/armada-385-linksys.dtsi index 3710755c6d76..85d2c377c332 100644 --- a/arch/arm/boot/dts/armada-385-linksys.dtsi +++ b/arch/arm/boot/dts/armada-385-linksys.dtsi @@ -117,7 +117,7 @@ }; /* USB part of the eSATA/USB 2.0 port */ - usb@50000 { + usb@58000 { status = "okay"; }; -- cgit v1.2.3 From 81b5ed00246258100df11c87f118a27f0ebceba3 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Fri, 11 Mar 2016 10:12:28 -0600 Subject: ARM: OMAP2: Fix up interconnect barrier initialization for DRA7 commit 456e8d53482537616899a146b706eccd095404e6 upstream. The following commits: commit 3fa609755c11 ("ARM: omap2: restore OMAP4 barrier behaviour") commit f746929ffdc8 ("Revert "ARM: OMAP4: remove dead kconfig option OMAP4_ERRATA_I688"") and commit ea827ad5ffbb ("ARM: DRA7: Provide proper IO map table") came in around the same time, unfortunately this seem to have missed initializing the barrier for DRA7 platforms - omap5_map_io was reused for dra7 till it was split out by the last patch. barrier_init needs to be hence carried forward as it is valid for DRA7 family of processors as they are for OMAP5. Fixes: ea827ad5ffbb7 ("ARM: DRA7: Provide proper IO map table") Reported-by: Laurent Pinchart Reported-by: Tomi Valkeinen Cc: Russell King Signed-off-by: Nishanth Menon Reviewed-by: Laurent Pinchart Signed-off-by: Tony Lindgren Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-omap2/io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c index 3eaeaca5da05..3a911d8dea8b 100644 --- a/arch/arm/mach-omap2/io.c +++ b/arch/arm/mach-omap2/io.c @@ -368,6 +368,7 @@ void __init omap5_map_io(void) void __init dra7xx_map_io(void) { iotable_init(dra7xx_io_desc, ARRAY_SIZE(dra7xx_io_desc)); + omap_barriers_init(); } #endif /* -- cgit v1.2.3 From 882e790b572a7dadf5323e373d0139bfbc6dce15 Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Sat, 26 Mar 2016 23:08:55 -0600 Subject: ARM: OMAP2+: hwmod: Fix updating of sysconfig register commit 3ca4a238106dedc285193ee47f494a6584b6fd2f upstream. Commit 127500ccb766f ("ARM: OMAP2+: Only write the sysconfig on idle when necessary") talks about verification of sysconfig cache value before updating it, only during idle path. But the patch is adding the verification in the enable path. So, adding the check in a proper place as per the commit description. Not keeping this check during enable path as there is a chance of losing context and it is safe to do on idle as the context of the register will never be lost while the device is active. Signed-off-by: Lokesh Vutla Acked-by: Tero Kristo Cc: Jon Hunter Fixes: commit 127500ccb766 "ARM: OMAP2+: Only write the sysconfig on idle when necessary" [paul@pwsan.com: appears to have been caused by my own mismerge of the originally posted patch] Signed-off-by: Paul Walmsley Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-omap2/omap_hwmod.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 8e0bd5939e5a..147c90e70b2e 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -1416,9 +1416,7 @@ static void _enable_sysc(struct omap_hwmod *oh) (sf & SYSC_HAS_CLOCKACTIVITY)) _set_clockactivity(oh, oh->class->sysc->clockact, &v); - /* If the cached value is the same as the new value, skip the write */ - if (oh->_sysc_cache != v) - _write_sysconfig(v, oh); + _write_sysconfig(v, oh); /* * Set the autoidle bit only after setting the smartidle bit @@ -1481,7 +1479,9 @@ static void _idle_sysc(struct omap_hwmod *oh) _set_master_standbymode(oh, idlemode, &v); } - _write_sysconfig(v, oh); + /* If the cached value is the same as the new value, skip the write */ + if (oh->_sysc_cache != v) + _write_sysconfig(v, oh); } /** -- cgit v1.2.3 From bdb0618ad1b9ea6ec6926450c687d133ccddf28c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 7 Jan 2016 16:07:20 +0000 Subject: arm64: Honour !PTE_WRITE in set_pte_at() for kernel mappings commit ac15bd63bbb24238f763ec5b24ee175ec301e8cd upstream. Currently, set_pte_at() only checks the software PTE_WRITE bit for user mappings when it sets or clears the hardware PTE_RDONLY accordingly. The kernel ptes are written directly without any modification, relying solely on the protection bits in macros like PAGE_KERNEL. However, modifying kernel pte attributes via pte_wrprotect() would be ignored by set_pte_at(). Since pte_wrprotect() does not set PTE_RDONLY (it only clears PTE_WRITE), the new permission is not taken into account. This patch changes set_pte_at() to adjust the read-only permission for kernel ptes as well. As a side effect, existing PROT_* definitions used for kernel ioremap*() need to include PTE_DIRTY | PTE_WRITE. (additionally, white space fix for PTE_KERNEL_ROX) Acked-by: Andrey Ryabinin Tested-by: Ard Biesheuvel Signed-off-by: Catalin Marinas Reported-by: Ard Biesheuvel Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/pgtable.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index eaa9cabf4066..298474933ef3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -69,11 +69,11 @@ extern void __pgd_error(const char *file, int line, unsigned long val); #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) -#define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) -#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) -#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC)) -#define PROT_NORMAL_WT (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_WT)) -#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL)) +#define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) +#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) +#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) +#define PROT_NORMAL_WT (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_WT)) +#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) @@ -83,7 +83,7 @@ extern void __pgd_error(const char *file, int line, unsigned long val); #define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_RO __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) @@ -155,6 +155,7 @@ extern struct page *empty_zero_page; #define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE)) #define pte_exec(pte) (!(pte_val(pte) & PTE_UXN)) #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT)) +#define pte_user(pte) (!!(pte_val(pte) & PTE_USER)) #ifdef CONFIG_ARM64_HW_AFDBM #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) @@ -165,8 +166,6 @@ extern struct page *empty_zero_page; #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) #define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) -#define pte_valid_user(pte) \ - ((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) @@ -264,13 +263,13 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - if (pte_valid_user(pte)) { - if (!pte_special(pte) && pte_exec(pte)) - __sync_icache_dcache(pte, addr); + if (pte_valid(pte)) { if (pte_sw_dirty(pte) && pte_write(pte)) pte_val(pte) &= ~PTE_RDONLY; else pte_val(pte) |= PTE_RDONLY; + if (pte_user(pte) && pte_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte, addr); } /* -- cgit v1.2.3 From 70d65587f0a82f50952cb29af133a5b6b8538611 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 9 Mar 2016 16:31:29 +0000 Subject: arm64: Update PTE_RDONLY in set_pte_at() for PROT_NONE permission commit fdc69e7df3cb24f18a93192641786e5b7ecd1dfe upstream. The set_pte_at() function must update the hardware PTE_RDONLY bit depending on the state of the PTE_WRITE and PTE_DIRTY bits of the given entry value. However, it currently only performs this for pte_valid() entries, ignoring PTE_PROT_NONE. The side-effect is that PROT_NONE mappings would not have the PTE_RDONLY bit set. Without CONFIG_ARM64_HW_AFDBM, this is not an issue since such PROT_NONE pages are not accessible anyway. With commit 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits"), the ptep_set_wrprotect() function was re-written to cope with automatic hardware updates of the dirty state. As an optimisation, only PTE_RDONLY is checked to assess the "dirty" status. Since set_pte_at() does not set this bit for PROT_NONE mappings, such pages may be considered "dirty" as a result of ptep_set_wrprotect(). This patch updates the pte_valid() check to pte_present() in set_pte_at(). It also adds PTE_PROT_NONE to the swap entry bits comment. Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits") Signed-off-by: Catalin Marinas Reported-by: Ganapatrao Kulkarni Tested-by: Ganapatrao Kulkarni Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/pgtable.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 298474933ef3..c63868ae9a4a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -263,7 +263,7 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - if (pte_valid(pte)) { + if (pte_present(pte)) { if (pte_sw_dirty(pte) && pte_write(pte)) pte_val(pte) &= ~PTE_RDONLY; else @@ -640,6 +640,7 @@ extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; * bits 0-1: present (must be zero) * bits 2-7: swap type * bits 8-57: swap offset + * bit 58: PTE_PROT_NONE (must be zero) */ #define __SWP_TYPE_SHIFT 2 #define __SWP_TYPE_BITS 6 -- cgit v1.2.3 From 27b3cc048a5275c53e26c15ffcab3fcf9a03cda0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 21 Apr 2016 00:27:04 -0600 Subject: x86/mm/xen: Suppress hugetlbfs in PV guests commit 103f6112f253017d7062cd74d17f4a514ed4485c upstream. Huge pages are not normally available to PV guests. Not suppressing hugetlbfs use results in an endless loop of page faults when user mode code tries to access a hugetlbfs mapped area (since the hypervisor denies such PTEs to be created, but error indications can't be propagated out of xen_set_pte_at(), just like for various of its siblings), and - once killed in an oops like this: kernel BUG at .../fs/hugetlbfs/inode.c:428! invalid opcode: 0000 [#1] SMP ... RIP: e030:[] [] remove_inode_hugepages+0x25b/0x320 ... Call Trace: [] hugetlbfs_evict_inode+0x15/0x40 [] evict+0xbd/0x1b0 [] __dentry_kill+0x19a/0x1f0 [] dput+0x1fe/0x220 [] __fput+0x155/0x200 [] task_work_run+0x60/0xa0 [] do_exit+0x160/0x400 [] do_group_exit+0x3b/0xa0 [] get_signal+0x1ed/0x470 [] do_signal+0x14/0x110 [] prepare_exit_to_usermode+0xe9/0xf0 [] retint_user+0x8/0x13 This is CVE-2016-3961 / XSA-174. Reported-by: Vitaly Kuznetsov Signed-off-by: Jan Beulich Cc: Andrew Morton Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Vrabel Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Juergen Gross Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: xen-devel Link: http://lkml.kernel.org/r/57188ED802000078000E431C@prv-mh.provo.novell.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index f8a29d2c97b0..e6a8613fbfb0 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -4,6 +4,7 @@ #include #include +#define hugepages_supported() cpu_has_pse static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, -- cgit v1.2.3 From d22ac3a9403bc2a60662ec117dc83f72564d61f9 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 31 Mar 2016 11:48:31 +0200 Subject: s390/pci: add extra padding to function measurement block commit 9d89d9e61d361f3adb75e1aebe4bb367faf16cfa upstream. Newer machines might use a different (larger) format for function measurement blocks. To ensure that we comply with the alignment requirement on these machines and prevent memory corruption (when firmware writes more data than we expect) add 16 padding bytes at the end of the fmb. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/pci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 2b2ced9dc00a..6dafabb6ae1a 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -45,7 +45,8 @@ struct zpci_fmb { u64 rpcit_ops; u64 dma_rbytes; u64 dma_wbytes; -} __packed __aligned(64); + u64 pad[2]; +} __packed __aligned(128); enum zpci_state { ZPCI_FN_STATE_RESERVED, -- cgit v1.2.3 From 9a10dfc8bf95270073c6c2e2be3de26a652d730a Mon Sep 17 00:00:00 2001 From: Xiaodong Liu Date: Tue, 12 Apr 2016 09:45:51 +0000 Subject: crypto: sha1-mb - use corrcet pointer while completing jobs commit 0851561d9c965df086ef8a53f981f5f95a57c2c8 upstream. In sha_complete_job, incorrect mcryptd_hash_request_ctx pointer is used when check and complete other jobs. If the memory of first completed req is freed, while still completing other jobs in the func, kernel will crash since NULL pointer is assigned to RIP. Signed-off-by: Xiaodong Liu Acked-by: Tim Chen Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/x86/crypto/sha-mb/sha1_mb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index a841e9765bd6..8381c09d2870 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -453,10 +453,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); else { local_bh_disable(); - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); local_bh_enable(); } } -- cgit v1.2.3 From 53f3e26b3d09ae40318877b74e0d6c1af767a07f Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 15 Apr 2016 12:06:13 +1000 Subject: powerpc: scan_features() updates incorrect bits for REAL_LE commit 6997e57d693b07289694239e52a10d2f02c3a46f upstream. The REAL_LE feature entry in the ibm_pa_feature struct is missing an MMU feature value, meaning all the remaining elements initialise the wrong values. This means instead of checking for byte 5, bit 0, we check for byte 0, bit 0, and then we incorrectly set the CPU feature bit as well as MMU feature bit 1 and CPU user feature bits 0 and 2 (5). Checking byte 0 bit 0 (IBM numbering), means we're looking at the "Memory Management Unit (MMU)" feature - ie. does the CPU have an MMU. In practice that bit is set on all platforms which have the property. This means we set CPU_FTR_REAL_LE always. In practice that seems not to matter because all the modern cpus which have this property also implement REAL_LE, and we've never needed to disable it. We're also incorrectly setting MMU feature bit 1, which is: #define MMU_FTR_TYPE_8xx 0x00000002 Luckily the only place that looks for MMU_FTR_TYPE_8xx is in Book3E code, which can't run on the same cpus as scan_features(). So this also doesn't matter in practice. Finally in the CPU user feature mask, we're setting bits 0 and 2. Bit 2 is not currently used, and bit 0 is: #define PPC_FEATURE_PPC_LE 0x00000001 Which says the CPU supports the old style "PPC Little Endian" mode. Again this should be harmless in practice as no 64-bit CPUs implement that mode. Fix the code by adding the missing initialisation of the MMU feature. Also add a comment marking CPU user feature bit 2 (0x4) as reserved. It would be unsafe to start using it as old kernels incorrectly set it. Fixes: 44ae3ab3358e ("powerpc: Free up some CPU feature bits by moving out MMU-related features") Signed-off-by: Anton Blanchard [mpe: Flesh out changelog, add comment reserving 0x4] Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/uapi/asm/cputable.h | 1 + arch/powerpc/kernel/prom.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h index 43686043e297..2734c005da21 100644 --- a/arch/powerpc/include/uapi/asm/cputable.h +++ b/arch/powerpc/include/uapi/asm/cputable.h @@ -31,6 +31,7 @@ #define PPC_FEATURE_PSERIES_PERFMON_COMPAT \ 0x00000040 +/* Reserved - do not use 0x00000004 */ #define PPC_FEATURE_TRUE_LE 0x00000002 #define PPC_FEATURE_PPC_LE 0x00000001 diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7030b035905d..080c96b44a7f 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -158,7 +158,7 @@ static struct ibm_pa_feature { {CPU_FTR_NOEXECUTE, 0, 0, 0, 6, 0}, {CPU_FTR_NODSISRALIGN, 0, 0, 1, 1, 1}, {0, MMU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0}, - {CPU_FTR_REAL_LE, PPC_FEATURE_TRUE_LE, 5, 0, 0}, + {CPU_FTR_REAL_LE, 0, PPC_FEATURE_TRUE_LE, 5, 0, 0}, /* * If the kernel doesn't support TM (ie. CONFIG_PPC_TRANSACTIONAL_MEM=n), * we don't want to turn on CPU_FTR_TM here, so we use CPU_FTR_TM_COMP -- cgit v1.2.3 From 08c9b94505bbe09ed42f658de2a4dbe274fa7468 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 15 Apr 2016 12:07:24 +1000 Subject: powerpc: Update cpu_user_features2 in scan_features() commit beff82374b259d726e2625ec6c518a5f2613f0ae upstream. scan_features() updates cpu_user_features but not cpu_user_features2. Amongst other things, cpu_user_features2 contains the user TM feature bits which we must keep in sync with the kernel TM feature bit. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/prom.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 080c96b44a7f..03fce77e441d 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -148,23 +148,24 @@ static struct ibm_pa_feature { unsigned long cpu_features; /* CPU_FTR_xxx bit */ unsigned long mmu_features; /* MMU_FTR_xxx bit */ unsigned int cpu_user_ftrs; /* PPC_FEATURE_xxx bit */ + unsigned int cpu_user_ftrs2; /* PPC_FEATURE2_xxx bit */ unsigned char pabyte; /* byte number in ibm,pa-features */ unsigned char pabit; /* bit number (big-endian) */ unsigned char invert; /* if 1, pa bit set => clear feature */ } ibm_pa_features[] __initdata = { - {0, 0, PPC_FEATURE_HAS_MMU, 0, 0, 0}, - {0, 0, PPC_FEATURE_HAS_FPU, 0, 1, 0}, - {CPU_FTR_CTRL, 0, 0, 0, 3, 0}, - {CPU_FTR_NOEXECUTE, 0, 0, 0, 6, 0}, - {CPU_FTR_NODSISRALIGN, 0, 0, 1, 1, 1}, - {0, MMU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0}, - {CPU_FTR_REAL_LE, 0, PPC_FEATURE_TRUE_LE, 5, 0, 0}, + {0, 0, PPC_FEATURE_HAS_MMU, 0, 0, 0, 0}, + {0, 0, PPC_FEATURE_HAS_FPU, 0, 0, 1, 0}, + {CPU_FTR_CTRL, 0, 0, 0, 0, 3, 0}, + {CPU_FTR_NOEXECUTE, 0, 0, 0, 0, 6, 0}, + {CPU_FTR_NODSISRALIGN, 0, 0, 0, 1, 1, 1}, + {0, MMU_FTR_CI_LARGE_PAGE, 0, 0, 1, 2, 0}, + {CPU_FTR_REAL_LE, 0, PPC_FEATURE_TRUE_LE, 0, 5, 0, 0}, /* * If the kernel doesn't support TM (ie. CONFIG_PPC_TRANSACTIONAL_MEM=n), * we don't want to turn on CPU_FTR_TM here, so we use CPU_FTR_TM_COMP * which is 0 if the kernel doesn't support TM. */ - {CPU_FTR_TM_COMP, 0, 0, 22, 0, 0}, + {CPU_FTR_TM_COMP, 0, 0, 0, 22, 0, 0}, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, @@ -195,10 +196,12 @@ static void __init scan_features(unsigned long node, const unsigned char *ftrs, if (bit ^ fp->invert) { cur_cpu_spec->cpu_features |= fp->cpu_features; cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftrs; + cur_cpu_spec->cpu_user_features2 |= fp->cpu_user_ftrs2; cur_cpu_spec->mmu_features |= fp->mmu_features; } else { cur_cpu_spec->cpu_features &= ~fp->cpu_features; cur_cpu_spec->cpu_user_features &= ~fp->cpu_user_ftrs; + cur_cpu_spec->cpu_user_features2 &= ~fp->cpu_user_ftrs2; cur_cpu_spec->mmu_features &= ~fp->mmu_features; } } -- cgit v1.2.3 From c89c3225062d64c63532c127c374ea962f336e6b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 15 Apr 2016 12:08:19 +1000 Subject: powerpc: Update TM user feature bits in scan_features() commit 4705e02498d6d5a7ab98dfee9595cd5e91db2017 upstream. We need to update the user TM feature bits (PPC_FEATURE2_HTM and PPC_FEATURE2_HTM) to mirror what we do with the kernel TM feature bit. At the moment, if firmware reports TM is not available we turn off the kernel TM feature bit but leave the userspace ones on. Userspace thinks it can execute TM instructions and it dies trying. This (together with a QEMU patch) fixes PR KVM, which doesn't currently support TM. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/prom.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 03fce77e441d..a15fe1d4e84a 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -161,11 +161,12 @@ static struct ibm_pa_feature { {0, MMU_FTR_CI_LARGE_PAGE, 0, 0, 1, 2, 0}, {CPU_FTR_REAL_LE, 0, PPC_FEATURE_TRUE_LE, 0, 5, 0, 0}, /* - * If the kernel doesn't support TM (ie. CONFIG_PPC_TRANSACTIONAL_MEM=n), - * we don't want to turn on CPU_FTR_TM here, so we use CPU_FTR_TM_COMP - * which is 0 if the kernel doesn't support TM. + * If the kernel doesn't support TM (ie CONFIG_PPC_TRANSACTIONAL_MEM=n), + * we don't want to turn on TM here, so we use the *_COMP versions + * which are 0 if the kernel doesn't support TM. */ - {CPU_FTR_TM_COMP, 0, 0, 0, 22, 0, 0}, + {CPU_FTR_TM_COMP, 0, 0, + PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0}, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, -- cgit v1.2.3 From 01d5ccd341290e771ac6b94b08c220df6f81a630 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Apr 2016 14:22:32 -0600 Subject: x86/apic: Handle zero vector gracefully in clear_vector_irq() commit 1bdb8970392a68489b469c3a330a1adb5ef61beb upstream. If x86_vector_alloc_irq() fails x86_vector_free_irqs() is invoked to cleanup the already allocated vectors. This subsequently calls clear_vector_irq(). The failed irq has no vector assigned, which triggers the BUG_ON(!vector) in clear_vector_irq(). We cannot suppress the call to x86_vector_free_irqs() for the failed interrupt, because the other data related to this irq must be cleaned up as well. So calling clear_vector_irq() with vector == 0 is legitimate. Remove the BUG_ON and return if vector is zero, [ tglx: Massaged changelog ] Fixes: b5dc8e6c21e7 "x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors" Signed-off-by: Keith Busch Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/apic/vector.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 7af2505f20c2..df6b4eeac0bd 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -254,7 +254,8 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data) struct irq_desc *desc; int cpu, vector; - BUG_ON(!data->cfg.vector); + if (!data->cfg.vector) + return; vector = data->cfg.vector; for_each_cpu_and(cpu, data->domain, cpu_online_mask) -- cgit v1.2.3 From 8481fdf6dc13e3a2b3f7e75e414b5eab3771329d Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Thu, 3 Mar 2016 02:03:11 +0100 Subject: x86/mm/kmmio: Fix mmiotrace for hugepages commit cfa52c0cfa4d727aa3e457bf29aeff296c528a08 upstream. Because Linux might use bigger pages than the 4K pages to handle those mmio ioremaps, the kmmio code shouldn't rely on the pade id as it currently does. Using the memory address instead of the page id lets us look up how big the page is and what its base address is, so that we won't get a page fault within the same page twice anymore. Tested-by: Pierre Moreau Signed-off-by: Karol Herbst Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: linux-mm@kvack.org Cc: linux-x86_64@vger.kernel.org Cc: nouveau@lists.freedesktop.org Cc: pq@iki.fi Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1456966991-6861-1-git-send-email-nouveau@karolherbst.de Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kmmio.c | 88 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 637ab34ed632..ddb2244b06a1 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -33,7 +33,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; - unsigned long page; /* location of the fault page */ + unsigned long addr; /* the requested address */ pteval_t old_presence; /* page presence prior to arming */ bool armed; @@ -70,9 +70,16 @@ unsigned int kmmio_count; static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; static LIST_HEAD(kmmio_probes); -static struct list_head *kmmio_page_list(unsigned long page) +static struct list_head *kmmio_page_list(unsigned long addr) { - return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + + return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; } /* Accessed per-cpu */ @@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) } /* You must be holding RCU read lock. */ -static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) { struct list_head *head; struct kmmio_fault_page *f; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); - page &= PAGE_MASK; - head = kmmio_page_list(page); + if (!pte) + return NULL; + addr &= page_level_mask(l); + head = kmmio_page_list(addr); list_for_each_entry_rcu(f, head, list) { - if (f->page == page) + if (f->addr == addr) return f; } return NULL; @@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(f->page, &level); + pte_t *pte = lookup_address(f->addr, &level); if (!pte) { - pr_err("no pte for page 0x%08lx\n", f->page); + pr_err("no pte for addr 0x%08lx\n", f->addr); return -1; } @@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one(f->page); + __flush_tlb_one(f->addr); return 0; } @@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, !!f->old_presence); + pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); - WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), - f->page); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), + f->addr); f->armed = true; return ret; } @@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, - KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); f->armed = false; } @@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) struct kmmio_context *ctx; struct kmmio_fault_page *faultpage; int ret = 0; /* default to fault not handled */ + unsigned long page_base = addr; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + if (!pte) + return -EINVAL; + page_base &= page_level_mask(l); /* * Preemption is now disabled to prevent process switch during @@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) preempt_disable(); rcu_read_lock(); - faultpage = get_kmmio_fault_page(addr); + faultpage = get_kmmio_fault_page(page_base); if (!faultpage) { /* * Either this page fault is not caused by kmmio, or @@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - if (addr == ctx->addr) { + if (page_base == ctx->addr) { /* * A second fault on the same page means some other * condition needs handling by do_page_fault(), the @@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx->active++; ctx->fpage = faultpage; - ctx->probe = get_kmmio_probe(addr); + ctx->probe = get_kmmio_probe(page_base); ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - ctx->addr = addr; + ctx->addr = page_base; if (ctx->probe && ctx->probe->pre_handler) ctx->probe->pre_handler(ctx->probe, regs, addr); @@ -354,12 +371,11 @@ out: } /* You must be holding kmmio_lock. */ -static int add_kmmio_fault_page(unsigned long page) +static int add_kmmio_fault_page(unsigned long addr) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (f) { if (!f->count) arm_kmmio_fault_page(f); @@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page) return -1; f->count = 1; - f->page = page; + f->addr = addr; if (arm_kmmio_fault_page(f)) { kfree(f); return -1; } - list_add_rcu(&f->list, kmmio_page_list(f->page)); + list_add_rcu(&f->list, kmmio_page_list(f->addr)); return 0; } /* You must be holding kmmio_lock. */ -static void release_kmmio_fault_page(unsigned long page, +static void release_kmmio_fault_page(unsigned long addr, struct kmmio_fault_page **release_list) { struct kmmio_fault_page *f; - page &= PAGE_MASK; - f = get_kmmio_fault_page(page); + f = get_kmmio_fault_page(addr); if (!f) return; @@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p) int ret = 0; unsigned long size = 0; const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + unsigned int l; + pte_t *pte; spin_lock_irqsave(&kmmio_lock, flags); if (get_kmmio_probe(p->addr)) { ret = -EEXIST; goto out; } + + pte = lookup_address(p->addr, &l); + if (!pte) { + ret = -EINVAL; + goto out; + } + kmmio_count++; list_add_rcu(&p->list, &kmmio_probes); while (size < size_lim) { if (add_kmmio_fault_page(p->addr + size)) pr_err("Unable to set page fault.\n"); - size += PAGE_SIZE; + size += page_level_size(l); } out: spin_unlock_irqrestore(&kmmio_lock, flags); @@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p) const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); struct kmmio_fault_page *release_list = NULL; struct kmmio_delayed_release *drelease; + unsigned int l; + pte_t *pte; + + pte = lookup_address(p->addr, &l); + if (!pte) + return; spin_lock_irqsave(&kmmio_lock, flags); while (size < size_lim) { release_kmmio_fault_page(p->addr + size, &release_list); - size += PAGE_SIZE; + size += page_level_size(l); } list_del_rcu(&p->list); kmmio_count--; -- cgit v1.2.3 From 40cab474b47b2fc87911812687e83f8cd21aea1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 19 Feb 2016 10:35:39 -0800 Subject: ARM: OMAP3: Add cpuidle parameters table for omap3430 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 98f42221501353067251fbf11e732707dbb68ce3 upstream. Based on CPU type choose generic omap3 or omap3430 specific cpuidle parameters. Parameters for omap3430 were measured on Nokia N900 device and added by commit 5a1b1d3a9efa ("OMAP3: RX-51: Pass cpu idle parameters") which were later removed by commit 231900afba52 ("ARM: OMAP3: cpuidle - remove rx51 cpuidle parameters table") due to huge code complexity. This patch brings cpuidle parameters for omap3430 devices again, but uses simple condition based on CPU type. Fixes: 231900afba52 ("ARM: OMAP3: cpuidle - remove rx51 cpuidle parameters table") Signed-off-by: Pali Rohár Acked-by: Daniel Lezcano Signed-off-by: Tony Lindgren Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-omap2/cpuidle34xx.c | 69 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c index aa7b379e2661..2a3db0bd9e15 100644 --- a/arch/arm/mach-omap2/cpuidle34xx.c +++ b/arch/arm/mach-omap2/cpuidle34xx.c @@ -34,6 +34,7 @@ #include "pm.h" #include "control.h" #include "common.h" +#include "soc.h" /* Mach specific information to be recorded in the C-state driver_data */ struct omap3_idle_statedata { @@ -315,6 +316,69 @@ static struct cpuidle_driver omap3_idle_driver = { .safe_state_index = 0, }; +/* + * Numbers based on measurements made in October 2009 for PM optimized kernel + * with CPU freq enabled on device Nokia N900. Assumes OPP2 (main idle OPP, + * and worst case latencies). + */ +static struct cpuidle_driver omap3430_idle_driver = { + .name = "omap3430_idle", + .owner = THIS_MODULE, + .states = { + { + .enter = omap3_enter_idle_bm, + .exit_latency = 110 + 162, + .target_residency = 5, + .name = "C1", + .desc = "MPU ON + CORE ON", + }, + { + .enter = omap3_enter_idle_bm, + .exit_latency = 106 + 180, + .target_residency = 309, + .name = "C2", + .desc = "MPU ON + CORE ON", + }, + { + .enter = omap3_enter_idle_bm, + .exit_latency = 107 + 410, + .target_residency = 46057, + .name = "C3", + .desc = "MPU RET + CORE ON", + }, + { + .enter = omap3_enter_idle_bm, + .exit_latency = 121 + 3374, + .target_residency = 46057, + .name = "C4", + .desc = "MPU OFF + CORE ON", + }, + { + .enter = omap3_enter_idle_bm, + .exit_latency = 855 + 1146, + .target_residency = 46057, + .name = "C5", + .desc = "MPU RET + CORE RET", + }, + { + .enter = omap3_enter_idle_bm, + .exit_latency = 7580 + 4134, + .target_residency = 484329, + .name = "C6", + .desc = "MPU OFF + CORE RET", + }, + { + .enter = omap3_enter_idle_bm, + .exit_latency = 7505 + 15274, + .target_residency = 484329, + .name = "C7", + .desc = "MPU OFF + CORE OFF", + }, + }, + .state_count = ARRAY_SIZE(omap3_idle_data), + .safe_state_index = 0, +}; + /* Public functions */ /** @@ -333,5 +397,8 @@ int __init omap3_idle_init(void) if (!mpu_pd || !core_pd || !per_pd || !cam_pd) return -ENODEV; - return cpuidle_register(&omap3_idle_driver, NULL); + if (cpu_is_omap3430()) + return cpuidle_register(&omap3430_idle_driver, NULL); + else + return cpuidle_register(&omap3_idle_driver, NULL); } -- cgit v1.2.3 From 159c52e15f95712dd22aa5d64b17a79a7fd8f939 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 28 Nov 2015 23:56:47 +0100 Subject: ARM: prima2: always enable reset controller commit ef2b1d777d643af227a22309d8b79898b90b123c upstream. The atlas7 clock controller driver registers a reset controller for itself, which causes a link error when the subsystem is disabled: drivers/built-in.o: In function `atlas7_clk_init': drivers/clk/sirf/clk-atlas7.c:1681: undefined reference to `reset_controller_register' As the clk driver does not have a Kconfig symbol for itself but it always built-in when the platform is enabled, we have to ensure that the reset controller subsystem is also built-in in this case. Signed-off-by: Arnd Bergmann Acked-by: Philipp Zabel Fixes: 301c5d29402e ("clk: sirf: add CSR atlas7 clk and reset support") Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-prima2/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/mach-prima2/Kconfig b/arch/arm/mach-prima2/Kconfig index 9ab8932403e5..56e55fd37d13 100644 --- a/arch/arm/mach-prima2/Kconfig +++ b/arch/arm/mach-prima2/Kconfig @@ -1,6 +1,7 @@ menuconfig ARCH_SIRF bool "CSR SiRF" if ARCH_MULTI_V7 select ARCH_HAS_RESET_CONTROLLER + select RESET_CONTROLLER select ARCH_REQUIRE_GPIOLIB select GENERIC_IRQ_CHIP select NO_IOPORT_MAP -- cgit v1.2.3 From abc48d066b7b5063db56f4a81e367c84b9582882 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 29 Jan 2016 15:50:38 +0100 Subject: ARM: EXYNOS: select THERMAL_OF commit dc7eb9d589e595954792cc192bcbb92932e5c2ff upstream. We cannot select a symbol that has disabled dependencies, so we get a warning if we ever enable EXYNOS_THERMAL without also turning on THERMAL_OF: warning: (ARCH_EXYNOS) selects EXYNOS_THERMAL which has unmet direct dependencies (THERMAL && (ARCH_EXYNOS || COMPILE_TEST) && THERMAL_OF) This adds another 'select' in the platform code to avoid that case. Alternatively, we could decide to not select EXYNOS_THERMAL here and instead make it a user option. Signed-off-by: Arnd Bergmann Fixes: f87e6bd3f740 ("thermal: exynos: Add the dependency of CONFIG_THERMAL_OF instead of CONFIG_OF") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-exynos/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig index 3a10f1a8317a..bfd8bb371477 100644 --- a/arch/arm/mach-exynos/Kconfig +++ b/arch/arm/mach-exynos/Kconfig @@ -26,6 +26,7 @@ menuconfig ARCH_EXYNOS select S5P_DEV_MFC select SRAM select THERMAL + select THERMAL_OF select MFD_SYSCON help Support for SAMSUNG EXYNOS SoCs (EXYNOS4/5) -- cgit v1.2.3 From ea075ae7f00c6416b12d68abf29b6a57a15b3916 Mon Sep 17 00:00:00 2001 From: Lior Amsalem Date: Wed, 10 Feb 2016 17:29:15 +0100 Subject: ARM: dts: armada-375: use armada-370-sata for SATA commit b3a7f31eb7375633cd6a742f19488fc5a4208b36 upstream. The Armada 375 has the same SATA IP as Armada 370 and Armada XP, which requires the PHY speed to be set in the LP_PHY_CTL register for SATA hotplug to work. Therefore, this commit updates the compatible string used to describe the SATA IP in Armada 375 from marvell,orion-sata to marvell,armada-370-sata. Fixes: 4de59085091f753d08c8429d756b46756ab94665 ("ARM: mvebu: add Device Tree description of the Armada 375 SoC") Signed-off-by: Lior Amsalem Signed-off-by: Thomas Petazzoni Reviewed-by: Andrew Lunn Signed-off-by: Gregory CLEMENT Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/armada-375.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/armada-375.dtsi b/arch/arm/boot/dts/armada-375.dtsi index 7ccce7529b0c..cc952cf8ec30 100644 --- a/arch/arm/boot/dts/armada-375.dtsi +++ b/arch/arm/boot/dts/armada-375.dtsi @@ -529,7 +529,7 @@ }; sata@a0000 { - compatible = "marvell,orion-sata"; + compatible = "marvell,armada-370-sata"; reg = <0xa0000 0x5000>; interrupts = ; clocks = <&gateclk 14>, <&gateclk 20>; -- cgit v1.2.3 From eb7f1c5fb5c8e888ca8b728e17e71426ea809590 Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Sat, 13 Feb 2016 00:49:20 +0100 Subject: ARM: dts: pxa: fix dma engine node to pxa3xx-nand commit 07c6b2d01d351f0512ed7145625265e435ab3240 upstream. Since the switch from mmp_pdma to pxa_dma driver for pxa architectures, the pxa_dma requires 2 arguments, namely the requestor line and the requested priority. Fix the only left device node which was still passing only one argument, making the pxa3xx-nand driver misbehave in a device-tree configuration, ie. failing all data transfers. Fixes: c943646d1f49 ("ARM: dts: pxa: add dma engine node to pxa3xx-nand") Signed-off-by: Robert Jarzmik Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/pxa3xx.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/pxa3xx.dtsi b/arch/arm/boot/dts/pxa3xx.dtsi index cf6998a0804d..564341af7e97 100644 --- a/arch/arm/boot/dts/pxa3xx.dtsi +++ b/arch/arm/boot/dts/pxa3xx.dtsi @@ -30,7 +30,7 @@ reg = <0x43100000 90>; interrupts = <45>; clocks = <&clks CLK_NAND>; - dmas = <&pdma 97>; + dmas = <&pdma 97 3>; dma-names = "data"; #address-cells = <1>; #size-cells = <1>; -- cgit v1.2.3 From cc798dcca00a8b71e5c09a38d3921461f931c85f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Nov 2015 13:26:19 +0000 Subject: arm64: mm: detect bad __create_mapping uses If a caller of __create_mapping provides a PA and VA which have different sub-page offsets, it is not clear which offset they expect to apply to the mapping, and is indicative of a bad caller. In some cases, the region we wish to map may validly have a sub-page offset in the physical and virtual addresses. For example, EFI runtime regions have 4K granularity, yet may be mapped by a 64K page kernel. So long as the physical and virtual offsets are the same, the region will be mapped at the expected VAs. Disallow calls with differing sub-page offsets, and WARN when they are encountered, so that we can detect and fix such cases. Cc: Laura Abbott Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Reviewed-by: Steve Capper Signed-off-by: Mark Rutland Signed-off-by: Will Deacon (cherry picked from commit cc5d2b3b95cdbb3fed4e38e667d17b9ac7250f7a) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 116ad654dd59..61a82d68330d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -251,6 +251,13 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, { unsigned long addr, length, end, next; + /* + * If the virtual and physical address don't have the same offset + * within a page, we cannot map the region as the caller expects. + */ + if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) + return; + addr = virt & PAGE_MASK; length = PAGE_ALIGN(size + (virt & ~PAGE_MASK)); -- cgit v1.2.3 From 6329e5d3b74540578dbbf4550fdc53c52c706c94 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Nov 2015 13:26:20 +0000 Subject: arm64: mm: allow sections for unaligned bases Callees of __create_mapping may decide to create section mappings if sufficient low bits of the physical and virtual addresses they were passed are zero. While __create_mapping rounds the virtual base address down, it does not similarly round the physical base address down, and hence non-zero bits in the physical address can prevent use of a section mapping, even where a whole next-level table would be used instead. Round down the physical base address in __create_mapping to enable all callees to always create section mappings when such a mapping is possible. Cc: Laura Abbott Acked-by: Ard Biesheuvel Acked-by: Catalin Marinas Reviewed-by: Steve Capper Signed-off-by: Mark Rutland Signed-off-by: Will Deacon (cherry picked from commit 9c4e08a3022b6df90d31ef4007291faabfce5431) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 61a82d68330d..d2a6194f4bec 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -258,6 +258,7 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) return; + phys &= PAGE_MASK; addr = virt & PAGE_MASK; length = PAGE_ALIGN(size + (virt & ~PAGE_MASK)); -- cgit v1.2.3 From 6e8ef09edf4a9e61a04fad753ffeebaecc60b568 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Oct 2015 18:56:19 +0000 Subject: arm64: pgtable: implement pte_accessible() This patch implements the pte_accessible() macro, which can be used to test whether or not a given pte is a candidate for allocation in the TLB. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon (cherry picked from commit 76c714be0e5e60c935a53b31be58939510ba1d0f) Signed-off-by: Alex Shi --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c63868ae9a4a..cd5dfc97268e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -168,6 +168,16 @@ extern struct page *empty_zero_page; #define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) +#define pte_valid_young(pte) \ + ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF)) + +/* + * Could the pte be present in the TLB? We must check mm_tlb_flush_pending + * so that we don't erroneously return false for pages that have been + * remapped as PROT_NONE but are yet to be flushed from the TLB. + */ +#define pte_accessible(mm, pte) \ + (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte)) static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { -- cgit v1.2.3 From 1a9cc42c0a812241f6cd679a19aefba2900437a7 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 20 Nov 2015 17:59:10 +0800 Subject: arm64: add __init/__initdata section marker to some functions/variables These functions/variables are not needed after booting, so mark them as __init or __initdata. Signed-off-by: Jisheng Zhang Signed-off-by: Will Deacon (cherry picked from commit a7c61a3452d39078919f0e1f493ff966fb64f0db) Signed-off-by: Alex Shi --- arch/arm64/kernel/armv8_deprecated.c | 6 +++--- arch/arm64/kernel/cpufeature.c | 9 +++++---- arch/arm64/kernel/fpsimd.c | 2 +- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/arm64/mm/init.c | 6 +++--- 5 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 937f5e58a4d3..3e01207917b1 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -62,7 +62,7 @@ struct insn_emulation { }; static LIST_HEAD(insn_emulation); -static int nr_insn_emulated; +static int nr_insn_emulated __initdata; static DEFINE_RAW_SPINLOCK(insn_emulation_lock); static void register_emulation_hooks(struct insn_emulation_ops *ops) @@ -173,7 +173,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn, return ret; } -static void register_insn_emulation(struct insn_emulation_ops *ops) +static void __init register_insn_emulation(struct insn_emulation_ops *ops) { unsigned long flags; struct insn_emulation *insn; @@ -237,7 +237,7 @@ static struct ctl_table ctl_abi[] = { { } }; -static void register_insn_emulation_sysctl(struct ctl_table *table) +static void __init register_insn_emulation_sysctl(struct ctl_table *table) { unsigned long flags; int i = 0; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0669c63281ea..5c90aa490a2b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -684,7 +684,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = { {}, }; -static void cap_set_hwcap(const struct arm64_cpu_capabilities *cap) +static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: @@ -729,7 +729,7 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities * return rc; } -static void setup_cpu_hwcaps(void) +static void __init setup_cpu_hwcaps(void) { int i; const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps; @@ -758,7 +758,8 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, * Run through the enabled capabilities and enable() it on all active * CPUs */ -static void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) +static void __init +enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) { int i; @@ -897,7 +898,7 @@ static inline void set_sys_caps_initialised(void) #endif /* CONFIG_HOTPLUG_CPU */ -static void setup_feature_capabilities(void) +static void __init setup_feature_capabilities(void) { update_cpu_capabilities(arm64_features, "detected feature:"); enable_cpu_capabilities(arm64_features); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4c46c54a3ad7..acc1afd5c749 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -289,7 +289,7 @@ static struct notifier_block fpsimd_cpu_pm_notifier_block = { .notifier_call = fpsimd_cpu_pm_notifier, }; -static void fpsimd_pm_init(void) +static void __init fpsimd_pm_init(void) { cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block); } diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 354144e33218..a6e757cbab77 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -40,7 +40,7 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot, static struct gen_pool *atomic_pool; #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K -static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; +static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE; static int __init early_coherent_pool(char *p) { @@ -896,7 +896,7 @@ static int __iommu_attach_notifier(struct notifier_block *nb, return 0; } -static int register_iommu_dma_ops_notifier(struct bus_type *bus) +static int __init register_iommu_dma_ops_notifier(struct bus_type *bus) { struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL); int ret; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 4cb98aa8c27b..10fab52eed95 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -71,7 +71,7 @@ early_param("initrd", early_initrd); * currently assumes that for memory starting above 4G, 32-bit devices will * use a DMA offset. */ -static phys_addr_t max_zone_dma_phys(void) +static phys_addr_t __init max_zone_dma_phys(void) { phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32); return min(offset + (1ULL << 32), memblock_end_of_DRAM()); @@ -126,11 +126,11 @@ EXPORT_SYMBOL(pfn_valid); #endif #ifndef CONFIG_SPARSEMEM -static void arm64_memory_present(void) +static void __init arm64_memory_present(void) { } #else -static void arm64_memory_present(void) +static void __init arm64_memory_present(void) { struct memblock_region *reg; -- cgit v1.2.3 From 2e310797997214e0cc606013ac308167b6b72dc0 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 2 Dec 2015 14:00:10 +0000 Subject: arm64: fix COMPAT_SHMLBA definition for large pages ARM glibc uses (4 * __getpagesize()) for SHMLBA, which is correct for 4KB pages and works fine for 64KB pages, but the kernel uses a hardcoded 16KB that is too small for 64KB page based kernels. This changes the definition to what user space sees when using 64KB pages. Acked-by: Arnd Bergmann Signed-off-by: Yury Norov Signed-off-by: Will Deacon (cherry picked from commit b9b7aebb42d1b1392f3111de61136bb6cf3aae3f) Signed-off-by: Alex Shi --- arch/arm64/include/asm/shmparam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h index 4df608a8459e..e368a55ebd22 100644 --- a/arch/arm64/include/asm/shmparam.h +++ b/arch/arm64/include/asm/shmparam.h @@ -21,7 +21,7 @@ * alignment value. Since we don't have aliasing D-caches, the rest of * the time we can safely use PAGE_SIZE. */ -#define COMPAT_SHMLBA 0x4000 +#define COMPAT_SHMLBA (4 * PAGE_SIZE) #include -- cgit v1.2.3 From 4ea9dd702768f3d1c3ab346f30fdce8b8d1a8ef9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Nov 2015 15:12:59 +0000 Subject: arm64: enable HAVE_IRQ_TIME_ACCOUNTING arm64 relies on the arm_arch_timer for sched_clock, so we can select HAVE_IRQ_TIME_ACCOUNTING and have the core sched-clock code enable the feature at runtime based on the rate. Reported-by: Mario Smarduch Signed-off-by: Will Deacon (cherry picked from commit 24da208db32ee1e4757ceaba898c47add8e5361e) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 871f21783866..4876459c0838 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -70,6 +70,7 @@ config ARM64 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_GENERIC_DMA_COHERENT select HAVE_HW_BREAKPOINT if PERF_EVENTS + select HAVE_IRQ_TIME_ACCOUNTING select HAVE_MEMBLOCK select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS -- cgit v1.2.3 From 0348dff2c49b25c7b5702ef887f03177f4b0c0fd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 19 Nov 2015 17:48:31 +0000 Subject: arm64: spinlock: serialise spin_unlock_wait against concurrent lockers Boqun Feng reported a rather nasty ordering issue with spin_unlock_wait on architectures implementing spin_lock with LL/SC sequences and acquire semantics: | CPU 1 CPU 2 CPU 3 | ================== ==================== ============== | spin_unlock(&lock); | spin_lock(&lock): | r1 = *lock; // r1 == 0; | o = READ_ONCE(object); // reordered here | object = NULL; | smp_mb(); | spin_unlock_wait(&lock); | *lock = 1; | smp_mb(); | o->dead = true; | if (o) // true | BUG_ON(o->dead); // true!! The crux of the problem is that spin_unlock_wait(&lock) can return on CPU 1 whilst CPU 2 is in the process of taking the lock. This can be resolved by upgrading spin_unlock_wait to a LOCK operation, forcing it to serialise against a concurrent locker and giving it acquire semantics in the process (although it is not at all clear whether this is needed - different callers seem to assume different things about the barrier semantics and architectures are similarly disjoint in their implementations of the macro). This patch implements spin_unlock_wait using an LL/SC sequence with acquire semantics on arm64. For v8.1 systems with the LSE atomics, the exclusive writeback is omitted, since the spin_lock operation is indivisible and no intermediate state can be observed. Signed-off-by: Will Deacon (cherry picked from commit d86b8da04dfa4771a68bdbad6c424d40f22f0d14) Signed-off-by: Alex Shi --- arch/arm64/include/asm/spinlock.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index c85e96d174a5..fc9682bfe002 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,9 +26,28 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ + unsigned int tmp; + arch_spinlock_t lockval; -#define arch_spin_unlock_wait(lock) \ - do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0) + asm volatile( +" sevl\n" +"1: wfe\n" +"2: ldaxr %w0, %2\n" +" eor %w1, %w0, %w0, ror #16\n" +" cbnz %w1, 1b\n" + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ +" stxr %w1, %w0, %2\n" +" cbnz %w1, 2b\n", /* Serialise against any concurrent lockers */ + /* LSE atomics */ +" nop\n" +" nop\n") + : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) + : + : "memory"); +} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -- cgit v1.2.3 From 2ef8b1f56c1989157c8ef929f4f89bd6a3ac7950 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 4 Dec 2015 11:38:39 +0800 Subject: arm64: ftrace: stop using kstop_machine to enable/disable tracing For ftrace on arm64, kstop_machine which is hugely disruptive to a running system is not needed to convert nops to ftrace calls or back, because that to be modified instrucions, that NOP, B or BL, are all safe instructions which called "concurrent modification and execution of instructions", that can be executed by one thread of execution as they are being modified by another thread of execution without requiring explicit synchronization. Signed-off-by: Li Bin Reviewed-by: Steven Rostedt Signed-off-by: Will Deacon (cherry picked from commit 81a6a146e88eca5d6726569779778d61489d85aa) Signed-off-by: Alex Shi --- arch/arm64/kernel/ftrace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c851be795080..9669b331a23b 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -93,6 +93,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ftrace_modify_code(pc, old, new, true); } +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + int __init ftrace_dyn_arch_init(void) { return 0; -- cgit v1.2.3 From 42c1d121864549d11fb0f002df8dc8ef35219107 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Fri, 4 Dec 2015 11:38:40 +0800 Subject: arm64: ftrace: fix the comments for ftrace_modify_code There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Suggested-by: Steven Rostedt Signed-off-by: Li Bin Signed-off-by: Will Deacon (cherry picked from commit 004ab584e028093996cf5b8e220b8bc50c5111cf) Signed-off-by: Alex Shi --- arch/arm64/kernel/ftrace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 9669b331a23b..8f7005bc35bd 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -29,12 +29,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, /* * Note: - * Due to modules and __init, code can disappear and change, - * we need to protect against faulting as well as code changing. - * We do this by aarch64_insn_*() which use the probe_kernel_*(). - * - * No lock is held here because all the modifications are run - * through stop_machine(). + * We are paranoid about modifying text, as if a bug were to happen, it + * could cause us to read or write to someplace that could cause harm. + * Carefully read and modify the code with aarch64_insn_*() which uses + * probe_kernel_*(), and make sure what we read is what we expected it + * to be before modifying it. */ if (validate) { if (aarch64_insn_read((void *)pc, &replaced)) -- cgit v1.2.3 From a5b499e62f4070c7bfe2d322516a937e6f48d04f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 4 Dec 2015 12:42:29 +0000 Subject: arm64: Add trace_hardirqs_off annotation in ret_to_user When a kernel is built with CONFIG_TRACE_IRQFLAGS the following warning is produced when entering userspace for the first time: WARNING: at /work/Linux/linux-2.6-aarch64/kernel/locking/lockdep.c:3519 Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc3+ #639 Hardware name: Juno (DT) task: ffffffc9768a0000 ti: ffffffc9768a8000 task.ti: ffffffc9768a8000 PC is at check_flags.part.22+0x19c/0x1a8 LR is at check_flags.part.22+0x19c/0x1a8 pc : [] lr : [] pstate: 600001c5 sp : ffffffc9768abe10 x29: ffffffc9768abe10 x28: ffffffc9768a8000 x27: 0000000000000000 x26: 0000000000000001 x25: 00000000000000a6 x24: ffffffc00064be6c x23: ffffffc0009f249e x22: ffffffc9768a0000 x21: ffffffc97fea5480 x20: 00000000000001c0 x19: ffffffc00169a000 x18: 0000005558cc7b58 x17: 0000007fb78e3180 x16: 0000005558d2e238 x15: ffffffffffffffff x14: 0ffffffffffffffd x13: 0000000000000008 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: fefefefefefeff63 x9 : 7f7f7f7f7f7f7f7f x8 : 6e655f7371726964 x7 : 0000000000000001 x6 : ffffffc0001079c4 x5 : 0000000000000000 x4 : 0000000000000001 x3 : ffffffc001698438 x2 : 0000000000000000 x1 : ffffffc9768a0000 x0 : 000000000000002e Call trace: [] check_flags.part.22+0x19c/0x1a8 [] lock_is_held+0x80/0x98 [] __schedule+0x404/0x730 [] schedule+0x44/0xb8 [] ret_to_user+0x0/0x24 possible reason: unannotated irqs-off. irq event stamp: 502169 hardirqs last enabled at (502169): [] el0_irq_naked+0x1c/0x24 hardirqs last disabled at (502167): [] __do_softirq+0x17c/0x298 softirqs last enabled at (502168): [] __do_softirq+0x1fc/0x298 softirqs last disabled at (502143): [] irq_exit+0xa0/0xf0 This happens because we disable interrupts in ret_to_user before calling schedule() in work_resched. This patch adds the necessary trace_hardirqs_off annotation. Signed-off-by: Catalin Marinas Reported-by: Mark Rutland Cc: Will Deacon Signed-off-by: Will Deacon (cherry picked from commit db3899a6477a4dccd26cbfb7f408b6be2cc068e0) Signed-off-by: Alex Shi --- arch/arm64/kernel/entry.S | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ed3d75f6304..e5b25389c48f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -634,6 +634,9 @@ work_pending: bl do_notify_resume b ret_to_user work_resched: +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off // the IRQs are off here, inform the tracing code +#endif bl schedule /* -- cgit v1.2.3 From c0f49bdceea7d2f0cfe1c6f857e4e1cfa919382a Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Fri, 4 Dec 2015 11:02:25 +0000 Subject: arm64: Store struct thread_info in sp_el0 There is need for figuring out how to manage struct thread_info data when IRQ stack is introduced. struct thread_info information should be copied to IRQ stack under the current thread_info calculation logic whenever context switching is invoked. This is too expensive to keep supporting the approach. Instead, this patch pays attention to sp_el0 which is an unused scratch register in EL1 context. sp_el0 utilization not only simplifies the management, but also prevents text section size from being increased largely due to static allocated IRQ stack as removing masking operation using THREAD_SIZE in many places. Reviewed-by: Catalin Marinas Signed-off-by: Jungseok Lee Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit 6cdf9c7ca687e01840d0215437620a20263012fc) Signed-off-by: Alex Shi --- arch/arm64/include/asm/thread_info.h | 10 ++++++++-- arch/arm64/kernel/entry.S | 15 ++++++++++++--- arch/arm64/kernel/head.S | 5 +++++ arch/arm64/kernel/sleep.S | 3 +++ 4 files changed, 28 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 90c7ff233735..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -73,10 +73,16 @@ register unsigned long current_stack_pointer asm ("sp"); */ static inline struct thread_info *current_thread_info(void) __attribute_const__; +/* + * struct thread_info can be accessed directly via sp_el0. + */ static inline struct thread_info *current_thread_info(void) { - return (struct thread_info *) - (current_stack_pointer & ~(THREAD_SIZE - 1)); + unsigned long sp_el0; + + asm ("mrs %0, sp_el0" : "=r" (sp_el0)); + + return (struct thread_info *)sp_el0; } #define thread_saved_pc(tsk) \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e5b25389c48f..245fa6837880 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -88,7 +88,8 @@ .if \el == 0 mrs x21, sp_el0 - get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, + mov tsk, sp + and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. .else @@ -107,6 +108,13 @@ str x21, [sp, #S_SYSCALLNO] .endif + /* + * Set sp_el0 to current thread_info. + */ + .if \el == 0 + msr sp_el0, tsk + .endif + /* * Registers that may be useful after this macro is invoked: * @@ -164,8 +172,7 @@ alternative_endif .endm .macro get_thread_info, rd - mov \rd, sp - and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack + mrs \rd, sp_el0 .endm /* @@ -599,6 +606,8 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 + and x9, x9, #~(THREAD_SIZE - 1) + msr sp_el0, x9 ret ENDPROC(cpu_switch_to) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index b685257926f0..17ce7285bb12 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -424,6 +424,9 @@ __mmap_switched: b 1b 2: adr_l sp, initial_sp, x4 + mov x4, sp + and x4, x4, #~(THREAD_SIZE - 1) + msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer str_l x24, memstart_addr, x6 // Save PHYS_OFFSET mov x29, #0 @@ -611,6 +614,8 @@ ENDPROC(secondary_startup) ENTRY(__secondary_switched) ldr x0, [x21] // get secondary_data.stack mov sp, x0 + and x0, x0, #~(THREAD_SIZE - 1) + msr sp_el0, x0 // save thread_info mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index f586f7c875e2..e33fe33876ab 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -173,6 +173,9 @@ ENTRY(cpu_resume) /* load physical address of identity map page table in x1 */ adrp x1, idmap_pg_dir mov sp, x2 + /* save thread_info */ + and x2, x2, #~(THREAD_SIZE - 1) + msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context physical address * pointer and x1 to contain physical address of 1:1 page tables -- cgit v1.2.3 From 2f478c1abcc07611671c5fee4bd10974dde476ab Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 4 Dec 2015 11:02:26 +0000 Subject: arm64: Modify stack trace and dump for use with irq_stack This patch allows unwind_frame() to traverse from interrupt stack to task stack correctly. It requires data from a dummy stack frame, created during irq_stack_entry(), added by a later patch. A similar approach is taken to modify dump_backtrace(), which expects to find struct pt_regs underneath any call to functions marked __exception. When on an irq_stack, the struct pt_regs is stored on the old task stack, the location of which is stored in the dummy stack frame. Reviewed-by: Catalin Marinas Signed-off-by: AKASHI Takahiro [james.morse: merged two patches, reworked for per_cpu irq_stacks, and no alignment guarantees, added irq_stack definitions] Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit 132cd887b5c54758d04bf25c52fa48f45e843a30) Signed-off-by: Alex Shi --- arch/arm64/include/asm/irq.h | 32 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/irq.c | 3 +++ arch/arm64/kernel/stacktrace.c | 29 +++++++++++++++++++++++++++-- arch/arm64/kernel/traps.c | 14 +++++++++++++- 4 files changed, 75 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 8e8d30684392..e2f3f135a3bc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,10 +1,32 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H +#define IRQ_STACK_SIZE THREAD_SIZE +#define IRQ_STACK_START_SP THREAD_START_SP + +#ifndef __ASSEMBLER__ + +#include + #include +#include struct pt_regs; +DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +/* + * The highest address on the stack, and the first to be used. Used to + * find the dummy-stack frame put down by el?_irq() in entry.S. + */ +#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) + +/* + * The offset from irq_stack_ptr where entry.S will store the original + * stack pointer. Used by unwind_frame() and dump_backtrace(). + */ +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); + extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -12,4 +34,14 @@ static inline int nr_legacy_irqs(void) return 0; } +static inline bool on_irq_stack(unsigned long sp, int cpu) +{ + /* variable names the same as kernel/stacktrace.c */ + unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); + unsigned long high = low + IRQ_STACK_START_SP; + + return (low <= sp && sp <= high); +} + +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 9f17ec071ee0..1e3cef578e21 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -30,6 +30,9 @@ unsigned long irq_err_count; +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ccb6078ed9f2..b947eeffa5b2 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,7 @@ #include #include +#include #include /* @@ -39,17 +40,41 @@ int notrace unwind_frame(struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; + unsigned long irq_stack_ptr; + + /* + * Use raw_smp_processor_id() to avoid false-positives from + * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping + * task stacks, we can be pre-empted in this case, so + * {raw_,}smp_processor_id() may give us the wrong value. Sleeping + * tasks can't ever be on an interrupt stack, so regardless of cpu, + * the checks will always fail. + */ + irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id()); low = frame->sp; - high = ALIGN(low, THREAD_SIZE); + /* irq stacks are not THREAD_SIZE aligned */ + if (on_irq_stack(frame->sp, raw_smp_processor_id())) + high = irq_stack_ptr; + else + high = ALIGN(low, THREAD_SIZE) - 0x20; - if (fp < low || fp > high - 0x18 || fp & 0xf) + if (fp < low || fp > high || fp & 0xf) return -EINVAL; frame->sp = fp + 0x10; frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); + /* + * Check whether we are going to walk through from interrupt stack + * to task stack. + * If we reach the end of the stack - and its an interrupt stack, + * read the original task stack pointer from the dummy frame. + */ + if (frame->sp == irq_stack_ptr) + frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + return 0; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index e9b9b5364393..8a0084541f84 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -146,6 +146,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; + unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -180,9 +181,20 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (ret < 0) break; stack = frame.sp; - if (in_exception_text(where)) + if (in_exception_text(where)) { + /* + * If we switched to the irq_stack before calling this + * exception handler, then the pt_regs will be on the + * task stack. The easiest way to tell is if the large + * pt_regs would overlap with the end of the irq_stack. + */ + if (stack < irq_stack_ptr && + (stack + sizeof(struct pt_regs)) > irq_stack_ptr) + stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + dump_mem("", "Exception stack", stack, stack + sizeof(struct pt_regs), false); + } } } -- cgit v1.2.3 From ea288f7a80b63d6956d23f50dd04fa70f8e7368f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 4 Dec 2015 11:02:27 +0000 Subject: arm64: Add do_softirq_own_stack() and enable irq_stacks entry.S is modified to switch to the per_cpu irq_stack during el{0,1}_irq. irq_count is used to detect recursive interrupts on the irq_stack, it is updated late by do_softirq_own_stack(), when called on the irq_stack, before __do_softirq() re-enables interrupts to process softirqs. do_softirq_own_stack() is added by this patch, but does not yet switch stack. This patch adds the dummy stack frame and data needed by the previous stack tracing patches. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit 8e23dacd12a48e58125b84c817da50850b73280a) Signed-off-by: Alex Shi --- arch/arm64/include/asm/irq.h | 2 ++ arch/arm64/kernel/entry.S | 42 ++++++++++++++++++++++++++++++++++++++++-- arch/arm64/kernel/irq.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index e2f3f135a3bc..fa2a8d0e4792 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,6 +11,8 @@ #include #include +#define __ARCH_HAS_DO_SOFTIRQ + struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 245fa6837880..8f7e737949fe 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,42 @@ alternative_endif mrs \rd, sp_el0 .endm + .macro irq_stack_entry, dummy_lr + mov x19, sp // preserve the original sp + + adr_l x25, irq_stack + mrs x26, tpidr_el1 + add x25, x25, x26 + + /* + * Check the lowest address on irq_stack for the irq_count value, + * incremented by do_softirq_own_stack if we have re-enabled irqs + * while on the irq_stack. + */ + ldr x26, [x25] + cbnz x26, 9998f // recursive use? + + /* switch to the irq stack */ + mov x26, #IRQ_STACK_START_SP + add x26, x25, x26 + mov sp, x26 + + /* Add a dummy stack frame */ + stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + mov x29, sp + stp xzr, x19, [sp, #-16]! + +9998: + .endm + + /* + * x19 should be preserved between irq_stack_entry and + * irq_stack_exit. + */ + .macro irq_stack_exit + mov sp, x19 + .endm + /* * These are the registers used in the syscall handler, and allow us to * have in theory up to 7 arguments to a function - x0 to x6. @@ -190,10 +227,11 @@ tsk .req x28 // current thread_info * Interrupt handling. */ .macro irq_handler - adrp x1, handle_arch_irq - ldr x1, [x1, #:lo12:handle_arch_irq] + ldr_l x1, handle_arch_irq mov x0, sp + irq_stack_entry x22 blr x1 + irq_stack_exit .endm .text diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 1e3cef578e21..ff7ebb710e51 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,14 +25,24 @@ #include #include #include +#include #include #include unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +/* + * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. + * irq_stack[0] is used as irq_count, a non-zero value indicates the stack + * is in use, and el?_irq() shouldn't switch to it. This is used to detect + * recursive use of the irq_stack, it is lazily updated by + * do_softirq_own_stack(), which is called on the irq_stack, before + * re-enabling interrupts to process softirqs. + */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -56,3 +66,29 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } + +/* + * do_softirq_own_stack() is called from irq_exit() before __do_softirq() + * re-enables interrupts, at which point we may re-enter el?_irq(). We + * increase irq_count here so that el1_irq() knows that it is already on the + * irq stack. + * + * Called with interrupts disabled, so we don't worry about moving cpu, or + * being interrupted while modifying irq_count. + * + * This function doesn't actually switch stack. + */ +void do_softirq_own_stack(void) +{ + int cpu = smp_processor_id(); + + WARN_ON_ONCE(!irqs_disabled()); + + if (on_irq_stack(current_stack_pointer, cpu)) { + IRQ_COUNT()++; + __do_softirq(); + IRQ_COUNT()--; + } else { + __do_softirq(); + } +} -- cgit v1.2.3 From 306fe6c320ec846544b25130a2fe61e3d394cb6f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Dec 2015 13:58:42 +0000 Subject: arm64: irq: fix walking from irq stack to task stack Running with CONFIG_DEBUG_SPINLOCK=y can trigger a BUG with the new IRQ stack code: BUG: spinlock lockup suspected on CPU#1 This is due to the IRQ_STACK_TO_TASK_STACK macro incorrectly retrieving the task stack pointer stashed at the top of the IRQ stack. Sayeth James: | Yup, this is what is happening. Its an off-by-one due to broken | thinking about how the stack works. My broken thinking was: | | > top ------------ | > | dummy_lr | <- irq_stack_ptr | > ------------ | > | x29 | | > ------------ | > | x19 | <- irq_stack_ptr - 0x10 | > ------------ | > | xzr | | > ------------ | | But the stack-pointer is decreased before use. So it actually looks | like this: | | > ------------ | > | | <- irq_stack_ptr | > top ------------ | > | dummy_lr | | > ------------ | > | x29 | <- irq_stack_ptr - 0x10 | > ------------ | > | x19 | | > ------------ | > | xzr | <- irq_stack_ptr - 0x20 | > ------------ | | The value being used as the original stack is x29, which in all the | tests is sp but without the current frames data, hence there are no | missing frames in the output. | | Jungseok Lee picked it up with a 32bit user space because aarch32 | can't use x29, so it remains 0 forever. The fix he posted is correct. This patch fixes the macro and adds some of this wisdom to a comment, so that the layout of the IRQ stack is well understood. Cc: James Morse Reported-by: Jungseok Lee Signed-off-by: Will Deacon (cherry picked from commit 7596abf2e5661d52c4f414f37addeed54e098880) Signed-off-by: Alex Shi --- arch/arm64/include/asm/irq.h | 20 ++++++++++++++++++-- arch/arm64/kernel/entry.S | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index fa2a8d0e4792..877c7e358384 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -19,7 +19,23 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); /* * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S. + * find the dummy-stack frame put down by el?_irq() in entry.S, which + * is structured as follows: + * + * ------------ + * | | <- irq_stack_ptr + * top ------------ + * | elr_el1 | + * ------------ + * | x29 | <- irq_stack_ptr - 0x10 + * ------------ + * | xzr | + * ------------ + * | x19 | <- irq_stack_ptr - 0x20 + * ------------ + * + * where x19 holds a copy of the task stack pointer. + * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -27,7 +43,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8f7e737949fe..be7ec544b540 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -199,7 +199,7 @@ alternative_endif /* Add a dummy stack frame */ stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame mov x29, sp - stp xzr, x19, [sp, #-16]! + stp x19, xzr, [sp, #-16]! 9998: .endm -- cgit v1.2.3 From 95e1db8bd78d2b3f15f7d4e7896735a041c775f6 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:39 +0000 Subject: arm64: Add this_cpu_ptr() assembler macro for use in entry.S irq_stack is a per_cpu variable, that needs to be access from entry.S. Use an assembler macro instead of the unreadable details. Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit aa4d5d3cbc258c355151a3903211b27359390ec5) Signed-off-by: Alex Shi --- arch/arm64/include/asm/assembler.h | 11 +++++++++++ arch/arm64/kernel/entry.S | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 12eff928ef8b..bb7b72734c24 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -193,6 +193,17 @@ lr .req x30 // link register str \src, [\tmp, :lo12:\sym] .endm + /* + * @sym: The name of the per-cpu variable + * @reg: Result of per_cpu(sym, smp_processor_id()) + * @tmp: scratch register + */ + .macro this_cpu_ptr, sym, reg, tmp + adr_l \reg, \sym + mrs \tmp, tpidr_el1 + add \reg, \reg, \tmp + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index be7ec544b540..e394f8c9595a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -179,9 +179,7 @@ alternative_endif .macro irq_stack_entry, dummy_lr mov x19, sp // preserve the original sp - adr_l x25, irq_stack - mrs x26, tpidr_el1 - add x25, x25, x26 + this_cpu_ptr irq_stack, x25, x26 /* * Check the lowest address on irq_stack for the irq_count value, -- cgit v1.2.3 From e330d15430acce6073bb2c8486fba7555be1e923 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:40 +0000 Subject: arm64: when walking onto the task stack, check sp & fp are in current->stack When unwind_frame() reaches the bottom of the irq_stack, the last fp points to the original task stack. unwind_frame() uses IRQ_STACK_TO_TASK_STACK() to find the sp value. If either values is wrong, we may end up walking a corrupt stack. Check these values are sane by testing if they are both on the stack pointed to by current->stack. Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit 1ffe199b1c9b72a8e752a9ae2a7af10128ab2ca1) Signed-off-by: Alex Shi --- arch/arm64/kernel/stacktrace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b947eeffa5b2..d916d5b6aef6 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -71,9 +71,17 @@ int notrace unwind_frame(struct stackframe *frame) * to task stack. * If we reach the end of the stack - and its an interrupt stack, * read the original task stack pointer from the dummy frame. + * + * Check the frame->fp we read from the bottom of the irq_stack, + * and the original task stack pointer are both in current->stack. */ - if (frame->sp == irq_stack_ptr) - frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (frame->sp == irq_stack_ptr) { + unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + + if(object_is_on_stack((void *)orig_sp) && + object_is_on_stack((void *)frame->fp)) + frame->sp = orig_sp; + } return 0; } -- cgit v1.2.3 From 70dfc6968ad22e057520da92b7b4da86041d3ea7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:41 +0000 Subject: arm64: don't call C code with el0's fp register On entry from el0, we save all the registers on the kernel stack, and restore them before returning. x29 remains unchanged when we call out to C code, which will store x29 as the frame-pointer on the stack. Instead, write 0 into x29 after entry from el0, to avoid any risk of tracing into user space. Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit 49003a8d6b35e128ef5e51433e60e783a46fbe5f) Signed-off-by: Alex Shi --- arch/arm64/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e394f8c9595a..2284c296e3f7 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -93,6 +93,8 @@ and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. + + mov x29, xzr // fp pointed to user-space .else add x21, sp, #S_FRAME_SIZE .endif -- cgit v1.2.3 From 2949f7a8a1516b704750cd343aebb36de2428d38 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:36 +0000 Subject: arm64: mm: remove pointless PAGE_MASKing As pgd_offset{,_k} shift the input address by PGDIR_SHIFT, the sub-page bits will always be shifted out. There is no need to apply PAGE_MASK before this. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon (cherry picked from commit e2c30ee320eb96304896c7ab84499e5bc5e5fb6e) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d2a6194f4bec..c5bd5bca8e3d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -288,7 +288,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt, + __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, size, prot, early_alloc); } @@ -309,7 +309,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, return; } - return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), + return __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, size, prot, late_alloc); } -- cgit v1.2.3 From a2151a0e23afcaf1fbb8f2e63bf2335f61e12172 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:37 +0000 Subject: arm64: Remove redundant padding from linker script Currently we place an ALIGN_DEBUG_RO between text and data for the .text and .init sections, and depending on configuration each of these may result in up to SECTION_SIZE bytes worth of padding (for DEBUG_RODATA_ALIGN). We make no distinction between the text and data in each of these sections at any point when creating the initial page tables in head.S. We also make no distinction when modifying the tables; __map_memblock, fixup_executable, mark_rodata_ro, and fixup_init only work at section granularity. Thus this padding is unnecessary. For the spit between init text and data we impose a minimum alignment of 16 bytes, but this is also unnecessary. The init data is output immediately after the padding before any symbols are defined, so this is not required to keep a symbol for linker a section array correctly associated with the data. Any objects within the section will be given at least their usual alignment regardless. This patch removes the redundant padding. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon (cherry picked from commit 5b28cd9d084eca8ddc46270d2720305bfd40e348) Signed-off-by: Alex Shi --- arch/arm64/kernel/vmlinux.lds.S | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 71426a78db12..cc2572db32a6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -113,7 +113,6 @@ SECTIONS *(.got) /* Global offset table */ } - ALIGN_DEBUG_RO RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES @@ -128,7 +127,6 @@ SECTIONS ARM_EXIT_KEEP(EXIT_TEXT) } - ALIGN_DEBUG_RO_MIN(16) .init.data : { INIT_DATA INIT_SETUP(16) -- cgit v1.2.3 From 9250d09be9e3f467145584fd476a6132e04ddaef Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:38 +0000 Subject: arm64: mm: fold alternatives into .init Currently we treat the alternatives separately from other data that's only used during initialisation, using separate .altinstructions and .altinstr_replacement linker sections. These are freed for general allocation separately from .init*. This is problematic as: * We do not remove execute permissions, as we do for .init, leaving the memory executable. * We pad between them, making the kernel Image bianry up to PAGE_SIZE bytes larger than necessary. This patch moves the two sections into the contiguous region used for .init*. This saves some memory, ensures that we remove execute permissions, and allows us to remove some code made redundant by this reorganisation. Signed-off-by: Mark Rutland Cc: Andre Przywara Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon (cherry picked from commit 9aa4ec1571da62366cfddc20f3b923609604fe63) Signed-off-by: Alex Shi --- arch/arm64/include/asm/alternative.h | 1 - arch/arm64/kernel/alternative.c | 6 ------ arch/arm64/kernel/vmlinux.lds.S | 5 ++--- arch/arm64/mm/init.c | 1 - 4 files changed, 2 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index d56ec0715157..e4962f04201e 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -19,7 +19,6 @@ struct alt_instr { void __init apply_alternatives_all(void); void apply_alternatives(void *start, size_t length); -void free_alternatives_memory(void); #define ALTINSTR_ENTRY(feature) \ " .word 661b - .\n" /* label */ \ diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index ab9db0e9818c..d2ee1b21a10d 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -158,9 +158,3 @@ void apply_alternatives(void *start, size_t length) __apply_alternatives(®ion); } - -void free_alternatives_memory(void) -{ - free_reserved_area(__alt_instructions, __alt_instructions_end, - 0, "alternatives"); -} diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index cc2572db32a6..e3928f578891 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -141,9 +141,6 @@ SECTIONS PERCPU_SECTION(L1_CACHE_BYTES) - . = ALIGN(PAGE_SIZE); - __init_end = .; - . = ALIGN(4); .altinstructions : { __alt_instructions = .; @@ -155,6 +152,8 @@ SECTIONS } . = ALIGN(PAGE_SIZE); + __init_end = .; + _data = .; _sdata = .; RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 10fab52eed95..dba32ceff17a 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -360,7 +360,6 @@ void free_initmem(void) { fixup_init(); free_initmem_default(0); - free_alternatives_memory(); } #ifdef CONFIG_BLK_DEV_INITRD -- cgit v1.2.3 From c92251d1a82c7c071662ca90300ebf488ab3d6f1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 10 Dec 2015 16:54:32 +0000 Subject: arm64: cmpxchg: Don't incldue linux/mmdebug.h The arm64 asm/cmpxchg.h includes linux/mmdebug.h but doesn't so far as I can tell actually use anything from it. Removing the inclusion reduces spurious header dependency rebuilds and also avoids issues with recursive inclusions of headers causing build breaks due to attempts to use things before they are defined if linux/mmdebug.h starts pulling in more low level headers. Such errors have happened in -next recently, for example: In file included from include/linux/completion.h:11:0, from include/linux/rcupdate.h:43, from include/linux/tracepoint.h:19, from include/linux/mmdebug.h:6, from ./arch/arm64/include/asm/cmpxchg.h:22, from ./arch/arm64/include/asm/atomic.h:41, from include/linux/atomic.h:4, from include/linux/spinlock.h:406, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:19, from arch/arm64/kernel/asm-offsets.c:21: include/linux/wait.h: In function 'wait_on_atomic_t': include/linux/wait.h:1218:2: error: implicit declaration of function 'atomic_read' [-Werror=implicit-function-declaration] if (atomic_read(val) == 0) Signed-off-by: Mark Brown Signed-off-by: Will Deacon (cherry picked from commit 4a6ccf30263f4e265c0f171561bf4c40bed5f273) Signed-off-by: Alex Shi --- arch/arm64/include/asm/cmpxchg.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index 9ea611ea69df..510c7b404454 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -19,7 +19,6 @@ #define __ASM_CMPXCHG_H #include -#include #include #include -- cgit v1.2.3 From 9429ab599551a430b8e6d9d8bfccfc9f31288211 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 11 Dec 2015 11:04:31 +0000 Subject: arm64: mm: place __cpu_setup in .text We drop __cpu_setup in .text.init, which ends up being part of .text. The .text.init section was a legacy section name which has been unused elsewhere for a long time. The ".text.init" name is misleading if read as a synonym for ".init.text". Any CPU may execute __cpu_setup before turning the MMU on, so it should simply live in .text. Remove the pointless section assignment. This will leave __cpu_setup in the .text section. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Will Deacon (cherry picked from commit f00083cae331e5d3eecade6b4fdc35d0825e73ef) Signed-off-by: Alex Shi --- arch/arm64/mm/proc.S | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index b8f04b3f2786..c164d2cb35c0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -140,8 +140,6 @@ ENTRY(cpu_do_switch_mm) ret ENDPROC(cpu_do_switch_mm) - .section ".text.init", #alloc, #execinstr - /* * __cpu_setup * -- cgit v1.2.3 From a79c216b06b18f1545d55e8f238dd9b49896f347 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 15 Dec 2015 11:21:25 +0000 Subject: arm64: reduce stack use in irq_handler The code for switching to irq_stack stores three pieces of information on the stack, fp+lr, as a fake stack frame (that lets us walk back onto the interrupted tasks stack frame), and the address of the struct pt_regs that contains the register values from kernel entry. (which dump_backtrace() will print in any stack trace). To reduce this, we store fp, and the pointer to the struct pt_regs. unwind_frame() can recognise this as the irq_stack dummy frame, (as it only appears at the top of the irq_stack), and use the struct pt_regs values to find the missing interrupted link-register. Suggested-by: Will Deacon Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit 971c67ce37cfeeaf560e792a2c3bc21d8b67163a) Signed-off-by: Alex Shi --- arch/arm64/include/asm/irq.h | 11 ++++------- arch/arm64/kernel/entry.S | 12 +++++++----- arch/arm64/kernel/stacktrace.c | 19 ++++++++++++++++--- 3 files changed, 27 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 877c7e358384..3bece4379bd9 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -25,16 +25,13 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * ------------ * | | <- irq_stack_ptr * top ------------ - * | elr_el1 | + * | x19 | <- irq_stack_ptr - 0x08 * ------------ * | x29 | <- irq_stack_ptr - 0x10 * ------------ - * | xzr | - * ------------ - * | x19 | <- irq_stack_ptr - 0x20 - * ------------ * - * where x19 holds a copy of the task stack pointer. + * where x19 holds a copy of the task stack pointer where the struct pt_regs + * from kernel_entry can be found. * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -43,7 +40,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); +#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2284c296e3f7..0667fb7d8bb1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -178,7 +178,7 @@ alternative_endif mrs \rd, sp_el0 .endm - .macro irq_stack_entry, dummy_lr + .macro irq_stack_entry mov x19, sp // preserve the original sp this_cpu_ptr irq_stack, x25, x26 @@ -196,10 +196,12 @@ alternative_endif add x26, x25, x26 mov sp, x26 - /* Add a dummy stack frame */ - stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + /* + * Add a dummy stack frame, this non-standard format is fixed up + * by unwind_frame() + */ + stp x29, x19, [sp, #-16]! mov x29, sp - stp x19, xzr, [sp, #-16]! 9998: .endm @@ -229,7 +231,7 @@ tsk .req x28 // current thread_info .macro irq_handler ldr_l x1, handle_arch_irq mov x0, sp - irq_stack_entry x22 + irq_stack_entry blr x1 irq_stack_exit .endm diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d916d5b6aef6..b9fd3a8abfc1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -70,17 +70,30 @@ int notrace unwind_frame(struct stackframe *frame) * Check whether we are going to walk through from interrupt stack * to task stack. * If we reach the end of the stack - and its an interrupt stack, - * read the original task stack pointer from the dummy frame. + * unpack the dummy frame to find the original elr. * * Check the frame->fp we read from the bottom of the irq_stack, * and the original task stack pointer are both in current->stack. */ if (frame->sp == irq_stack_ptr) { + struct pt_regs *irq_args; unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - if(object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) + if (object_is_on_stack((void *)orig_sp) && + object_is_on_stack((void *)frame->fp)) { frame->sp = orig_sp; + + /* orig_sp is the saved pt_regs, find the elr */ + irq_args = (struct pt_regs *)orig_sp; + frame->pc = irq_args->pc; + } else { + /* + * This frame has a non-standard format, and we + * didn't fix it, because the data looked wrong. + * Refuse to output this frame. + */ + return -EINVAL; + } } return 0; -- cgit v1.2.3 From ac7406c28c8bada863d36c46ca246bb7b76f3e9f Mon Sep 17 00:00:00 2001 From: Ashok Kumar Date: Thu, 17 Dec 2015 01:38:31 -0800 Subject: arm64: Defer dcache flush in __cpu_copy_user_page Defer dcache flushing to __sync_icache_dcache by calling flush_dcache_page which clears PG_dcache_clean flag. Acked-by: Catalin Marinas Signed-off-by: Ashok Kumar Signed-off-by: Will Deacon (cherry picked from commit e6b1185f77351aa154e63bd54b05d07ff99d4ffa) Signed-off-by: Alex Shi --- arch/arm64/mm/copypage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 13bbc3be6f5a..22e4cb4d6f53 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -24,8 +24,9 @@ void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr) { + struct page *page = virt_to_page(kto); copy_page(kto, kfrom); - __flush_dcache_area(kto, PAGE_SIZE); + flush_dcache_page(page); } EXPORT_SYMBOL_GPL(__cpu_copy_user_page); -- cgit v1.2.3 From 358e3c80a223c4d79a786be2e71e51cab91c2e7e Mon Sep 17 00:00:00 2001 From: Ashok Kumar Date: Thu, 17 Dec 2015 01:38:32 -0800 Subject: arm64: Use PoU cache instr for I/D coherency In systems with three levels of cache(PoU at L1 and PoC at L3), PoC cache flush instructions flushes L2 and L3 caches which could affect performance. For cache flushes for I and D coherency, PoU should suffice. So changing all I and D coherency related cache flushes to PoU. Introduced a new __clean_dcache_area_pou API for dcache flush till PoU and provided a common macro for __flush_dcache_area and __clean_dcache_area_pou. Also, now in __sync_icache_dcache, icache invalidation for non-aliasing VIPT icache is done only for that particular page instead of the earlier __flush_icache_all. Reviewed-by: Catalin Marinas Reviewed-by: Mark Rutland Signed-off-by: Ashok Kumar Signed-off-by: Will Deacon (cherry picked from commit 0a28714c53fd4f7aea709be7577dfbe0095c8c3e) Signed-off-by: Alex Shi Conflicts: included reset_pmuserenr_el0 in arch/arm64/mm/proc-macros.S --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/cache.S | 28 +++++++++++++++++----------- arch/arm64/mm/flush.c | 33 ++++++++++++++++++--------------- arch/arm64/mm/proc-macros.S | 22 ++++++++++++++++++++++ 4 files changed, 58 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 54efedaf331f..7fc294c3bc5b 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -68,6 +68,7 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); static inline void flush_cache_mm(struct mm_struct *mm) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index cfa44a6adc0a..6df07069a025 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -81,25 +81,31 @@ ENDPROC(__flush_cache_user_range) /* * __flush_dcache_area(kaddr, size) * - * Ensure that the data held in the page kaddr is written back to the - * page in question. + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned and invalidated to the PoC. * * - kaddr - kernel address * - size - size in question */ ENTRY(__flush_dcache_area) - dcache_line_size x2, x3 - add x1, x0, x1 - sub x3, x2, #1 - bic x0, x0, x3 -1: dc civac, x0 // clean & invalidate D line / unified line - add x0, x0, x2 - cmp x0, x1 - b.lo 1b - dsb sy + dcache_by_line_op civac, sy, x0, x1, x2, x3 ret ENDPIPROC(__flush_dcache_area) +/* + * __clean_dcache_area_pou(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoU. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pou) + dcache_by_line_op cvau, ish, x0, x1, x2, x3 + ret +ENDPROC(__clean_dcache_area_pou) + /* * __inval_cache_range(start, end) * - start - start address of region diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index c26b804015e8..46649d6e6c5a 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -34,19 +34,24 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, __flush_icache_all(); } +static void sync_icache_aliases(void *kaddr, unsigned long len) +{ + unsigned long addr = (unsigned long)kaddr; + + if (icache_is_aliasing()) { + __clean_dcache_area_pou(kaddr, len); + __flush_icache_all(); + } else { + flush_icache_range(addr, addr + len); + } +} + static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len) { - if (vma->vm_flags & VM_EXEC) { - unsigned long addr = (unsigned long)kaddr; - if (icache_is_aliasing()) { - __flush_dcache_area(kaddr, len); - __flush_icache_all(); - } else { - flush_icache_range(addr, addr + len); - } - } + if (vma->vm_flags & VM_EXEC) + sync_icache_aliases(kaddr, len); } /* @@ -74,13 +79,11 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr) if (!page_mapping(page)) return; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) { - __flush_dcache_area(page_address(page), - PAGE_SIZE << compound_order(page)); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + sync_icache_aliases(page_address(page), + PAGE_SIZE << compound_order(page)); + else if (icache_is_aivivt()) __flush_icache_all(); - } else if (icache_is_aivivt()) { - __flush_icache_all(); - } } /* diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index d69dffffaa89..984edcda1850 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -74,3 +74,25 @@ msr pmuserenr_el0, xzr // Disable PMU access from EL0 9000: .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [kaddr, kaddr + size) + * + * op: operation passed to dc instruction + * domain: domain used in dsb instruciton + * kaddr: starting virtual address of the region + * size: size of the region + * Corrupts: kaddr, size, tmp1, tmp2 + */ + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 + dcache_line_size \tmp1, \tmp2 + add \size, \kaddr, \size + sub \tmp2, \tmp1, #1 + bic \kaddr, \kaddr, \tmp2 +9998: dc \op, \kaddr + add \kaddr, \kaddr, \tmp1 + cmp \kaddr, \size + b.lo 9998b + dsb \domain + .endm -- cgit v1.2.3 From 720089ef0ba8007c34dfa7d80d96e27d6d23088f Mon Sep 17 00:00:00 2001 From: David Woods Date: Thu, 17 Dec 2015 14:31:26 -0500 Subject: arm64: hugetlb: add support for PTE contiguous bit The arm64 MMU supports a Contiguous bit which is a hint that the TTE is one of a set of contiguous entries which can be cached in a single TLB entry. Supporting this bit adds new intermediate huge page sizes. The set of huge page sizes available depends on the base page size. Without using contiguous pages the huge page sizes are as follows. 4KB: 2MB 1GB 64KB: 512MB With a 4KB granule, the contiguous bit groups together sets of 16 pages and with a 64KB granule it groups sets of 32 pages. This enables two new huge page sizes in each case, so that the full set of available sizes is as follows. 4KB: 64KB 2MB 32MB 1GB 64KB: 2MB 512MB 16GB If a 16KB granule is used then the contiguous bit groups 128 pages at the PTE level and 32 pages at the PMD level. If the base page size is set to 64KB then 2MB pages are enabled by default. It is possible in the future to make 2MB the default huge page size for both 4KB and 64KB granules. Reviewed-by: Chris Metcalf Reviewed-by: Steve Capper Signed-off-by: David Woods Signed-off-by: Will Deacon (cherry picked from commit 66b3923a1a0f77a563b43f43f6ad091354abbfe9) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 3 - arch/arm64/include/asm/hugetlb.h | 44 ++---- arch/arm64/include/asm/pgtable-hwdef.h | 18 ++- arch/arm64/include/asm/pgtable.h | 10 +- arch/arm64/mm/hugetlbpage.c | 274 ++++++++++++++++++++++++++++++++- 5 files changed, 313 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4876459c0838..ffa3c549a4ba 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -530,9 +530,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARCH_WANT_HUGE_PMD_SHARE def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index bb4052e85dba..bbc1e35aa601 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return *ptep; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page) clear_bit(PG_dcache_clean, &page->flags); } +extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable); +#define arch_make_huge_pte arch_make_huge_pte +extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d6739e836f7b..5c25b831273d 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -90,7 +90,23 @@ /* * Contiguous page definitions. */ -#define CONT_PTES (_AC(1, UL) << CONT_SHIFT) +#ifdef CONFIG_ARM64_64K_PAGES +#define CONT_PTE_SHIFT 5 +#define CONT_PMD_SHIFT 5 +#elif defined(CONFIG_ARM64_16K_PAGES) +#define CONT_PTE_SHIFT 7 +#define CONT_PMD_SHIFT 5 +#else +#define CONT_PTE_SHIFT 4 +#define CONT_PMD_SHIFT 4 +#endif + +#define CONT_PTES (1 << CONT_PTE_SHIFT) +#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE) +#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1)) +#define CONT_PMDS (1 << CONT_PMD_SHIFT) +#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) +#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) /* the the numerical offset of the PTE within a range of CONT_PTES */ #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1)) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index cd5dfc97268e..fd3d7c177c5f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -228,7 +228,8 @@ static inline pte_t pte_mkspecial(pte_t pte) static inline pte_t pte_mkcont(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_CONT)); + pte = set_pte_bit(pte, __pgprot(PTE_CONT)); + return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE)); } static inline pte_t pte_mknoncont(pte_t pte) @@ -236,6 +237,11 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } +static inline pmd_t pmd_mkcont(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | PMD_SECT_CONT); +} + static inline void set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; @@ -309,7 +315,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, /* * Hugetlb definitions. */ -#define HUGE_MAX_HSTATE 2 +#define HUGE_MAX_HSTATE 4 #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 383b03ff38f8..82d607c3614e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -41,17 +41,289 @@ int pud_huge(pud_t pud) #endif } +static int find_num_contig(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, size_t *pgsize) +{ + pgd_t *pgd = pgd_offset(mm, addr); + pud_t *pud; + pmd_t *pmd; + + *pgsize = PAGE_SIZE; + if (!pte_cont(pte)) + return 1; + if (!pgd_present(*pgd)) { + VM_BUG_ON(!pgd_present(*pgd)); + return 1; + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + VM_BUG_ON(!pud_present(*pud)); + return 1; + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + VM_BUG_ON(!pmd_present(*pmd)); + return 1; + } + if ((pte_t *)pmd == ptep) { + *pgsize = PMD_SIZE; + return CONT_PMDS; + } + return CONT_PTES; +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + size_t pgsize; + int i; + int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize); + unsigned long pfn; + pgprot_t hugeprot; + + if (ncontig == 1) { + set_pte_at(mm, addr, ptep, pte); + return; + } + + pfn = pte_pfn(pte); + hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); + for (i = 0; i < ncontig; i++) { + pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, + pte_val(pfn_pte(pfn, hugeprot))); + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + ptep++; + pfn += pgsize >> PAGE_SHIFT; + addr += pgsize; + } +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz); + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return NULL; + + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else if (sz == (PAGE_SIZE * CONT_PTES)) { + pmd_t *pmd = pmd_alloc(mm, pud, addr); + + WARN_ON(addr & (sz - 1)); + /* + * Note that if this code were ever ported to the + * 32-bit arm platform then it will cause trouble in + * the case where CONFIG_HIGHPTE is set, since there + * will be no pte_unmap() to correspond with this + * pte_alloc_map(). + */ + pte = pte_alloc_map(mm, NULL, pmd, addr); + } else if (sz == PMD_SIZE) { + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && + pud_none(*pud)) + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } else if (sz == (PMD_SIZE * CONT_PMDS)) { + pmd_t *pmd; + + pmd = pmd_alloc(mm, pud, addr); + WARN_ON(addr & (sz - 1)); + return (pte_t *)pmd; + } + + pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr, + sz, pte, pte_val(*pte)); + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd); + if (!pgd_present(*pgd)) + return NULL; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return NULL; + + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return NULL; + + if (pte_cont(pmd_pte(*pmd))) { + pmd = pmd_offset( + pud, (addr & CONT_PMD_MASK)); + return (pte_t *)pmd; + } + if (pmd_huge(*pmd)) + return (pte_t *)pmd; + pte = pte_offset_kernel(pmd, addr); + if (pte_present(*pte) && pte_cont(*pte)) { + pte = pte_offset_kernel( + pmd, (addr & CONT_PTE_MASK)); + return pte; + } + return NULL; +} + +pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t pagesize = huge_page_size(hstate_vma(vma)); + + if (pagesize == CONT_PTE_SIZE) { + entry = pte_mkcont(entry); + } else if (pagesize == CONT_PMD_SIZE) { + entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); + } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) { + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); + } + return entry; +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte; + + if (pte_cont(*ptep)) { + int ncontig, i; + size_t pgsize; + pte_t *cpte; + bool is_dirty = false; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + /* save the 1st pte to return */ + pte = ptep_get_and_clear(mm, addr, cpte); + for (i = 1; i < ncontig; ++i) { + /* + * If HW_AFDBM is enabled, then the HW could + * turn on the dirty bit for any of the page + * in the set, so check them all. + */ + ++cpte; + if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) + is_dirty = true; + } + if (is_dirty) + return pte_mkdirty(pte); + else + return pte; + } else { + return ptep_get_and_clear(mm, addr, ptep); + } +} + +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + pte_t *cpte; + + if (pte_cont(pte)) { + int ncontig, i, changed = 0; + size_t pgsize = 0; + unsigned long pfn = pte_pfn(pte); + /* Select all bits except the pfn */ + pgprot_t hugeprot = + __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ + pte_val(pte)); + + cpte = huge_pte_offset(vma->vm_mm, addr); + pfn = pte_pfn(*cpte); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) { + changed = ptep_set_access_flags(vma, addr, cpte, + pfn_pte(pfn, + hugeprot), + dirty); + pfn += pgsize >> PAGE_SHIFT; + } + return changed; + } else { + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + } +} + +void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_set_wrprotect(mm, addr, cpte); + } else { + ptep_set_wrprotect(mm, addr, ptep); + } +} + +void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(vma->vm_mm, addr); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_clear_flush(vma, addr, cpte); + } else { + ptep_clear_flush(vma, addr, ptep); + } +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else if (ps == (PAGE_SIZE * CONT_PTES)) { + hugetlb_add_hstate(CONT_PTE_SHIFT); + } else if (ps == (PMD_SIZE * CONT_PMDS)) { + hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT); } else { - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; } return 1; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_ARM64_64K_PAGES +static __init int add_default_hugepagesz(void) +{ + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) + hugetlb_add_hstate(CONT_PMD_SHIFT); + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif -- cgit v1.2.3 From da604e8646f553a4a2c7613491112b0c9b3ec400 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Dec 2015 16:01:47 +0000 Subject: arm64: remove irq_count and do_softirq_own_stack() sysrq_handle_reboot() re-enables interrupts while on the irq stack. The irq_stack implementation wrongly assumed this would only ever happen via the softirq path, allowing it to update irq_count late, in do_softirq_own_stack(). This means if an irq occurs in sysrq_handle_reboot(), during emergency_restart() the stack will be corrupted, as irq_count wasn't updated. Lose the optimisation, and instead of moving the adding/subtracting of irq_count into irq_stack_entry/irq_stack_exit, remove it, and compare sp_el0 (struct thread_info) with sp & ~(THREAD_SIZE - 1). This tells us if we are on a task stack, if so, we can safely switch to the irq stack. Finally, remove do_softirq_own_stack(), we don't need it anymore. Reported-by: Will Deacon Signed-off-by: James Morse [will: use get_thread_info macro] Signed-off-by: Will Deacon (cherry picked from commit d224a69e3d80fe08f285d1f41d21b590bae4fa9f) Signed-off-by: Alex Shi --- arch/arm64/include/asm/irq.h | 2 -- arch/arm64/kernel/entry.S | 19 ++++++++++--------- arch/arm64/kernel/irq.c | 38 +------------------------------------- 3 files changed, 11 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 3bece4379bd9..b77197d941fc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,8 +11,6 @@ #include #include -#define __ARCH_HAS_DO_SOFTIRQ - struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 0667fb7d8bb1..c0db321db7e1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -181,19 +181,20 @@ alternative_endif .macro irq_stack_entry mov x19, sp // preserve the original sp - this_cpu_ptr irq_stack, x25, x26 - /* - * Check the lowest address on irq_stack for the irq_count value, - * incremented by do_softirq_own_stack if we have re-enabled irqs - * while on the irq_stack. + * Compare sp with the current thread_info, if the top + * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and + * should switch to the irq stack. */ - ldr x26, [x25] - cbnz x26, 9998f // recursive use? + and x25, x19, #~(THREAD_SIZE - 1) + cmp x25, tsk + b.ne 9998f - /* switch to the irq stack */ + this_cpu_ptr irq_stack, x25, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 + + /* switch to the irq stack */ mov sp, x26 /* @@ -405,10 +406,10 @@ el1_irq: bl trace_hardirqs_off #endif + get_thread_info tsk irq_handler #ifdef CONFIG_PREEMPT - get_thread_info tsk ldr w24, [tsk, #TI_PREEMPT] // get preempt count cbnz w24, 1f // preempt count != 0 ldr x0, [tsk, #TI_FLAGS] // get flags diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index ff7ebb710e51..2386b26c0712 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,24 +25,14 @@ #include #include #include -#include #include #include unsigned long irq_err_count; -/* - * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. - * irq_stack[0] is used as irq_count, a non-zero value indicates the stack - * is in use, and el?_irq() shouldn't switch to it. This is used to detect - * recursive use of the irq_stack, it is lazily updated by - * do_softirq_own_stack(), which is called on the irq_stack, before - * re-enabling interrupts to process softirqs. - */ +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); -#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) - int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -66,29 +56,3 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } - -/* - * do_softirq_own_stack() is called from irq_exit() before __do_softirq() - * re-enables interrupts, at which point we may re-enter el?_irq(). We - * increase irq_count here so that el1_irq() knows that it is already on the - * irq stack. - * - * Called with interrupts disabled, so we don't worry about moving cpu, or - * being interrupted while modifying irq_count. - * - * This function doesn't actually switch stack. - */ -void do_softirq_own_stack(void) -{ - int cpu = smp_processor_id(); - - WARN_ON_ONCE(!irqs_disabled()); - - if (on_irq_stack(current_stack_pointer, cpu)) { - IRQ_COUNT()++; - __do_softirq(); - IRQ_COUNT()--; - } else { - __do_softirq(); - } -} -- cgit v1.2.3 From 76f2d0af233200abc487122d0002f3c14d796676 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:39 +0900 Subject: arm64: ftrace: modify a stack frame in a safe way Function graph tracer modifies a return address (LR) in a stack frame by calling ftrace_prepare_return() in a traced function's function prologue. The current code does this modification before preserving an original address at ftrace_push_return_trace() and there is always a small window of inconsistency when an interrupt occurs. This doesn't matter, as far as an interrupt stack is introduced, because stack tracer won't be invoked in an interrupt context. But it would be better to proactively minimize such a window by moving the LR modification after ftrace_push_return_trace(). Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon (cherry picked from commit 79fdee9b6355c9720f14717e1ad66af51bb331b5) Signed-off-by: Alex Shi --- arch/arm64/kernel/ftrace.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8f7005bc35bd..ebecf9aa33d1 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -129,23 +129,20 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, * on other archs. It's unlikely on AArch64. */ old = *parent; - *parent = return_hooker; trace.func = self_addr; trace.depth = current->curr_ret_stack + 1; /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { - *parent = old; + if (!ftrace_graph_entry(&trace)) return; - } err = ftrace_push_return_trace(old, self_addr, &trace.depth, frame_pointer); - if (err == -EBUSY) { - *parent = old; + if (err == -EBUSY) return; - } + else + *parent = return_hooker; } #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v1.2.3 From 30e9fa2678d1ab96894aa1ee670c5a804fe5a29b Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:40 +0900 Subject: arm64: pass a task parameter to unwind_frame() Function graph tracer modifies a return address (LR) in a stack frame to hook a function's return. This will result in many useless entries (return_to_handler) showing up in a call stack list. We will fix this problem in a later patch ("arm64: ftrace: fix a stack tracer's output under function graph tracer"). But since real return addresses are saved in ret_stack[] array in struct task_struct, unwind functions need to be notified of, in addition to a stack pointer address, which task is being traced in order to find out real return addresses. This patch extends unwind functions' interfaces by adding an extra argument of a pointer to task_struct. Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon (cherry picked from commit fe13f95b720075327a761fe6ddb45b0c90cab504) Signed-off-by: Alex Shi --- arch/arm64/include/asm/stacktrace.h | 6 ++++-- arch/arm64/kernel/perf_callchain.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/return_address.c | 2 +- arch/arm64/kernel/stacktrace.c | 8 ++++---- arch/arm64/kernel/time.c | 2 +- arch/arm64/kernel/traps.c | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 7318f6d54aa9..6fb61c5090b4 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -16,14 +16,16 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H +struct task_struct; + struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; }; -extern int unwind_frame(struct stackframe *frame); -extern void walk_stackframe(struct stackframe *frame, +extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); +extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 3aa74830cc69..797220da912b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -165,7 +165,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.sp = regs->sp; frame.pc = regs->pc; - walk_stackframe(&frame, callchain_trace, entry); + walk_stackframe(current, &frame, callchain_trace, entry); } unsigned long perf_instruction_pointer(struct pt_regs *regs) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index f75b540bc3b4..98bf5461d4b6 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -348,7 +348,7 @@ unsigned long get_wchan(struct task_struct *p) do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(&frame)) + unwind_frame(p, &frame)) return 0; if (!in_sched_functions(frame.pc)) return frame.pc; diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 6c4fd2810ecb..07b37ac05be4 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -44,7 +44,7 @@ void *return_address(unsigned int level) frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ - walk_stackframe(&frame, save_return_addr, &data); + walk_stackframe(current, &frame, save_return_addr, &data); if (!data.level) return data.addr; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b9fd3a8abfc1..f7ee597ec883 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -36,7 +36,7 @@ * ldp x29, x30, [sp] * add sp, sp, #0x10 */ -int notrace unwind_frame(struct stackframe *frame) +int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; @@ -99,7 +99,7 @@ int notrace unwind_frame(struct stackframe *frame) return 0; } -void notrace walk_stackframe(struct stackframe *frame, +void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data) { while (1) { @@ -107,7 +107,7 @@ void notrace walk_stackframe(struct stackframe *frame, if (fn(frame, data)) break; - ret = unwind_frame(frame); + ret = unwind_frame(tsk, frame); if (ret < 0) break; } @@ -159,7 +159,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.pc = (unsigned long)save_stack_trace_tsk; } - walk_stackframe(&frame, save_trace, &data); + walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 13339b6ffc1a..6e5c521f123a 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -53,7 +53,7 @@ unsigned long profile_pc(struct pt_regs *regs) frame.sp = regs->sp; frame.pc = regs->pc; do { - int ret = unwind_frame(&frame); + int ret = unwind_frame(NULL, &frame); if (ret < 0) return 0; } while (in_lock_functions(frame.pc)); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8a0084541f84..937008523fa5 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -177,7 +177,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) int ret; dump_backtrace_entry(where); - ret = unwind_frame(&frame); + ret = unwind_frame(tsk, &frame); if (ret < 0) break; stack = frame.sp; -- cgit v1.2.3 From 1f18836b775f8f041dcad620c1b5d18c22c25c06 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:41 +0900 Subject: arm64: ftrace: fix a stack tracer's output under function graph tracer Function graph tracer modifies a return address (LR) in a stack frame to hook a function return. This will result in many useless entries (return_to_handler) showing up in a) a stack tracer's output b) perf call graph (with perf record -g) c) dump_backtrace (at panic et al.) For example, in case of a), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo 1 > /proc/sys/kernel/stack_trace_enabled $ cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (54 entries) ----- ---- -------- 0) 4504 16 gic_raise_softirq+0x28/0x150 1) 4488 80 smp_cross_call+0x38/0xb8 2) 4408 48 return_to_handler+0x0/0x40 3) 4360 32 return_to_handler+0x0/0x40 ... In case of b), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ perf record -e mem:XXX:x -ag -- sleep 10 $ perf report ... | | |--0.22%-- 0x550f8 | | | 0x10888 | | | el0_svc_naked | | | sys_openat | | | return_to_handler | | | return_to_handler ... In case of c), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo c > /proc/sysrq-trigger ... Call trace: [] sysrq_handle_crash+0x24/0x30 [] return_to_handler+0x0/0x40 [] return_to_handler+0x0/0x40 ... This patch replaces such entries with real addresses preserved in current->ret_stack[] at unwind_frame(). This way, we can cover all the cases. Reviewed-by: Jungseok Lee Signed-off-by: AKASHI Takahiro [will: fixed minor context changes conflicting with irq stack bits] Signed-off-by: Will Deacon (cherry picked from commit 20380bb390a443b2c5c8800cec59743faf8151b4) Signed-off-by: Alex Shi --- arch/arm64/include/asm/ftrace.h | 2 ++ arch/arm64/include/asm/stacktrace.h | 3 +++ arch/arm64/kernel/perf_callchain.c | 3 +++ arch/arm64/kernel/process.c | 3 +++ arch/arm64/kernel/return_address.c | 3 +++ arch/arm64/kernel/stacktrace.c | 17 +++++++++++++++++ arch/arm64/kernel/time.c | 3 +++ arch/arm64/kernel/traps.c | 26 ++++++++++++++++++++------ 8 files changed, 54 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index c5534facf941..3c60f37e48ab 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -28,6 +28,8 @@ struct dyn_arch_ftrace { extern unsigned long ftrace_graph_call; +extern void return_to_handler(void); + static inline unsigned long ftrace_call_adjust(unsigned long addr) { /* diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 6fb61c5090b4..801a16dbbdf6 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -22,6 +22,9 @@ struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned int graph; +#endif }; extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 797220da912b..ff4665462a02 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -164,6 +164,9 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, callchain_trace, entry); } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 98bf5461d4b6..88d742ba19d5 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -344,6 +344,9 @@ unsigned long get_wchan(struct task_struct *p) frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = p->curr_ret_stack; +#endif stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 07b37ac05be4..1718706fde83 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -43,6 +43,9 @@ void *return_address(unsigned int level) frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, save_return_addr, &data); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index f7ee597ec883..4fad9787ab46 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -17,6 +17,7 @@ */ #include #include +#include #include #include @@ -66,6 +67,19 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (tsk && tsk->ret_stack && + (frame->pc == (unsigned long)return_to_handler)) { + /* + * This is a case where function graph tracer has + * modified a return address (LR) in a stack frame + * to hook a function return. + * So replace it to an original value. + */ + frame->pc = tsk->ret_stack[frame->graph--].ret; + } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Check whether we are going to walk through from interrupt stack * to task stack. @@ -158,6 +172,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 6e5c521f123a..59779699a1a4 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -52,6 +52,9 @@ unsigned long profile_pc(struct pt_regs *regs) frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = -1; /* no task info */ +#endif do { int ret = unwind_frame(NULL, &frame); if (ret < 0) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 937008523fa5..bdc293f6adc4 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -147,17 +147,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) tsk = current; - if (regs) { - frame.fp = regs->regs[29]; - frame.sp = regs->sp; - frame.pc = regs->pc; - } else if (tsk == current) { + if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; @@ -169,14 +166,31 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif + skip = !!regs; pr_emerg("Call trace:\n"); while (1) { unsigned long where = frame.pc; unsigned long stack; int ret; - dump_backtrace_entry(where); + /* skip until specified stack frame */ + if (!skip) { + dump_backtrace_entry(where); + } else if (frame.fp == regs->regs[29]) { + skip = 0; + /* + * Mostly, this is the case where this function is + * called in panic/abort. As exception handler's + * stack frame does not contain the corresponding pc + * at which an exception has taken place, use regs->pc + * instead. + */ + dump_backtrace_entry(regs->pc); + } ret = unwind_frame(tsk, &frame); if (ret < 0) break; -- cgit v1.2.3 From d2b08280e2c1f40fa209aaaaf86c9b730de28204 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 21 Dec 2015 16:44:27 +0000 Subject: arm64: traps: address fallout from printk -> pr_* conversion Commit ac7b406c1a9d ("arm64: Use pr_* instead of printk") was a fairly mindless s/printk/pr_*/ change driven by a complaint from checkpatch. As is usual with such changes, this has led to some odd behaviour on arm64: * syslog now picks up the "pr_emerg" line from dump_backtrace, but not the actual trace, which leads to a bunch of "kernel:Call trace:" lines in the log * __{pte,pmd,pgd}_error print at KERN_CRIT, as opposed to KERN_ERR which is used by other architectures. This patch restores the original printk behaviour for dump_backtrace and downgrade the pgtable error macros to KERN_ERR. Signed-off-by: Will Deacon (cherry picked from commit c9cd0ed925c0b927283d4739bfe689eb9d1e9dfd) Signed-off-by: Alex Shi --- arch/arm64/kernel/traps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index bdc293f6adc4..cbedd724f48e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -171,7 +171,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) #endif skip = !!regs; - pr_emerg("Call trace:\n"); + printk("Call trace:\n"); while (1) { unsigned long where = frame.pc; unsigned long stack; @@ -482,22 +482,22 @@ asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) void __pte_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pte %016lx.\n", file, line, val); + pr_err("%s:%d: bad pte %016lx.\n", file, line, val); } void __pmd_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val); + pr_err("%s:%d: bad pmd %016lx.\n", file, line, val); } void __pud_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pud %016lx.\n", file, line, val); + pr_err("%s:%d: bad pud %016lx.\n", file, line, val); } void __pgd_error(const char *file, int line, unsigned long val) { - pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val); + pr_err("%s:%d: bad pgd %016lx.\n", file, line, val); } /* GENERIC_BUG traps */ -- cgit v1.2.3 From 12037cecc2cfc56e01851c0fbf605be9e05bbe95 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2016 10:18:51 +0100 Subject: arm64: module: fix relocation of movz instruction with negative immediate The test whether a movz instruction with a signed immediate should be turned into a movn instruction (i.e., when the immediate is negative) is flawed, since the value of imm is always positive. Also, the subsequent bounds check is incorrect since the limit update never executes, due to the fact that the imm_type comparison will always be false for negative signed immediates. Let's fix this by performing the sign test on sval directly, and replacing the bounds check with a simple comparison against U16_MAX. Signed-off-by: Ard Biesheuvel [will: tidied up use of sval, renamed MOVK enum value to MOVKZ] Signed-off-by: Will Deacon (cherry picked from commit b24a557527f97ad88619d5bd4c8017c635056d69) Signed-off-by: Alex Shi --- arch/arm64/kernel/module.c | 51 ++++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 33 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index f4bc779e62e8..03464ab0fff2 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -30,9 +30,6 @@ #include #include -#define AARCH64_INSN_IMM_MOVNZ AARCH64_INSN_IMM_MAX -#define AARCH64_INSN_IMM_MOVK AARCH64_INSN_IMM_16 - void *module_alloc(unsigned long size) { void *p; @@ -110,16 +107,20 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) return 0; } +enum aarch64_insn_movw_imm_type { + AARCH64_INSN_IMM_MOVNZ, + AARCH64_INSN_IMM_MOVKZ, +}; + static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, - int lsb, enum aarch64_insn_imm_type imm_type) + int lsb, enum aarch64_insn_movw_imm_type imm_type) { - u64 imm, limit = 0; + u64 imm; s64 sval; u32 insn = le32_to_cpu(*(u32 *)place); sval = do_reloc(op, place, val); - sval >>= lsb; - imm = sval & 0xffff; + imm = sval >> lsb; if (imm_type == AARCH64_INSN_IMM_MOVNZ) { /* @@ -128,7 +129,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, * immediate is less than zero. */ insn &= ~(3 << 29); - if ((s64)imm >= 0) { + if (sval >= 0) { /* >=0: Set the instruction to MOVZ (opcode 10b). */ insn |= 2 << 29; } else { @@ -140,29 +141,13 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val, */ imm = ~imm; } - imm_type = AARCH64_INSN_IMM_MOVK; } /* Update the instruction with the new encoding. */ - insn = aarch64_insn_encode_immediate(imm_type, insn, imm); + insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm); *(u32 *)place = cpu_to_le32(insn); - /* Shift out the immediate field. */ - sval >>= 16; - - /* - * For unsigned immediates, the overflow check is straightforward. - * For signed immediates, the sign bit is actually the bit past the - * most significant bit of the field. - * The AARCH64_INSN_IMM_16 immediate type is unsigned. - */ - if (imm_type != AARCH64_INSN_IMM_16) { - sval++; - limit++; - } - - /* Check the upper bits depending on the sign of the immediate. */ - if ((u64)sval > limit) + if (imm > U16_MAX) return -ERANGE; return 0; @@ -267,25 +252,25 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, overflow_check = false; case R_AARCH64_MOVW_UABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G1_NC: overflow_check = false; case R_AARCH64_MOVW_UABS_G1: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G2_NC: overflow_check = false; case R_AARCH64_MOVW_UABS_G2: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_UABS_G3: /* We're using the top bits so we can't overflow. */ overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, - AARCH64_INSN_IMM_16); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_SABS_G0: ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, @@ -302,7 +287,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G0_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G0: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, @@ -311,7 +296,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G1_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G1: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, @@ -320,7 +305,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_MOVW_PREL_G2_NC: overflow_check = false; ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, - AARCH64_INSN_IMM_MOVK); + AARCH64_INSN_IMM_MOVKZ); break; case R_AARCH64_MOVW_PREL_G2: ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32, -- cgit v1.2.3 From 3fd9316702a82b498fcf7055f9781589ea6c1e1c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Jan 2016 10:18:52 +0100 Subject: arm64: module: avoid undefined shift behavior in reloc_data() Compilers may engage the improbability drive when encountering shifts by a distance that is a multiple of the size of the operand type. Since the required bounds check is very simple here, we can get rid of all the fuzzy masking, shifting and comparing, and use the documented bounds directly. Reported-by: David Binderman Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon (cherry picked from commit f930896967fa3f9ab16a6f87267b92798308d48f) Signed-off-by: Alex Shi --- arch/arm64/kernel/module.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 03464ab0fff2..93e970231ca9 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -72,15 +72,18 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val) static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) { - u64 imm_mask = (1 << len) - 1; s64 sval = do_reloc(op, place, val); switch (len) { case 16: *(s16 *)place = sval; + if (sval < S16_MIN || sval > U16_MAX) + return -ERANGE; break; case 32: *(s32 *)place = sval; + if (sval < S32_MIN || sval > U32_MAX) + return -ERANGE; break; case 64: *(s64 *)place = sval; @@ -89,21 +92,6 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len) pr_err("Invalid length (%d) for data relocation\n", len); return 0; } - - /* - * Extract the upper value bits (including the sign bit) and - * shift them to bit 0. - */ - sval = (s64)(sval & ~(imm_mask >> 1)) >> (len - 1); - - /* - * Overflow has occurred if the value is not representable in - * len bits (i.e the bottom len bits are not sign-extended and - * the top bits are not all zero). - */ - if ((u64)(sval + 1) > 2) - return -ERANGE; - return 0; } -- cgit v1.2.3 From 87e4c1f363cfd0ed3a673d47f229725a6b0946d7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 5 Jan 2016 15:36:59 +0000 Subject: arm64: mm: move pgd_cache initialisation to pgtable_cache_init Initialising the suppport for EFI runtime services requires us to allocate a pgd off the back of an early_initcall. On systems where the PGD_SIZE is smaller than PAGE_SIZE (e.g. 64k pages and 48-bit VA), the pgd_cache isn't initialised at this stage, and we panic with a NULL dereference during boot: Unable to handle kernel NULL pointer dereference at virtual address 00000000 __create_mapping.isra.5+0x84/0x350 create_pgd_mapping+0x20/0x28 efi_create_mapping+0x5c/0x6c arm_enable_runtime_services+0x154/0x1e4 do_one_initcall+0x8c/0x190 kernel_init_freeable+0x84/0x1ec kernel_init+0x10/0xe0 ret_from_fork+0x10/0x50 This patch fixes the problem by initialising the pgd_cache earlier, in the pgtable_cache_init callback, which sounds suspiciously like what it was intended for. Reported-by: Dennis Chen Signed-off-by: Will Deacon (cherry picked from commit 39b5be9b4233a9f212b98242bddf008f379b5122) Signed-off-by: Alex Shi --- arch/arm64/include/asm/pgtable.h | 3 ++- arch/arm64/mm/pgd.c | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index fd3d7c177c5f..76ff5d93c6c3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -682,7 +682,8 @@ extern int kern_addr_valid(unsigned long addr); #include -#define pgtable_cache_init() do { } while (0) +void pgd_cache_init(void); +#define pgtable_cache_init pgd_cache_init /* * On AArch64, the cache coherency is handled via the set_pte_at() function. diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index cb3ba1b812e7..ae11d4e03d0e 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -46,14 +46,14 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { + if (PGD_SIZE == PAGE_SIZE) + return; + /* * Naturally aligned pgds required by the architecture. */ - if (PGD_SIZE != PAGE_SIZE) - pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, - SLAB_PANIC, NULL); - return 0; + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, + SLAB_PANIC, NULL); } -core_initcall(pgd_cache_init); -- cgit v1.2.3 From ec567e8f53127f17d757c1ac30cfe09db6d1ea66 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 5 Jan 2016 17:33:34 +0000 Subject: arm64: entry: remove pointless SPSR mode check In work_pending, we may skip work if the stacked SPSR value represents anything other than an EL0 context. We then immediately invoke the kernel_exit 0 macro as part of ret_to_user, assuming a return to EL0. This is somewhat confusing. We use work_pending as part of the ret_to_user/ret_fast_syscall state machine. We only use ret_fast_syscall in the return from an SVC issued from EL0. We use ret_to_user for return from EL0 exception handlers and also for return from ret_from_fork in the case the task was not a kernel thread (i.e. it is a user task). Thus in all cases the stacked SPSR value must represent an EL0 context, and the check is redundant. This patch removes it, along with the now unused no_work_pending label. Cc: Chris Metcalf Acked-by: Catalin Marinas Signed-off-by: Mark Rutland Signed-off-by: Will Deacon (cherry picked from commit ee03353bc04f8e460cc4e3da80d9721d9ecb89f1) Signed-off-by: Alex Shi --- arch/arm64/kernel/entry.S | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c0db321db7e1..1f7f5a2b61bf 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -676,10 +676,7 @@ ret_fast_syscall_trace: work_pending: tbnz x1, #TIF_NEED_RESCHED, work_resched /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ - ldr x2, [sp, #S_PSTATE] mov x0, sp // 'regs' - tst x2, #PSR_MODE_MASK // user mode regs? - b.ne no_work_pending // returning to kernel enable_irq // enable interrupts for do_notify_resume() bl do_notify_resume b ret_to_user @@ -698,7 +695,6 @@ ret_to_user: and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending enable_step_tsk x1, x2 -no_work_pending: kernel_exit 0 ENDPROC(ret_to_user) -- cgit v1.2.3 From 1ebc63c2d5ebd9578b625a049b6f83e9181caf8d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 6 Jan 2016 11:05:27 +0000 Subject: arm64: head.S: use memset to clear BSS Currently we use an open-coded memzero to clear the BSS. As it is a trivial implementation, it is sub-optimal. Our optimised memset doesn't use the stack, is position-independent, and for the memzero case can use of DC ZVA to clear large blocks efficiently. In __mmap_switched the MMU is on and there are no live caller-saved registers, so we can safely call an uninstrumented memset. This patch changes __mmap_switched to use memset when clearing the BSS. We use the __pi_memset alias so as to avoid any instrumentation in all kernel configurations. Cc: Catalin Marinas Cc: Marc Zyngier Reviewed-by: Ard Biesheuvel Signed-off-by: Mark Rutland Signed-off-by: Will Deacon (cherry picked from commit 2a803c4db615d85126c5c7afd5849a3cfde71422) Signed-off-by: Alex Shi --- arch/arm64/kernel/head.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 17ce7285bb12..917d98108b3f 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -415,14 +415,13 @@ ENDPROC(__create_page_tables) */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: - adr_l x6, __bss_start - adr_l x7, __bss_stop - -1: cmp x6, x7 - b.hs 2f - str xzr, [x6], #8 // Clear BSS - b 1b -2: + // Clear BSS + adr_l x0, __bss_start + mov x1, xzr + adr_l x2, __bss_stop + sub x2, x2, x0 + bl __pi_memset + adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) -- cgit v1.2.3 From 32b06020f36dd2dcfd7832ffd34a84f254b14e46 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 15 Jan 2016 13:28:57 +0100 Subject: arm64: hide __efistub_ aliases from kallsyms Commit e8f3010f7326 ("arm64/efi: isolate EFI stub from the kernel proper") isolated the EFI stub code from the kernel proper by prefixing all of its symbols with __efistub_, and selectively allowing access to core kernel symbols from the stub by emitting __efistub_ aliases for functions and variables that the stub can access legally. As an unintended side effect, these aliases are emitted into the kallsyms symbol table, which means they may turn up in backtraces, e.g., ... PC is at __efistub_memset+0x108/0x200 LR is at fixup_init+0x3c/0x48 ... [] __efistub_memset+0x108/0x200 [] free_initmem+0x2c/0x40 [] kernel_init+0x20/0xe0 [] ret_from_fork+0x10/0x40 The backtrace in question has nothing to do with the EFI stub, but simply returns one of the several aliases of memset() that have been recorded in the kallsyms table. This is undesirable, since it may suggest to people who are not aware of this that the issue they are seeing is somehow EFI related. So hide the __efistub_ aliases from kallsyms, by emitting them as absolute linker symbols explicitly. The distinction between those and section relative symbols is completely irrelevant to these definitions, and to the final link we are performing when these definitions are being taken into account (the distinction is only relevant to symbols defined inside a section definition when performing a partial link), and so the resulting values are identical to the original ones. Since absolute symbols are ignored by kallsyms, this will result in these values to be omitted from its symbol table. After this patch, the backtrace generated from the same address looks like this: ... PC is at __memset+0x108/0x200 LR is at fixup_init+0x3c/0x48 ... [] __memset+0x108/0x200 [] free_initmem+0x2c/0x40 [] kernel_init+0x20/0xe0 [] ret_from_fork+0x10/0x40 Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon (cherry picked from commit 75feee3d9d51775072d3a04f47d4a439a4c4590e) Signed-off-by: Alex Shi --- arch/arm64/kernel/image.h | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index bc2abb8b1599..999633bd7294 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -64,6 +64,16 @@ #ifdef CONFIG_EFI +/* + * Prevent the symbol aliases below from being emitted into the kallsyms + * table, by forcing them to be absolute symbols (which are conveniently + * ignored by scripts/kallsyms) rather than section relative symbols. + * The distinction is only relevant for partial linking, and only for symbols + * that are defined within a section declaration (which is not the case for + * the definitions below) so the resulting values will be identical. + */ +#define KALLSYMS_HIDE(sym) ABSOLUTE(sym) + /* * The EFI stub has its own symbol namespace prefixed by __efistub_, to * isolate it from the kernel proper. The following symbols are legally @@ -73,25 +83,25 @@ * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_memcmp = __pi_memcmp; -__efistub_memchr = __pi_memchr; -__efistub_memcpy = __pi_memcpy; -__efistub_memmove = __pi_memmove; -__efistub_memset = __pi_memset; -__efistub_strlen = __pi_strlen; -__efistub_strcmp = __pi_strcmp; -__efistub_strncmp = __pi_strncmp; -__efistub___flush_dcache_area = __pi___flush_dcache_area; +__efistub_memcmp = KALLSYMS_HIDE(__pi_memcmp); +__efistub_memchr = KALLSYMS_HIDE(__pi_memchr); +__efistub_memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub_memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub_memset = KALLSYMS_HIDE(__pi_memset); +__efistub_strlen = KALLSYMS_HIDE(__pi_strlen); +__efistub_strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub_strncmp = KALLSYMS_HIDE(__pi_strncmp); +__efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); #ifdef CONFIG_KASAN -__efistub___memcpy = __pi_memcpy; -__efistub___memmove = __pi_memmove; -__efistub___memset = __pi_memset; +__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); +__efistub___memmove = KALLSYMS_HIDE(__pi_memmove); +__efistub___memset = KALLSYMS_HIDE(__pi_memset); #endif -__efistub__text = _text; -__efistub__end = _end; -__efistub__edata = _edata; +__efistub__text = KALLSYMS_HIDE(_text); +__efistub__end = KALLSYMS_HIDE(_end); +__efistub__edata = KALLSYMS_HIDE(_edata); #endif -- cgit v1.2.3 From b87cf8adbe0d3b1998a7dafd01b70e9b118f641d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 15 Jan 2016 16:55:37 -0800 Subject: arch/arm64/include/asm/pgtable.h: add pmd_mkclean for THP MADV_FREE needs pmd_dirty and pmd_mkclean for detecting recent overwrite of the contents since MADV_FREE syscall is called for THP page. This patch adds pmd_mkclean for THP page MADV_FREE support. Signed-off-by: Minchan Kim Cc: "James E.J. Bottomley" Cc: "Kirill A. Shutemov" Cc: Shaohua Li Cc: Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Gang Cc: Chris Zankel Cc: Daniel Micay Cc: Darrick J. Wong Cc: David S. Miller Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: Jason Evans Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Kirill A. Shutemov Cc: Matt Turner Cc: Max Filippov Cc: Mel Gorman Cc: Michael Kerrisk Cc: Michal Hocko Cc: Mika Penttil Cc: Ralf Baechle Cc: Richard Henderson Cc: Rik van Riel Cc: Roland Dreier Cc: Russell King Cc: Shaohua Li Cc: Will Deacon Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 05ee26d9e7e29ab026995eab79be3c6e8351908c) Signed-off-by: Alex Shi --- arch/arm64/include/asm/pgtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 76ff5d93c6c3..2daf88970731 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -369,6 +369,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) +#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) -- cgit v1.2.3 From 035fdc46d48ae8a7cbf7199c74bac1de36cca626 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 14:50:21 +0100 Subject: arm64: kasan: ensure that the KASAN zero page is mapped read-only When switching from the early KASAN shadow region, which maps the entire shadow space read-write, to the permanent KASAN shadow region, which uses a zero page to shadow regions that are not subject to instrumentation, the lowest level table kasan_zero_pte[] may be reused unmodified, which means that the mappings of the zero page that it contains will still be read-write. So update it explicitly to map the zero page read only when we activate the permanent mapping. Acked-by: Andrey Ryabinin Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon (cherry picked from commit 7b1af9795773d745c2a8c7d4ca5f2936e8b6adfb) Signed-off-by: Alex Shi --- arch/arm64/mm/kasan_init.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cf038c7d9fa9..cab7a5be40aa 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -120,6 +120,7 @@ static void __init cpu_set_ttbr1(unsigned long ttbr1) void __init kasan_init(void) { struct memblock_region *reg; + int i; /* * We are going to perform proper setup of shadow memory. @@ -155,6 +156,14 @@ void __init kasan_init(void) pfn_to_nid(virt_to_pfn(start))); } + /* + * KAsan may reuse the contents of kasan_zero_pte directly, so we + * should make sure that it maps the zero page read-only. + */ + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(&kasan_zero_pte[i], + pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO)); + memset(kasan_zero_page, 0, PAGE_SIZE); cpu_set_ttbr1(__pa(swapper_pg_dir)); flush_tlb_all(); -- cgit v1.2.3 From 8c226342ae45c7a2029af59cf81edc9147a60d56 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 24 Jan 2016 15:24:12 +0900 Subject: arm64: Fix an enum typo in mm/dump.c This patch fixes a typo in mm/dump.c: "MODUELS_END_NR" should be "MODULES_END_NR". Signed-off-by: Masanari Iida Signed-off-by: Will Deacon (cherry picked from commit b3122023df935cf14bf951da98ca598d71b9f826) Signed-off-by: Alex Shi --- arch/arm64/mm/dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 5a22a119a74c..0adbebbc2803 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -46,7 +46,7 @@ enum address_markers_idx { PCI_START_NR, PCI_END_NR, MODULES_START_NR, - MODUELS_END_NR, + MODULES_END_NR, KERNEL_SPACE_NR, }; -- cgit v1.2.3 From d2779548f7c7156686d4d88ce4ae904460952a8c Mon Sep 17 00:00:00 2001 From: William Cohen Date: Thu, 21 Jan 2016 22:56:26 -0500 Subject: Eliminate the .eh_frame sections from the aarch64 vmlinux and kernel modules By default the aarch64 gcc generates .eh_frame sections. Unlike .debug_frame sections, the .eh_frame sections are loaded into memory when the associated code is loaded. On an example kernel being built with this default the .eh_frame section in vmlinux used an extra 1.7MB of memory. The x86 disables the creation of the .eh_frame section. The aarch64 should probably do the same to save some memory. Signed-off-by: William Cohen Signed-off-by: Will Deacon (cherry picked from commit 728dabd6d1751cf5e0f8e0535891393da62396e9) Signed-off-by: Alex Shi Conflicts: pick 67dfa1751 arm64: errata: Add -mpc-relative-literal-loads in arch/arm64/Makefile --- arch/arm64/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index b6c90e5006e4..548a2939d7e6 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -28,6 +28,7 @@ endif KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads) +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_AFLAGS += $(lseinstr) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) -- cgit v1.2.3 From c8403d828ed9741a4a9d820c829d71211685b659 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:56 +0000 Subject: arm64: mm: specialise pagetable allocators We pass a size parameter to early_alloc and late_alloc, but these are only ever used to allocate single pages. In late_alloc we always allocate a single page. Both allocators provide us with zeroed pages (such that all entries are invalid), but we have no barriers between allocating a page and adding that page to existing (live) tables. A concurrent page table walk may see stale data, leading to a number of issues. This patch specialises the two allocators for page tables. The size parameter is removed and the necessary dsb(ishst) is folded into each. To make it clear that the functions are intended for use for page table allocation, they are renamed to {early,late}_pgtable_alloc, with the related function pointed renamed to pgtable_alloc. As the dsb(ishst) is now in the allocator, the existing barrier for the zero page is redundant and thus is removed. The previously missing include of barrier.h is added. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 21ab99c289d350f4ae454bc069870009db6df20e) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index c5bd5bca8e3d..3ed128c96618 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -62,15 +63,18 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static void __init *early_alloc(unsigned long sz) +static void __init *early_pgtable_alloc(void) { phys_addr_t phys; void *ptr; - phys = memblock_alloc(sz, sz); + phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE); BUG_ON(!phys); ptr = __va(phys); - memset(ptr, 0, sz); + memset(ptr, 0, PAGE_SIZE); + + /* Ensure the zeroed page is visible to the page table walker */ + dsb(ishst); return ptr; } @@ -95,12 +99,12 @@ static void split_pmd(pmd_t *pmd, pte_t *pte) static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - pte = alloc(PTRS_PER_PTE * sizeof(pte_t)); + pte = pgtable_alloc(); if (pmd_sect(*pmd)) split_pmd(pmd, pte); __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); @@ -130,7 +134,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pmd_t *pmd; unsigned long next; @@ -139,7 +143,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t)); + pmd = pgtable_alloc(); if (pud_sect(*pud)) { /* * need to have the 1G of mappings continue to be @@ -174,7 +178,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, } } else { alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot, alloc); + prot, pgtable_alloc); } phys += next - addr; } while (pmd++, addr = next, addr != end); @@ -195,13 +199,13 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { pud_t *pud; unsigned long next; if (pgd_none(*pgd)) { - pud = alloc(PTRS_PER_PUD * sizeof(pud_t)); + pud = pgtable_alloc(); pgd_populate(mm, pgd, pud); } BUG_ON(pgd_bad(*pgd)); @@ -234,7 +238,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } } } else { - alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc); + alloc_init_pmd(mm, pud, addr, next, phys, prot, + pgtable_alloc); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -247,7 +252,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - void *(*alloc)(unsigned long size)) + void *(*pgtable_alloc)(void)) { unsigned long addr, length, end, next; @@ -265,18 +270,18 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc); + alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc); phys += next - addr; } while (pgd++, addr = next, addr != end); } -static void *late_alloc(unsigned long size) +static void *late_pgtable_alloc(void) { - void *ptr; - - BUG_ON(size > PAGE_SIZE); - ptr = (void *)__get_free_page(PGALLOC_GFP); + void *ptr = (void *)__get_free_page(PGALLOC_GFP); BUG_ON(!ptr); + + /* Ensure the zeroed page is visible to the page table walker */ + dsb(ishst); return ptr; } @@ -289,7 +294,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, return; } __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, - size, prot, early_alloc); + size, prot, early_pgtable_alloc); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -297,7 +302,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot) { __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot, - late_alloc); + late_pgtable_alloc); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -310,7 +315,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } return __create_mapping(&init_mm, pgd_offset_k(virt), - phys, virt, size, prot, late_alloc); + phys, virt, size, prot, late_pgtable_alloc); } #ifdef CONFIG_DEBUG_RODATA @@ -458,15 +463,12 @@ void __init paging_init(void) fixup_executable(); /* allocate the zero page. */ - zero_page = early_alloc(PAGE_SIZE); + zero_page = early_pgtable_alloc(); bootmem_init(); empty_zero_page = virt_to_page(zero_page); - /* Ensure the zero page is visible to the page table walker */ - dsb(ishst); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. -- cgit v1.2.3 From a4593c91bbb59e89df1aefe677493ff364a7ddb2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:57 +0000 Subject: arm64: mm: place empty_zero_page in bss Currently the zero page is set up in paging_init, and thus we cannot use the zero page earlier. We use the zero page as a reserved TTBR value from which no TLB entries may be allocated (e.g. when uninstalling the idmap). To enable such usage earlier (as may be required for invasive changes to the kernel page tables), and to minimise the time that the idmap is active, we need to be able to use the zero page before paging_init. This patch follows the example set by x86, by allocating the zero page at compile time, in .bss. This means that the zero page itself is available immediately upon entry to start_kernel (as we zero .bss before this), and also means that the zero page takes up no space in the raw Image binary. The associated struct page is allocated in bootmem_init, and remains unavailable until this time. Outside of arch code, the only users of empty_zero_page assume that the empty_zero_page symbol refers to the zeroed memory itself, and that ZERO_PAGE(x) must be used to acquire the associated struct page, following the example of x86. This patch also brings arm64 inline with these assumptions. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 5227cfa71f9e8574373f4d0e9e754942d76cdf67) Signed-off-by: Alex Shi --- arch/arm64/include/asm/mmu_context.h | 2 +- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/arm64/kernel/head.S | 1 + arch/arm64/mm/mmu.c | 9 +-------- 4 files changed, 5 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 24165784b803..600eacb9f7d5 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -48,7 +48,7 @@ static inline void contextidr_thread_switch(struct task_struct *next) */ static inline void cpu_set_reserved_ttbr0(void) { - unsigned long ttbr = page_to_phys(empty_zero_page); + unsigned long ttbr = virt_to_phys(empty_zero_page); asm( " msr ttbr0_el1, %0 // set TTBR0\n" diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2daf88970731..8a76e603d737 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -123,8 +123,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val); * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern struct page *empty_zero_page; -#define ZERO_PAGE(vaddr) (empty_zero_page) +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) #define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 917d98108b3f..53b9f9f128c2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -421,6 +421,7 @@ __mmap_switched: adr_l x2, __bss_stop sub x2, x2, x0 bl __pi_memset + dsb ishst // Make zero page visible to PTW adr_l sp, initial_sp, x4 mov x4, sp diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3ed128c96618..e4932aa6c6e9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -49,7 +49,7 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); * Empty_zero_page is a special page that is used for zero-initialized data * and COW. */ -struct page *empty_zero_page; +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, @@ -457,18 +457,11 @@ void fixup_init(void) */ void __init paging_init(void) { - void *zero_page; - map_mem(); fixup_executable(); - /* allocate the zero page. */ - zero_page = early_pgtable_alloc(); - bootmem_init(); - empty_zero_page = virt_to_page(zero_page); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. -- cgit v1.2.3 From 0dedc3948e315aff0f382a58c0d387547d8a35b5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:58 +0000 Subject: arm64: unify idmap removal We currently open-code the removal of the idmap and restoration of the current task's MMU state in a few places. Before introducing yet more copies of this sequence, unify these to call a new helper, cpu_uninstall_idmap. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Lorenzo Pieralisi Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 9e8e865bbe294a69666a1996bda3e87825b258c0) Signed-off-by: Alex Shi --- arch/arm64/include/asm/mmu_context.h | 25 +++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 1 + arch/arm64/kernel/smp.c | 4 +--- arch/arm64/kernel/suspend.c | 20 ++++---------------- arch/arm64/mm/mmu.c | 4 +--- 5 files changed, 32 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 600eacb9f7d5..b1b2514d8883 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef CONFIG_PID_IN_CONTEXTIDR static inline void contextidr_thread_switch(struct task_struct *next) @@ -89,6 +90,30 @@ static inline void cpu_set_default_tcr_t0sz(void) : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } +/* + * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. + * + * The idmap lives in the same VA range as userspace, but uses global entries + * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from + * speculative TLB fetches, we must temporarily install the reserved page + * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ. + * + * If current is a not a user task, the mm covers the TTBR1_EL1 page tables, + * which should not be installed in TTBR0_EL1. In this case we can leave the + * reserved page tables in place. + */ +static inline void cpu_uninstall_idmap(void) +{ + struct mm_struct *mm = current->active_mm; + + cpu_set_reserved_ttbr0(); + local_flush_tlb_all(); + cpu_set_default_tcr_t0sz(); + + if (mm != &init_mm) + cpu_switch_mm(mm->pgd, mm); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 8119479147db..f6621ba071f9 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -62,6 +62,7 @@ #include #include #include +#include phys_addr_t __fdt_pointer __initdata; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b1adc51b2c2e..68e7f79630d4 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -149,9 +149,7 @@ asmlinkage void secondary_start_kernel(void) * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); + cpu_uninstall_idmap(); preempt_disable(); trace_hardirqs_off(); diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 1095aa483a1c..66055392f445 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -60,7 +60,6 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *)) */ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { - struct mm_struct *mm = current->active_mm; int ret; unsigned long flags; @@ -87,22 +86,11 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) ret = __cpu_suspend_enter(arg, fn); if (ret == 0) { /* - * We are resuming from reset with TTBR0_EL1 set to the - * idmap to enable the MMU; set the TTBR0 to the reserved - * page tables to prevent speculative TLB allocations, flush - * the local tlb and set the default tcr_el1.t0sz so that - * the TTBR0 address space set-up is properly restored. - * If the current active_mm != &init_mm we entered cpu_suspend - * with mappings in TTBR0 that must be restored, so we switch - * them back to complete the address space configuration - * restoration before returning. + * We are resuming from reset with the idmap active in TTBR0_EL1. + * We must uninstall the idmap and restore the expected MMU + * state before we can possibly return to userspace. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); - - if (mm != &init_mm) - cpu_switch_mm(mm->pgd, mm); + cpu_uninstall_idmap(); /* * Restore per-cpu offset before any kernel diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e4932aa6c6e9..dcc06b23b37f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -466,9 +466,7 @@ void __init paging_init(void) * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - cpu_set_default_tcr_t0sz(); + cpu_uninstall_idmap(); } /* -- cgit v1.2.3 From 7ed029beef4adcab0ad59356579f6fea71655895 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:44:59 +0000 Subject: arm64: unmap idmap earlier During boot we leave the idmap in place until paging_init, as we previously had to wait for the zero page to become allocated and accessible. Now that we have a statically-allocated zero page, we can uninstall the idmap much earlier in the boot process, making it far easier to spot accidental use of physical addresses. This also brings the cold boot path in line with the secondary boot path. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 86ccce896cb0aa800a7a6dcd29b41ffc4eeb1a75) Signed-off-by: Alex Shi --- arch/arm64/kernel/setup.c | 6 ++++++ arch/arm64/mm/mmu.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f6621ba071f9..cfed56f0ad26 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -314,6 +314,12 @@ void __init setup_arch(char **cmdline_p) */ local_async_enable(); + /* + * TTBR0 is only used for the identity mapping at this stage. Make it + * point to zero page to avoid speculatively fetching new entries. + */ + cpu_uninstall_idmap(); + efi_init(); arm64_memblock_init(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index dcc06b23b37f..8587ed9d81b6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -461,12 +461,6 @@ void __init paging_init(void) fixup_executable(); bootmem_init(); - - /* - * TTBR0 is only used for the identity mapping at this stage. Make it - * point to zero page to avoid speculatively fetching new entries. - */ - cpu_uninstall_idmap(); } /* -- cgit v1.2.3 From 34903bb8c69405ec6eb2b2d437fabd0571df94bb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:00 +0000 Subject: arm64: add function to install the idmap In some cases (e.g. when making invasive changes to the kernel page tables) we will need to execute code from the idmap. Add a new helper which may be used to install the idmap, complementing the existing cpu_uninstall_idmap. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 609116d202a8c5fd3fe393eb85373cbee906df68) Signed-off-by: Alex Shi --- arch/arm64/include/asm/mmu_context.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index b1b2514d8883..944f2730a940 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -74,7 +74,7 @@ static inline bool __cpu_uses_extended_idmap(void) /* * Set TCR.T0SZ to its default value (based on VA_BITS) */ -static inline void cpu_set_default_tcr_t0sz(void) +static inline void __cpu_set_tcr_t0sz(unsigned long t0sz) { unsigned long tcr; @@ -87,9 +87,12 @@ static inline void cpu_set_default_tcr_t0sz(void) " msr tcr_el1, %0 ;" " isb" : "=&r" (tcr) - : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); + : "r"(t0sz), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } +#define cpu_set_default_tcr_t0sz() __cpu_set_tcr_t0sz(TCR_T0SZ(VA_BITS)) +#define cpu_set_idmap_tcr_t0sz() __cpu_set_tcr_t0sz(idmap_t0sz) + /* * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm. * @@ -114,6 +117,15 @@ static inline void cpu_uninstall_idmap(void) cpu_switch_mm(mm->pgd, mm); } +static inline void cpu_install_idmap(void) +{ + cpu_set_reserved_ttbr0(); + local_flush_tlb_all(); + cpu_set_idmap_tcr_t0sz(); + + cpu_switch_mm(idmap_pg_dir, &init_mm); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously -- cgit v1.2.3 From 34fc059805c7dfa19d0d9d1e008fe83f7744c0ed Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:01 +0000 Subject: arm64: mm: add code to safely replace TTBR1_EL1 If page tables are modified without suitable TLB maintenance, the ARM architecture permits multiple TLB entries to be allocated for the same VA. When this occurs, it is permitted that TLB conflict aborts are raised in response to synchronous data/instruction accesses, and/or and amalgamation of the TLB entries may be used as a result of a TLB lookup. The presence of conflicting TLB entries may result in a variety of behaviours detrimental to the system (e.g. erroneous physical addresses may be used by I-cache fetches and/or page table walks). Some of these cases may result in unexpected changes of hardware state, and/or result in the (asynchronous) delivery of SError. To avoid these issues, we must avoid situations where conflicting entries may be allocated into TLBs. For user and module mappings we can follow a strict break-before-make approach, but this cannot work for modifications to the swapper page tables that cover the kernel text and data. Instead, this patch adds code which is intended to be executed from the idmap, which can safely unmap the swapper page tables as it only requires the idmap to be active. This enables us to uninstall the active TTBR1_EL1 entry, invalidate TLBs, then install a new TTBR1_EL1 entry without potentially unmapping code or data required for the sequence. This avoids the risk of conflict, but requires that updates are staged in a copy of the swapper page tables prior to being installed. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 50e1881ddde2a986c7d0d2150985239e5e3d7d96) Signed-off-by: Alex Shi --- arch/arm64/include/asm/mmu_context.h | 19 +++++++++++++++++++ arch/arm64/mm/proc.S | 28 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 944f2730a940..a00f7cf35bbd 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -126,6 +126,25 @@ static inline void cpu_install_idmap(void) cpu_switch_mm(idmap_pg_dir, &init_mm); } +/* + * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, + * avoiding the possibility of conflicting TLB entries being allocated. + */ +static inline void cpu_replace_ttbr1(pgd_t *pgd) +{ + typedef void (ttbr_replace_func)(phys_addr_t); + extern ttbr_replace_func idmap_cpu_replace_ttbr1; + ttbr_replace_func *replace_phys; + + phys_addr_t pgd_phys = virt_to_phys(pgd); + + replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1); + + cpu_install_idmap(); + replace_phys(pgd_phys); + cpu_uninstall_idmap(); +} + /* * It would be nice to return ASIDs back to the allocator, but unfortunately * that introduces a race with a generation rollover where we could erroneously diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index c164d2cb35c0..0c19534a901e 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -140,6 +140,34 @@ ENTRY(cpu_do_switch_mm) ret ENDPROC(cpu_do_switch_mm) + .pushsection ".idmap.text", "ax" +/* + * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd) + * + * This is the low-level counterpart to cpu_replace_ttbr1, and should not be + * called by anything else. It can only be executed from a TTBR0 mapping. + */ +ENTRY(idmap_cpu_replace_ttbr1) + mrs x2, daif + msr daifset, #0xf + + adrp x1, empty_zero_page + msr ttbr1_el1, x1 + isb + + tlbi vmalle1 + dsb nsh + isb + + msr ttbr1_el1, x0 + isb + + msr daif, x2 + + ret +ENDPROC(idmap_cpu_replace_ttbr1) + .popsection + /* * __cpu_setup * -- cgit v1.2.3 From 8b8513de45f823e10fce3b1563953be70483941b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:02 +0000 Subject: arm64: kasan: avoid TLB conflicts The page table modification performed during the KASAN init risks the allocation of conflicting TLB entries, as it swaps a set of valid global entries for another without suitable TLB maintenance. The presence of conflicting TLB entries can result in the delivery of synchronous TLB conflict aborts, or may result in the use of erroneous data being returned in response to a TLB lookup. This can affect explicit data accesses from software as well as translations performed asynchronously (e.g. as part of page table walks or speculative I-cache fetches), and can therefore result in a wide variety of problems. To avoid this, use cpu_replace_ttbr1 to swap the page tables. This ensures that when the new tables are installed there are no stale entries from the old tables which may conflict. As all updates are made to the tables while they are not active, the updates themselves are safe. At the same time, add the missing barrier to ensure that the tmp_pg_dir entries updated via memcpy are visible to the page table walkers at the point the tmp_pg_dir is installed. All other page table updates made as part of KASAN initialisation have the requisite barriers due to the use of the standard page table accessors. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Andrey Ryabinin Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit c1a88e9124a499939ebd8069d5e4d3937f019157) Signed-off-by: Alex Shi --- arch/arm64/mm/kasan_init.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cab7a5be40aa..263b59020500 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -108,15 +109,6 @@ static void __init clear_pgds(unsigned long start, set_pgd(pgd_offset_k(start), __pgd(0)); } -static void __init cpu_set_ttbr1(unsigned long ttbr1) -{ - asm( - " msr ttbr1_el1, %0\n" - " isb" - : - : "r" (ttbr1)); -} - void __init kasan_init(void) { struct memblock_region *reg; @@ -130,8 +122,8 @@ void __init kasan_init(void) * setup will be finished. */ memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir)); - cpu_set_ttbr1(__pa(tmp_pg_dir)); - flush_tlb_all(); + dsb(ishst); + cpu_replace_ttbr1(tmp_pg_dir); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -165,8 +157,7 @@ void __init kasan_init(void) pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO)); memset(kasan_zero_page, 0, PAGE_SIZE); - cpu_set_ttbr1(__pa(swapper_pg_dir)); - flush_tlb_all(); + cpu_replace_ttbr1(swapper_pg_dir); /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; -- cgit v1.2.3 From febef18a360df3f8b4b364167e8fe050b24ccf65 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:03 +0000 Subject: arm64: mm: move pte_* macros For pmd, pud, and pgd levels of table, functions including p?d_index and p?d_offset are defined after the p?d_page_vaddr function for the immediately higher level of table. The pte functions however are defined much earlier, even though several rely on the later definition of pmd_page_vaddr. While this isn't currently a problem as these are macros, it prevents the logical grouping of later C functions (which cannot rely on prototypes for functions not yet defined). Move these definitions after pmd_page_vaddr, for consistency with the placement of these functions for other levels of table. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 053520f7d3923cc6d37afb28f9887cb1e7d77454) Signed-off-by: Alex Shi --- arch/arm64/include/asm/pgtable.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 8a76e603d737..d439523a7910 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -136,16 +136,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) -/* Find an entry in the third-level page table. */ -#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) - -#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) - /* * The following only work if pte_present(). Undefined behaviour otherwise. */ @@ -447,6 +437,16 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK); } +/* Find an entry in the third-level page table. */ +#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + +#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) + +#define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) + #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) /* -- cgit v1.2.3 From 77ea11473be30e0b90b32deb650b6403ad291a12 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:04 +0000 Subject: arm64: mm: add functions to walk page tables by PA To allow us to walk tables allocated into the fixmap, we need to acquire the physical address of a page, rather than the virtual address in the linear map. This patch adds new p??_page_paddr and p??_offset_phys functions to acquire the physical address of a next-level table, and changes p??_offset* into macros which simply convert this to a linear map VA. This renders p??_page_vaddr unused, and hence they are removed. At the pgd level, a new pgd_offset_raw function is added to find the relevant PGD entry given the base of a PGD and a virtual address. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit dca56dca7124709f3dfca81afe61b4d98eb9cacf) Signed-off-by: Alex Shi --- arch/arm64/include/asm/pgtable.h | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d439523a7910..db608e7984c6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -432,15 +432,16 @@ static inline void pmd_clear(pmd_t *pmdp) set_pmd(pmdp, __pmd(0)); } -static inline pte_t *pmd_page_vaddr(pmd_t pmd) +static inline phys_addr_t pmd_page_paddr(pmd_t pmd) { - return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK); + return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir,addr) (pmd_page_vaddr(*(dir)) + pte_index(addr)) +#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) +#define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) @@ -475,21 +476,23 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } -static inline pmd_t *pud_page_vaddr(pud_t pud) +static inline phys_addr_t pud_page_paddr(pud_t pud) { - return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK); + return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the second-level page table. */ #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) -{ - return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); -} +#define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t)) +#define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) +#else + +#define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -511,21 +514,23 @@ static inline void pgd_clear(pgd_t *pgdp) set_pgd(pgdp, __pgd(0)); } -static inline pud_t *pgd_page_vaddr(pgd_t pgd) +static inline phys_addr_t pgd_page_paddr(pgd_t pgd) { - return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK); + return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK; } /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) -{ - return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr); -} +#define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t)) +#define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) +#else + +#define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) @@ -533,7 +538,9 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) /* to find an entry in a page-table-directory */ #define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) -#define pgd_offset(mm, addr) ((mm)->pgd+pgd_index(addr)) +#define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr)) + +#define pgd_offset(mm, addr) (pgd_offset_raw((mm)->pgd, (addr))) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) -- cgit v1.2.3 From 6f74a379d1593fe728141747c6c594758cc5d328 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:05 +0000 Subject: arm64: mm: avoid redundant __pa(__va(x)) When we "upgrade" to a section mapping, we free any table we made redundant by giving it back to memblock. To get the PA, we acquire the physical address and convert this to a VA, then subsequently convert this back to a PA. This works currently, but will not work if the tables are not accessed via linear map VAs (e.g. is we use fixmap slots). This patch uses {pmd,pud}_page_paddr to acquire the PA. This avoids the __pa(__va()) round trip, saving some work and avoiding reliance on the linear mapping. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 316b39db06718d59d82736df9fc65cf05b467cc7) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8587ed9d81b6..0b6c8727bcd1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -171,7 +171,7 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, if (!pmd_none(old_pmd)) { flush_tlb_all(); if (pmd_table(old_pmd)) { - phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0)); + phys_addr_t table = pmd_page_paddr(old_pmd); if (!WARN_ON_ONCE(slab_is_available())) memblock_free(table, PAGE_SIZE); } @@ -232,7 +232,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, if (!pud_none(old_pud)) { flush_tlb_all(); if (pud_table(old_pud)) { - phys_addr_t table = __pa(pmd_offset(&old_pud, 0)); + phys_addr_t table = pud_page_paddr(old_pud); if (!WARN_ON_ONCE(slab_is_available())) memblock_free(table, PAGE_SIZE); } -- cgit v1.2.3 From 0061f781a0fe83adfe76f78f89653cbd41249a93 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:06 +0000 Subject: arm64: mm: add __{pud,pgd}_populate We currently have __pmd_populate for creating a pmd table entry given the physical address of a pte, but don't have equivalents for the pud or pgd levels of table. To enable us to manipulate tables which are mapped outside of the linear mapping (where we have a PA, but not a linear map VA), it is useful to have these functions. This patch adds __{pud,pgd}_populate. As these should not be called when the kernel uses folded {pmd,pud}s, in these cases they expand to BUILD_BUG(). So long as the appropriate checks are made on the {pud,pgd} entry prior to attempting population, these should be optimized out at compile time. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 1e531cce68c92b46c7d29f36a72f9a3e5886678f) Signed-off-by: Alex Shi --- arch/arm64/include/asm/pgalloc.h | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index c15053902942..ff98585d085a 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -42,11 +42,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) { - set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE)); + set_pud(pud, __pud(pmd | prot)); } +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + __pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE); +} +#else +static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) +{ + BUILD_BUG(); +} #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -62,11 +71,20 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) free_page((unsigned long)pud); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) { - set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE)); + set_pgd(pgdp, __pgd(pud | prot)); } +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + __pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE); +} +#else +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) +{ + BUILD_BUG(); +} #endif /* CONFIG_PGTABLE_LEVELS > 3 */ extern pgd_t *pgd_alloc(struct mm_struct *mm); -- cgit v1.2.3 From 9568281b80e91974c04f6aaac50ecfd4dcf31df1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:07 +0000 Subject: arm64: mm: add functions to walk tables in fixmap As a preparatory step to allow us to allocate early page tables from unmapped memory using memblock_alloc, add new p??_{set,clear}_fixmap* functions which can be used to walk page tables outside of the linear mapping by using fixmap slots. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 961faac114819a01e627fe9c9c82b830bb3849d4) Signed-off-by: Alex Shi --- arch/arm64/include/asm/fixmap.h | 10 ++++++++++ arch/arm64/include/asm/pgtable.h | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 309704544d22..1a617d46fce9 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,6 +62,16 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses, FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, + + /* + * Used for kernel page table creation, so unmapped memory may be used + * for tables. + */ + FIX_PTE, + FIX_PMD, + FIX_PUD, + FIX_PGD, + __end_of_fixed_addresses }; diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index db608e7984c6..c99dfc588deb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -59,6 +59,7 @@ #ifndef __ASSEMBLY__ +#include #include extern void __pte_error(const char *file, int line, unsigned long val); @@ -448,6 +449,10 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr)) +#define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) +#define pte_clear_fixmap() clear_fixmap(FIX_PTE) + #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) /* @@ -487,12 +492,21 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t)) #define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) +#define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr)) +#define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) +#define pmd_clear_fixmap() clear_fixmap(FIX_PMD) + #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) #else #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) +/* Match pmd_offset folding in */ +#define pmd_set_fixmap(addr) NULL +#define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) +#define pmd_clear_fixmap() + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -525,12 +539,21 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t)) #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) +#define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) +#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) +#define pud_clear_fixmap() clear_fixmap(FIX_PUD) + #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) #else #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) +/* Match pud_offset folding in */ +#define pud_set_fixmap(addr) NULL +#define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) +#define pud_clear_fixmap() + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) @@ -545,6 +568,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) +#define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr)) +#define pgd_clear_fixmap() clear_fixmap(FIX_PGD) + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | -- cgit v1.2.3 From c790554bd459b56042e784162b5017c41120538e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:08 +0000 Subject: arm64: mm: use fixmap when creating page tables As a preparatory step to allow us to allocate early page tables from unmapped memory using memblock_alloc, modify the __create_mapping callees to map and unmap the tables they modify using fixmap entries. All but the top-level pgd initialisation is performed via the fixmap. Subsequent patches will inject the pgd physical address, and migrate to using the FIX_PGD slot. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit f4710445458c0a1bd1c3c014ada2e7d7dc7b882f) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 61 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0b6c8727bcd1..4f5a5fa3f8f4 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -63,19 +63,30 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static void __init *early_pgtable_alloc(void) +static phys_addr_t __init early_pgtable_alloc(void) { phys_addr_t phys; void *ptr; phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE); BUG_ON(!phys); - ptr = __va(phys); + + /* + * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE + * slot will be free, so we can (ab)use the FIX_PTE slot to initialise + * any level of table. + */ + ptr = pte_set_fixmap(phys); + memset(ptr, 0, PAGE_SIZE); - /* Ensure the zeroed page is visible to the page table walker */ - dsb(ishst); - return ptr; + /* + * Implicit barriers also ensure the zeroed page is visible to the page + * table walker + */ + pte_clear_fixmap(); + + return phys; } /* @@ -99,24 +110,28 @@ static void split_pmd(pmd_t *pmd, pte_t *pte) static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - pte = pgtable_alloc(); + phys_addr_t pte_phys = pgtable_alloc(); + pte = pte_set_fixmap(pte_phys); if (pmd_sect(*pmd)) split_pmd(pmd, pte); - __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE); + __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); flush_tlb_all(); + pte_clear_fixmap(); } BUG_ON(pmd_bad(*pmd)); - pte = pte_offset_kernel(pmd, addr); + pte = pte_set_fixmap_offset(pmd, addr); do { set_pte(pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + + pte_clear_fixmap(); } static void split_pud(pud_t *old_pud, pmd_t *pmd) @@ -134,7 +149,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pmd_t *pmd; unsigned long next; @@ -143,7 +158,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - pmd = pgtable_alloc(); + phys_addr_t pmd_phys = pgtable_alloc(); + pmd = pmd_set_fixmap(pmd_phys); if (pud_sect(*pud)) { /* * need to have the 1G of mappings continue to be @@ -151,12 +167,13 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, */ split_pud(pud, pmd); } - pud_populate(mm, pud, pmd); + __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); flush_tlb_all(); + pmd_clear_fixmap(); } BUG_ON(pud_bad(*pud)); - pmd = pmd_offset(pud, addr); + pmd = pmd_set_fixmap_offset(pud, addr); do { next = pmd_addr_end(addr, end); /* try section mapping first */ @@ -182,6 +199,8 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, } phys += next - addr; } while (pmd++, addr = next, addr != end); + + pmd_clear_fixmap(); } static inline bool use_1G_block(unsigned long addr, unsigned long next, @@ -199,18 +218,18 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { pud_t *pud; unsigned long next; if (pgd_none(*pgd)) { - pud = pgtable_alloc(); - pgd_populate(mm, pgd, pud); + phys_addr_t pud_phys = pgtable_alloc(); + __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); } BUG_ON(pgd_bad(*pgd)); - pud = pud_offset(pgd, addr); + pud = pud_set_fixmap_offset(pgd, addr); do { next = pud_addr_end(addr, end); @@ -243,6 +262,8 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } phys += next - addr; } while (pud++, addr = next, addr != end); + + pud_clear_fixmap(); } /* @@ -252,7 +273,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - void *(*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void)) { unsigned long addr, length, end, next; @@ -275,14 +296,14 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, } while (pgd++, addr = next, addr != end); } -static void *late_pgtable_alloc(void) +static phys_addr_t late_pgtable_alloc(void) { void *ptr = (void *)__get_free_page(PGALLOC_GFP); BUG_ON(!ptr); /* Ensure the zeroed page is visible to the page table walker */ dsb(ishst); - return ptr; + return __pa(ptr); } static void __init create_mapping(phys_addr_t phys, unsigned long virt, -- cgit v1.2.3 From 8f64994ff3068b62b867188a12e366ee237da3b5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:09 +0000 Subject: arm64: mm: allocate pagetables anywhere Now that create_mapping uses fixmap slots to modify pte, pmd, and pud entries, we can access page tables anywhere in physical memory, regardless of the extent of the linear mapping. Given that, we no longer need to limit memblock allocations during page table creation, and can leave the limit as its default MEMBLOCK_ALLOC_ANYWHERE. We never add memory which will fall outside of the linear map range given phys_offset and MAX_MEMBLOCK_ADDR are configured appropriately, so any tables we create will fall in the linear map of the final tables. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit cdef5f6e9e0e5ee397759b664a9f875ff59ccf01) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4f5a5fa3f8f4..d50535a07c6e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -384,20 +384,6 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end) static void __init map_mem(void) { struct memblock_region *reg; - phys_addr_t limit; - - /* - * Temporarily limit the memblock range. We need to do this as - * create_mapping requires puds, pmds and ptes to be allocated from - * memory addressable from the initial direct kernel mapping. - * - * The initial direct kernel mapping, located at swapper_pg_dir, gives - * us PUD_SIZE (with SECTION maps) or PMD_SIZE (without SECTION maps, - * memory starting from PHYS_OFFSET (which must be aligned to 2MB as - * per Documentation/arm64/booting.txt). - */ - limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE; - memblock_set_current_limit(limit); /* map all the memory banks */ for_each_memblock(memory, reg) { @@ -407,29 +393,8 @@ static void __init map_mem(void) if (start >= end) break; - if (ARM64_SWAPPER_USES_SECTION_MAPS) { - /* - * For the first memory bank align the start address and - * current memblock limit to prevent create_mapping() from - * allocating pte page tables from unmapped memory. With - * the section maps, if the first block doesn't end on section - * size boundary, create_mapping() will try to allocate a pte - * page, which may be returned from an unmapped area. - * When section maps are not used, the pte page table for the - * current limit is already present in swapper_pg_dir. - */ - if (start < limit) - start = ALIGN(start, SECTION_SIZE); - if (end < limit) { - limit = end & SECTION_MASK; - memblock_set_current_limit(limit); - } - } __map_memblock(start, end); } - - /* Limit no longer required. */ - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); } static void __init fixup_executable(void) -- cgit v1.2.3 From 55ce0af58717f7651d5a393f687f2ea9a109e4f5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:10 +0000 Subject: arm64: mm: allow passing a pgdir to alloc_init_* To allow us to initialise pgdirs which are fixmapped, allow explicitly passing a pgdir rather than an mm. A new __create_pgd_mapping function is added for this, with existing __create_mapping callers migrated to this. The mm argument was previously only used at the top level. Now that it is redundant at all levels, it is removed. To indicate its new found similarity to alloc_init_{pud,pmd,pte}, __create_mapping is renamed to init_pgd. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 11509a306bb6ea595878b2d246d2d56b1783e040) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d50535a07c6e..570ba3e3d362 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -146,8 +146,7 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) } while (pmd++, i++, i < PTRS_PER_PMD); } -static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, +static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -215,8 +214,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, return true; } -static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, +static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -257,7 +255,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, } } } else { - alloc_init_pmd(mm, pud, addr, next, phys, prot, + alloc_init_pmd(pud, addr, next, phys, prot, pgtable_alloc); } phys += next - addr; @@ -270,8 +268,7 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd, * Create the page directory entries and any necessary page tables for the * mapping specified by 'md'. */ -static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, - phys_addr_t phys, unsigned long virt, +static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -291,7 +288,7 @@ static void __create_mapping(struct mm_struct *mm, pgd_t *pgd, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(mm, pgd, addr, next, phys, prot, pgtable_alloc); + alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -306,6 +303,14 @@ static phys_addr_t late_pgtable_alloc(void) return __pa(ptr); } +static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + pgprot_t prot, + phys_addr_t (*alloc)(void)) +{ + init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc); +} + static void __init create_mapping(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { @@ -314,16 +319,16 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_mapping(&init_mm, pgd_offset_k(virt), phys, virt, - size, prot, early_pgtable_alloc); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, + early_pgtable_alloc); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { - __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot, - late_pgtable_alloc); + __create_pgd_mapping(mm->pgd, phys, virt, size, prot, + late_pgtable_alloc); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -335,8 +340,8 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, return; } - return __create_mapping(&init_mm, pgd_offset_k(virt), - phys, virt, size, prot, late_pgtable_alloc); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, + late_pgtable_alloc); } #ifdef CONFIG_DEBUG_RODATA -- cgit v1.2.3 From 0060e7a78b1a3b208d178b285fb3912f4fd4d9ee Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:11 +0000 Subject: arm64: ensure _stext and _etext are page-aligned Currently we have separate ALIGN_DEBUG_RO{,_MIN} directives to align _etext and __init_begin. While we ensure that __init_begin is page-aligned, we do not provide the same guarantee for _etext. This is not problematic currently as the alignment of __init_begin is sufficient to prevent issues when we modify permissions. Subsequent patches will assume page alignment of segments of the kernel we wish to map with different permissions. To ensure this, move _etext after the ALIGN_DEBUG_RO_MIN for the init section. This renders the prior ALIGN_DEBUG_RO irrelevant, and hence it is removed. Likewise, upgrade to ALIGN_DEBUG_RO_MIN(PAGE_SIZE) for _stext. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit fca082bfb543ccaaff864fc0892379ccaa1711cd) Signed-off-by: Alex Shi --- arch/arm64/kernel/vmlinux.lds.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index e3928f578891..b78a3c772294 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -95,7 +95,7 @@ SECTIONS _text = .; HEAD_TEXT } - ALIGN_DEBUG_RO + ALIGN_DEBUG_RO_MIN(PAGE_SIZE) .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ __exception_text_start = .; @@ -116,10 +116,9 @@ SECTIONS RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(8) NOTES - ALIGN_DEBUG_RO - _etext = .; /* End of text and rodata section */ ALIGN_DEBUG_RO_MIN(PAGE_SIZE) + _etext = .; /* End of text and rodata section */ __init_begin = .; INIT_TEXT_SECTION(8) -- cgit v1.2.3 From a8a81d65140e0bdfe27e53ae3529546b5dce6bd4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 25 Jan 2016 11:45:12 +0000 Subject: arm64: mm: create new fine-grained mappings at boot At boot we may change the granularity of the tables mapping the kernel (by splitting or making sections). This may happen when we create the linear mapping (in __map_memblock), or at any point we try to apply fine-grained permissions to the kernel (e.g. fixup_executable, mark_rodata_ro, fixup_init). Changing the active page tables in this manner may result in multiple entries for the same address being allocated into TLBs, risking problems such as TLB conflict aborts or issues derived from the amalgamation of TLB entries. Generally, a break-before-make (BBM) approach is necessary to avoid conflicts, but we cannot do this for the kernel tables as it risks unmapping text or data being used to do so. Instead, we can create a new set of tables from scratch in the safety of the existing mappings, and subsequently migrate over to these using the new cpu_replace_ttbr1 helper, which avoids the two sets of tables being active simultaneously. To avoid issues when we later modify permissions of the page tables (e.g. in fixup_init), we must create the page tables at a granularity such that later modification does not result in splitting of tables. This patch applies this strategy, creating a new set of fine-grained page tables from scratch, and safely migrating to them. The existing fixmap and kasan shadow page tables are reused in the new fine-grained tables. Signed-off-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Andrey Ryabinin Tested-by: Ard Biesheuvel Reviewed-by: Ard Biesheuvel Tested-by: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 068a17a5805dfbca4bbf03e664ca6b19709cc7a8) Signed-off-by: Alex Shi --- arch/arm64/include/asm/kasan.h | 3 + arch/arm64/mm/kasan_init.c | 15 ++++ arch/arm64/mm/mmu.c | 153 ++++++++++++++++++++++++----------------- 3 files changed, 109 insertions(+), 62 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index 2774fa384c47..de0d21211c34 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -7,6 +7,7 @@ #include #include +#include /* * KASAN_SHADOW_START: beginning of the kernel virtual addresses. @@ -28,10 +29,12 @@ #define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << (64 - 3))) void kasan_init(void); +void kasan_copy_shadow(pgd_t *pgdir); asmlinkage void kasan_early_init(void); #else static inline void kasan_init(void) { } +static inline void kasan_copy_shadow(pgd_t *pgdir) { } #endif #endif diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 263b59020500..cc569a38bc76 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -97,6 +97,21 @@ asmlinkage void __init kasan_early_init(void) kasan_map_early_shadow(); } +/* + * Copy the current shadow region into a new pgdir. + */ +void __init kasan_copy_shadow(pgd_t *pgdir) +{ + pgd_t *pgd, *pgd_new, *pgd_end; + + pgd = pgd_offset_k(KASAN_SHADOW_START); + pgd_end = pgd_offset_k(KASAN_SHADOW_END); + pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START); + do { + set_pgd(pgd_new, *pgd); + } while (pgd++, pgd_new++, pgd != pgd_end); +} + static void __init clear_pgds(unsigned long start, unsigned long end) { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 570ba3e3d362..4874d2fea1c9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -344,49 +345,42 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, late_pgtable_alloc); } -#ifdef CONFIG_DEBUG_RODATA -static void __init __map_memblock(phys_addr_t start, phys_addr_t end) +static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { + + unsigned long kernel_start = __pa(_stext); + unsigned long kernel_end = __pa(_end); + /* - * Set up the executable regions using the existing section mappings - * for now. This will get more fine grained later once all memory - * is mapped + * The kernel itself is mapped at page granularity. Map all other + * memory, making sure we don't overwrite the existing kernel mappings. */ - unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE); - unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE); - - if (end < kernel_x_start) { - create_mapping(start, __phys_to_virt(start), - end - start, PAGE_KERNEL); - } else if (start >= kernel_x_end) { - create_mapping(start, __phys_to_virt(start), - end - start, PAGE_KERNEL); - } else { - if (start < kernel_x_start) - create_mapping(start, __phys_to_virt(start), - kernel_x_start - start, - PAGE_KERNEL); - create_mapping(kernel_x_start, - __phys_to_virt(kernel_x_start), - kernel_x_end - kernel_x_start, - PAGE_KERNEL_EXEC); - if (kernel_x_end < end) - create_mapping(kernel_x_end, - __phys_to_virt(kernel_x_end), - end - kernel_x_end, - PAGE_KERNEL); + + /* No overlap with the kernel. */ + if (end < kernel_start || start >= kernel_end) { + __create_pgd_mapping(pgd, start, __phys_to_virt(start), + end - start, PAGE_KERNEL, + early_pgtable_alloc); + return; } + /* + * This block overlaps the kernel mapping. Map the portion(s) which + * don't overlap. + */ + if (start < kernel_start) + __create_pgd_mapping(pgd, start, + __phys_to_virt(start), + kernel_start - start, PAGE_KERNEL, + early_pgtable_alloc); + if (kernel_end < end) + __create_pgd_mapping(pgd, kernel_end, + __phys_to_virt(kernel_end), + end - kernel_end, PAGE_KERNEL, + early_pgtable_alloc); } -#else -static void __init __map_memblock(phys_addr_t start, phys_addr_t end) -{ - create_mapping(start, __phys_to_virt(start), end - start, - PAGE_KERNEL_EXEC); -} -#endif -static void __init map_mem(void) +static void __init map_mem(pgd_t *pgd) { struct memblock_region *reg; @@ -398,33 +392,10 @@ static void __init map_mem(void) if (start >= end) break; - __map_memblock(start, end); + __map_memblock(pgd, start, end); } } -static void __init fixup_executable(void) -{ -#ifdef CONFIG_DEBUG_RODATA - /* now that we are actually fully mapped, make the start/end more fine grained */ - if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) { - unsigned long aligned_start = round_down(__pa(_stext), - SWAPPER_BLOCK_SIZE); - - create_mapping(aligned_start, __phys_to_virt(aligned_start), - __pa(_stext) - aligned_start, - PAGE_KERNEL); - } - - if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) { - unsigned long aligned_end = round_up(__pa(__init_end), - SWAPPER_BLOCK_SIZE); - create_mapping(__pa(__init_end), (unsigned long)__init_end, - aligned_end - __pa(__init_end), - PAGE_KERNEL); - } -#endif -} - #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { @@ -442,14 +413,72 @@ void fixup_init(void) PAGE_KERNEL); } +static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, + pgprot_t prot) +{ + phys_addr_t pa_start = __pa(va_start); + unsigned long size = va_end - va_start; + + BUG_ON(!PAGE_ALIGNED(pa_start)); + BUG_ON(!PAGE_ALIGNED(size)); + + __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, + early_pgtable_alloc); +} + +/* + * Create fine-grained mappings for the kernel. + */ +static void __init map_kernel(pgd_t *pgd) +{ + + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); + + /* + * The fixmap falls in a separate pgd to the kernel, and doesn't live + * in the carveout for the swapper_pg_dir. We can simply re-use the + * existing dir for the fixmap. + */ + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); + + kasan_copy_shadow(pgd); +} + /* * paging_init() sets up the page tables, initialises the zone memory * maps and sets up the zero page. */ void __init paging_init(void) { - map_mem(); - fixup_executable(); + phys_addr_t pgd_phys = early_pgtable_alloc(); + pgd_t *pgd = pgd_set_fixmap(pgd_phys); + + map_kernel(pgd); + map_mem(pgd); + + /* + * We want to reuse the original swapper_pg_dir so we don't have to + * communicate the new address to non-coherent secondaries in + * secondary_entry, and so cpu_switch_mm can generate the address with + * adrp+add rather than a load from some global variable. + * + * To do this we need to go via a temporary pgd. + */ + cpu_replace_ttbr1(__va(pgd_phys)); + memcpy(swapper_pg_dir, pgd, PAGE_SIZE); + cpu_replace_ttbr1(swapper_pg_dir); + + pgd_clear_fixmap(); + memblock_free(pgd_phys, PAGE_SIZE); + + /* + * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd + * allocated with it. + */ + memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE, + SWAPPER_DIR_SIZE - PAGE_SIZE); bootmem_init(); } -- cgit v1.2.3 From f00cf2ba83ca2b1ade50b27dd60d6f4294ddeef7 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 26 Jan 2016 11:10:38 +0000 Subject: arm64: kernel: implement ACPI parking protocol The SBBR and ACPI specifications allow ACPI based systems that do not implement PSCI (eg systems with no EL3) to boot through the ACPI parking protocol specification[1]. This patch implements the ACPI parking protocol CPU operations, and adds code that eases parsing the parking protocol data structures to the ARM64 SMP initializion carried out at the same time as cpus enumeration. To wake-up the CPUs from the parked state, this patch implements a wakeup IPI for ARM64 (ie arch_send_wakeup_ipi_mask()) that mirrors the ARM one, so that a specific IPI is sent for wake-up purpose in order to distinguish it from other IPI sources. Given the current ACPI MADT parsing API, the patch implements a glue layer that helps passing MADT GICC data structure from SMP initialization code to the parking protocol implementation somewhat overriding the CPU operations interfaces. This to avoid creating a completely trasparent DT/ACPI CPU operations layer that would require creating opaque structure handling for CPUs data (DT represents CPU through DT nodes, ACPI through static MADT table entries), which seems overkill given that ACPI on ARM64 mandates only two booting protocols (PSCI and parking protocol), so there is no need for further protocol additions. Based on the original work by Mark Salter [1] https://acpica.org/sites/acpica/files/MP%20Startup%20for%20ARM%20platforms.docx Signed-off-by: Lorenzo Pieralisi Tested-by: Loc Ho Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Mark Rutland Cc: Mark Salter Cc: Al Stone [catalin.marinas@arm.com: Added WARN_ONCE(!acpi_parking_protocol_valid() on the IPI] Signed-off-by: Catalin Marinas (cherry picked from commit 5e89c55e4ed81d7abb1ce8828db35fa389dc0e90) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 9 ++ arch/arm64/include/asm/acpi.h | 19 +++- arch/arm64/include/asm/hardirq.h | 2 +- arch/arm64/include/asm/smp.h | 9 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/acpi_parking_protocol.c | 153 ++++++++++++++++++++++++++++++ arch/arm64/kernel/cpu_ops.c | 27 +++++- arch/arm64/kernel/smp.c | 28 ++++++ 8 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 arch/arm64/kernel/acpi_parking_protocol.c (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ffa3c549a4ba..4a1b665d90dc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -706,6 +706,15 @@ endmenu menu "Boot options" +config ARM64_ACPI_PARKING_PROTOCOL + bool "Enable support for the ARM64 ACPI parking protocol" + depends on ACPI + help + Enable support for the ARM64 ACPI parking protocol. If disabled + the kernel will not allow booting through the ARM64 ACPI parking + protocol even if the corresponding data is present in the ACPI + MADT table. + config CMDLINE string "Default kernel command string" default "" diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index caafd63b8092..aee323b13802 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -87,9 +87,26 @@ void __init acpi_init_cpus(void); static inline void acpi_init_cpus(void) { } #endif /* CONFIG_ACPI */ +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +bool acpi_parking_protocol_valid(int cpu); +void __init +acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor); +#else +static inline bool acpi_parking_protocol_valid(int cpu) { return false; } +static inline void +acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor) +{} +#endif + static inline const char *acpi_get_enable_method(int cpu) { - return acpi_psci_present() ? "psci" : NULL; + if (acpi_psci_present()) + return "psci"; + + if (acpi_parking_protocol_valid(cpu)) + return "parking-protocol"; + + return NULL; } #ifdef CONFIG_ACPI_APEI diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index a57601f9d17c..8740297dac77 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -20,7 +20,7 @@ #include #include -#define NR_IPI 5 +#define NR_IPI 6 typedef struct { unsigned int __softirq_pending; diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index d9c3d6a6100a..2013a4dc5124 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -64,6 +64,15 @@ extern void secondary_entry(void); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); +#else +static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask) +{ + BUILD_BUG(); +} +#endif + extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 474691f8b13a..c4e2f70c0aa0 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -41,6 +41,7 @@ arm64-obj-$(CONFIG_EFI) += efi.o efi-entry.stub.o arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o arm64-obj-$(CONFIG_ACPI) += acpi.o +arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c new file mode 100644 index 000000000000..4b1e5a7a98da --- /dev/null +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -0,0 +1,153 @@ +/* + * ARM64 ACPI Parking Protocol implementation + * + * Authors: Lorenzo Pieralisi + * Mark Salter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include + +#include + +struct cpu_mailbox_entry { + phys_addr_t mailbox_addr; + u8 version; + u8 gic_cpu_id; +}; + +static struct cpu_mailbox_entry cpu_mailbox_entries[NR_CPUS]; + +void __init acpi_set_mailbox_entry(int cpu, + struct acpi_madt_generic_interrupt *p) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + + cpu_entry->mailbox_addr = p->parked_address; + cpu_entry->version = p->parking_version; + cpu_entry->gic_cpu_id = p->cpu_interface_number; +} + +bool acpi_parking_protocol_valid(int cpu) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + + return cpu_entry->mailbox_addr && cpu_entry->version; +} + +static int acpi_parking_protocol_cpu_init(unsigned int cpu) +{ + pr_debug("%s: ACPI parked addr=%llx\n", __func__, + cpu_mailbox_entries[cpu].mailbox_addr); + + return 0; +} + +static int acpi_parking_protocol_cpu_prepare(unsigned int cpu) +{ + return 0; +} + +struct parking_protocol_mailbox { + __le32 cpu_id; + __le32 reserved; + __le64 entry_point; +}; + +static int acpi_parking_protocol_cpu_boot(unsigned int cpu) +{ + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + struct parking_protocol_mailbox __iomem *mailbox; + __le32 cpu_id; + + /* + * Map mailbox memory with attribute device nGnRE (ie ioremap - + * this deviates from the parking protocol specifications since + * the mailboxes are required to be mapped nGnRnE; the attribute + * discrepancy is harmless insofar as the protocol specification + * is concerned). + * If the mailbox is mistakenly allocated in the linear mapping + * by FW ioremap will fail since the mapping will be prevented + * by the kernel (it clashes with the linear mapping attributes + * specifications). + */ + mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox)); + if (!mailbox) + return -EIO; + + cpu_id = readl_relaxed(&mailbox->cpu_id); + /* + * Check if firmware has set-up the mailbox entry properly + * before kickstarting the respective cpu. + */ + if (cpu_id != ~0U) { + iounmap(mailbox); + return -ENXIO; + } + + /* + * We write the entry point and cpu id as LE regardless of the + * native endianness of the kernel. Therefore, any boot-loaders + * that read this address need to convert this address to the + * Boot-Loader's endianness before jumping. + */ + writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point); + writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); + + arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + + iounmap(mailbox); + + return 0; +} + +static void acpi_parking_protocol_cpu_postboot(void) +{ + int cpu = smp_processor_id(); + struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu]; + struct parking_protocol_mailbox __iomem *mailbox; + __le64 entry_point; + + /* + * Map mailbox memory with attribute device nGnRE (ie ioremap - + * this deviates from the parking protocol specifications since + * the mailboxes are required to be mapped nGnRnE; the attribute + * discrepancy is harmless insofar as the protocol specification + * is concerned). + * If the mailbox is mistakenly allocated in the linear mapping + * by FW ioremap will fail since the mapping will be prevented + * by the kernel (it clashes with the linear mapping attributes + * specifications). + */ + mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox)); + if (!mailbox) + return; + + entry_point = readl_relaxed(&mailbox->entry_point); + /* + * Check if firmware has cleared the entry_point as expected + * by the protocol specification. + */ + WARN_ON(entry_point); + + iounmap(mailbox); +} + +const struct cpu_operations acpi_parking_protocol_ops = { + .name = "parking-protocol", + .cpu_init = acpi_parking_protocol_cpu_init, + .cpu_prepare = acpi_parking_protocol_cpu_prepare, + .cpu_boot = acpi_parking_protocol_cpu_boot, + .cpu_postboot = acpi_parking_protocol_cpu_postboot +}; diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index b6bd7d447768..c7cfb8fe06f9 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -25,19 +25,30 @@ #include extern const struct cpu_operations smp_spin_table_ops; +extern const struct cpu_operations acpi_parking_protocol_ops; extern const struct cpu_operations cpu_psci_ops; const struct cpu_operations *cpu_ops[NR_CPUS]; -static const struct cpu_operations *supported_cpu_ops[] __initconst = { +static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = { &smp_spin_table_ops, &cpu_psci_ops, NULL, }; +static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = { +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL + &acpi_parking_protocol_ops, +#endif + &cpu_psci_ops, + NULL, +}; + static const struct cpu_operations * __init cpu_get_ops(const char *name) { - const struct cpu_operations **ops = supported_cpu_ops; + const struct cpu_operations **ops; + + ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops; while (*ops) { if (!strcmp(name, (*ops)->name)) @@ -75,8 +86,16 @@ static const char *__init cpu_read_enable_method(int cpu) } } else { enable_method = acpi_get_enable_method(cpu); - if (!enable_method) - pr_err("Unsupported ACPI enable-method\n"); + if (!enable_method) { + /* + * In ACPI systems the boot CPU does not require + * checking the enable method since for some + * boot protocol (ie parking protocol) it need not + * be initialized. Don't warn spuriously. + */ + if (cpu != 0) + pr_err("Unsupported ACPI enable-method\n"); + } } return enable_method; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 68e7f79630d4..24cb4f800033 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -70,6 +70,7 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_TIMER, IPI_IRQ_WORK, + IPI_WAKEUP }; /* @@ -443,6 +444,17 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) /* map the logical cpu id to cpu MPIDR */ cpu_logical_map(cpu_count) = hwid; + /* + * Set-up the ACPI parking protocol cpu entries + * while initializing the cpu_logical_map to + * avoid parsing MADT entries multiple times for + * nothing (ie a valid cpu_logical_map entry should + * contain a valid parking protocol data set to + * initialize the cpu if the parking protocol is + * the only available enable method). + */ + acpi_set_mailbox_entry(cpu_count, processor); + cpu_count++; } @@ -625,6 +637,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = { S(IPI_CPU_STOP, "CPU stop interrupts"), S(IPI_TIMER, "Timer broadcast interrupts"), S(IPI_IRQ_WORK, "IRQ work interrupts"), + S(IPI_WAKEUP, "CPU wake-up interrupts"), }; static void smp_cross_call(const struct cpumask *target, unsigned int ipinr) @@ -668,6 +681,13 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC); } +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL +void arch_send_wakeup_ipi_mask(const struct cpumask *mask) +{ + smp_cross_call(mask, IPI_WAKEUP); +} +#endif + #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { @@ -745,6 +765,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; #endif +#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL + case IPI_WAKEUP: + WARN_ONCE(!acpi_parking_protocol_valid(cpu), + "CPU%u: Wake-up IPI outside the ACPI parking protocol\n", + cpu); + break; +#endif + default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); break; -- cgit v1.2.3 From a0e40450cf255994501d3f84081f95b1fb41623d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 27 Jan 2016 10:50:19 +0100 Subject: arm64: allow vmalloc regions to be set with set_memory_* The range of set_memory_* is currently restricted to the module address range because of difficulties in breaking down larger block sizes. vmalloc maps PAGE_SIZE pages so it is safe to use as well. Update the function ranges and add a comment explaining why the range is restricted the way it is. Suggested-by: Laura Abbott Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon (cherry picked from commit 95f5c80050ad723163aa80dc8bffd48ef4afc6d5) Signed-off-by: Alex Shi --- arch/arm64/mm/pageattr.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index cf6240741134..0795c3a36d8f 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,7 @@ static int change_memory_common(unsigned long addr, int numpages, unsigned long end = start + size; int ret; struct page_change_data data; + struct vm_struct *area; if (!PAGE_ALIGNED(addr)) { start &= PAGE_MASK; @@ -51,10 +53,23 @@ static int change_memory_common(unsigned long addr, int numpages, WARN_ON_ONCE(1); } - if (start < MODULES_VADDR || start >= MODULES_END) - return -EINVAL; - - if (end < MODULES_VADDR || end >= MODULES_END) + /* + * Kernel VA mappings are always live, and splitting live section + * mappings into page mappings may cause TLB conflicts. This means + * we have to ensure that changing the permission bits of the range + * we are operating on does not result in such splitting. + * + * Let's restrict ourselves to mappings created by vmalloc (or vmap). + * Those are guaranteed to consist entirely of page mappings, and + * splitting is never needed. + * + * So check whether the [addr, addr + size) interval is entirely + * covered by precisely one VM area that has the VM_ALLOC flag set. + */ + area = find_vm_area((void *)addr); + if (!area || + end > (unsigned long)area->addr + area->size || + !(area->flags & VM_ALLOC)) return -EINVAL; if (!numpages) -- cgit v1.2.3 From 41cb2829d020e4fdaeb5eb9286153f4c7dc8e7dd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:23 +0000 Subject: arm64: prefetch: don't provide spin_lock_prefetch with LSE The LSE atomics rely on us not dirtying data at L1 if we can avoid it, otherwise many of the potential scalability benefits are lost. This patch replaces spin_lock_prefetch with a nop when the LSE atomics are in use, so that users don't shoot themselves in the foot by causing needless coherence traffic at L1. Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas (cherry picked from commit cd5e10bdf3795d22f10787bb1991c43798c885d5) Signed-off-by: Alex Shi --- arch/arm64/include/asm/processor.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 4acb7ca94fcd..31b76fce4477 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -29,6 +29,7 @@ #include +#include #include #include #include @@ -177,9 +178,11 @@ static inline void prefetchw(const void *ptr) } #define ARCH_HAS_SPINLOCK_PREFETCH -static inline void spin_lock_prefetch(const void *x) +static inline void spin_lock_prefetch(const void *ptr) { - prefetchw(x); + asm volatile(ARM64_LSE_ATOMIC_INSN( + "prfm pstl1strm, %a0", + "nop") : : "p" (ptr)); } #define HAVE_ARCH_PICK_MMAP_LAYOUT -- cgit v1.2.3 From 742e490adaa444e9657528ef38dde35f2916c793 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:24 +0000 Subject: arm64: prefetch: add alternative pattern for CPUs without a prefetcher Most CPUs have a hardware prefetcher which generally performs better without explicit prefetch instructions issued by software, however some CPUs (e.g. Cavium ThunderX) rely solely on explicit prefetch instructions. This patch adds an alternative pattern (ARM64_HAS_NO_HW_PREFETCH) to allow our library code to make use of explicit prefetch instructions during things like copy routines only when the CPU does not have the capability to perform the prefetching itself. Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas (cherry picked from commit d5370f754875460662abe8561388e019d90dd0c4) Signed-off-by: Alex Shi --- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/cputype.h | 17 ++++++++++++++++- arch/arm64/kernel/cpu_errata.c | 18 +++--------------- arch/arm64/kernel/cpufeature.c | 17 +++++++++++++++++ 4 files changed, 38 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8f271b83f910..8d56bd8550dc 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -30,8 +30,9 @@ #define ARM64_HAS_LSE_ATOMICS 5 #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 +#define ARM64_HAS_NO_HW_PREFETCH 8 -#define ARM64_NCAPS 8 +#define ARM64_NCAPS 9 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 1a5949364ed0..7540284a17fe 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -57,11 +57,22 @@ #define MIDR_IMPLEMENTOR(midr) \ (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) -#define MIDR_CPU_PART(imp, partnum) \ +#define MIDR_CPU_MODEL(imp, partnum) \ (((imp) << MIDR_IMPLEMENTOR_SHIFT) | \ (0xf << MIDR_ARCHITECTURE_SHIFT) | \ ((partnum) << MIDR_PARTNUM_SHIFT)) +#define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +#define MIDR_IS_CPU_MODEL_RANGE(midr, model, rv_min, rv_max) \ +({ \ + u32 _model = (midr) & MIDR_CPU_MODEL_MASK; \ + u32 rv = (midr) & (MIDR_REVISION_MASK | MIDR_VARIANT_MASK); \ + \ + _model == (model) && rv >= (rv_min) && rv <= (rv_max); \ + }) + #define ARM_CPU_IMP_ARM 0x41 #define ARM_CPU_IMP_APM 0x50 #define ARM_CPU_IMP_CAVIUM 0x43 @@ -75,6 +86,10 @@ #define CAVIUM_CPU_PART_THUNDERX 0x0A1 +#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) +#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) + #ifndef __ASSEMBLY__ /* diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index feb6b4efa641..e6bc988e8dbf 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -21,24 +21,12 @@ #include #include -#define MIDR_CORTEX_A53 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) -#define MIDR_CORTEX_A57 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) -#define MIDR_THUNDERX MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) - -#define CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ - MIDR_ARCHITECTURE_MASK) - static bool __maybe_unused is_affected_midr_range(const struct arm64_cpu_capabilities *entry) { - u32 midr = read_cpuid_id(); - - if ((midr & CPU_MODEL_MASK) != entry->midr_model) - return false; - - midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK; - - return (midr >= entry->midr_range_min && midr <= entry->midr_range_max); + return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model, + entry->midr_range_min, + entry->midr_range_max); } #define MIDR_RANGE(model, min, max) \ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5c90aa490a2b..3615d7d7c9af 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -621,6 +621,18 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry) return has_sre; } +static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry) +{ + u32 midr = read_cpuid_id(); + u32 rv_min, rv_max; + + /* Cavium ThunderX pass 1.x and 2.x */ + rv_min = 0; + rv_max = (1 << MIDR_VARIANT_SHIFT) | MIDR_REVISION_MASK; + + return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -651,6 +663,11 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 2, }, #endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */ + { + .desc = "Software prefetching using PRFM", + .capability = ARM64_HAS_NO_HW_PREFETCH, + .matches = has_no_hw_prefetch, + }, {}, }; -- cgit v1.2.3 From 93c384820cf3c1db51073e746980866d2bce8af9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Feb 2016 12:46:25 +0000 Subject: arm64: lib: improve copy_page to deal with 128 bytes at a time We want to avoid lots of different copy_page implementations, settling for something that is "good enough" everywhere and hopefully easy to understand and maintain whilst we're at it. This patch reworks our copy_page implementation based on discussions with Cavium on the list and benchmarking on Cortex-A processors so that: - The loop is unrolled to copy 128 bytes per iteration - The reads are offset so that we read from the next 128-byte block in the same iteration that we store the previous block - Explicit prefetch instructions are removed for now, since they hurt performance on CPUs with hardware prefetching - The loop exit condition is calculated at the start of the loop Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas (cherry picked from commit 223e23e8aa26b0bb62c597637e77295e14f6a62c) Signed-off-by: Alex Shi --- arch/arm64/lib/copy_page.S | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 512b9a7b980e..2534533ceb1d 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -27,20 +27,50 @@ * x1 - src */ ENTRY(copy_page) - /* Assume cache line size is 64 bytes. */ - prfm pldl1strm, [x1, #64] -1: ldp x2, x3, [x1] + ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] ldp x8, x9, [x1, #48] - add x1, x1, #64 - prfm pldl1strm, [x1, #64] + ldp x10, x11, [x1, #64] + ldp x12, x13, [x1, #80] + ldp x14, x15, [x1, #96] + ldp x16, x17, [x1, #112] + + mov x18, #(PAGE_SIZE - 128) + add x1, x1, #128 +1: + subs x18, x18, #128 + stnp x2, x3, [x0] + ldp x2, x3, [x1] stnp x4, x5, [x0, #16] + ldp x4, x5, [x1, #16] stnp x6, x7, [x0, #32] + ldp x6, x7, [x1, #32] stnp x8, x9, [x0, #48] - add x0, x0, #64 - tst x1, #(PAGE_SIZE - 1) - b.ne 1b + ldp x8, x9, [x1, #48] + stnp x10, x11, [x0, #64] + ldp x10, x11, [x1, #64] + stnp x12, x13, [x0, #80] + ldp x12, x13, [x1, #80] + stnp x14, x15, [x0, #96] + ldp x14, x15, [x1, #96] + stnp x16, x17, [x0, #112] + ldp x16, x17, [x1, #112] + + add x0, x0, #128 + add x1, x1, #128 + + b.gt 1b + + stnp x2, x3, [x0] + stnp x4, x5, [x0, #16] + stnp x6, x7, [x0, #32] + stnp x8, x9, [x0, #48] + stnp x10, x11, [x0, #64] + stnp x12, x13, [x0, #80] + stnp x14, x15, [x0, #96] + stnp x16, x17, [x0, #112] + ret ENDPROC(copy_page) -- cgit v1.2.3 From e46018fe4fd741b5f58b6c3cf8b94ed34603c325 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Tue, 2 Feb 2016 12:46:26 +0000 Subject: arm64: lib: patch in prfm for copy_page if requested On ThunderX T88 pass 1 and pass 2, there is no hardware prefetching so we need to patch in explicit software prefetching instructions Prefetching improves this code by 60% over the original code and 2x over the code without prefetching for the affected hardware using the benchmark code at https://github.com/apinski-cavium/copy_page_benchmark Signed-off-by: Andrew Pinski Signed-off-by: Will Deacon Tested-by: Andrew Pinski Signed-off-by: Catalin Marinas (cherry picked from commit 60e0a09db24adc8809696307e5d97cc4ba7cb3e0) Signed-off-by: Alex Shi --- arch/arm64/lib/copy_page.S | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 2534533ceb1d..4c1e700840b6 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -18,6 +18,8 @@ #include #include #include +#include +#include /* * Copy a page from src to dest (both are page aligned) @@ -27,6 +29,15 @@ * x1 - src */ ENTRY(copy_page) +alternative_if_not ARM64_HAS_NO_HW_PREFETCH + nop + nop +alternative_else + # Prefetch two cache lines ahead. + prfm pldl1strm, [x1, #128] + prfm pldl1strm, [x1, #256] +alternative_endif + ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] @@ -41,6 +52,12 @@ ENTRY(copy_page) 1: subs x18, x18, #128 +alternative_if_not ARM64_HAS_NO_HW_PREFETCH + nop +alternative_else + prfm pldl1strm, [x1, #384] +alternative_endif + stnp x2, x3, [x0] ldp x2, x3, [x1] stnp x4, x5, [x0, #16] -- cgit v1.2.3 From a97b93b11bd9f76b0cba55b8ff04a7489b087caf Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 10 Feb 2016 10:07:30 +0000 Subject: arm64: prefetch: add missing #include for spin_lock_prefetch As of 52e662326e1e ("arm64: prefetch: don't provide spin_lock_prefetch with LSE"), spin_lock_prefetch is patched at runtime when the LSE atomics are in use. This relies on the ARM64_LSE_ATOMIC_INSN macro to drive the alternatives framework, but that macro is only available via asm/lse.h, which isn't explicitly included in processor.h. Consequently, drivers can run into build failures such as: In file included from include/linux/prefetch.h:14:0, from drivers/net/ethernet/intel/i40e/i40e_txrx.c:27: arch/arm64/include/asm/processor.h: In function 'spin_lock_prefetch': arch/arm64/include/asm/processor.h:183:15: error: expected string literal before 'ARM64_LSE_ATOMIC_INSN' asm volatile(ARM64_LSE_ATOMIC_INSN( This patch add the missing include and gets things building again. Reported-by: kbuild test robot Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit afb83cc3f0e4f86ea0e1cc3db7a90f58f1abd4d5) Signed-off-by: Alex Shi --- arch/arm64/include/asm/processor.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 31b76fce4477..5bb1d763d17a 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 65e670213029f7df5dd20407eecbe691aa078930 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:46 -0800 Subject: arm64: Drop alloc function from create_mapping create_mapping is only used in fixmap_remap_fdt. All the create_mapping calls need to happen on existing translation table pages without additional allocations. Rather than have an alloc function be called and fail, just set it to NULL and catch its use. Also change the name to create_mapping_noalloc to better capture what exactly is going on. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas (cherry picked from commit 132233a759580f5ce9b1bfaac9073e47d03c460d) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4874d2fea1c9..3096240e6eb8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -116,7 +116,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte_t *pte; if (pmd_none(*pmd) || pmd_sect(*pmd)) { - phys_addr_t pte_phys = pgtable_alloc(); + phys_addr_t pte_phys; + BUG_ON(!pgtable_alloc); + pte_phys = pgtable_alloc(); pte = pte_set_fixmap(pte_phys); if (pmd_sect(*pmd)) split_pmd(pmd, pte); @@ -158,7 +160,9 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, * Check for initial section mappings in the pgd/pud and remove them. */ if (pud_none(*pud) || pud_sect(*pud)) { - phys_addr_t pmd_phys = pgtable_alloc(); + phys_addr_t pmd_phys; + BUG_ON(!pgtable_alloc); + pmd_phys = pgtable_alloc(); pmd = pmd_set_fixmap(pmd_phys); if (pud_sect(*pud)) { /* @@ -223,7 +227,9 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long next; if (pgd_none(*pgd)) { - phys_addr_t pud_phys = pgtable_alloc(); + phys_addr_t pud_phys; + BUG_ON(!pgtable_alloc); + pud_phys = pgtable_alloc(); __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); } BUG_ON(pgd_bad(*pgd)); @@ -312,7 +318,12 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc); } -static void __init create_mapping(phys_addr_t phys, unsigned long virt, +/* + * This function can only be used to modify existing table entries, + * without allocating new levels of table. Note that this permits the + * creation of new section or page entries. + */ +static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { if (virt < VMALLOC_START) { @@ -321,7 +332,7 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt, return; } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - early_pgtable_alloc); + NULL); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -678,7 +689,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) /* * Make sure that the FDT region can be mapped without the need to * allocate additional translation table pages, so that it is safe - * to call create_mapping() this early. + * to call create_mapping_noalloc() this early. * * On 64k pages, the FDT will be mapped using PTEs, so we need to * be in the same PMD as the rest of the fixmap. @@ -694,8 +705,8 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) dt_virt = (void *)dt_virt_base + offset; /* map the first chunk so we can read the size from the header */ - create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, - SWAPPER_BLOCK_SIZE, prot); + create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), + dt_virt_base, SWAPPER_BLOCK_SIZE, prot); if (fdt_check_header(dt_virt) != 0) return NULL; @@ -705,7 +716,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return NULL; if (offset + size > SWAPPER_BLOCK_SIZE) - create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, + create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, round_up(offset + size, SWAPPER_BLOCK_SIZE), prot); memblock_reserve(dt_phys, size); -- cgit v1.2.3 From e4d0298cdff23a21e532291df7a4fb6e0a908be4 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:47 -0800 Subject: arm64: Add support for ARCH_SUPPORTS_DEBUG_PAGEALLOC ARCH_SUPPORTS_DEBUG_PAGEALLOC provides a hook to map and unmap pages for debugging purposes. This requires memory be mapped with PAGE_SIZE mappings since breaking down larger mappings at runtime will lead to TLB conflicts. Check if debug_pagealloc is enabled at runtime and if so, map everyting with PAGE_SIZE pages. Implement the functions to actually map/unmap the pages at runtime. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Laura Abbott [catalin.marinas@arm.com: static annotation block_mappings_allowed() and #ifdef] Signed-off-by: Catalin Marinas (cherry picked from commit 83863f25e4b8214e994ef8b5647aad614d74b45d) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 3 +++ arch/arm64/mm/mmu.c | 26 ++++++++++++++++++++++++-- arch/arm64/mm/pageattr.c | 46 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 63 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4a1b665d90dc..98992dee9a29 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -507,6 +507,9 @@ config HOTPLUG_CPU source kernel/Kconfig.preempt source kernel/Kconfig.hz +config ARCH_SUPPORTS_DEBUG_PAGEALLOC + def_bool y + config ARCH_HAS_HOLES_MEMORYMODEL def_bool y if SPARSEMEM diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3096240e6eb8..d1fa678355c9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -149,6 +149,26 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd) } while (pmd++, i++, i < PTRS_PER_PMD); } +#ifdef CONFIG_DEBUG_PAGEALLOC +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) +{ + + /* + * If debug_page_alloc is enabled we must map the linear map + * using pages. However, other mappings created by + * create_mapping_noalloc must use sections in some cases. Allow + * sections to be used in those cases, where no pgtable_alloc + * function is provided. + */ + return !pgtable_alloc || !debug_pagealloc_enabled(); +} +#else +static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void)) +{ + return true; +} +#endif + static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) @@ -181,7 +201,8 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, do { next = pmd_addr_end(addr, end); /* try section mapping first */ - if (((addr | next | phys) & ~SECTION_MASK) == 0) { + if (((addr | next | phys) & ~SECTION_MASK) == 0 && + block_mappings_allowed(pgtable_alloc)) { pmd_t old_pmd =*pmd; set_pmd(pmd, __pmd(phys | pgprot_val(mk_sect_prot(prot)))); @@ -241,7 +262,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys)) { + if (use_1G_block(addr, next, phys) && + block_mappings_allowed(pgtable_alloc)) { pud_t old_pud = *pud; set_pud(pud, __pud(phys | pgprot_val(mk_sect_prot(prot)))); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0795c3a36d8f..ca6d268e3313 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -37,14 +37,31 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, return 0; } +/* + * This function assumes that the range is mapped with PAGE_SIZE pages. + */ +static int __change_memory_common(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) +{ + struct page_change_data data; + int ret; + + data.set_mask = set_mask; + data.clear_mask = clear_mask; + + ret = apply_to_page_range(&init_mm, start, size, change_page_range, + &data); + + flush_tlb_kernel_range(start, start + size); + return ret; +} + static int change_memory_common(unsigned long addr, int numpages, pgprot_t set_mask, pgprot_t clear_mask) { unsigned long start = addr; unsigned long size = PAGE_SIZE*numpages; unsigned long end = start + size; - int ret; - struct page_change_data data; struct vm_struct *area; if (!PAGE_ALIGNED(addr)) { @@ -75,14 +92,7 @@ static int change_memory_common(unsigned long addr, int numpages, if (!numpages) return 0; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); - - flush_tlb_kernel_range(start, end); - return ret; + return __change_memory_common(start, size, set_mask, clear_mask); } int set_memory_ro(unsigned long addr, int numpages) @@ -114,3 +124,19 @@ int set_memory_x(unsigned long addr, int numpages) __pgprot(PTE_PXN)); } EXPORT_SYMBOL_GPL(set_memory_x); + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long addr = (unsigned long) page_address(page); + + if (enable) + __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(PTE_VALID), + __pgprot(0)); + else + __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(0), + __pgprot(PTE_VALID)); +} +#endif -- cgit v1.2.3 From 5ca7d16080e32bb34f8d2ead86adde49d1c4c652 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 5 Feb 2016 16:24:48 -0800 Subject: arm64: ptdump: Indicate whether memory should be faulting With CONFIG_DEBUG_PAGEALLOC, pages do not have the valid bit set when free in the buddy allocator. Add an indiciation to the page table dumping code that the valid bit is not set, 'F' for fault, to make this easier to understand. Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Laura Abbott Signed-off-by: Catalin Marinas (cherry picked from commit d7e9d59494a9a5d83274f5af2148b82ca22dff3f) Signed-off-by: Alex Shi --- arch/arm64/mm/dump.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 0adbebbc2803..0841b2bf0e6a 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -90,6 +90,11 @@ struct prot_bits { static const struct prot_bits pte_bits[] = { { + .mask = PTE_VALID, + .val = PTE_VALID, + .set = " ", + .clear = "F", + }, { .mask = PTE_USER, .val = PTE_USER, .set = "USR", -- cgit v1.2.3 From 1d1e6a82d643537e9b80cf58f887b2be616aa515 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Feb 2016 15:50:18 -0800 Subject: arm64: ubsan: select ARCH_HAS_UBSAN_SANITIZE_ALL To enable UBSAN on arm64, ARCH_HAS_UBSAN_SANITIZE_ALL need to be selected. Basic kernel bootup test is passed on arm64 with CONFIG_UBSAN_SANITIZE_ALL enabled. Signed-off-by: Yang Shi Acked-by: Andrey Ryabinin Tested-by: Mark Rutland Signed-off-by: Catalin Marinas (cherry picked from commit f0b7f8a4b44657386273a67179dd901c81cd11a6) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 98992dee9a29..1420102341d0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -13,6 +13,7 @@ config ARM64 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER select ARM_GIC -- cgit v1.2.3 From 391a428880ad64f0ab9dcf48f4666ba2a6aa7e76 Mon Sep 17 00:00:00 2001 From: David Brown Date: Wed, 10 Feb 2016 13:52:22 -0800 Subject: arm64: vdso: Mark vDSO code as read-only Although the arm64 vDSO is cleanly separated by code/data with the code being read-only in userspace mappings, the code page is still writable from the kernel. There have been exploits (such as http://itszn.com/blog/?p=21) that take advantage of this on x86 to go from a bad kernel write to full root. Prevent this specific exploit on arm64 by putting the vDSO code page in read-only memory as well. Before the change: [ 3.138366] vdso: 2 pages (1 code @ ffffffc000a71000, 1 data @ ffffffc000a70000) ---[ Kernel Mapping ]--- 0xffffffc000000000-0xffffffc000082000 520K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000082000-0xffffffc000200000 1528K ro x SHD AF UXN MEM/NORMAL 0xffffffc000200000-0xffffffc000800000 6M ro x SHD AF BLK UXN MEM/NORMAL 0xffffffc000800000-0xffffffc0009b6000 1752K ro x SHD AF UXN MEM/NORMAL 0xffffffc0009b6000-0xffffffc000c00000 2344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000c00000-0xffffffc008000000 116M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc00c000000-0xffffffc07f000000 1840M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc800000000-0xffffffc840000000 1G RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc840000000-0xffffffc87ae00000 942M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87ae00000-0xffffffc87ae70000 448K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af80000-0xffffffc87af8a000 40K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af8b000-0xffffffc87b000000 468K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87b000000-0xffffffc87fe00000 78M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87fe00000-0xffffffc87ff50000 1344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87ff90000-0xffffffc87ffa0000 64K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87fff0000-0xffffffc880000000 64K RW NX SHD AF UXN MEM/NORMAL After: [ 3.138368] vdso: 2 pages (1 code @ ffffffc0006de000, 1 data @ ffffffc000a74000) ---[ Kernel Mapping ]--- 0xffffffc000000000-0xffffffc000082000 520K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000082000-0xffffffc000200000 1528K ro x SHD AF UXN MEM/NORMAL 0xffffffc000200000-0xffffffc000800000 6M ro x SHD AF BLK UXN MEM/NORMAL 0xffffffc000800000-0xffffffc0009b8000 1760K ro x SHD AF UXN MEM/NORMAL 0xffffffc0009b8000-0xffffffc000c00000 2336K RW NX SHD AF UXN MEM/NORMAL 0xffffffc000c00000-0xffffffc008000000 116M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc00c000000-0xffffffc07f000000 1840M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc800000000-0xffffffc840000000 1G RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc840000000-0xffffffc87ae00000 942M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87ae00000-0xffffffc87ae70000 448K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af80000-0xffffffc87af8a000 40K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87af8b000-0xffffffc87b000000 468K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87b000000-0xffffffc87fe00000 78M RW NX SHD AF BLK UXN MEM/NORMAL 0xffffffc87fe00000-0xffffffc87ff50000 1344K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87ff90000-0xffffffc87ffa0000 64K RW NX SHD AF UXN MEM/NORMAL 0xffffffc87fff0000-0xffffffc880000000 64K RW NX SHD AF UXN MEM/NORMAL Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the PaX Team, Brad Spengler, and Kees Cook. Signed-off-by: David Brown Acked-by: Will Deacon Acked-by: Ard Biesheuvel [catalin.marinas@arm.com: removed superfluous __PAGE_ALIGNED_DATA] Signed-off-by: Catalin Marinas (cherry picked from commit 88d8a7994e564d209d4b2583496631c2357d386b) Signed-off-by: Alex Shi --- arch/arm64/kernel/vdso/vdso.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S index 60c1db54b41a..82379a70ef03 100644 --- a/arch/arm64/kernel/vdso/vdso.S +++ b/arch/arm64/kernel/vdso/vdso.S @@ -21,9 +21,8 @@ #include #include - __PAGE_ALIGNED_DATA - .globl vdso_start, vdso_end + .section .rodata .balign PAGE_SIZE vdso_start: .incbin "arch/arm64/kernel/vdso/vdso.so" -- cgit v1.2.3 From 866817f9f155879d1d28f0944958a542bb70475c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 15 Feb 2016 09:51:49 +0100 Subject: arm64: use local label prefixes for __reg_num symbols The __reg_num_xNN symbols that are used to implement the msr_s and mrs_s macros are recorded in the ELF metadata of each object file. This does not affect the size of the final binary, but it does clutter the output of tools like readelf, i.e., $ readelf -a vmlinux |grep -c __reg_num_x 50976 So let's use symbols with the .L prefix, these are strictly local, and don't end up in the object files. $ readelf -a vmlinux |grep -c __reg_num_x 0 Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 7abc7d833c9eb16efc8a59239d3771a6e30be367) Signed-off-by: Alex Shi --- arch/arm64/include/asm/sysreg.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index d48ab5b41f52..76907c94b11f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -194,32 +194,32 @@ #ifdef __ASSEMBLY__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 - .equ __reg_num_x\num, \num + .equ .L__reg_num_x\num, \num .endr - .equ __reg_num_xzr, 31 + .equ .L__reg_num_xzr, 31 .macro mrs_s, rt, sreg - .inst 0xd5200000|(\sreg)|(__reg_num_\rt) + .inst 0xd5200000|(\sreg)|(.L__reg_num_\rt) .endm .macro msr_s, sreg, rt - .inst 0xd5000000|(\sreg)|(__reg_num_\rt) + .inst 0xd5000000|(\sreg)|(.L__reg_num_\rt) .endm #else asm( " .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" -" .equ __reg_num_x\\num, \\num\n" +" .equ .L__reg_num_x\\num, \\num\n" " .endr\n" -" .equ __reg_num_xzr, 31\n" +" .equ .L__reg_num_xzr, 31\n" "\n" " .macro mrs_s, rt, sreg\n" -" .inst 0xd5200000|(\\sreg)|(__reg_num_\\rt)\n" +" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n" " .endm\n" "\n" " .macro msr_s, sreg, rt\n" -" .inst 0xd5000000|(\\sreg)|(__reg_num_\\rt)\n" +" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n" " .endm\n" ); -- cgit v1.2.3 From 0b3419007e096a706ac4894a2cf28cd97b6028e1 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:46 +0000 Subject: arm64: cpufeature: Change read_cpuid() to use sysreg's mrs_s macro Older assemblers may not have support for newer feature registers. To get round this, sysreg.h provides a 'mrs_s' macro that takes a register encoding and generates the raw instruction. Change read_cpuid() to use mrs_s in all cases so that new registers don't have to be a special case. Including sysreg.h means we need to move the include and definition of read_cpuid() after the #ifndef __ASSEMBLY__ to avoid syntax errors in vmlinux.lds. Signed-off-by: James Morse Acked-by: Mark Rutland Signed-off-by: Catalin Marinas (cherry picked from commit 0f54b14e76f5302afe164dc911b049b5df836ff5) Signed-off-by: Alex Shi --- arch/arm64/include/asm/cpufeature.h | 2 +- arch/arm64/include/asm/cputype.h | 20 +++++++------ arch/arm64/kernel/cpufeature.c | 58 ++++++++++++++++++------------------- arch/arm64/kernel/cpuinfo.c | 54 +++++++++++++++++----------------- arch/arm64/mm/context.c | 2 +- 5 files changed, 69 insertions(+), 67 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8d56bd8550dc..8131abfabb0a 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -177,7 +177,7 @@ u64 read_system_reg(u32 id); static inline bool cpu_supports_mixed_endian_el0(void) { - return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1)); + return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1)); } static inline bool system_supports_mixed_endian_el0(void) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 7540284a17fe..b3a83da152a7 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -32,12 +32,6 @@ #define MPIDR_AFFINITY_LEVEL(mpidr, level) \ ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK) -#define read_cpuid(reg) ({ \ - u64 __val; \ - asm("mrs %0, " #reg : "=r" (__val)); \ - __val; \ -}) - #define MIDR_REVISION_MASK 0xf #define MIDR_REVISION(midr) ((midr) & MIDR_REVISION_MASK) #define MIDR_PARTNUM_SHIFT 4 @@ -92,6 +86,14 @@ #ifndef __ASSEMBLY__ +#include + +#define read_cpuid(reg) ({ \ + u64 __val; \ + asm("mrs_s %0, " __stringify(reg) : "=r" (__val)); \ + __val; \ +}) + /* * The CPU ID never changes at run time, so we might as well tell the * compiler that it's constant. Use this function to read the CPU ID @@ -99,12 +101,12 @@ */ static inline u32 __attribute_const__ read_cpuid_id(void) { - return read_cpuid(MIDR_EL1); + return read_cpuid(SYS_MIDR_EL1); } static inline u64 __attribute_const__ read_cpuid_mpidr(void) { - return read_cpuid(MPIDR_EL1); + return read_cpuid(SYS_MPIDR_EL1); } static inline unsigned int __attribute_const__ read_cpuid_implementor(void) @@ -119,7 +121,7 @@ static inline unsigned int __attribute_const__ read_cpuid_part_number(void) static inline u32 __attribute_const__ read_cpuid_cachetype(void) { - return read_cpuid(CTR_EL0); + return read_cpuid(SYS_CTR_EL0); } #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 3615d7d7c9af..1ef10e784031 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -808,35 +808,35 @@ static inline void set_sys_caps_initialised(void) static u64 __raw_read_system_reg(u32 sys_id) { switch (sys_id) { - case SYS_ID_PFR0_EL1: return (u64)read_cpuid(ID_PFR0_EL1); - case SYS_ID_PFR1_EL1: return (u64)read_cpuid(ID_PFR1_EL1); - case SYS_ID_DFR0_EL1: return (u64)read_cpuid(ID_DFR0_EL1); - case SYS_ID_MMFR0_EL1: return (u64)read_cpuid(ID_MMFR0_EL1); - case SYS_ID_MMFR1_EL1: return (u64)read_cpuid(ID_MMFR1_EL1); - case SYS_ID_MMFR2_EL1: return (u64)read_cpuid(ID_MMFR2_EL1); - case SYS_ID_MMFR3_EL1: return (u64)read_cpuid(ID_MMFR3_EL1); - case SYS_ID_ISAR0_EL1: return (u64)read_cpuid(ID_ISAR0_EL1); - case SYS_ID_ISAR1_EL1: return (u64)read_cpuid(ID_ISAR1_EL1); - case SYS_ID_ISAR2_EL1: return (u64)read_cpuid(ID_ISAR2_EL1); - case SYS_ID_ISAR3_EL1: return (u64)read_cpuid(ID_ISAR3_EL1); - case SYS_ID_ISAR4_EL1: return (u64)read_cpuid(ID_ISAR4_EL1); - case SYS_ID_ISAR5_EL1: return (u64)read_cpuid(ID_ISAR4_EL1); - case SYS_MVFR0_EL1: return (u64)read_cpuid(MVFR0_EL1); - case SYS_MVFR1_EL1: return (u64)read_cpuid(MVFR1_EL1); - case SYS_MVFR2_EL1: return (u64)read_cpuid(MVFR2_EL1); - - case SYS_ID_AA64PFR0_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64PFR1_EL1: return (u64)read_cpuid(ID_AA64PFR0_EL1); - case SYS_ID_AA64DFR0_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64DFR1_EL1: return (u64)read_cpuid(ID_AA64DFR0_EL1); - case SYS_ID_AA64MMFR0_EL1: return (u64)read_cpuid(ID_AA64MMFR0_EL1); - case SYS_ID_AA64MMFR1_EL1: return (u64)read_cpuid(ID_AA64MMFR1_EL1); - case SYS_ID_AA64ISAR0_EL1: return (u64)read_cpuid(ID_AA64ISAR0_EL1); - case SYS_ID_AA64ISAR1_EL1: return (u64)read_cpuid(ID_AA64ISAR1_EL1); - - case SYS_CNTFRQ_EL0: return (u64)read_cpuid(CNTFRQ_EL0); - case SYS_CTR_EL0: return (u64)read_cpuid(CTR_EL0); - case SYS_DCZID_EL0: return (u64)read_cpuid(DCZID_EL0); + case SYS_ID_PFR0_EL1: return read_cpuid(SYS_ID_PFR0_EL1); + case SYS_ID_PFR1_EL1: return read_cpuid(SYS_ID_PFR1_EL1); + case SYS_ID_DFR0_EL1: return read_cpuid(SYS_ID_DFR0_EL1); + case SYS_ID_MMFR0_EL1: return read_cpuid(SYS_ID_MMFR0_EL1); + case SYS_ID_MMFR1_EL1: return read_cpuid(SYS_ID_MMFR1_EL1); + case SYS_ID_MMFR2_EL1: return read_cpuid(SYS_ID_MMFR2_EL1); + case SYS_ID_MMFR3_EL1: return read_cpuid(SYS_ID_MMFR3_EL1); + case SYS_ID_ISAR0_EL1: return read_cpuid(SYS_ID_ISAR0_EL1); + case SYS_ID_ISAR1_EL1: return read_cpuid(SYS_ID_ISAR1_EL1); + case SYS_ID_ISAR2_EL1: return read_cpuid(SYS_ID_ISAR2_EL1); + case SYS_ID_ISAR3_EL1: return read_cpuid(SYS_ID_ISAR3_EL1); + case SYS_ID_ISAR4_EL1: return read_cpuid(SYS_ID_ISAR4_EL1); + case SYS_ID_ISAR5_EL1: return read_cpuid(SYS_ID_ISAR4_EL1); + case SYS_MVFR0_EL1: return read_cpuid(SYS_MVFR0_EL1); + case SYS_MVFR1_EL1: return read_cpuid(SYS_MVFR1_EL1); + case SYS_MVFR2_EL1: return read_cpuid(SYS_MVFR2_EL1); + + case SYS_ID_AA64PFR0_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1); + case SYS_ID_AA64PFR1_EL1: return read_cpuid(SYS_ID_AA64PFR0_EL1); + case SYS_ID_AA64DFR0_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); + case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); + case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1); + case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1); + case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1); + case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1); + + case SYS_CNTFRQ_EL0: return read_cpuid(SYS_CNTFRQ_EL0); + case SYS_CTR_EL0: return read_cpuid(SYS_CTR_EL0); + case SYS_DCZID_EL0: return read_cpuid(SYS_DCZID_EL0); default: BUG(); return 0; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 212ae6361d8b..76df22272804 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -201,35 +201,35 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) { info->reg_cntfrq = arch_timer_get_cntfrq(); info->reg_ctr = read_cpuid_cachetype(); - info->reg_dczid = read_cpuid(DCZID_EL0); + info->reg_dczid = read_cpuid(SYS_DCZID_EL0); info->reg_midr = read_cpuid_id(); - info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); - info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); - info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1); - info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1); - info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); - info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); - info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1); - info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1); - - info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1); - info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1); - info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1); - info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1); - info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1); - info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1); - info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1); - info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1); - info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1); - info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1); - info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1); - info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1); - info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1); - - info->reg_mvfr0 = read_cpuid(MVFR0_EL1); - info->reg_mvfr1 = read_cpuid(MVFR1_EL1); - info->reg_mvfr2 = read_cpuid(MVFR2_EL1); + info->reg_id_aa64dfr0 = read_cpuid(SYS_ID_AA64DFR0_EL1); + info->reg_id_aa64dfr1 = read_cpuid(SYS_ID_AA64DFR1_EL1); + info->reg_id_aa64isar0 = read_cpuid(SYS_ID_AA64ISAR0_EL1); + info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1); + info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1); + info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1); + info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1); + info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1); + + info->reg_id_dfr0 = read_cpuid(SYS_ID_DFR0_EL1); + info->reg_id_isar0 = read_cpuid(SYS_ID_ISAR0_EL1); + info->reg_id_isar1 = read_cpuid(SYS_ID_ISAR1_EL1); + info->reg_id_isar2 = read_cpuid(SYS_ID_ISAR2_EL1); + info->reg_id_isar3 = read_cpuid(SYS_ID_ISAR3_EL1); + info->reg_id_isar4 = read_cpuid(SYS_ID_ISAR4_EL1); + info->reg_id_isar5 = read_cpuid(SYS_ID_ISAR5_EL1); + info->reg_id_mmfr0 = read_cpuid(SYS_ID_MMFR0_EL1); + info->reg_id_mmfr1 = read_cpuid(SYS_ID_MMFR1_EL1); + info->reg_id_mmfr2 = read_cpuid(SYS_ID_MMFR2_EL1); + info->reg_id_mmfr3 = read_cpuid(SYS_ID_MMFR3_EL1); + info->reg_id_pfr0 = read_cpuid(SYS_ID_PFR0_EL1); + info->reg_id_pfr1 = read_cpuid(SYS_ID_PFR1_EL1); + + info->reg_mvfr0 = read_cpuid(SYS_MVFR0_EL1); + info->reg_mvfr1 = read_cpuid(SYS_MVFR1_EL1); + info->reg_mvfr2 = read_cpuid(SYS_MVFR2_EL1); cpuinfo_detect_icache_policy(info); diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index e87f53ff5f58..7275628ba59f 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -187,7 +187,7 @@ switch_mm_fastpath: static int asids_init(void) { - int fld = cpuid_feature_extract_field(read_cpuid(ID_AA64MMFR0_EL1), 4); + int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4); switch (fld) { default: -- cgit v1.2.3 From f6c5d808273093c8220da4163c548932d18e8e36 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:47 +0000 Subject: arm64: add ARMv8.2 id_aa64mmfr2 boiler plate ARMv8.2 adds a new feature register id_aa64mmfr2. This patch adds the cpu feature boiler plate used by the actual features in later patches. Signed-off-by: James Morse Reviewed-by: Suzuki K Poulose Signed-off-by: Catalin Marinas (cherry picked from commit 406e308770a92bd33995b2e5b681e86358328bb0) Signed-off-by: Alex Shi --- arch/arm64/include/asm/cpu.h | 1 + arch/arm64/include/asm/sysreg.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 10 ++++++++++ arch/arm64/kernel/cpuinfo.c | 1 + 4 files changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index b5e9cee4b5f8..13a6103130cd 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -36,6 +36,7 @@ struct cpuinfo_arm64 { u64 reg_id_aa64isar1; u64 reg_id_aa64mmfr0; u64 reg_id_aa64mmfr1; + u64 reg_id_aa64mmfr2; u64 reg_id_aa64pfr0; u64 reg_id_aa64pfr1; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 76907c94b11f..4bc8655529df 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -70,6 +70,7 @@ #define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0) #define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1) +#define SYS_ID_AA64MMFR2_EL1 sys_reg(3, 0, 0, 7, 2) #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0) #define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1) @@ -135,6 +136,9 @@ #define ID_AA64MMFR1_VMIDBITS_SHIFT 4 #define ID_AA64MMFR1_HADBS_SHIFT 0 +/* id_aa64mmfr2 */ +#define ID_AA64MMFR2_UAO_SHIFT 4 + /* id_aa64dfr0 */ #define ID_AA64DFR0_CTX_CMPS_SHIFT 28 #define ID_AA64DFR0_WRPS_SHIFT 20 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 1ef10e784031..42918c797e8e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -123,6 +123,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { ARM64_FTR_END, }; +static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { + ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static struct arm64_ftr_bits ftr_ctr[] = { U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RAO */ ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 3, 0), @@ -284,6 +289,7 @@ static struct arm64_ftr_reg arm64_ftr_regs[] = { /* Op1 = 0, CRn = 0, CRm = 7 */ ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0), ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1), + ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2), /* Op1 = 3, CRn = 0, CRm = 0 */ ARM64_FTR_REG(SYS_CTR_EL0, ftr_ctr), @@ -408,6 +414,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1); init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0); init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1); + init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2); init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0); init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1); init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0); @@ -517,6 +524,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu, info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu, + info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); /* * EL3 is not our concern. @@ -831,6 +840,7 @@ static u64 __raw_read_system_reg(u32 sys_id) case SYS_ID_AA64DFR1_EL1: return read_cpuid(SYS_ID_AA64DFR0_EL1); case SYS_ID_AA64MMFR0_EL1: return read_cpuid(SYS_ID_AA64MMFR0_EL1); case SYS_ID_AA64MMFR1_EL1: return read_cpuid(SYS_ID_AA64MMFR1_EL1); + case SYS_ID_AA64MMFR2_EL1: return read_cpuid(SYS_ID_AA64MMFR2_EL1); case SYS_ID_AA64ISAR0_EL1: return read_cpuid(SYS_ID_AA64ISAR0_EL1); case SYS_ID_AA64ISAR1_EL1: return read_cpuid(SYS_ID_AA64ISAR1_EL1); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 76df22272804..966fbd52550b 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -210,6 +210,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1); info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1); info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1); + info->reg_id_aa64mmfr2 = read_cpuid(SYS_ID_AA64MMFR2_EL1); info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1); info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1); -- cgit v1.2.3 From 13e05550e107e46ef982e5c4347e4986aeeee7ec Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:48 +0000 Subject: arm64: kernel: Add support for User Access Override 'User Access Override' is a new ARMv8.2 feature which allows the unprivileged load and store instructions to be overridden to behave in the normal way. This patch converts {get,put}_user() and friends to use ldtr*/sttr* instructions - so that they can only access EL0 memory, then enables UAO when fs==KERNEL_DS so that these functions can access kernel memory. This allows user space's read/write permissions to be checked against the page tables, instead of testing addr [catalin.marinas@arm.com: move uao_thread_switch() above dsb()] Signed-off-by: Catalin Marinas (cherry picked from commit 57f4959bad0a154aeca125b7d38d1d9471a12422) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 21 +++++++++++ arch/arm64/include/asm/alternative.h | 72 ++++++++++++++++++++++++++++++++++++ arch/arm64/include/asm/cpufeature.h | 3 +- arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 3 ++ arch/arm64/include/asm/thread_info.h | 6 +++ arch/arm64/include/asm/uaccess.h | 44 ++++++++++++++++------ arch/arm64/include/uapi/asm/ptrace.h | 1 + arch/arm64/kernel/cpufeature.c | 11 ++++++ arch/arm64/kernel/process.c | 19 ++++++++++ arch/arm64/lib/clear_user.S | 8 ++-- arch/arm64/lib/copy_from_user.S | 8 ++-- arch/arm64/lib/copy_in_user.S | 16 ++++---- arch/arm64/lib/copy_to_user.S | 8 ++-- arch/arm64/mm/fault.c | 31 +++++++++++++--- 15 files changed, 213 insertions(+), 39 deletions(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1420102341d0..4df85b5a2045 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -706,6 +706,27 @@ config ARM64_LSE_ATOMICS endmenu +config ARM64_UAO + bool "Enable support for User Access Override (UAO)" + default y + help + User Access Override (UAO; part of the ARMv8.2 Extensions) + causes the 'unprivileged' variant of the load/store instructions to + be overriden to be privileged. + + This option changes get_user() and friends to use the 'unprivileged' + variant of the load/store instructions. This ensures that user-space + really did have access to the supplied memory. When addr_limit is + set to kernel memory the UAO bit will be set, allowing privileged + access to kernel memory. + + Choosing this option will cause copy_to_user() et al to use user-space + memory permissions. + + The feature is detected at runtime, the kernel will use the + regular load/store instructions if the cpu does not implement the + feature. + endmenu menu "Boot options" diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index e4962f04201e..a9fc24ec1aa9 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -1,6 +1,8 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H +#include + #ifndef __ASSEMBLY__ #include @@ -63,6 +65,8 @@ void apply_alternatives(void *start, size_t length); #else +#include + .macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len .word \orig_offset - . .word \alt_offset - . @@ -136,6 +140,74 @@ void apply_alternatives(void *start, size_t length); alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) +/* + * Generate the assembly for UAO alternatives with exception table entries. + * This is complicated as there is no post-increment or pair versions of the + * unprivileged instructions, and USER() only works for single instructions. + */ +#ifdef CONFIG_ARM64_UAO + .macro uao_ldp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: ldp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + ldtr \reg1, [\addr]; + ldtr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .quad 8889b,\l; + .previous; + .endm + + .macro uao_stp l, reg1, reg2, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: stp \reg1, \reg2, [\addr], \post_inc; +8889: nop; + nop; + alternative_else + sttr \reg1, [\addr]; + sttr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .quad 8889b,\l; + .previous + .endm + + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + alternative_if_not ARM64_HAS_UAO +8888: \inst \reg, [\addr], \post_inc; + nop; + alternative_else + \alt_inst \reg, [\addr]; + add \addr, \addr, \post_inc; + alternative_endif + + .section __ex_table,"a"; + .align 3; + .quad 8888b,\l; + .previous + .endm +#else + .macro uao_ldp l, reg1, reg2, addr, post_inc + USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_stp l, reg1, reg2, addr, post_inc + USER(\l, stp \reg1, \reg2, [\addr], \post_inc) + .endm + .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc + USER(\l, \inst \reg, [\addr], \post_inc) + .endm +#endif + #endif /* __ASSEMBLY__ */ /* diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8131abfabb0a..a5df7cde616b 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -31,8 +31,9 @@ #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 +#define ARM64_HAS_UAO 9 -#define ARM64_NCAPS 9 +#define ARM64_NCAPS 10 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 5bb1d763d17a..cef1cf398356 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -191,5 +191,6 @@ static inline void spin_lock_prefetch(const void *ptr) #endif void cpu_enable_pan(void *__unused); +void cpu_enable_uao(void *__unused); #endif /* __ASM_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 4bc8655529df..b9fd8ec79033 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -77,9 +77,12 @@ #define SYS_DCZID_EL0 sys_reg(3, 3, 0, 0, 7) #define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4) +#define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3) #define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\ (!!x)<<8 | 0x1f) +#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ + (!!x)<<8 | 0x1f) /* SCTLR_EL1 */ #define SCTLR_EL1_CP15BEN (0x1 << 5) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index abd64bd1f6d9..eba8db6838af 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -85,6 +85,12 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } +/* Access struct thread_info of another thread */ +static inline struct thread_info *get_thread_info(unsigned long thread_stack) +{ + return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1)); +} + #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index b2ede967fe7d..f973bdce8410 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -64,6 +64,16 @@ extern int fixup_exception(struct pt_regs *regs); static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; + + /* + * Enable/disable UAO so that copy_to_user() etc can access + * kernel memory with the unprivileged instructions. + */ + if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); } #define segment_eq(a, b) ((a) == (b)) @@ -113,9 +123,10 @@ static inline void set_fs(mm_segment_t fs) * The "__xxx_error" versions set the third argument to -EFAULT if an error * occurs, and leave it unchanged on success. */ -#define __get_user_asm(instr, reg, x, addr, err) \ +#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ - "1: " instr " " reg "1, [%2]\n" \ + "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ + alt_instr " " reg "1, [%2]\n", feature) \ "2:\n" \ " .section .fixup, \"ax\"\n" \ " .align 2\n" \ @@ -138,16 +149,20 @@ do { \ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm("ldrb", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __get_user_asm("ldrh", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __get_user_asm("ldr", "%w", __gu_val, (ptr), (err)); \ + __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __get_user_asm("ldr", "%", __gu_val, (ptr), (err)); \ + __get_user_asm("ldr", "ldtr", "%", __gu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ @@ -181,9 +196,10 @@ do { \ ((x) = 0, -EFAULT); \ }) -#define __put_user_asm(instr, reg, x, addr, err) \ +#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ - "1: " instr " " reg "1, [%2]\n" \ + "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ + alt_instr " " reg "1, [%2]\n", feature) \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .align 2\n" \ @@ -205,16 +221,20 @@ do { \ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ - __put_user_asm("strb", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 2: \ - __put_user_asm("strh", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 4: \ - __put_user_asm("str", "%w", __pu_val, (ptr), (err)); \ + __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ case 8: \ - __put_user_asm("str", "%", __pu_val, (ptr), (err)); \ + __put_user_asm("str", "sttr", "%", __pu_val, (ptr), \ + (err), ARM64_HAS_UAO); \ break; \ default: \ BUILD_BUG(); \ diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 208db3df135a..b5c3933ed441 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -45,6 +45,7 @@ #define PSR_A_BIT 0x00000100 #define PSR_D_BIT 0x00000200 #define PSR_PAN_BIT 0x00400000 +#define PSR_UAO_BIT 0x00800000 #define PSR_Q_BIT 0x08000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 42918c797e8e..ae22edf9d3c9 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -677,6 +677,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_NO_HW_PREFETCH, .matches = has_no_hw_prefetch, }, +#ifdef CONFIG_ARM64_UAO + { + .desc = "User Access Override", + .capability = ARM64_HAS_UAO, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .field_pos = ID_AA64MMFR2_UAO_SHIFT, + .min_field_value = 1, + .enable = cpu_enable_uao, + }, +#endif /* CONFIG_ARM64_UAO */ {}, }; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 88d742ba19d5..c1ca4ea065d4 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -46,6 +46,7 @@ #include #include +#include #include #include #include @@ -280,6 +281,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, } else { memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h; + if (IS_ENABLED(CONFIG_ARM64_UAO) && + cpus_have_cap(ARM64_HAS_UAO)) + childregs->pstate |= PSR_UAO_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; } @@ -308,6 +312,20 @@ static void tls_thread_switch(struct task_struct *next) : : "r" (tpidr), "r" (tpidrro)); } +/* Restore the UAO state depending on next's addr_limit */ +static void uao_thread_switch(struct task_struct *next) +{ + unsigned long next_sp = next->thread.cpu_context.sp; + + if (IS_ENABLED(CONFIG_ARM64_UAO) && + get_thread_info(next_sp)->addr_limit == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, + CONFIG_ARM64_UAO)); +} + /* * Thread switching. */ @@ -320,6 +338,7 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); + uao_thread_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index a9723c71c52b..3f950b677c07 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -39,20 +39,20 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ subs x1, x1, #8 b.mi 2f 1: -USER(9f, str xzr, [x0], #8 ) +uao_user_alternative 9f, str, sttr, xzr, x0, 8 subs x1, x1, #8 b.pl 1b 2: adds x1, x1, #4 b.mi 3f -USER(9f, str wzr, [x0], #4 ) +uao_user_alternative 9f, str, sttr, wzr, x0, 4 sub x1, x1, #4 3: adds x1, x1, #2 b.mi 4f -USER(9f, strh wzr, [x0], #2 ) +uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 sub x1, x1, #2 4: adds x1, x1, #1 b.mi 5f -USER(9f, strb wzr, [x0] ) +uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4699cd74f87e..1d982d64f1a7 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -34,7 +34,7 @@ */ .macro ldrb1 ptr, regB, val - USER(9998f, ldrb \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val .endm .macro strb1 ptr, regB, val @@ -42,7 +42,7 @@ .endm .macro ldrh1 ptr, regB, val - USER(9998f, ldrh \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val .endm .macro strh1 ptr, regB, val @@ -50,7 +50,7 @@ .endm .macro ldr1 ptr, regB, val - USER(9998f, ldr \ptr, [\regB], \val) + uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val .endm .macro str1 ptr, regB, val @@ -58,7 +58,7 @@ .endm .macro ldp1 ptr, regB, regC, val - USER(9998f, ldp \ptr, \regB, [\regC], \val) + uao_ldp 9998f, \ptr, \regB, \regC, \val .endm .macro stp1 ptr, regB, regC, val diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 81c8fc93c100..feaad1520dc1 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -35,35 +35,35 @@ * x0 - bytes not copied */ .macro ldrb1 ptr, regB, val - USER(9998f, ldrb \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val .endm .macro strb1 ptr, regB, val - USER(9998f, strb \ptr, [\regB], \val) + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val .endm .macro ldrh1 ptr, regB, val - USER(9998f, ldrh \ptr, [\regB], \val) + uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val .endm .macro strh1 ptr, regB, val - USER(9998f, strh \ptr, [\regB], \val) + uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val .endm .macro ldr1 ptr, regB, val - USER(9998f, ldr \ptr, [\regB], \val) + uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val .endm .macro str1 ptr, regB, val - USER(9998f, str \ptr, [\regB], \val) + uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val .endm .macro ldp1 ptr, regB, regC, val - USER(9998f, ldp \ptr, \regB, [\regC], \val) + uao_ldp 9998f, \ptr, \regB, \regC, \val .endm .macro stp1 ptr, regB, regC, val - USER(9998f, stp \ptr, \regB, [\regC], \val) + uao_stp 9998f, \ptr, \regB, \regC, \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7512bbbc07ac..2dae2cd2c481 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -37,7 +37,7 @@ .endm .macro strb1 ptr, regB, val - USER(9998f, strb \ptr, [\regB], \val) + uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val .endm .macro ldrh1 ptr, regB, val @@ -45,7 +45,7 @@ .endm .macro strh1 ptr, regB, val - USER(9998f, strh \ptr, [\regB], \val) + uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val .endm .macro ldr1 ptr, regB, val @@ -53,7 +53,7 @@ .endm .macro str1 ptr, regB, val - USER(9998f, str \ptr, [\regB], \val) + uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val .endm .macro ldp1 ptr, regB, regC, val @@ -61,7 +61,7 @@ .endm .macro stp1 ptr, regB, regC, val - USER(9998f, stp \ptr, \regB, [\regC], \val) + uao_stp 9998f, \ptr, \regB, \regC, \val .endm end .req x5 diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 92ddac1e8ca2..820d47353cf0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -192,6 +192,14 @@ out: return fault; } +static inline int permission_fault(unsigned int esr) +{ + unsigned int ec = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT; + unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + + return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM); +} + static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -225,12 +233,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - /* - * PAN bit set implies the fault happened in kernel space, but not - * in the arch's user access functions. - */ - if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT)) - goto no_context; + if (permission_fault(esr) && (addr < USER_DS)) { + if (!search_exception_tables(regs->pc)) + panic("Accessing user space memory outside uaccess.h routines"); + } /* * As per x86, we may deadlock here. However, since the kernel only @@ -561,3 +567,16 @@ void cpu_enable_pan(void *__unused) config_sctlr_el1(SCTLR_EL1_SPAN, 0); } #endif /* CONFIG_ARM64_PAN */ + +#ifdef CONFIG_ARM64_UAO +/* + * Kernel threads have fs=KERNEL_DS by default, and don't need to call + * set_fs(), devtmpfs in particular relies on this behaviour. + * We need to enable the feature at runtime (instead of adding it to + * PSR_MODE_EL1h) as the feature may not be implemented by the cpu. + */ +void cpu_enable_uao(void *__unused) +{ + asm(SET_PSTATE_UAO(1)); +} +#endif /* CONFIG_ARM64_UAO */ -- cgit v1.2.3 From 31b28ec1f1051bf82515fe38d0ae1884f40783cf Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:49 +0000 Subject: arm64: cpufeature: Test 'matches' pointer to find the end of the list CPU feature code uses the desc field as a test to find the end of the list, this means every entry must have a description. This generates noise for entries in the list that aren't really features, but combinations of them. e.g. > CPU features: detected feature: Privileged Access Never > CPU features: detected feature: PAN and not UAO These combination features are needed for corner cases with alternatives, where cpu features interact. Change all walkers of the arm64_features[] and arm64_hwcaps[] lists to test 'matches' not 'desc', and only print 'desc' if it is non-NULL. Signed-off-by: James Morse Reviewed-by : Suzuki K Poulose Signed-off-by: Catalin Marinas (cherry picked from commit 644c2ae198412c956700e55a2acf80b2541f6aa5) Signed-off-by: Alex Shi --- arch/arm64/kernel/cpufeature.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index ae22edf9d3c9..9cc8186cd14b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -771,7 +771,7 @@ static void __init setup_cpu_hwcaps(void) int i; const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps; - for (i = 0; hwcaps[i].desc; i++) + for (i = 0; hwcaps[i].matches; i++) if (hwcaps[i].matches(&hwcaps[i])) cap_set_hwcap(&hwcaps[i]); } @@ -781,11 +781,11 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, { int i; - for (i = 0; caps[i].desc; i++) { + for (i = 0; caps[i].matches; i++) { if (!caps[i].matches(&caps[i])) continue; - if (!cpus_have_cap(caps[i].capability)) + if (!cpus_have_cap(caps[i].capability) && caps[i].desc) pr_info("%s %s\n", info, caps[i].desc); cpus_set_cap(caps[i].capability); } @@ -800,7 +800,7 @@ enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) { int i; - for (i = 0; caps[i].desc; i++) + for (i = 0; caps[i].matches; i++) if (caps[i].enable && cpus_have_cap(caps[i].capability)) on_each_cpu(caps[i].enable, NULL, true); } @@ -907,7 +907,7 @@ void verify_local_cpu_capabilities(void) return; caps = arm64_features; - for (i = 0; caps[i].desc; i++) { + for (i = 0; caps[i].matches; i++) { if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg) continue; /* @@ -920,7 +920,7 @@ void verify_local_cpu_capabilities(void) caps[i].enable(NULL); } - for (i = 0, caps = arm64_hwcaps; caps[i].desc; i++) { + for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) { if (!cpus_have_hwcap(&caps[i])) continue; if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i])) -- cgit v1.2.3 From cdfec5aaf4a886521b7f54dfe2db61735558d546 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 5 Feb 2016 14:58:50 +0000 Subject: arm64: kernel: Don't toggle PAN on systems with UAO If a CPU supports both Privileged Access Never (PAN) and User Access Override (UAO), we don't need to disable/re-enable PAN round all copy_to_user() like calls. UAO alternatives cause these calls to use the 'unprivileged' load/store instructions, which are overridden to be the privileged kind when fs==KERNEL_DS. This patch changes the copy_to_user() calls to have their PAN toggling depend on a new composite 'feature' ARM64_ALT_PAN_NOT_UAO. If both features are detected, PAN will be enabled, but the copy_to_user() alternatives will not be applied. This means PAN will be enabled all the time for these functions. If only PAN is detected, the toggling will be enabled as normal. This will save the time taken to disable/re-enable PAN, and allow us to catch copy_to_user() accesses that occur with fs==KERNEL_DS. Futex and swp-emulation code continue to hang their PAN toggling code on ARM64_HAS_PAN. Signed-off-by: James Morse Signed-off-by: Catalin Marinas (cherry picked from commit 705441960033e66b63524521f153fbb28c99ddbd) Signed-off-by: Alex Shi --- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/uaccess.h | 8 ++++---- arch/arm64/kernel/cpufeature.c | 16 ++++++++++++++++ arch/arm64/lib/clear_user.S | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_in_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S | 4 ++-- arch/arm64/mm/fault.c | 3 +++ 8 files changed, 33 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index a5df7cde616b..37a53fc6b384 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -32,8 +32,9 @@ #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 #define ARM64_HAS_UAO 9 +#define ARM64_ALT_PAN_NOT_UAO 10 -#define ARM64_NCAPS 10 +#define ARM64_NCAPS 11 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index f973bdce8410..16ba0d5c9740 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -145,7 +145,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -168,7 +168,7 @@ do { \ BUILD_BUG(); \ } \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ } while (0) @@ -217,7 +217,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -239,7 +239,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ CONFIG_ARM64_PAN)); \ } while (0) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9cc8186cd14b..7566cad9fa1d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -67,6 +67,10 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); .width = 0, \ } +/* meta feature for alternatives */ +static bool __maybe_unused +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry); + static struct arm64_ftr_bits ftr_id_aa64isar0[] = { ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0), ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0), @@ -688,6 +692,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .enable = cpu_enable_uao, }, #endif /* CONFIG_ARM64_UAO */ +#ifdef CONFIG_ARM64_PAN + { + .capability = ARM64_ALT_PAN_NOT_UAO, + .matches = cpufeature_pan_not_uao, + }, +#endif /* CONFIG_ARM64_PAN */ {}, }; @@ -966,3 +976,9 @@ void __init setup_cpu_features(void) pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", L1_CACHE_BYTES, cls); } + +static bool __maybe_unused +cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry) +{ + return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO)); +} diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 3f950b677c07..5d1cad3ce6d6 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -33,7 +33,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x2, x1 // save the size for fixup return subs x1, x1, #8 @@ -54,7 +54,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 1d982d64f1a7..17e8306dca29 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -67,11 +67,11 @@ end .req x5 ENTRY(__copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index feaad1520dc1..f7292dd08c84 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -68,11 +68,11 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 2dae2cd2c481..21faae60f988 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -66,11 +66,11 @@ end .req x5 ENTRY(__copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ +ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 820d47353cf0..d0762a729d01 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -234,6 +234,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (permission_fault(esr) && (addr < USER_DS)) { + if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS) + panic("Accessing user space memory with fs=KERNEL_DS"); + if (!search_exception_tables(regs->pc)) panic("Accessing user space memory outside uaccess.h routines"); } -- cgit v1.2.3 From 9193df45aa67b769ee8af97609537a876a81baca Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 18 Feb 2016 15:50:04 +0000 Subject: arm64: Remove the get_thread_info() function This function was introduced by previous commits implementing UAO. However, it can be replaced with task_thread_info() in uao_thread_switch() or get_fs() in do_page_fault() (the latter being called only on the current context, so no need for using the saved pt_regs). Signed-off-by: Catalin Marinas (cherry picked from commit e950631e84e7e38892ffbeee5e1816b270026b0e) Signed-off-by: Alex Shi --- arch/arm64/include/asm/thread_info.h | 6 ------ arch/arm64/kernel/process.c | 15 ++++++--------- arch/arm64/mm/fault.c | 2 +- 3 files changed, 7 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index eba8db6838af..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -85,12 +85,6 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)sp_el0; } -/* Access struct thread_info of another thread */ -static inline struct thread_info *get_thread_info(unsigned long thread_stack) -{ - return (struct thread_info *)(thread_stack & ~(THREAD_SIZE - 1)); -} - #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c1ca4ea065d4..80624829db61 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -315,15 +315,12 @@ static void tls_thread_switch(struct task_struct *next) /* Restore the UAO state depending on next's addr_limit */ static void uao_thread_switch(struct task_struct *next) { - unsigned long next_sp = next->thread.cpu_context.sp; - - if (IS_ENABLED(CONFIG_ARM64_UAO) && - get_thread_info(next_sp)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); + if (IS_ENABLED(CONFIG_ARM64_UAO)) { + if (task_thread_info(next)->addr_limit == KERNEL_DS) + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); + else + asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); + } } /* diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index d0762a729d01..a8eafeceb08a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -234,7 +234,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (permission_fault(esr) && (addr < USER_DS)) { - if (get_thread_info(regs->sp)->addr_limit == KERNEL_DS) + if (get_fs() == KERNEL_DS) panic("Accessing user space memory with fs=KERNEL_DS"); if (!search_exception_tables(regs->pc)) -- cgit v1.2.3 From 2894f328e3bbb808606c82fa1e2ea300089df728 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:34 +0100 Subject: arm64: prevent potential circular header dependencies in asm/bug.h Currently, using BUG_ON() in header files is cumbersome, due to the fact that asm/bug.h transitively includes a lot of other header files, resulting in the actual BUG_ON() invocation appearing before its definition in the preprocessor input. So let's reverse the #include dependency between asm/bug.h and asm/debug-monitors.h, by moving the definition of BUG_BRK_IMM from the latter to the former. Also fix up one user of asm/debug-monitors.h which relied on a transitive include. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 03336b1df9929e5d9c28fd9768948b6151cb046c) Signed-off-by: Alex Shi Conflicts: skip arch/arm64/kvm/hyp/debug-sr.c --- arch/arm64/include/asm/bug.h | 2 +- arch/arm64/include/asm/debug-monitors.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 4a748ce9ba1a..679d49221998 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,7 +18,7 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#include +#define BUG_BRK_IMM 0x800 #ifdef CONFIG_GENERIC_BUG #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 279c85b5ec09..e893a1fca9c2 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -57,7 +58,6 @@ #define FAULT_BRK_IMM 0x100 #define KGDB_DYN_DBG_BRK_IMM 0x400 #define KGDB_COMPILED_DBG_BRK_IMM 0x401 -#define BUG_BRK_IMM 0x800 /* * BRK instruction encoding -- cgit v1.2.3 From 37cbc7db8e4fa9b66e15cf8661383a6b51c9a3e7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:35 +0100 Subject: arm64: add support for ioremap() block mappings This wires up the existing generic huge-vmap feature, which allows ioremap() to use PMD or PUD sized block mappings. It also adds support to the unmap path for dealing with block mappings, which will allow us to unmap the __init region using unmap_kernel_range() in a subsequent patch. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 324420bf91f60582bb481133db9547111768ef17) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/memory.h | 6 ++++++ arch/arm64/mm/mmu.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4df85b5a2045..8cd8d06ece4a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -49,6 +49,7 @@ config ARM64 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE + select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 853953cd1f08..c65aad7b13dc 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -100,6 +100,12 @@ #define MT_S2_NORMAL 0xf #define MT_S2_DEVICE_nGnRE 0x1 +#ifdef CONFIG_ARM64_4K_PAGES +#define IOREMAP_MAX_ORDER (PUD_SHIFT) +#else +#define IOREMAP_MAX_ORDER (PMD_SHIFT) +#endif + #ifndef __ASSEMBLY__ extern phys_addr_t memstart_addr; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d1fa678355c9..b4afa9fbb00f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -745,3 +745,44 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; } + +int __init arch_ioremap_pud_supported(void) +{ + /* only 4k granule supports level 1 block mappings */ + return IS_ENABLED(CONFIG_ARM64_4K_PAGES); +} + +int __init arch_ioremap_pmd_supported(void) +{ + return 1; +} + +int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) +{ + BUG_ON(phys & ~PUD_MASK); + set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) +{ + BUG_ON(phys & ~PMD_MASK); + set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot)))); + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (!pud_sect(*pud)) + return 0; + pud_clear(pud); + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (!pmd_sect(*pmd)) + return 0; + pmd_clear(pmd); + return 1; +} -- cgit v1.2.3 From 1dd59fe47656335cd3c913e378718eb49b7b1b38 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:36 +0100 Subject: arm64: introduce KIMAGE_VADDR as the virtual base of the kernel region This introduces the preprocessor symbol KIMAGE_VADDR which will serve as the symbolic virtual base of the kernel region, i.e., the kernel's virtual offset will be KIMAGE_VADDR + TEXT_OFFSET. For now, we define it as being equal to PAGE_OFFSET, but in the future, it will be moved below it once we move the kernel virtual mapping out of the linear mapping. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit ab893fb9f1b17f02139bce547bb4b69e96b9ae16) Signed-off-by: Alex Shi --- arch/arm64/include/asm/memory.h | 10 ++++++++-- arch/arm64/kernel/head.S | 2 +- arch/arm64/kernel/vmlinux.lds.S | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c65aad7b13dc..aebc739f5a11 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -51,7 +51,8 @@ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define VA_START (UL(0xffffffffffffffff) << VA_BITS) #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) -#define MODULES_END (PAGE_OFFSET) +#define KIMAGE_VADDR (PAGE_OFFSET) +#define MODULES_END (KIMAGE_VADDR) #define MODULES_VADDR (MODULES_END - SZ_64M) #define PCI_IO_END (MODULES_VADDR - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) @@ -75,8 +76,13 @@ * private definitions which should NOT be used outside memory.h * files. Use virt_to_phys/phys_to_virt/__pa/__va instead. */ -#define __virt_to_phys(x) (((phys_addr_t)(x) - PAGE_OFFSET + PHYS_OFFSET)) +#define __virt_to_phys(x) ({ \ + phys_addr_t __x = (phys_addr_t)(x); \ + __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ + (__x - KIMAGE_VADDR + PHYS_OFFSET); }) + #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) +#define __phys_to_kimg(x) ((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR)) /* * Convert a page to/from a physical address diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 53b9f9f128c2..04d38a058b19 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -389,7 +389,7 @@ __create_page_tables: * Map the kernel image (starting with PHYS_OFFSET). */ mov x0, x26 // swapper_pg_dir - mov x5, #PAGE_OFFSET + ldr x5, =KIMAGE_VADDR create_pgd_entry x0, x5, x3, x6 ldr x6, =KERNEL_END // __va(KERNEL_END) mov x3, x24 // phys offset diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index b78a3c772294..282e3e64a17e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -89,7 +89,7 @@ SECTIONS *(.discard.*) } - . = PAGE_OFFSET + TEXT_OFFSET; + . = KIMAGE_VADDR + TEXT_OFFSET; .head.text : { _text = .; @@ -186,4 +186,4 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ -ASSERT(_text == (PAGE_OFFSET + TEXT_OFFSET), "HEAD is misaligned") +ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned") -- cgit v1.2.3 From 4545faf8e5b81592ba597141d432ca7e2b52a43e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:37 +0100 Subject: arm64: pgtable: implement static [pte|pmd|pud]_offset variants The page table accessors pte_offset(), pud_offset() and pmd_offset() rely on __va translations, so they can only be used after the linear mapping has been installed. For the early fixmap and kasan init routines, whose page tables are allocated statically in the kernel image, these functions will return bogus values. So implement pte_offset_kimg(), pmd_offset_kimg() and pud_offset_kimg(), which can be used instead before any page tables have been allocated dynamically. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 6533945a32c762c5db70d7a3ec251a040b2d9661) Signed-off-by: Alex Shi --- arch/arm64/include/asm/pgtable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c99dfc588deb..9a560b368910 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -455,6 +455,9 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. @@ -498,6 +501,9 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pud_page(pud) pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) + #else #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) @@ -507,6 +513,8 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) #define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) #define pmd_clear_fixmap() +#define pmd_offset_kimg(dir,addr) ((pmd_t *)dir) + #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -545,6 +553,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK)) +/* use ONLY for statically allocated translation tables */ +#define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) + #else #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) @@ -554,6 +565,8 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) #define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) #define pud_clear_fixmap() +#define pud_offset_kimg(dir,addr) ((pud_t *)dir) + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) -- cgit v1.2.3 From ade984b5fcdda640ca25a1606f9cdaf279e8b4c7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:38 +0100 Subject: arm64: decouple early fixmap init from linear mapping Since the early fixmap page tables are populated using pages that are part of the static footprint of the kernel, they are covered by the initial kernel mapping, and we can refer to them without using __va/__pa translations, which are tied to the linear mapping. Since the fixmap page tables are disjoint from the kernel mapping up to the top level pgd entry, we can refer to bm_pte[] directly, and there is no need to walk the page tables and perform __pa()/__va() translations at each step. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 157962f5a8f236cab898b68bdaa69ce68922f0bf) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b4afa9fbb00f..0f58a45df1f3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -614,7 +614,7 @@ static inline pud_t * fixmap_pud(unsigned long addr) BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd)); - return pud_offset(pgd, addr); + return pud_offset_kimg(pgd, addr); } static inline pmd_t * fixmap_pmd(unsigned long addr) @@ -623,16 +623,12 @@ static inline pmd_t * fixmap_pmd(unsigned long addr) BUG_ON(pud_none(*pud) || pud_bad(*pud)); - return pmd_offset(pud, addr); + return pmd_offset_kimg(pud, addr); } static inline pte_t * fixmap_pte(unsigned long addr) { - pmd_t *pmd = fixmap_pmd(addr); - - BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd)); - - return pte_offset_kernel(pmd, addr); + return &bm_pte[pte_index(addr)]; } void __init early_fixmap_init(void) @@ -644,14 +640,14 @@ void __init early_fixmap_init(void) pgd = pgd_offset_k(addr); pgd_populate(&init_mm, pgd, bm_pud); - pud = pud_offset(pgd, addr); + pud = fixmap_pud(addr); pud_populate(&init_mm, pud, bm_pmd); - pmd = pmd_offset(pud, addr); + pmd = fixmap_pmd(addr); pmd_populate_kernel(&init_mm, pmd, bm_pte); /* * The boot-ioremap range spans multiple pmds, for which - * we are not preparted: + * we are not prepared: */ BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); -- cgit v1.2.3 From 44b9620e6822e2acb0d65507d89cd0658055843c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:39 +0100 Subject: arm64: kvm: deal with kernel symbols outside of linear mapping KVM on arm64 uses a fixed offset between the linear mapping at EL1 and the HYP mapping at EL2. Before we can move the kernel virtual mapping out of the linear mapping, we have to make sure that references to kernel symbols that are accessed via the HYP mapping are translated to their linear equivalent. Reviewed-by: Mark Rutland Acked-by: Marc Zyngier Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit a0bf9776cd0be4490d4675d4108e13379849fc7f) Signed-off-by: Alex Shi Conflicts: skip new funcs create_hyp_mappings(__start_rodata, in arch/arm/kvm/arm.c and keep funcs in arch/arm64/kvm/hyp.S --- arch/arm/include/asm/kvm_asm.h | 2 ++ arch/arm/kvm/arm.c | 5 +++-- arch/arm64/include/asm/kvm_asm.h | 17 +++++++++++++++++ arch/arm64/include/asm/kvm_host.h | 8 +++++--- arch/arm64/kvm/hyp.S | 6 +++--- 5 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 194c91b610ff..c35c349da069 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -79,6 +79,8 @@ #define rr_lo_hi(a1, a2) a1, a2 #endif +#define kvm_ksym_ref(kva) (kva) + #ifndef __ASSEMBLY__ struct kvm; struct kvm_vcpu; diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index e06fd299de08..70e6d557c75f 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -969,7 +969,7 @@ static void cpu_init_hyp_mode(void *dummy) pgd_ptr = kvm_mmu_get_httbr(); stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; - vector_ptr = (unsigned long)__kvm_hyp_vector; + vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); @@ -1061,7 +1061,8 @@ static int init_hyp_mode(void) /* * Map the Hyp-code called directly from the host */ - err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end); + err = create_hyp_mappings(kvm_ksym_ref(__kvm_hyp_code_start), + kvm_ksym_ref(__kvm_hyp_code_end)); if (err) { kvm_err("Cannot map world-switch code\n"); goto out_free_mappings; diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 5e377101f919..e95c39543629 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -102,7 +102,24 @@ #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) +#define kvm_ksym_ref(sym) ((void *)&sym + kvm_ksym_shift) + #ifndef __ASSEMBLY__ +#if __GNUC__ > 4 +#define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR) +#else +/* + * GCC versions 4.9 and older will fold the constant below into the addend of + * the reference to 'sym' above if kvm_ksym_shift is declared static or if the + * constant is used directly. However, since we use the small code model for + * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair, + * with a +/- 4 GB range, resulting in linker relocation errors if the shift + * is sufficiently large. So prevent the compiler from folding the shift into + * the addend, by making the shift a variable with external linkage. + */ +__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR; +#endif + struct kvm; struct kvm_vcpu; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a35ce7266aac..90c6368ad7c8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -222,7 +222,7 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); -u64 kvm_call_hyp(void *hypfn, ...); +u64 __kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); @@ -243,8 +243,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, * Call initialization code, and switch to the full blown * HYP code. */ - kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, - hyp_stack_ptr, vector_ptr); + __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, + hyp_stack_ptr, vector_ptr); } static inline void kvm_arch_hardware_disable(void) {} @@ -258,4 +258,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 86c289832272..309e3479dc2c 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -923,7 +923,7 @@ __hyp_panic_str: .align 2 /* - * u64 kvm_call_hyp(void *hypfn, ...); + * u64 __kvm_call_hyp(void *hypfn, ...); * * This is not really a variadic function in the classic C-way and care must * be taken when calling this to ensure parameters are passed in registers @@ -940,10 +940,10 @@ __hyp_panic_str: * used to implement __hyp_get_vectors in the same way as in * arch/arm64/kernel/hyp_stub.S. */ -ENTRY(kvm_call_hyp) +ENTRY(__kvm_call_hyp) hvc #0 ret -ENDPROC(kvm_call_hyp) +ENDPROC(__kvm_call_hyp) .macro invalid_vector label, target .align 2 -- cgit v1.2.3 From 49d5b2c298815fcaee00779e69a741ad1e80e740 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:40 +0100 Subject: arm64: move kernel image to base of vmalloc area This moves the module area to right before the vmalloc area, and moves the kernel image to the base of the vmalloc area. This is an intermediate step towards implementing KASLR, which allows the kernel image to be located anywhere in the vmalloc area. Since other subsystems such as hibernate may still need to refer to the kernel text or data segments via their linears addresses, both are mapped in the linear region as well. The linear alias of the text region is mapped read-only/non-executable to prevent inadvertent modification or execution. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit f9040773b7bbbd9e98eb6184a263512a7cfc133f) Signed-off-by: Alex Shi --- arch/arm64/include/asm/kasan.h | 2 +- arch/arm64/include/asm/memory.h | 21 +++++--- arch/arm64/include/asm/pgtable.h | 10 +--- arch/arm64/mm/dump.c | 12 ++--- arch/arm64/mm/init.c | 23 ++++---- arch/arm64/mm/kasan_init.c | 27 ++++++++-- arch/arm64/mm/mmu.c | 110 +++++++++++++++++++++++++++------------ 7 files changed, 137 insertions(+), 68 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h index de0d21211c34..71ad0f93eb71 100644 --- a/arch/arm64/include/asm/kasan.h +++ b/arch/arm64/include/asm/kasan.h @@ -14,7 +14,7 @@ * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses. */ #define KASAN_SHADOW_START (VA_START) -#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1UL << (VA_BITS - 3))) +#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) /* * This value is used to map an address to the corresponding shadow diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index aebc739f5a11..4388651d1f0d 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -45,16 +45,15 @@ * VA_START - the first kernel virtual address. * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. - * The module space lives between the addresses given by TASK_SIZE - * and PAGE_OFFSET - it must be within 128MB of the kernel text. */ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define VA_START (UL(0xffffffffffffffff) << VA_BITS) #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) -#define KIMAGE_VADDR (PAGE_OFFSET) -#define MODULES_END (KIMAGE_VADDR) -#define MODULES_VADDR (MODULES_END - SZ_64M) -#define PCI_IO_END (MODULES_VADDR - SZ_2M) +#define KIMAGE_VADDR (MODULES_END) +#define MODULES_END (MODULES_VADDR + MODULES_VSIZE) +#define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) +#define MODULES_VSIZE (SZ_64M) +#define PCI_IO_END (PAGE_OFFSET - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) #define TASK_SIZE_64 (UL(1) << VA_BITS) @@ -71,6 +70,16 @@ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) +/* + * The size of the KASAN shadow region. This should be 1/8th of the + * size of the entire kernel virtual address space. + */ +#ifdef CONFIG_KASAN +#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) +#else +#define KASAN_SHADOW_SIZE (0) +#endif + /* * Physical vs virtual RAM address space conversion. These are * private definitions which should NOT be used outside memory.h diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9a560b368910..c3c2518eecfe 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -36,19 +36,13 @@ * * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array * (rounded up to PUD_SIZE). - * VMALLOC_START: beginning of the kernel VA space + * VMALLOC_START: beginning of the kernel vmalloc space * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, * fixed mappings and modules */ #define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) -#ifndef CONFIG_KASAN -#define VMALLOC_START (VA_START) -#else -#include -#define VMALLOC_START (KASAN_SHADOW_END + SZ_64K) -#endif - +#define VMALLOC_START (MODULES_END) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) #define VMEMMAP_START (VMALLOC_END + SZ_64K) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 0841b2bf0e6a..6be918478f85 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -35,7 +35,9 @@ struct addr_marker { }; enum address_markers_idx { - VMALLOC_START_NR = 0, + MODULES_START_NR = 0, + MODULES_END_NR, + VMALLOC_START_NR, VMALLOC_END_NR, #ifdef CONFIG_SPARSEMEM_VMEMMAP VMEMMAP_START_NR, @@ -45,12 +47,12 @@ enum address_markers_idx { FIXADDR_END_NR, PCI_START_NR, PCI_END_NR, - MODULES_START_NR, - MODULES_END_NR, KERNEL_SPACE_NR, }; static struct addr_marker address_markers[] = { + { MODULES_VADDR, "Modules start" }, + { MODULES_END, "Modules end" }, { VMALLOC_START, "vmalloc() Area" }, { VMALLOC_END, "vmalloc() End" }, #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -61,9 +63,7 @@ static struct addr_marker address_markers[] = { { FIXADDR_TOP, "Fixmap end" }, { PCI_IO_START, "PCI I/O start" }, { PCI_IO_END, "PCI I/O end" }, - { MODULES_VADDR, "Modules start" }, - { MODULES_END, "Modules end" }, - { PAGE_OFFSET, "Kernel Mapping" }, + { PAGE_OFFSET, "Linear Mapping" }, { -1, NULL }, }; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index dba32ceff17a..ac4d8159d6f3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -302,22 +303,26 @@ void __init mem_init(void) #ifdef CONFIG_KASAN " kasan : 0x%16lx - 0x%16lx (%6ld GB)\n" #endif + " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" " vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n" + " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" + " .data : 0x%p" " - 0x%p" " (%6ld KB)\n" #ifdef CONFIG_SPARSEMEM_VMEMMAP " vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" " 0x%16lx - 0x%16lx (%6ld MB actual)\n" #endif " fixed : 0x%16lx - 0x%16lx (%6ld KB)\n" " PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n" - " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" - " memory : 0x%16lx - 0x%16lx (%6ld MB)\n" - " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" - " .data : 0x%p" " - 0x%p" " (%6ld KB)\n", + " memory : 0x%16lx - 0x%16lx (%6ld MB)\n", #ifdef CONFIG_KASAN MLG(KASAN_SHADOW_START, KASAN_SHADOW_END), #endif + MLM(MODULES_VADDR, MODULES_END), MLG(VMALLOC_START, VMALLOC_END), + MLK_ROUNDUP(__init_begin, __init_end), + MLK_ROUNDUP(_text, _etext), + MLK_ROUNDUP(_sdata, _edata), #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE), @@ -326,11 +331,7 @@ void __init mem_init(void) #endif MLK(FIXADDR_START, FIXADDR_TOP), MLM(PCI_IO_START, PCI_IO_END), - MLM(MODULES_VADDR, MODULES_END), - MLM(PAGE_OFFSET, (unsigned long)high_memory), - MLK_ROUNDUP(__init_begin, __init_end), - MLK_ROUNDUP(_text, _etext), - MLK_ROUNDUP(_sdata, _edata)); + MLM(PAGE_OFFSET, (unsigned long)high_memory)); #undef MLK #undef MLM @@ -358,8 +359,8 @@ void __init mem_init(void) void free_initmem(void) { - fixup_init(); free_initmem_default(0); + fixup_init(); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index cc569a38bc76..7f10cc91fa8a 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -17,9 +17,11 @@ #include #include +#include #include #include #include +#include #include static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); @@ -33,7 +35,7 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr, if (pmd_none(*pmd)) pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); - pte = pte_offset_kernel(pmd, addr); + pte = pte_offset_kimg(pmd, addr); do { next = addr + PAGE_SIZE; set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page), @@ -51,7 +53,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud, if (pud_none(*pud)) pud_populate(&init_mm, pud, kasan_zero_pmd); - pmd = pmd_offset(pud, addr); + pmd = pmd_offset_kimg(pud, addr); do { next = pmd_addr_end(addr, end); kasan_early_pte_populate(pmd, addr, next); @@ -68,7 +70,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd, if (pgd_none(*pgd)) pgd_populate(&init_mm, pgd, kasan_zero_pud); - pud = pud_offset(pgd, addr); + pud = pud_offset_kimg(pgd, addr); do { next = pud_addr_end(addr, end); kasan_early_pmd_populate(pud, addr, next); @@ -126,9 +128,13 @@ static void __init clear_pgds(unsigned long start, void __init kasan_init(void) { + u64 kimg_shadow_start, kimg_shadow_end; struct memblock_region *reg; int i; + kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); + kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + /* * We are going to perform proper setup of shadow memory. * At first we should unmap early shadow (clear_pgds() call bellow). @@ -142,8 +148,23 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE); + + /* + * vmemmap_populate() has populated the shadow region that covers the + * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round + * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent + * kasan_populate_zero_shadow() from replacing the PMD block mappings + * with PMD table mappings at the edges of the shadow region for the + * kernel image. + */ + if (ARM64_SWAPPER_USES_SECTION_MAPS) + kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, kasan_mem_to_shadow((void *)MODULES_VADDR)); + kasan_populate_zero_shadow((void *)kimg_shadow_end, + kasan_mem_to_shadow((void *)PAGE_OFFSET)); for_each_memblock(memory, reg) { void *start = (void *)__phys_to_virt(reg->base); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0f58a45df1f3..895a8457259c 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -53,6 +53,10 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); +static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; +static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused; +static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -380,16 +384,15 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) { - unsigned long kernel_start = __pa(_stext); - unsigned long kernel_end = __pa(_end); + unsigned long kernel_end = __pa(_etext); /* - * The kernel itself is mapped at page granularity. Map all other - * memory, making sure we don't overwrite the existing kernel mappings. + * Take care not to create a writable alias for the + * read-only text and rodata sections of the kernel image. */ - /* No overlap with the kernel. */ + /* No overlap with the kernel text */ if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, @@ -398,8 +401,8 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end } /* - * This block overlaps the kernel mapping. Map the portion(s) which - * don't overlap. + * This block overlaps the kernel text mapping. + * Map the portion(s) which don't overlap. */ if (start < kernel_start) __create_pgd_mapping(pgd, start, @@ -411,6 +414,16 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, early_pgtable_alloc); + + /* + * Map the linear alias of the [_stext, _etext) interval as + * read-only/non-executable. This makes the contents of the + * region accessible to subsystems such as hibernate, but + * protects it from inadvertent modification or execution. + */ + __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), + kernel_end - kernel_start, PAGE_KERNEL_RO, + early_pgtable_alloc); } static void __init map_mem(pgd_t *pgd) @@ -429,25 +442,28 @@ static void __init map_mem(pgd_t *pgd) } } -#ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { + if (!IS_ENABLED(CONFIG_DEBUG_RODATA)) + return; + create_mapping_late(__pa(_stext), (unsigned long)_stext, (unsigned long)_etext - (unsigned long)_stext, PAGE_KERNEL_ROX); - } -#endif void fixup_init(void) { - create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin, - (unsigned long)__init_end - (unsigned long)__init_begin, - PAGE_KERNEL); + /* + * Unmap the __init region but leave the VM area in place. This + * prevents the region from being reused for kernel modules, which + * is not supported by kallsyms. + */ + unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); } static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, - pgprot_t prot) + pgprot_t prot, struct vm_struct *vma) { phys_addr_t pa_start = __pa(va_start); unsigned long size = va_end - va_start; @@ -457,6 +473,14 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, early_pgtable_alloc); + + vma->addr = va_start; + vma->phys_addr = pa_start; + vma->size = size; + vma->flags = VM_MAP; + vma->caller = __builtin_return_address(0); + + vm_area_add_early(vma); } /* @@ -464,17 +488,35 @@ static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, */ static void __init map_kernel(pgd_t *pgd) { + static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data; - map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); - map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); - map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, + &vmlinux_init); + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); - /* - * The fixmap falls in a separate pgd to the kernel, and doesn't live - * in the carveout for the swapper_pg_dir. We can simply re-use the - * existing dir for the fixmap. - */ - set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); + if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { + /* + * The fixmap falls in a separate pgd to the kernel, and doesn't + * live in the carveout for the swapper_pg_dir. We can simply + * re-use the existing dir for the fixmap. + */ + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), + *pgd_offset_k(FIXADDR_START)); + } else if (CONFIG_PGTABLE_LEVELS > 3) { + /* + * The fixmap shares its top level pgd entry with the kernel + * mapping. This can really only occur when we are running + * with 16k/4 levels, so we can simply reuse the pud level + * entry instead. + */ + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); + set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START), + __pud(__pa(bm_pmd) | PUD_TYPE_TABLE)); + pud_clear_fixmap(); + } else { + BUG(); + } kasan_copy_shadow(pgd); } @@ -600,14 +642,6 @@ void vmemmap_free(unsigned long start, unsigned long end) } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; -#if CONFIG_PGTABLE_LEVELS > 2 -static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; -#endif -#if CONFIG_PGTABLE_LEVELS > 3 -static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; -#endif - static inline pud_t * fixmap_pud(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); @@ -639,8 +673,18 @@ void __init early_fixmap_init(void) unsigned long addr = FIXADDR_START; pgd = pgd_offset_k(addr); - pgd_populate(&init_mm, pgd, bm_pud); - pud = fixmap_pud(addr); + if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) { + /* + * We only end up here if the kernel mapping and the fixmap + * share the top level pgd entry, which should only happen on + * 16k/4 levels configurations. + */ + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); + pud = pud_offset_kimg(pgd, addr); + } else { + pgd_populate(&init_mm, pgd, bm_pud); + pud = fixmap_pud(addr); + } pud_populate(&init_mm, pud, bm_pmd); pmd = fixmap_pmd(addr); pmd_populate_kernel(&init_mm, pmd, bm_pte); -- cgit v1.2.3 From 368a063148f5d4e5b40e65b9954a9b98b1a7cc3c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:41 +0100 Subject: arm64: defer __va translation of initrd_start and initrd_end Before deferring the assignment of memstart_addr in a subsequent patch, to the moment where all memory has been discovered and possibly clipped based on the size of the linear region and the presence of a mem= command line parameter, we need to ensure that memstart_addr is not used to perform __va translations before it is assigned. One such use is in the generic early DT discovery of the initrd location, which is recorded as a virtual address in the globals initrd_start and initrd_end. So wire up the generic support to declare the initrd addresses, and implement it without __va() translations, and perform the translation after memstart_addr has been assigned. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit a89dea585371a9d5d85499db47c93f129be8e0c4) Signed-off-by: Alex Shi --- arch/arm64/include/asm/memory.h | 8 ++++++++ arch/arm64/mm/init.c | 13 +++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 4388651d1f0d..18b7e77c7495 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -121,6 +121,14 @@ #define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif +#ifdef CONFIG_BLK_DEV_INITRD +#define __early_init_dt_declare_initrd(__start, __end) \ + do { \ + initrd_start = (__start); \ + initrd_end = (__end); \ + } while (0) +#endif + #ifndef __ASSEMBLY__ extern phys_addr_t memstart_addr; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ac4d8159d6f3..92acbee2bb8b 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -59,8 +59,8 @@ static int __init early_initrd(char *p) if (*endp == ',') { size = memparse(endp + 1, NULL); - initrd_start = (unsigned long)__va(start); - initrd_end = (unsigned long)__va(start + size); + initrd_start = start; + initrd_end = start + size; } return 0; } @@ -168,8 +168,13 @@ void __init arm64_memblock_init(void) */ memblock_reserve(__pa(_text), _end - _text); #ifdef CONFIG_BLK_DEV_INITRD - if (initrd_start) - memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start); + if (initrd_start) { + memblock_reserve(initrd_start, initrd_end - initrd_start); + + /* the generic initrd code expects virtual addresses */ + initrd_start = __phys_to_virt(initrd_start); + initrd_end = __phys_to_virt(initrd_end); + } #endif early_init_fdt_scan_reserved_mem(); -- cgit v1.2.3 From 72b991537db5f4c361b540cbf0059c7268d848c4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 16 Feb 2016 13:52:42 +0100 Subject: arm64: allow kernel Image to be loaded anywhere in physical memory This relaxes the kernel Image placement requirements, so that it may be placed at any 2 MB aligned offset in physical memory. This is accomplished by ignoring PHYS_OFFSET when installing memblocks, and accounting for the apparent virtual offset of the kernel Image. As a result, virtual address references below PAGE_OFFSET are correctly mapped onto physical references into the kernel Image regardless of where it sits in memory. Special care needs to be taken for dealing with memory limits passed via mem=, since the generic implementation clips memory top down, which may clip the kernel image itself if it is loaded high up in memory. To deal with this case, we simply add back the memory covering the kernel image, which may result in more memory to be retained than was passed as a mem= parameter. Since mem= should not be considered a production feature, a panic notifier handler is installed that dumps the memory limit at panic time if one was set. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit a7f8de168ace487fa7b88cb154e413cf40e87fc6) Signed-off-by: Alex Shi --- arch/arm64/include/asm/boot.h | 6 ++++ arch/arm64/include/asm/kernel-pgtable.h | 12 +++++++ arch/arm64/include/asm/kvm_asm.h | 17 +-------- arch/arm64/include/asm/memory.h | 18 +++++----- arch/arm64/kernel/head.S | 6 +++- arch/arm64/kernel/image.h | 13 ++++--- arch/arm64/mm/init.c | 63 +++++++++++++++++++++++++++++++-- arch/arm64/mm/mmu.c | 3 ++ 8 files changed, 106 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h index 81151b67b26b..ebf2481889c3 100644 --- a/arch/arm64/include/asm/boot.h +++ b/arch/arm64/include/asm/boot.h @@ -11,4 +11,10 @@ #define MIN_FDT_ALIGN 8 #define MAX_FDT_SIZE SZ_2M +/* + * arm64 requires the kernel image to placed + * TEXT_OFFSET bytes beyond a 2 MB aligned base + */ +#define MIN_KIMG_ALIGN SZ_2M + #endif diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index a459714ee29e..5c6375d8528b 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -79,5 +79,17 @@ #define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS) #endif +/* + * To make optimal use of block mappings when laying out the linear + * mapping, round down the base of physical memory to a size that can + * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE + * (64k granule), or a multiple that can be mapped using contiguous bits + * in the page tables: 32 * PMD_SIZE (16k granule) + */ +#ifdef CONFIG_ARM64_64K_PAGES +#define ARM64_MEMSTART_ALIGN SZ_512M +#else +#define ARM64_MEMSTART_ALIGN SZ_1G +#endif #endif /* __ASM_KERNEL_PGTABLE_H */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index e95c39543629..419bc6661b5c 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -102,24 +102,9 @@ #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) -#define kvm_ksym_ref(sym) ((void *)&sym + kvm_ksym_shift) +#define kvm_ksym_ref(sym) phys_to_virt((u64)&sym - kimage_voffset) #ifndef __ASSEMBLY__ -#if __GNUC__ > 4 -#define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR) -#else -/* - * GCC versions 4.9 and older will fold the constant below into the addend of - * the reference to 'sym' above if kvm_ksym_shift is declared static or if the - * constant is used directly. However, since we use the small code model for - * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair, - * with a +/- 4 GB range, resulting in linker relocation errors if the shift - * is sufficiently large. So prevent the compiler from folding the shift into - * the addend, by making the shift a variable with external linkage. - */ -__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR; -#endif - struct kvm; struct kvm_vcpu; diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 18b7e77c7495..3239e4d78e0d 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -88,10 +89,10 @@ #define __virt_to_phys(x) ({ \ phys_addr_t __x = (phys_addr_t)(x); \ __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ - (__x - KIMAGE_VADDR + PHYS_OFFSET); }) + (__x - kimage_voffset); }) #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) -#define __phys_to_kimg(x) ((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR)) +#define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* * Convert a page to/from a physical address @@ -133,15 +134,16 @@ extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ -#define PHYS_OFFSET ({ memstart_addr; }) +#define PHYS_OFFSET ({ BUG_ON(memstart_addr & 1); memstart_addr; }) + +/* the offset between the kernel virtual and physical mappings */ +extern u64 kimage_voffset; /* - * The maximum physical address that the linear direct mapping - * of system RAM can cover. (PAGE_OFFSET can be interpreted as - * a 2's complement signed quantity and negated to derive the - * maximum size of the linear mapping.) + * Allow all memory at the discovery stage. We will clip it later. */ -#define MAX_MEMBLOCK_ADDR ({ memstart_addr - PAGE_OFFSET - 1; }) +#define MIN_MEMBLOCK_ADDR 0 +#define MAX_MEMBLOCK_ADDR U64_MAX /* * PFNs are used to describe any physical page; this means diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 04d38a058b19..05b98289093e 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -428,7 +428,11 @@ __mmap_switched: and x4, x4, #~(THREAD_SIZE - 1) msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer - str_l x24, memstart_addr, x6 // Save PHYS_OFFSET + + ldr x4, =KIMAGE_VADDR // Save the offset between + sub x4, x4, x24 // the kernel virtual and + str_l x4, kimage_voffset, x5 // physical mappings + mov x29, #0 #ifdef CONFIG_KASAN bl kasan_early_init diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 999633bd7294..c9c62cab25a4 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -42,15 +42,18 @@ #endif #ifdef CONFIG_CPU_BIG_ENDIAN -#define __HEAD_FLAG_BE 1 +#define __HEAD_FLAG_BE 1 #else -#define __HEAD_FLAG_BE 0 +#define __HEAD_FLAG_BE 0 #endif -#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) +#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) -#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ - (__HEAD_FLAG_PAGE_SIZE << 1)) +#define __HEAD_FLAG_PHYS_BASE 1 + +#define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ + (__HEAD_FLAG_PAGE_SIZE << 1) | \ + (__HEAD_FLAG_PHYS_BASE << 3)) /* * These will output as part of the Image header, which should be little-endian diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 92acbee2bb8b..2c7a3c2868e4 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -35,8 +35,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -46,7 +48,13 @@ #include "mm.h" -phys_addr_t memstart_addr __read_mostly = 0; +/* + * We need to be able to catch inadvertent references to memstart_addr + * that occur (potentially in generic code) before arm64_memblock_init() + * executes, which assigns it its actual value. So use a default value + * that cannot be mistaken for a real physical address. + */ +phys_addr_t memstart_addr __read_mostly = ~0ULL; phys_addr_t arm64_dma_phys_limit __read_mostly; #ifdef CONFIG_BLK_DEV_INITRD @@ -160,7 +168,33 @@ early_param("mem", early_mem); void __init arm64_memblock_init(void) { - memblock_enforce_memory_limit(memory_limit); + const s64 linear_region_size = -(s64)PAGE_OFFSET; + + /* + * Select a suitable value for the base of physical memory. + */ + memstart_addr = round_down(memblock_start_of_DRAM(), + ARM64_MEMSTART_ALIGN); + + /* + * Remove the memory that we will not be able to cover with the + * linear mapping. Take care not to clip the kernel which may be + * high in memory. + */ + memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)), + ULLONG_MAX); + if (memblock_end_of_DRAM() > linear_region_size) + memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); + + /* + * Apply the memory limit if it was set. Since the kernel may be loaded + * high up in memory, add back the kernel region that must be accessible + * via the linear mapping. + */ + if (memory_limit != (phys_addr_t)ULLONG_MAX) { + memblock_enforce_memory_limit(memory_limit); + memblock_add(__pa(_text), (u64)(_end - _text)); + } /* * Register the kernel text, kernel data, initrd, and initial @@ -386,3 +420,28 @@ static int __init keepinitrd_setup(char *__unused) __setup("keepinitrd", keepinitrd_setup); #endif + +/* + * Dump out memory limit information on panic. + */ +static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p) +{ + if (memory_limit != (phys_addr_t)ULLONG_MAX) { + pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20); + } else { + pr_emerg("Memory Limit: none\n"); + } + return 0; +} + +static struct notifier_block mem_limit_notifier = { + .notifier_call = dump_mem_limit, +}; + +static int __init register_mem_limit_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &mem_limit_notifier); + return 0; +} +__initcall(register_mem_limit_dumper); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 895a8457259c..fb5c872fe3d6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -46,6 +46,9 @@ u64 idmap_t0sz = TCR_T0SZ(VA_BITS); +u64 kimage_voffset __read_mostly; +EXPORT_SYMBOL(kimage_voffset); + /* * Empty_zero_page is a special page that is used for zero-initialized data * and COW. -- cgit v1.2.3 From a67099df67dec4550a650aa3b871e2b0ecd20957 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 19 Feb 2016 14:28:58 +0000 Subject: arm64: User die() instead of panic() in do_page_fault() The former gives better error reporting on unhandled permission faults (introduced by the UAO patches). Signed-off-by: Catalin Marinas (cherry picked from commit 70c8abc28762d04e36c92e07eee2ce6ab41049cb) Signed-off-by: Alex Shi --- arch/arm64/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a8eafeceb08a..44e56de23f79 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -235,10 +235,10 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (permission_fault(esr) && (addr < USER_DS)) { if (get_fs() == KERNEL_DS) - panic("Accessing user space memory with fs=KERNEL_DS"); + die("Accessing user space memory with fs=KERNEL_DS", regs, esr); if (!search_exception_tables(regs->pc)) - panic("Accessing user space memory outside uaccess.h routines"); + die("Accessing user space memory outside uaccess.h routines", regs, esr); } /* -- cgit v1.2.3 From bf7cb966b2f5ef5a456a25d84d2b64c79730a07a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 18:46:03 +0100 Subject: arm64: mm: only perform memstart_addr sanity check if DEBUG_VM Checking whether memstart_addr has been assigned every time it is referenced adds a branch instruction that may hurt performance if the reference in question occurs on a hot path. So only perform the check if CONFIG_DEBUG_VM=y. Signed-off-by: Ard Biesheuvel [catalin.marinas@arm.com: replaced #ifdef with VM_BUG_ON] Signed-off-by: Catalin Marinas (cherry picked from commit a92405f082d43267575444a6927085e4c8a69e4e) Signed-off-by: Alex Shi --- arch/arm64/include/asm/memory.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 3239e4d78e0d..460d09bf9442 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -132,9 +132,11 @@ #ifndef __ASSEMBLY__ +#include + extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ -#define PHYS_OFFSET ({ BUG_ON(memstart_addr & 1); memstart_addr; }) +#define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; -- cgit v1.2.3 From d3bb0180b3ecbcff076ca7e41d4ad4fa0ee4c9d7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 18:46:04 +0100 Subject: arm64: mm: use bit ops rather than arithmetic in pa/va translations Since PAGE_OFFSET is chosen such that it cuts the kernel VA space right in half, and since the size of the kernel VA space itself is always a power of 2, we can treat PAGE_OFFSET as a bitmask and replace the additions/subtractions with 'or' and 'and-not' operations. For the comparison against PAGE_OFFSET, a mov/cmp/branch sequence ends up getting replaced with a single tbz instruction. For the additions and subtractions, we save a mov instruction since the mask is folded into the instruction's immediate field. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 8439e62a15614e8fcd43835d57b7245cd9870dc5) Signed-off-by: Alex Shi --- arch/arm64/include/asm/memory.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 460d09bf9442..eb798156cf56 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -88,10 +88,10 @@ */ #define __virt_to_phys(x) ({ \ phys_addr_t __x = (phys_addr_t)(x); \ - __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ - (__x - kimage_voffset); }) + __x & BIT(VA_BITS - 1) ? (__x & ~PAGE_OFFSET) + PHYS_OFFSET : \ + (__x - kimage_voffset); }) -#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) +#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET) #define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) /* @@ -132,6 +132,7 @@ #ifndef __ASSEMBLY__ +#include #include extern phys_addr_t memstart_addr; -- cgit v1.2.3 From 11e7d3ccfae5510815c72cec4066d5d5acbfa718 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 23 Feb 2016 08:56:45 +0100 Subject: arm64: move brk immediate argument definitions to separate header Instead of reversing the header dependency between asm/bug.h and asm/debug-monitors.h, split off the brk instruction immediate value defines into a new header asm/brk-imm.h, and include it from both. This solves the circular dependency issue that prevents BUG() from being used in some header files, and keeps the definitions together. Signed-off-by: Ard Biesheuvel Acked-by: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit f98deee9a9f8c47d05a0f64d86440882dca772ff) Signed-off-by: Alex Shi --- arch/arm64/include/asm/brk-imm.h | 25 +++++++++++++++++++++++++ arch/arm64/include/asm/bug.h | 2 +- arch/arm64/include/asm/debug-monitors.h | 14 +------------- 3 files changed, 27 insertions(+), 14 deletions(-) create mode 100644 arch/arm64/include/asm/brk-imm.h (limited to 'arch') diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h new file mode 100644 index 000000000000..ed693c5bcec0 --- /dev/null +++ b/arch/arm64/include/asm/brk-imm.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2012 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __ASM_BRK_IMM_H +#define __ASM_BRK_IMM_H + +/* + * #imm16 values used for BRK instruction generation + * Allowed values for kgdb are 0x400 - 0x7ff + * 0x100: for triggering a fault on purpose (reserved) + * 0x400: for dynamic BRK instruction + * 0x401: for compile time BRK instruction + * 0x800: kernel-mode BUG() and WARN() traps + */ +#define FAULT_BRK_IMM 0x100 +#define KGDB_DYN_DBG_BRK_IMM 0x400 +#define KGDB_COMPILED_DBG_BRK_IMM 0x401 +#define BUG_BRK_IMM 0x800 + +#endif diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 679d49221998..561190d15881 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -18,7 +18,7 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#define BUG_BRK_IMM 0x800 +#include #ifdef CONFIG_GENERIC_BUG #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index e893a1fca9c2..2fcb9b7c876c 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include @@ -47,18 +47,6 @@ */ #define BREAK_INSTR_SIZE AARCH64_INSN_SIZE -/* - * #imm16 values used for BRK instruction generation - * Allowed values for kgbd are 0x400 - 0x7ff - * 0x100: for triggering a fault on purpose (reserved) - * 0x400: for dynamic BRK instruction - * 0x401: for compile time BRK instruction - * 0x800: kernel-mode BUG() and WARN() traps - */ -#define FAULT_BRK_IMM 0x100 -#define KGDB_DYN_DBG_BRK_IMM 0x400 -#define KGDB_COMPILED_DBG_BRK_IMM 0x401 - /* * BRK instruction encoding * The #imm16 value should be placed at bits[20:5] within BRK ins -- cgit v1.2.3 From 6537906675622c231257664593ed3174b117b3ef Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 24 Nov 2015 12:37:35 +0100 Subject: arm64: add support for module PLTs This adds support for emitting PLTs at module load time for relative branches that are out of range. This is a prerequisite for KASLR, which may place the kernel and the modules anywhere in the vmalloc area, making it more likely that branch target offsets exceed the maximum range of +/- 128 MB. In this version, I removed the distinction between relocations against .init executable sections and ordinary executable sections. The reason is that it is hardly worth the trouble, given that .init.text usually does not contain that many far branches, and this version now only reserves PLT entry space for jump and call relocations against undefined symbols (since symbols defined in the same module can be assumed to be within +/- 128 MB) For example, the mac80211.ko module (which is fairly sizable at ~400 KB) built with -mcmodel=large gives the following relocation counts: relocs branches unique !local .text 3925 3347 518 219 .init.text 11 8 7 1 .exit.text 4 4 4 1 .text.unlikely 81 67 36 17 ('unique' means branches to unique type/symbol/addend combos, of which !local is the subset referring to undefined symbols) IOW, we are only emitting a single PLT entry for the .init sections, and we are better off just adding it to the core PLT section instead. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit fd045f6cd98ec4953147b318418bd45e441e52a3) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 9 ++ arch/arm64/Makefile | 6 +- arch/arm64/include/asm/module.h | 11 +++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/module-plts.c | 201 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/module.c | 22 +++++ arch/arm64/kernel/module.lds | 3 + 7 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/module-plts.c create mode 100644 arch/arm64/kernel/module.lds (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 8cd8d06ece4a..22db20491733 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -365,6 +365,7 @@ config ARM64_ERRATUM_843419 bool "Cortex-A53: 843419: A load or store might access an incorrect address" depends on MODULES default y + select ARM64_MODULE_CMODEL_LARGE help This option builds kernel modules using the large memory model in order to avoid the use of the ADRP instruction, which can cause @@ -728,6 +729,14 @@ config ARM64_UAO regular load/store instructions if the cpu does not implement the feature. +config ARM64_MODULE_CMODEL_LARGE + bool + +config ARM64_MODULE_PLTS + bool + select ARM64_MODULE_CMODEL_LARGE + select HAVE_MOD_ARCH_SPECIFIC + endmenu menu "Boot options" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 548a2939d7e6..71054a38decf 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -43,10 +43,14 @@ endif CHECKFLAGS += -D__aarch64__ -ifeq ($(CONFIG_ARM64_ERRATUM_843419), y) +ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y) KBUILD_CFLAGS_MODULE += -mcmodel=large endif +ifeq ($(CONFIG_ARM64_MODULE_PLTS),y) +KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/arm64/kernel/module.lds +endif + # Default value head-y := arch/arm64/kernel/head.o diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index e80e232b730e..8652fb613304 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -20,4 +20,15 @@ #define MODULE_ARCH_VERMAGIC "aarch64" +#ifdef CONFIG_ARM64_MODULE_PLTS +struct mod_arch_specific { + struct elf64_shdr *plt; + int plt_num_entries; + int plt_max_entries; +}; +#endif + +u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, + Elf64_Sym *sym); + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index c4e2f70c0aa0..8d971f9c6ed5 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -30,6 +30,7 @@ arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ ../../arm/kernel/opcodes.o arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o +arm64-obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o arm64-obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c new file mode 100644 index 000000000000..1ce90d8450ae --- /dev/null +++ b/arch/arm64/kernel/module-plts.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2014-2016 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +struct plt_entry { + /* + * A program that conforms to the AArch64 Procedure Call Standard + * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or + * IP1 (x17) may be inserted at any branch instruction that is + * exposed to a relocation that supports long branches. Since that + * is exactly what we are dealing with here, we are free to use x16 + * as a scratch register in the PLT veneers. + */ + __le32 mov0; /* movn x16, #0x.... */ + __le32 mov1; /* movk x16, #0x...., lsl #16 */ + __le32 mov2; /* movk x16, #0x...., lsl #32 */ + __le32 br; /* br x16 */ +}; + +u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, + Elf64_Sym *sym) +{ + struct plt_entry *plt = (struct plt_entry *)mod->arch.plt->sh_addr; + int i = mod->arch.plt_num_entries; + u64 val = sym->st_value + rela->r_addend; + + /* + * We only emit PLT entries against undefined (SHN_UNDEF) symbols, + * which are listed in the ELF symtab section, but without a type + * or a size. + * So, similar to how the module loader uses the Elf64_Sym::st_value + * field to store the resolved addresses of undefined symbols, let's + * borrow the Elf64_Sym::st_size field (whose value is never used by + * the module loader, even for symbols that are defined) to record + * the address of a symbol's associated PLT entry as we emit it for a + * zero addend relocation (which is the only kind we have to deal with + * in practice). This allows us to find duplicates without having to + * go through the table every time. + */ + if (rela->r_addend == 0 && sym->st_size != 0) { + BUG_ON(sym->st_size < (u64)plt || sym->st_size >= (u64)&plt[i]); + return sym->st_size; + } + + mod->arch.plt_num_entries++; + BUG_ON(mod->arch.plt_num_entries > mod->arch.plt_max_entries); + + /* + * MOVK/MOVN/MOVZ opcode: + * +--------+------------+--------+-----------+-------------+---------+ + * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] | + * +--------+------------+--------+-----------+-------------+---------+ + * + * Rd := 0x10 (x16) + * hw := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32) + * opc := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ) + * sf := 1 (64-bit variant) + */ + plt[i] = (struct plt_entry){ + cpu_to_le32(0x92800010 | (((~val ) & 0xffff)) << 5), + cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5), + cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5), + cpu_to_le32(0xd61f0200) + }; + + if (rela->r_addend == 0) + sym->st_size = (u64)&plt[i]; + + return (u64)&plt[i]; +} + +#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) +{ + const Elf64_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info)); + if (i == 0) + i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info)); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; +} + +static bool duplicate_rel(const Elf64_Rela *rela, int num) +{ + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding + * slot. + */ + return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0; +} + +static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num) +{ + unsigned int ret = 0; + Elf64_Sym *s; + int i; + + for (i = 0; i < num; i++) { + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_AARCH64_JUMP26: + case R_AARCH64_CALL26: + /* + * We only have to consider branch targets that resolve + * to undefined symbols. This is not simply a heuristic, + * it is a fundamental limitation, since the PLT itself + * is part of the module, and needs to be within 128 MB + * as well, so modules can never grow beyond that limit. + */ + s = syms + ELF64_R_SYM(rela[i].r_info); + if (s->st_shndx != SHN_UNDEF) + break; + + /* + * Jump relocations with non-zero addends against + * undefined symbols are supported by the ELF spec, but + * do not occur in practice (e.g., 'jump n bytes past + * the entry point of undefined function symbol f'). + * So we need to support them, but there is no need to + * take them into consideration when trying to optimize + * this code. So let's only check for duplicates when + * the addend is zero: this allows us to record the PLT + * entry address in the symbol table itself, rather than + * having to search the list for duplicates each time we + * emit one. + */ + if (rela[i].r_addend != 0 || !duplicate_rel(rela, i)) + ret++; + break; + } + } + return ret; +} + +int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + unsigned long plt_max_entries = 0; + Elf64_Sym *syms = NULL; + int i; + + /* + * Find the empty .plt section so we can expand it to store the PLT + * entries. Record the symtab address as well. + */ + for (i = 0; i < ehdr->e_shnum; i++) { + if (strcmp(".plt", secstrings + sechdrs[i].sh_name) == 0) + mod->arch.plt = sechdrs + i; + else if (sechdrs[i].sh_type == SHT_SYMTAB) + syms = (Elf64_Sym *)sechdrs[i].sh_addr; + } + + if (!mod->arch.plt) { + pr_err("%s: module PLT section missing\n", mod->name); + return -ENOEXEC; + } + if (!syms) { + pr_err("%s: module symtab section missing\n", mod->name); + return -ENOEXEC; + } + + for (i = 0; i < ehdr->e_shnum; i++) { + Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; + int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; + + if (sechdrs[i].sh_type != SHT_RELA) + continue; + + /* ignore relocations that operate on non-exec sections */ + if (!(dstsec->sh_flags & SHF_EXECINSTR)) + continue; + + /* sort by type, symbol index and addend */ + sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + + plt_max_entries += count_plts(syms, rels, numrels); + } + + mod->arch.plt->sh_type = SHT_NOBITS; + mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.plt->sh_size = plt_max_entries * sizeof(struct plt_entry); + mod->arch.plt_num_entries = 0; + mod->arch.plt_max_entries = plt_max_entries; + return 0; +} diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 93e970231ca9..a9dde97f5ca5 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -38,6 +38,21 @@ void *module_alloc(unsigned long size) GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); + if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && + !IS_ENABLED(CONFIG_KASAN)) + /* + * KASAN can only deal with module allocations being served + * from the reserved module region, since the remainder of + * the vmalloc region is already backed by zero shadow pages, + * and punching holes into it is non-trivial. Since the module + * region is not randomized when KASAN is enabled, it is even + * less likely that the module region gets exhausted, so we + * can simply omit this fallback in that case. + */ + p = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, + VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p); return NULL; @@ -361,6 +376,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_AARCH64_CALL26: ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, AARCH64_INSN_IMM_26); + + if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && + ovf == -ERANGE) { + val = module_emit_plt_entry(me, &rel[i], sym); + ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, + 26, AARCH64_INSN_IMM_26); + } break; default: diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds new file mode 100644 index 000000000000..8949f6c6f729 --- /dev/null +++ b/arch/arm64/kernel/module.lds @@ -0,0 +1,3 @@ +SECTIONS { + .plt (NOLOAD) : { BYTE(0) } +} -- cgit v1.2.3 From 9bd7f88a1dd7b6c513b6f7cf39154e69d0cbf62e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 26 Dec 2015 13:48:02 +0100 Subject: arm64: avoid R_AARCH64_ABS64 relocations for Image header fields Unfortunately, the current way of using the linker to emit build time constants into the Image header will no longer work once we switch to the use of PIE executables. The reason is that such constants are emitted into the binary using R_AARCH64_ABS64 relocations, which are resolved at runtime, not at build time, and the places targeted by those relocations will contain zeroes before that. So refactor the endian swapping linker script constant generation code so that it emits the upper and lower 32-bit words separately. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 6ad1fe5d9077a1ab40bf74b61994d2e770b00b14) Signed-off-by: Alex Shi --- arch/arm64/include/asm/assembler.h | 11 +++++++++++ arch/arm64/kernel/head.S | 6 +++--- arch/arm64/kernel/image.h | 32 +++++++++++++++++++------------- 3 files changed, 33 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bb7b72734c24..ba5aff6c830e 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -215,4 +215,15 @@ lr .req x30 // link register .size __pi_##x, . - x; \ ENDPROC(x) + /* + * Emit a 64-bit absolute little endian symbol reference in a way that + * ensures that it will be resolved at build time, even when building a + * PIE binary. This requires cooperation from the linker script, which + * must emit the lo32/hi32 halves individually. + */ + .macro le64sym, sym + .long \sym\()_lo32 + .long \sym\()_hi32 + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 05b98289093e..f076debf392d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -83,9 +83,9 @@ efi_head: b stext // branch to kernel start, magic .long 0 // reserved #endif - .quad _kernel_offset_le // Image load offset from start of RAM, little-endian - .quad _kernel_size_le // Effective size of kernel image, little-endian - .quad _kernel_flags_le // Informative flags, little-endian + le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian + le64sym _kernel_size_le // Effective size of kernel image, little-endian + le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved .quad 0 // reserved .quad 0 // reserved diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index c9c62cab25a4..db1bf57948f1 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -26,21 +26,27 @@ * There aren't any ELF relocations we can use to endian-swap values known only * at link time (e.g. the subtraction of two symbol addresses), so we must get * the linker to endian-swap certain values before emitting them. + * + * Note that, in order for this to work when building the ELF64 PIE executable + * (for KASLR), these values should not be referenced via R_AARCH64_ABS64 + * relocations, since these are fixed up at runtime rather than at build time + * when PIE is in effect. So we need to split them up in 32-bit high and low + * words. */ #ifdef CONFIG_CPU_BIG_ENDIAN -#define DATA_LE64(data) \ - ((((data) & 0x00000000000000ff) << 56) | \ - (((data) & 0x000000000000ff00) << 40) | \ - (((data) & 0x0000000000ff0000) << 24) | \ - (((data) & 0x00000000ff000000) << 8) | \ - (((data) & 0x000000ff00000000) >> 8) | \ - (((data) & 0x0000ff0000000000) >> 24) | \ - (((data) & 0x00ff000000000000) >> 40) | \ - (((data) & 0xff00000000000000) >> 56)) +#define DATA_LE32(data) \ + ((((data) & 0x000000ff) << 24) | \ + (((data) & 0x0000ff00) << 8) | \ + (((data) & 0x00ff0000) >> 8) | \ + (((data) & 0xff000000) >> 24)) #else -#define DATA_LE64(data) ((data) & 0xffffffffffffffff) +#define DATA_LE32(data) ((data) & 0xffffffff) #endif +#define DEFINE_IMAGE_LE64(sym, data) \ + sym##_lo32 = DATA_LE32((data) & 0xffffffff); \ + sym##_hi32 = DATA_LE32((data) >> 32) + #ifdef CONFIG_CPU_BIG_ENDIAN #define __HEAD_FLAG_BE 1 #else @@ -61,9 +67,9 @@ * endian swapped in head.S, all are done here for consistency. */ #define HEAD_SYMBOLS \ - _kernel_size_le = DATA_LE64(_end - _text); \ - _kernel_offset_le = DATA_LE64(TEXT_OFFSET); \ - _kernel_flags_le = DATA_LE64(__HEAD_FLAGS); + DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \ + DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \ + DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS); #ifdef CONFIG_EFI -- cgit v1.2.3 From 63f9fbe469f17deee50491bc293bb3d2843f3e4a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 26 Dec 2015 12:46:40 +0100 Subject: arm64: avoid dynamic relocations in early boot code Before implementing KASLR for arm64 by building a self-relocating PIE executable, we have to ensure that values we use before the relocation routine is executed are not subject to dynamic relocation themselves. This applies not only to virtual addresses, but also to values that are supplied by the linker at build time and relocated using R_AARCH64_ABS64 relocations. So instead, use assemble time constants, or force the use of static relocations by folding the constants into the instructions. Reviewed-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 2bf31a4a05f5b00f37d65ba029d36a0230286cb7) Signed-off-by: Alex Shi --- arch/arm64/kernel/efi-entry.S | 2 +- arch/arm64/kernel/head.S | 39 ++++++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S index a773db92908b..f82036e02485 100644 --- a/arch/arm64/kernel/efi-entry.S +++ b/arch/arm64/kernel/efi-entry.S @@ -61,7 +61,7 @@ ENTRY(entry) */ mov x20, x0 // DTB address ldr x0, [sp, #16] // relocated _text address - ldr x21, =stext_offset + movz x21, #:abs_g0:stext_offset add x21, x0, x21 /* diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index f076debf392d..4cad8f9f2268 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -67,12 +67,11 @@ * in the entry routines. */ __HEAD - +_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ #ifdef CONFIG_EFI -efi_head: /* * This add instruction has no meaningful effect except that * its opcode forms the magic "MZ" signature required by UEFI. @@ -94,14 +93,14 @@ efi_head: .byte 0x4d .byte 0x64 #ifdef CONFIG_EFI - .long pe_header - efi_head // Offset to the PE header. + .long pe_header - _head // Offset to the PE header. #else .word 0 // reserved #endif #ifdef CONFIG_EFI .globl __efistub_stext_offset - .set __efistub_stext_offset, stext - efi_head + .set __efistub_stext_offset, stext - _head .align 3 pe_header: .ascii "PE" @@ -124,7 +123,7 @@ optional_header: .long _end - stext // SizeOfCode .long 0 // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_entry - efi_head // AddressOfEntryPoint + .long __efistub_entry - _head // AddressOfEntryPoint .long __efistub_stext_offset // BaseOfCode extra_header_fields: @@ -139,7 +138,7 @@ extra_header_fields: .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - efi_head // SizeOfImage + .long _end - _head // SizeOfImage // Everything before the kernel image is considered part of the header .long __efistub_stext_offset // SizeOfHeaders @@ -219,11 +218,13 @@ ENTRY(stext) * On return, the CPU will be ready for the MMU to be turned on and * the TCR will have been set. */ - ldr x27, =__mmap_switched // address to jump to after + ldr x27, 0f // address to jump to after // MMU has been enabled adr_l lr, __enable_mmu // return (PIC) address b __cpu_setup // initialise processor ENDPROC(stext) + .align 3 +0: .quad __mmap_switched - (_head - TEXT_OFFSET) + KIMAGE_VADDR /* * Preserve the arguments passed by the bootloader in x0 .. x3 @@ -391,7 +392,8 @@ __create_page_tables: mov x0, x26 // swapper_pg_dir ldr x5, =KIMAGE_VADDR create_pgd_entry x0, x5, x3, x6 - ldr x6, =KERNEL_END // __va(KERNEL_END) + ldr w6, kernel_img_size + add x6, x6, x5 mov x3, x24 // phys offset create_block_map x0, x7, x3, x5, x6 @@ -408,6 +410,9 @@ __create_page_tables: mov lr, x27 ret ENDPROC(__create_page_tables) + +kernel_img_size: + .long _end - (_head - TEXT_OFFSET) .ltorg /* @@ -415,6 +420,10 @@ ENDPROC(__create_page_tables) */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: + adr_l x8, vectors // load VBAR_EL1 with virtual + msr vbar_el1, x8 // vector table address + isb + // Clear BSS adr_l x0, __bss_start mov x1, xzr @@ -610,13 +619,19 @@ ENTRY(secondary_startup) adrp x26, swapper_pg_dir bl __cpu_setup // initialise processor - ldr x21, =secondary_data - ldr x27, =__secondary_switched // address to jump to after enabling the MMU + ldr x8, =KIMAGE_VADDR + ldr w9, 0f + sub x27, x8, w9, sxtw // address to jump to after enabling the MMU b __enable_mmu ENDPROC(secondary_startup) +0: .long (_text - TEXT_OFFSET) - __secondary_switched ENTRY(__secondary_switched) - ldr x0, [x21] // get secondary_data.stack + adr_l x5, vectors + msr vbar_el1, x5 + isb + + ldr_l x0, secondary_data // get secondary_data.stack mov sp, x0 and x0, x0, #~(THREAD_SIZE - 1) msr sp_el0, x0 // save thread_info @@ -641,8 +656,6 @@ __enable_mmu: ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED b.ne __no_granule_support - ldr x5, =vectors - msr vbar_el1, x5 msr ttbr0_el1, x25 // load TTBR0 msr ttbr1_el1, x26 // load TTBR1 isb -- cgit v1.2.3 From 632fd2f00cb569f65adc7d2c67282b70a6af8634 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 17:08:26 +0100 Subject: arm64: make asm/elf.h available to asm files This reshuffles some code in asm/elf.h and puts a #ifndef __ASSEMBLY__ around its C definitions so that the CPP defines can be used in asm source files as well. Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 4a2e034e5cdadde4c712f79bdd57d1455c76a3db) Signed-off-by: Alex Shi --- arch/arm64/include/asm/elf.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index faad6df49e5b..435f55952e1f 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -24,15 +24,6 @@ #include #include -typedef unsigned long elf_greg_t; - -#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t)) -#define ELF_CORE_COPY_REGS(dest, regs) \ - *(struct user_pt_regs *)&(dest) = (regs)->user_regs; - -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; -typedef struct user_fpsimd_state elf_fpregset_t; - /* * AArch64 static relocation types. */ @@ -127,6 +118,17 @@ typedef struct user_fpsimd_state elf_fpregset_t; */ #define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) +#ifndef __ASSEMBLY__ + +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t)) +#define ELF_CORE_COPY_REGS(dest, regs) \ + *(struct user_pt_regs *)&(dest) = (regs)->user_regs; + +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; +typedef struct user_fpsimd_state elf_fpregset_t; + /* * When the program starts, a1 contains a pointer to a function to be * registered with atexit, as per the SVR4 ABI. A value of 0 means we have no @@ -186,4 +188,6 @@ extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, #endif /* CONFIG_COMPAT */ +#endif /* !__ASSEMBLY__ */ + #endif -- cgit v1.2.3 From 6ef77fd5962d90de7957ef9eafddd659c7375a4f Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 2 Feb 2016 15:53:59 +0000 Subject: arm64: futex.h: Add missing PAN toggling futex.h's futex_atomic_cmpxchg_inatomic() does not use the __futex_atomic_op() macro and needs its own PAN toggling. This was missed when the feature was implemented. Fixes: 338d4f49d6f ("arm64: kernel: Add support for Privileged Access Never") Signed-off-by: James Morse Signed-off-by: Will Deacon (cherry picked from commit 811d61e384e24759372bb3f01772f3744b0a8327) Signed-off-by: Alex Shi --- arch/arm64/include/asm/futex.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 007a69fc4f40..5f3ab8c1db55 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -121,6 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return -EFAULT; asm volatile("// futex_atomic_cmpxchg_inatomic\n" +ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -137,6 +138,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, " .align 3\n" " .quad 1b, 4b, 2b, 4b\n" " .popsection\n" +ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); -- cgit v1.2.3 From 525787eea48f8c6a1630e6dab07313896cfc6b8d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 Jan 2016 15:02:12 +0100 Subject: arm64: switch to relative exception tables Instead of using absolute addresses for both the exception location and the fixup, use offsets relative to the exception table entry values. Not only does this cut the size of the exception table in half, it is also a prerequisite for KASLR, since absolute exception table entries are subject to dynamic relocation, which is incompatible with the sorting of the exception table that occurs at build time. This patch also introduces the _ASM_EXTABLE preprocessor macro (which exists on x86 as well) and its _asm_extable assembly counterpart, as shorthands to emit exception table entries. Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 6c94f27ac847ff8ef15b3da5b200574923bd6287) Signed-off-by: Alex Shi --- arch/arm64/include/asm/alternative.h | 19 +++++-------------- arch/arm64/include/asm/assembler.h | 15 +++++++++++---- arch/arm64/include/asm/futex.h | 12 ++++-------- arch/arm64/include/asm/uaccess.h | 30 ++++++++++++++++-------------- arch/arm64/include/asm/word-at-a-time.h | 7 +++---- arch/arm64/kernel/armv8_deprecated.c | 7 ++----- arch/arm64/mm/extable.c | 2 +- 7 files changed, 42 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index a9fc24ec1aa9..beccbdefa106 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -157,11 +157,8 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .quad 8889b,\l; - .previous; + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; .endm .macro uao_stp l, reg1, reg2, addr, post_inc @@ -175,11 +172,8 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .quad 8889b,\l; - .previous + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; .endm .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc @@ -191,10 +185,7 @@ void apply_alternatives(void *start, size_t length); add \addr, \addr, \post_inc; alternative_endif - .section __ex_table,"a"; - .align 3; - .quad 8888b,\l; - .previous + _asm_extable 8888b,\l; .endm #else .macro uao_ldp l, reg1, reg2, addr, post_inc diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index ba5aff6c830e..70f7b9e04598 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -94,12 +94,19 @@ dmb \opt .endm +/* + * Emit an entry into the exception table + */ + .macro _asm_extable, from, to + .pushsection __ex_table, "a" + .align 3 + .long (\from - .), (\to - .) + .popsection + .endm + #define USER(l, x...) \ 9999: x; \ - .section __ex_table,"a"; \ - .align 3; \ - .quad 9999b,l; \ - .previous + _asm_extable 9999b, l /* * Register aliases. diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 5f3ab8c1db55..f2585cdd32c2 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -42,10 +42,8 @@ "4: mov %w0, %w5\n" \ " b 3b\n" \ " .popsection\n" \ -" .pushsection __ex_table,\"a\"\n" \ -" .align 3\n" \ -" .quad 1b, 4b, 2b, 4b\n" \ -" .popsection\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ @@ -134,10 +132,8 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) "4: mov %w0, %w6\n" " b 3b\n" " .popsection\n" -" .pushsection __ex_table,\"a\"\n" -" .align 3\n" -" .quad 1b, 4b, 2b, 4b\n" -" .popsection\n" + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 4b) ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 16ba0d5c9740..0685d74572af 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -36,11 +36,11 @@ #define VERIFY_WRITE 1 /* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. + * The exception table consists of pairs of relative offsets: the first + * is the relative offset to an instruction that is allowed to fault, + * and the second is the relative offset at which the program should + * continue. No registers are modified, so it is entirely up to the + * continuation code to figure out what to do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -50,9 +50,11 @@ struct exception_table_entry { - unsigned long insn, fixup; + int insn, fixup; }; +#define ARCH_HAS_RELATIVE_EXTABLE + extern int fixup_exception(struct pt_regs *regs); #define KERNEL_DS (-1UL) @@ -115,6 +117,12 @@ static inline void set_fs(mm_segment_t fs) #define access_ok(type, addr, size) __range_ok(addr, size) #define user_addr_max get_fs +#define _ASM_EXTABLE(from, to) \ + " .pushsection __ex_table, \"a\"\n" \ + " .align 3\n" \ + " .long (" #from " - .), (" #to " - .)\n" \ + " .popsection\n" + /* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" @@ -134,10 +142,7 @@ static inline void set_fs(mm_segment_t fs) " mov %1, #0\n" \ " b 2b\n" \ " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 1b, 3b\n" \ - " .previous" \ + _ASM_EXTABLE(1b, 3b) \ : "+r" (err), "=&r" (x) \ : "r" (addr), "i" (-EFAULT)) @@ -206,10 +211,7 @@ do { \ "3: mov %w0, %3\n" \ " b 2b\n" \ " .previous\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 1b, 3b\n" \ - " .previous" \ + _ASM_EXTABLE(1b, 3b) \ : "+r" (err) \ : "r" (x), "r" (addr), "i" (-EFAULT)) diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h index aab5bf09e9d9..2b79b8a89457 100644 --- a/arch/arm64/include/asm/word-at-a-time.h +++ b/arch/arm64/include/asm/word-at-a-time.h @@ -16,6 +16,8 @@ #ifndef __ASM_WORD_AT_A_TIME_H #define __ASM_WORD_AT_A_TIME_H +#include + #ifndef __AARCH64EB__ #include @@ -81,10 +83,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr) #endif " b 2b\n" " .popsection\n" - " .pushsection __ex_table,\"a\"\n" - " .align 3\n" - " .quad 1b, 3b\n" - " .popsection" + _ASM_EXTABLE(1b, 3b) : "=&r" (ret), "=&r" (offset) : "r" (addr), "Q" (*(unsigned long *)addr)); diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 3e01207917b1..c37202c0c838 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -297,11 +297,8 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) "4: mov %w0, %w5\n" \ " b 3b\n" \ " .popsection" \ - " .pushsection __ex_table,\"a\"\n" \ - " .align 3\n" \ - " .quad 0b, 4b\n" \ - " .quad 1b, 4b\n" \ - " .popsection\n" \ + _ASM_EXTABLE(0b, 4b) \ + _ASM_EXTABLE(1b, 4b) \ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp) \ diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index 79444279ba8c..81acd4706878 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(instruction_pointer(regs)); if (fixup) - regs->pc = fixup->fixup; + regs->pc = (unsigned long)&fixup->fixup + fixup->fixup; return fixup != NULL; } -- cgit v1.2.3 From 89328d41aa99df071dadb43c722cd88ffafc77e2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 09:13:44 +0100 Subject: arm64: add support for building vmlinux as a relocatable PIE binary This implements CONFIG_RELOCATABLE, which links the final vmlinux image with a dynamic relocation section, allowing the early boot code to perform a relocation to a different virtual address at runtime. This is a prerequisite for KASLR (CONFIG_RANDOMIZE_BASE). Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 1e48ef7fcc374051730381a2a05da77eb4eafdb0) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 11 +++++++++++ arch/arm64/Makefile | 4 ++++ arch/arm64/include/asm/elf.h | 2 ++ arch/arm64/kernel/head.S | 32 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 16 ++++++++++++++++ 5 files changed, 65 insertions(+) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 22db20491733..ac1475f559e6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -737,6 +737,17 @@ config ARM64_MODULE_PLTS select ARM64_MODULE_CMODEL_LARGE select HAVE_MOD_ARCH_SPECIFIC +config RELOCATABLE + bool + help + This builds the kernel as a Position Independent Executable (PIE), + which retains all relocation metadata required to relocate the + kernel binary at runtime to a different virtual address than the + address it was linked at. + Since AArch64 uses the RELA relocation format, this requires a + relocation pass at runtime even if the kernel is loaded at the + same address it was linked at. + endmenu menu "Boot options" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 71054a38decf..304dcc3da06f 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -15,6 +15,10 @@ CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S GZFLAGS :=-9 +ifneq ($(CONFIG_RELOCATABLE),) +LDFLAGS_vmlinux += -pie +endif + KBUILD_DEFCONFIG := defconfig # Check for binutils support for specific extensions diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 435f55952e1f..24ed037f09fd 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -77,6 +77,8 @@ #define R_AARCH64_MOVW_PREL_G2_NC 292 #define R_AARCH64_MOVW_PREL_G3 293 +#define R_AARCH64_RELATIVE 1027 + /* * These are used to set parameters in the core dumps. */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 4cad8f9f2268..4e69412a7323 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -432,6 +433,37 @@ __mmap_switched: bl __pi_memset dsb ishst // Make zero page visible to PTW +#ifdef CONFIG_RELOCATABLE + + /* + * Iterate over each entry in the relocation table, and apply the + * relocations in place. + */ + adr_l x8, __dynsym_start // start of symbol table + adr_l x9, __reloc_start // start of reloc table + adr_l x10, __reloc_end // end of reloc table + +0: cmp x9, x10 + b.hs 2f + ldp x11, x12, [x9], #24 + ldr x13, [x9, #-8] + cmp w12, #R_AARCH64_RELATIVE + b.ne 1f + str x13, [x11] + b 0b + +1: cmp w12, #R_AARCH64_ABS64 + b.ne 0b + add x12, x12, x12, lsl #1 // symtab offset: 24x top word + add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word + ldr x15, [x12, #8] // Elf64_Sym::st_value + add x15, x13, x15 + str x15, [x11] + b 0b + +2: +#endif + adr_l sp, initial_sp, x4 mov x4, sp and x4, x4, #~(THREAD_SIZE - 1) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 282e3e64a17e..e3f6cd740ea3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -87,6 +87,7 @@ SECTIONS EXIT_CALL *(.discard) *(.discard.*) + *(.interp .dynamic) } . = KIMAGE_VADDR + TEXT_OFFSET; @@ -149,6 +150,21 @@ SECTIONS .altinstr_replacement : { *(.altinstr_replacement) } + .rela : ALIGN(8) { + __reloc_start = .; + *(.rela .rela*) + __reloc_end = .; + } + .dynsym : ALIGN(8) { + __dynsym_start = .; + *(.dynsym) + } + .dynstr : { + *(.dynstr) + } + .hash : { + *(.hash) + } . = ALIGN(PAGE_SIZE); __init_end = .; -- cgit v1.2.3 From d0a12e9199c75cad71361f746ac40e4612945a43 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 14:12:01 +0100 Subject: arm64: add support for kernel ASLR This adds support for KASLR is implemented, based on entropy provided by the bootloader in the /chosen/kaslr-seed DT property. Depending on the size of the address space (VA_BITS) and the page size, the entropy in the virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all 4 levels), with the sidenote that displacements that result in the kernel image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB granule kernels, respectively) are not allowed, and will be rounded up to an acceptable value. If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is randomized independently from the core kernel. This makes it less likely that the location of core kernel data structures can be determined by an adversary, but causes all function calls from modules into the core kernel to be resolved via entries in the module PLTs. If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is randomized by choosing a page aligned 128 MB region inside the interval [_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of entropy (depending on page size), independently of the kernel randomization, but still guarantees that modules are within the range of relative branch and jump instructions (with the caveat that, since the module region is shared with other uses of the vmalloc area, modules may need to be loaded further away if the module region is exhausted) Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit f80fb3a3d50843a401dac4b566b3b131da8077a2) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 29 +++++++ arch/arm64/include/asm/memory.h | 5 +- arch/arm64/include/asm/module.h | 6 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/head.S | 59 ++++++++++++-- arch/arm64/kernel/kaslr.c | 173 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/module.c | 3 +- arch/arm64/kernel/setup.c | 29 +++++++ arch/arm64/mm/kasan_init.c | 17 +++- arch/arm64/mm/mmu.c | 29 ++++--- 10 files changed, 329 insertions(+), 22 deletions(-) create mode 100644 arch/arm64/kernel/kaslr.c (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ac1475f559e6..b311ac23c989 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -748,6 +748,35 @@ config RELOCATABLE relocation pass at runtime even if the kernel is loaded at the same address it was linked at. +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS + select RELOCATABLE + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts + relying on knowledge of the location of kernel internals. + + It is the bootloader's job to provide entropy, by passing a + random u64 value in /chosen/kaslr-seed at kernel entry. + + If unsure, say N. + +config RANDOMIZE_MODULE_REGION_FULL + bool "Randomize the module region independently from the core kernel" + depends on RANDOMIZE_BASE + default y + help + Randomizes the location of the module region without considering the + location of the core kernel. This way, it is impossible for modules + to leak information about the location of core kernel data structures + but it does imply that function calls between modules and the core + kernel will need to be resolved via veneers in the module PLT. + + When this option is not set, the module region will be randomized over + a limited range that contains the [_stext, _etext] interval of the + core kernel, so branch relocations are always in range. + endmenu menu "Boot options" diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index eb798156cf56..5f8667a99e41 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -53,7 +53,7 @@ #define KIMAGE_VADDR (MODULES_END) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) -#define MODULES_VSIZE (SZ_64M) +#define MODULES_VSIZE (SZ_128M) #define PCI_IO_END (PAGE_OFFSET - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) @@ -139,6 +139,9 @@ extern phys_addr_t memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) +/* the virtual base of the kernel image (minus TEXT_OFFSET) */ +extern u64 kimage_vaddr; + /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 8652fb613304..e12af6754634 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -31,4 +31,10 @@ struct mod_arch_specific { u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela, Elf64_Sym *sym); +#ifdef CONFIG_RANDOMIZE_BASE +extern u64 module_alloc_base; +#else +#define module_alloc_base ((u64)_etext - MODULES_VSIZE) +#endif + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 8d971f9c6ed5..49a2430b0786 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -43,6 +43,7 @@ arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o arm64-obj-$(CONFIG_ACPI) += acpi.o arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o +arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 4e69412a7323..319f896c6e74 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -210,6 +210,7 @@ section_table: ENTRY(stext) bl preserve_boot_args bl el2_setup // Drop to EL1, w20=cpu_boot_mode + mov x23, xzr // KASLR offset, defaults to 0 adrp x24, __PHYS_OFFSET bl set_cpu_boot_mode_flag bl __create_page_tables // x25=TTBR0, x26=TTBR1 @@ -313,7 +314,7 @@ ENDPROC(preserve_boot_args) __create_page_tables: adrp x25, idmap_pg_dir adrp x26, swapper_pg_dir - mov x27, lr + mov x28, lr /* * Invalidate the idmap and swapper page tables to avoid potential @@ -392,6 +393,7 @@ __create_page_tables: */ mov x0, x26 // swapper_pg_dir ldr x5, =KIMAGE_VADDR + add x5, x5, x23 // add KASLR displacement create_pgd_entry x0, x5, x3, x6 ldr w6, kernel_img_size add x6, x6, x5 @@ -408,8 +410,7 @@ __create_page_tables: dmb sy bl __inval_cache_range - mov lr, x27 - ret + ret x28 ENDPROC(__create_page_tables) kernel_img_size: @@ -421,6 +422,7 @@ kernel_img_size: */ .set initial_sp, init_thread_union + THREAD_START_SP __mmap_switched: + mov x28, lr // preserve LR adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address isb @@ -449,19 +451,26 @@ __mmap_switched: ldr x13, [x9, #-8] cmp w12, #R_AARCH64_RELATIVE b.ne 1f - str x13, [x11] + add x13, x13, x23 // relocate + str x13, [x11, x23] b 0b 1: cmp w12, #R_AARCH64_ABS64 b.ne 0b add x12, x12, x12, lsl #1 // symtab offset: 24x top word add x12, x8, x12, lsr #(32 - 3) // ... shifted into bottom word + ldrsh w14, [x12, #6] // Elf64_Sym::st_shndx ldr x15, [x12, #8] // Elf64_Sym::st_value + cmp w14, #-0xf // SHN_ABS (0xfff1) ? + add x14, x15, x23 // relocate + csel x15, x14, x15, ne add x15, x13, x15 - str x15, [x11] + str x15, [x11, x23] b 0b -2: +2: adr_l x8, kimage_vaddr // make relocated kimage_vaddr + dc cvac, x8 // value visible to secondaries + dsb sy // with MMU off #endif adr_l sp, initial_sp, x4 @@ -470,13 +479,23 @@ __mmap_switched: msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer - ldr x4, =KIMAGE_VADDR // Save the offset between + ldr_l x4, kimage_vaddr // Save the offset between sub x4, x4, x24 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings mov x29, #0 #ifdef CONFIG_KASAN bl kasan_early_init +#endif +#ifdef CONFIG_RANDOMIZE_BASE + cbnz x23, 0f // already running randomized? + mov x0, x21 // pass FDT address in x0 + bl kaslr_early_init // parse FDT for KASLR options + cbz x0, 0f // KASLR disabled? just proceed + mov x23, x0 // record KASLR offset + ret x28 // we must enable KASLR, return + // to __enable_mmu() +0: #endif b start_kernel ENDPROC(__mmap_switched) @@ -486,6 +505,10 @@ ENDPROC(__mmap_switched) * hotplug and needs to have the same protections as the text region */ .section ".text","ax" + +ENTRY(kimage_vaddr) + .quad _text - TEXT_OFFSET + /* * If we're fortunate enough to boot at EL2, ensure that the world is * sane before dropping to EL1. @@ -651,7 +674,7 @@ ENTRY(secondary_startup) adrp x26, swapper_pg_dir bl __cpu_setup // initialise processor - ldr x8, =KIMAGE_VADDR + ldr x8, kimage_vaddr ldr w9, 0f sub x27, x8, w9, sxtw // address to jump to after enabling the MMU b __enable_mmu @@ -684,6 +707,7 @@ ENDPROC(__secondary_switched) */ .section ".idmap.text", "ax" __enable_mmu: + mrs x18, sctlr_el1 // preserve old SCTLR_EL1 value mrs x1, ID_AA64MMFR0_EL1 ubfx x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED @@ -701,6 +725,25 @@ __enable_mmu: ic iallu dsb nsh isb +#ifdef CONFIG_RANDOMIZE_BASE + mov x19, x0 // preserve new SCTLR_EL1 value + blr x27 + + /* + * If we return here, we have a KASLR displacement in x23 which we need + * to take into account by discarding the current kernel mapping and + * creating a new one. + */ + msr sctlr_el1, x18 // disable the MMU + isb + bl __create_page_tables // recreate kernel mapping + + msr sctlr_el1, x19 // re-enable the MMU + isb + ic ialluis // flush instructions fetched + isb // via old mapping + add x27, x27, x23 // relocated __mmap_switched +#endif br x27 ENDPROC(__enable_mmu) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c new file mode 100644 index 000000000000..8b32a1f8f09f --- /dev/null +++ b/arch/arm64/kernel/kaslr.c @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2016 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +u64 __read_mostly module_alloc_base; + +static __init u64 get_kaslr_seed(void *fdt) +{ + int node, len; + u64 *prop; + u64 ret; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} + +static __init const u8 *get_cmdline(void *fdt) +{ + static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + int node; + const u8 *prop; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + goto out; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + goto out; + return prop; + } +out: + return default_cmdline; +} + +extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, + pgprot_t prot); + +/* + * This routine will be executed with the kernel mapped at its default virtual + * address, and if it returns successfully, the kernel will be remapped, and + * start_kernel() will be executed from a randomized virtual offset. The + * relocation will result in all absolute references (e.g., static variables + * containing function pointers) to be reinitialized, and zero-initialized + * .bss variables will be reset to 0. + */ +u64 __init kaslr_early_init(u64 dt_phys) +{ + void *fdt; + u64 seed, offset, mask, module_range; + const u8 *cmdline, *str; + int size; + + /* + * Set a reasonable default for module_alloc_base in case + * we end up running with module randomization disabled. + */ + module_alloc_base = (u64)_etext - MODULES_VSIZE; + + /* + * Try to map the FDT early. If this fails, we simply bail, + * and proceed with KASLR disabled. We will make another + * attempt at mapping the FDT in setup_machine() + */ + early_fixmap_init(); + fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); + if (!fdt) + return 0; + + /* + * Retrieve (and wipe) the seed from the FDT + */ + seed = get_kaslr_seed(fdt); + if (!seed) + return 0; + + /* + * Check if 'nokaslr' appears on the command line, and + * return 0 if that is the case. + */ + cmdline = get_cmdline(fdt); + str = strstr(cmdline, "nokaslr"); + if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) + return 0; + + /* + * OK, so we are proceeding with KASLR enabled. Calculate a suitable + * kernel image offset from the seed. Let's place the kernel in the + * lower half of the VMALLOC area (VA_BITS - 2). + * Even if we could randomize at page granularity for 16k and 64k pages, + * let's always round to 2 MB so we don't interfere with the ability to + * map using contiguous PTEs + */ + mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); + offset = seed & mask; + + /* + * The kernel Image should not extend across a 1GB/32MB/512MB alignment + * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this + * happens, increase the KASLR offset by the size of the kernel image. + */ + if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) != + (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) + offset = (offset + (u64)(_end - _text)) & mask; + + if (IS_ENABLED(CONFIG_KASAN)) + /* + * KASAN does not expect the module region to intersect the + * vmalloc region, since shadow memory is allocated for each + * module at load time, whereas the vmalloc region is shadowed + * by KASAN zero pages. So keep modules out of the vmalloc + * region if KASAN is enabled. + */ + return offset; + + if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { + /* + * Randomize the module region independently from the core + * kernel. This prevents modules from leaking any information + * about the address of the kernel itself, but results in + * branches between modules and the core kernel that are + * resolved via PLTs. (Branches between modules will be + * resolved normally.) + */ + module_range = VMALLOC_END - VMALLOC_START - MODULES_VSIZE; + module_alloc_base = VMALLOC_START; + } else { + /* + * Randomize the module region by setting module_alloc_base to + * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, + * _stext) . This guarantees that the resulting region still + * covers [_stext, _etext], and that all relative branches can + * be resolved without veneers. + */ + module_range = MODULES_VSIZE - (u64)(_etext - _stext); + module_alloc_base = (u64)_etext + offset - MODULES_VSIZE; + } + + /* use the lower 21 bits to randomize the base of the module region */ + module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; + module_alloc_base &= PAGE_MASK; + + return offset; +} diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index a9dde97f5ca5..7f316982ce00 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -34,7 +34,8 @@ void *module_alloc(unsigned long size) { void *p; - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, + p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, + module_alloc_base + MODULES_VSIZE, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index cfed56f0ad26..42371f69def3 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -388,3 +388,32 @@ static int __init topology_init(void) return 0; } subsys_initcall(topology_init); + +/* + * Dump out kernel offset information on panic. + */ +static int dump_kernel_offset(struct notifier_block *self, unsigned long v, + void *p) +{ + u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR; + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) { + pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n", + kaslr_offset, KIMAGE_VADDR); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } + return 0; +} + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + return 0; +} +__initcall(register_kernel_offset_dumper); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 7f10cc91fa8a..56e19d150c21 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -129,12 +129,16 @@ static void __init clear_pgds(unsigned long start, void __init kasan_init(void) { u64 kimg_shadow_start, kimg_shadow_end; + u64 mod_shadow_start, mod_shadow_end; struct memblock_region *reg; int i; kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); + mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); + /* * We are going to perform proper setup of shadow memory. * At first we should unmap early shadow (clear_pgds() call bellow). @@ -158,13 +162,20 @@ void __init kasan_init(void) * with PMD table mappings at the edges of the shadow region for the * kernel image. */ - if (ARM64_SWAPPER_USES_SECTION_MAPS) + if (ARM64_SWAPPER_USES_SECTION_MAPS) { + kimg_shadow_start = round_down(kimg_shadow_start, + SWAPPER_BLOCK_SIZE); kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + } kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, - kasan_mem_to_shadow((void *)MODULES_VADDR)); + (void *)mod_shadow_start); kasan_populate_zero_shadow((void *)kimg_shadow_end, - kasan_mem_to_shadow((void *)PAGE_OFFSET)); + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + if (kimg_shadow_start > mod_shadow_end) + kasan_populate_zero_shadow((void *)mod_shadow_end, + (void *)kimg_shadow_start); for_each_memblock(memory, reg) { void *start = (void *)__phys_to_virt(reg->base); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fb5c872fe3d6..ff0f5a46b552 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -676,7 +676,8 @@ void __init early_fixmap_init(void) unsigned long addr = FIXADDR_START; pgd = pgd_offset_k(addr); - if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) { + if (CONFIG_PGTABLE_LEVELS > 3 && + !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on @@ -733,11 +734,10 @@ void __set_fixmap(enum fixed_addresses idx, } } -void *__init fixmap_remap_fdt(phys_addr_t dt_phys) +void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) { const u64 dt_virt_base = __fix_to_virt(FIX_FDT); - pgprot_t prot = PAGE_KERNEL_RO; - int size, offset; + int offset; void *dt_virt; /* @@ -776,16 +776,27 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) if (fdt_check_header(dt_virt) != 0) return NULL; - size = fdt_totalsize(dt_virt); - if (size > MAX_FDT_SIZE) + *size = fdt_totalsize(dt_virt); + if (*size > MAX_FDT_SIZE) return NULL; - if (offset + size > SWAPPER_BLOCK_SIZE) + if (offset + *size > SWAPPER_BLOCK_SIZE) create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base, - round_up(offset + size, SWAPPER_BLOCK_SIZE), prot); + round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot); - memblock_reserve(dt_phys, size); + return dt_virt; +} +void *__init fixmap_remap_fdt(phys_addr_t dt_phys) +{ + void *dt_virt; + int size; + + dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO); + if (!dt_virt) + return NULL; + + memblock_reserve(dt_phys, size); return dt_virt; } -- cgit v1.2.3 From 98e23ea3a3dd23269a69282291f9bef53e262bc2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 29 Jan 2016 11:59:03 +0100 Subject: arm64: kaslr: randomize the linear region When KASLR is enabled (CONFIG_RANDOMIZE_BASE=y), and entropy has been provided by the bootloader, randomize the placement of RAM inside the linear region if sufficient space is available. For instance, on a 4KB granule/3 levels kernel, the linear region is 256 GB in size, and we can choose any 1 GB aligned offset that is far enough from the top of the address space to fit the distance between the start of the lowest memblock and the top of the highest memblock. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit c031a4213c11a5db475f528c182f7b3858df11db) Signed-off-by: Alex Shi --- arch/arm64/kernel/kaslr.c | 4 ++++ arch/arm64/mm/init.c | 22 ++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 8b32a1f8f09f..582983920054 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -21,6 +21,7 @@ #include u64 __read_mostly module_alloc_base; +u16 __initdata memstart_offset_seed; static __init u64 get_kaslr_seed(void *fdt) { @@ -123,6 +124,9 @@ u64 __init kaslr_early_init(u64 dt_phys) mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1); offset = seed & mask; + /* use the top 16 bits to randomize the linear region */ + memstart_offset_seed = seed >> 48; + /* * The kernel Image should not extend across a 1GB/32MB/512MB alignment * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 2c7a3c2868e4..58a6d3f7525c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -196,6 +196,23 @@ void __init arm64_memblock_init(void) memblock_add(__pa(_text), (u64)(_end - _text)); } + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + extern u16 memstart_offset_seed; + u64 range = linear_region_size - + (memblock_end_of_DRAM() - memblock_start_of_DRAM()); + + /* + * If the size of the linear region exceeds, by a sufficient + * margin, the size of the region that the available physical + * memory spans, randomize the linear region as well. + */ + if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) { + range = range / ARM64_MEMSTART_ALIGN + 1; + memstart_addr -= ARM64_MEMSTART_ALIGN * + ((range * memstart_offset_seed) >> 16); + } + } + /* * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. @@ -365,12 +382,13 @@ void __init mem_init(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE), - MLM((unsigned long)virt_to_page(PAGE_OFFSET), + MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()), (unsigned long)virt_to_page(high_memory)), #endif MLK(FIXADDR_START, FIXADDR_TOP), MLM(PCI_IO_START, PCI_IO_END), - MLM(PAGE_OFFSET, (unsigned long)high_memory)); + MLM(__phys_to_virt(memblock_start_of_DRAM()), + (unsigned long)high_memory)); #undef MLK #undef MLM -- cgit v1.2.3 From 4a9c1460b2b904c4a9b6438a14d10c56e0e9ab78 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 11 Jan 2016 11:47:49 +0100 Subject: efi: stub: use high allocation for converted command line Before we can move the command line processing before the allocation of the kernel, which is required for detecting the 'nokaslr' option which controls that allocation, move the converted command line higher up in memory, to prevent it from interfering with the kernel itself. Since x86 needs the address to fit in 32 bits, use UINT_MAX as the upper bound there. Otherwise, use ULONG_MAX (i.e., no limit) Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 48fcb2d0216103d15306caa4814e2381104df6d8) Signed-off-by: Alex Shi --- arch/x86/include/asm/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0010c78c4998..08b1f2f6ea50 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -25,6 +25,8 @@ #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" +#define MAX_CMDLINE_ADDRESS UINT_MAX + #ifdef CONFIG_X86_32 -- cgit v1.2.3 From e009472925ee90986397518ef6796e6f8d12e1da Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 26 Jan 2016 14:48:29 +0100 Subject: arm64: efi: invoke EFI_RNG_PROTOCOL to supply KASLR randomness Since arm64 does not use a decompressor that supplies an execution environment where it is feasible to some extent to provide a source of randomness, the arm64 KASLR kernel depends on the bootloader to supply some random bits in the /chosen/kaslr-seed DT property upon kernel entry. On UEFI systems, we can use the EFI_RNG_PROTOCOL, if supplied, to obtain some random bits. At the same time, use it to randomize the offset of the kernel Image in physical memory. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 2b5fe07a78a09a32002642b8a823428ade611f16) Signed-off-by: Alex Shi --- arch/arm64/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b311ac23c989..97583a1878db 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -760,6 +760,11 @@ config RANDOMIZE_BASE It is the bootloader's job to provide entropy, by passing a random u64 value in /chosen/kaslr-seed at kernel entry. + When booting via the UEFI stub, it will invoke the firmware's + EFI_RNG_PROTOCOL implementation (if available) to supply entropy + to the kernel proper. In addition, it will randomise the physical + location of the kernel Image as well. + If unsure, say N. config RANDOMIZE_MODULE_REGION_FULL -- cgit v1.2.3 From 885af350336f3e3256999234cd4948210df6c946 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 11 Feb 2016 13:53:10 -0800 Subject: arm64: make irq_stack_ptr more robust Switching between stacks is only valid if we are tracing ourselves while on the irq_stack, so it is only valid when in current and non-preemptible context, otherwise is is just zeroed off. Fixes: 132cd887b5c5 ("arm64: Modify stack trace and dump for use with irq_stack") Acked-by: James Morse Tested-by: James Morse Signed-off-by: Yang Shi Signed-off-by: Will Deacon (cherry picked from commit a80a0eb70c358f8c7dda4bb62b2278dc6285217b) Signed-off-by: Alex Shi --- arch/arm64/kernel/stacktrace.c | 13 ++++++------- arch/arm64/kernel/traps.c | 11 ++++++++++- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 4fad9787ab46..cfd46c227c8c 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -44,14 +44,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) unsigned long irq_stack_ptr; /* - * Use raw_smp_processor_id() to avoid false-positives from - * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping - * task stacks, we can be pre-empted in this case, so - * {raw_,}smp_processor_id() may give us the wrong value. Sleeping - * tasks can't ever be on an interrupt stack, so regardless of cpu, - * the checks will always fail. + * Switching between stacks is valid when tracing current and in + * non-preemptible context. */ - irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id()); + if (tsk == current && !preemptible()) + irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + else + irq_stack_ptr = 0; low = frame->sp; /* irq stacks are not THREAD_SIZE aligned */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cbedd724f48e..c5392081b49b 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -146,9 +146,18 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + unsigned long irq_stack_ptr; int skip; + /* + * Switching between stacks is valid when tracing current and in + * non-preemptible context. + */ + if (tsk == current && !preemptible()) + irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + else + irq_stack_ptr = 0; + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) -- cgit v1.2.3 From 37829fdb8c27a5a506cb535db156c917a6e0061a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Mar 2016 15:22:55 +0000 Subject: arm64: hugetlb: partial revert of 66b3923a1a0f Commit 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") introduced support for huge pages using the contiguous bit in the PTE as opposed to block mappings, which may be slightly unwieldy (512M) in 64k page configurations. Unfortunately, this support has resulted in some late regressions when running the libhugetlbfs test suite with 64k pages and CONFIG_DEBUG_VM as a result of a BUG: | readback (2M: 64): ------------[ cut here ]------------ | kernel BUG at fs/hugetlbfs/inode.c:446! | Internal error: Oops - BUG: 0 [#1] SMP | Modules linked in: | CPU: 7 PID: 1448 Comm: readback Not tainted 4.5.0-rc7 #148 | Hardware name: linux,dummy-virt (DT) | task: fffffe0040964b00 ti: fffffe00c2668000 task.ti: fffffe00c2668000 | PC is at remove_inode_hugepages+0x44c/0x480 | LR is at remove_inode_hugepages+0x264/0x480 Rather than revert the entire patch, simply avoid advertising the contiguous huge page sizes for now while people are actively working on a fix. This patch can then be reverted once things have been sorted out. Cc: David Woods Reported-by: Steve Capper Signed-off-by: Will Deacon (cherry picked from commit ff7925848b50050732ac0401e0acf27e8b241d7b) Signed-off-by: Alex Shi --- arch/arm64/mm/hugetlbpage.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 82d607c3614e..da30529bb1f6 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt) hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else if (ps == (PAGE_SIZE * CONT_PTES)) { - hugetlb_add_hstate(CONT_PTE_SHIFT); - } else if (ps == (PMD_SIZE * CONT_PMDS)) { - hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT); } else { pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; @@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt) return 1; } __setup("hugepagesz=", setup_hugepagesz); - -#ifdef CONFIG_ARM64_64K_PAGES -static __init int add_default_hugepagesz(void) -{ - if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) - hugetlb_add_hstate(CONT_PMD_SHIFT); - return 0; -} -arch_initcall(add_default_hugepagesz); -#endif -- cgit v1.2.3 From f2971e0e6c42c2b0197e43280ef6a48d8a46097e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 15 Mar 2016 11:22:57 +0000 Subject: arm64: fix KASLR boot-time I-cache maintenance Commit f80fb3a3d50843a4 ("arm64: add support for kernel ASLR") missed a DSB necessary to complete I-cache maintenance in the primary boot path, and hence stale instructions may still be present in the I-cache and may be executed until the I-cache maintenance naturally completes. Since commit 8ec41987436d566f ("arm64: mm: ensure patched kernel text is fetched from PoU"), all CPUs invalidate their I-caches after their MMU is enabled. Prior a CPU's MMU having been enabled, arbitrary lines may have been fetched from the PoC into I-caches. We never patch text expected to be executed with the MMU off. Thus, it is unnecessary to perform broadcast I-cache maintenance in the primary boot path. This patch reduces the scope of the I-cache maintenance to the local CPU, and adds the missing DSB with similar scope, matching prior maintenance in the primary boot path. Signed-off-by: Mark Rutland Acked-by: Ard Biesehvuel Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit b90b4a608ea2401cc491828f7a385edd2e236e37) Signed-off-by: Alex Shi --- arch/arm64/kernel/head.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 319f896c6e74..a88a15447c3b 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -740,8 +740,9 @@ __enable_mmu: msr sctlr_el1, x19 // re-enable the MMU isb - ic ialluis // flush instructions fetched - isb // via old mapping + ic iallu // flush instructions fetched + dsb nsh // via old mapping + isb add x27, x27, x23 // relocated __mmap_switched #endif br x27 -- cgit v1.2.3 From 2426266ca6318722160645a720bd02bece8400c7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 22 Mar 2016 10:11:45 +0000 Subject: arm64: consistently use p?d_set_huge Commit 324420bf91f60582 ("arm64: add support for ioremap() block mappings") added new p?d_set_huge functions which do the hard work to generate and set a correct block entry. These differ from open-coded huge page creation in the early page table code by explicitly setting the P?D_TYPE_SECT bits (which are implicitly retained by mk_sect_prot() for any valid prot), but are otherwise identical (and cannot fail on arm64). For simplicity and consistency, make use of these in the initial page table creation code. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit c661cb1c537e2364bfdabb298fb934fd77445e98) Signed-off-by: Alex Shi --- arch/arm64/mm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ff0f5a46b552..41421c724fb9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -211,8 +211,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, if (((addr | next | phys) & ~SECTION_MASK) == 0 && block_mappings_allowed(pgtable_alloc)) { pmd_t old_pmd =*pmd; - set_pmd(pmd, __pmd(phys | - pgprot_val(mk_sect_prot(prot)))); + pmd_set_huge(pmd, phys, prot); /* * Check for previous table entries created during * boot (__create_page_tables) and flush them. @@ -272,8 +271,7 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, if (use_1G_block(addr, next, phys) && block_mappings_allowed(pgtable_alloc)) { pud_t old_pud = *pud; - set_pud(pud, __pud(phys | - pgprot_val(mk_sect_prot(prot)))); + pud_set_huge(pud, phys, prot); /* * If we have an old value for a pud, it will -- cgit v1.2.3 From 9ca29910090bc04686fbed05306131093da667f1 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 10 Mar 2016 18:41:16 +0000 Subject: arm64: kasan: Fix zero shadow mapping overriding kernel image shadow With the 16KB and 64KB page size configurations, SWAPPER_BLOCK_SIZE is PAGE_SIZE and ARM64_SWAPPER_USES_SECTION_MAPS is 0. Since kimg_shadow_end is not page aligned (_end shifted by KASAN_SHADOW_SCALE_SHIFT), the edges of previously mapped kernel image shadow via vmemmap_populate() may be overridden by subsequent calls to kasan_populate_zero_shadow(), leading to kernel panics like below: ------------------------------------------------------------------------------ Unable to handle kernel paging request at virtual address fffffc100135068c pgd = fffffc8009ac0000 [fffffc100135068c] *pgd=00000009ffee0003, *pud=00000009ffee0003, *pmd=00000009ffee0003, *pte=00e0000081a00793 Internal error: Oops: 9600004f [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.5.0-rc4+ #1984 Hardware name: Juno (DT) task: fffffe09001a0000 ti: fffffe0900200000 task.ti: fffffe0900200000 PC is at __memset+0x4c/0x200 LR is at kasan_unpoison_shadow+0x34/0x50 pc : [] lr : [] pstate: 00000245 sp : fffffe0900203db0 x29: fffffe0900203db0 x28: 0000000000000000 x27: 0000000000000000 x26: 0000000000000000 x25: fffffc80099b69d0 x24: 0000000000000001 x23: 0000000000000000 x22: 0000000000002000 x21: dffffc8000000000 x20: 1fffff9001350a8c x19: 0000000000002000 x18: 0000000000000008 x17: 0000000000000147 x16: ffffffffffffffff x15: 79746972100e041d x14: ffffff0000000000 x13: ffff000000000000 x12: 0000000000000000 x11: 0101010101010101 x10: 1fffffc11c000000 x9 : 0000000000000000 x8 : fffffc100135068c x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : 0000000000000004 x3 : fffffc100134f651 x2 : 0000000000000400 x1 : 0000000000000000 x0 : fffffc100135068c Process swapper/0 (pid: 1, stack limit = 0xfffffe0900200020) Call trace: [] __memset+0x4c/0x200 [] __asan_register_globals+0x5c/0xb0 [] _GLOBAL__sub_I_65535_1_sunrpc_cache_lookup+0x1c/0x28 [] kernel_init_freeable+0x104/0x274 [] kernel_init+0x10/0xf8 [] ret_from_fork+0x10/0x50 ------------------------------------------------------------------------------ This patch aligns kimg_shadow_start and kimg_shadow_end to SWAPPER_BLOCK_SIZE in all configurations. Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area") Signed-off-by: Catalin Marinas Acked-by: Mark Rutland Acked-by: Ard Biesheuvel (cherry picked from commit 2776e0e8ef683a42fe3e9a5facf576b73579700e) Signed-off-by: Alex Shi --- arch/arm64/mm/kasan_init.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 56e19d150c21..206dd95ea292 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -158,15 +158,12 @@ void __init kasan_init(void) * vmemmap_populate() has populated the shadow region that covers the * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent - * kasan_populate_zero_shadow() from replacing the PMD block mappings - * with PMD table mappings at the edges of the shadow region for the - * kernel image. + * kasan_populate_zero_shadow() from replacing the page table entries + * (PMD or PTE) at the edges of the shadow region for the kernel + * image. */ - if (ARM64_SWAPPER_USES_SECTION_MAPS) { - kimg_shadow_start = round_down(kimg_shadow_start, - SWAPPER_BLOCK_SIZE); - kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); - } + kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE); + kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, (void *)mod_shadow_start); -- cgit v1.2.3 From 3d3fe7cf1fbc97ea285fd1ee3ed55b7527e4d1ae Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 2 Mar 2016 09:47:13 +0100 Subject: arm64: mm: check at build time that PAGE_OFFSET divides the VA space evenly Commit 8439e62a1561 ("arm64: mm: use bit ops rather than arithmetic in pa/va translations") changed the boundary check against PAGE_OFFSET from an arithmetic comparison to a bit test. This means we now silently assume that PAGE_OFFSET is a power of 2 that divides the kernel virtual address space into two equal halves. So make that assumption explicit. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 6d2aa549de1fc998581d216de3853aa131aa4446) Signed-off-by: Alex Shi --- arch/arm64/mm/init.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 58a6d3f7525c..19ccdb73c680 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -170,6 +170,13 @@ void __init arm64_memblock_init(void) { const s64 linear_region_size = -(s64)PAGE_OFFSET; + /* + * Ensure that the linear region takes up exactly half of the kernel + * virtual address space. This way, we can distinguish a linear address + * from a kernel/module/vmalloc address by testing a single bit. + */ + BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1)); + /* * Select a suitable value for the base of physical memory. */ -- cgit v1.2.3 From 27fa6e51b8ddfd0224dad957b85fd8097caa5978 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 25 Feb 2016 20:48:53 +0100 Subject: arm64: lse: deal with clobbered IP registers after branch via PLT The LSE atomics implementation uses runtime patching to patch in calls to out of line non-LSE atomics implementations on cores that lack hardware support for LSE. To avoid paying the overhead cost of a function call even if no call ends up being made, the bl instruction is kept invisible to the compiler, and the out of line implementations preserve all registers, not just the ones that they are required to preserve as per the AAPCS64. However, commit fd045f6cd98e ("arm64: add support for module PLTs") added support for routing branch instructions via veneers if the branch target offset exceeds the range of the ordinary relative branch instructions. Since this deals with jump and call instructions that are exposed to ELF relocations, the PLT code uses x16 to hold the address of the branch target when it performs an indirect branch-to-register, something which is explicitly allowed by the AAPCS64 (and ordinary compiler generated code does not expect register x16 or x17 to retain their values across a bl instruction). Since the lse runtime patched bl instructions don't adhere to the AAPCS64, they don't deal with this clobbering of registers x16 and x17. So add them to the clobber list of the asm() statements that perform the call instructions, and drop x16 and x17 from the list of registers that are callee saved in the out of line non-LSE implementations. In addition, since we have given these functions two scratch registers, they no longer need to stack/unstack temp registers. Signed-off-by: Ard Biesheuvel [will: factored clobber list into #define, updated Makefile comment] Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas (cherry picked from commit 5be8b70af1ca78cefb8b756d157532360a5fd663) Signed-off-by: Alex Shi --- arch/arm64/include/asm/atomic_lse.h | 38 ++++++++++++++++++------------------- arch/arm64/include/asm/lse.h | 1 + arch/arm64/lib/Makefile | 13 +++++++------ 3 files changed, 27 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index 197e06afbf71..39c1d340fec5 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -36,7 +36,7 @@ static inline void atomic_andnot(int i, atomic_t *v) " stclr %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_or(int i, atomic_t *v) @@ -48,7 +48,7 @@ static inline void atomic_or(int i, atomic_t *v) " stset %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_xor(int i, atomic_t *v) @@ -60,7 +60,7 @@ static inline void atomic_xor(int i, atomic_t *v) " steor %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_add(int i, atomic_t *v) @@ -72,7 +72,7 @@ static inline void atomic_add(int i, atomic_t *v) " stadd %w[i], %[v]\n") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC_OP_ADD_RETURN(name, mb, cl...) \ @@ -90,7 +90,7 @@ static inline int atomic_add_return##name(int i, atomic_t *v) \ " add %w[i], %w[i], w30") \ : [i] "+r" (w0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return w0; \ } @@ -116,7 +116,7 @@ static inline void atomic_and(int i, atomic_t *v) " stclr %w[i], %[v]") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic_sub(int i, atomic_t *v) @@ -133,7 +133,7 @@ static inline void atomic_sub(int i, atomic_t *v) " stadd %w[i], %[v]") : [i] "+r" (w0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC_OP_SUB_RETURN(name, mb, cl...) \ @@ -153,7 +153,7 @@ static inline int atomic_sub_return##name(int i, atomic_t *v) \ " add %w[i], %w[i], w30") \ : [i] "+r" (w0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS , ##cl); \ \ return w0; \ } @@ -177,7 +177,7 @@ static inline void atomic64_andnot(long i, atomic64_t *v) " stclr %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_or(long i, atomic64_t *v) @@ -189,7 +189,7 @@ static inline void atomic64_or(long i, atomic64_t *v) " stset %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_xor(long i, atomic64_t *v) @@ -201,7 +201,7 @@ static inline void atomic64_xor(long i, atomic64_t *v) " steor %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_add(long i, atomic64_t *v) @@ -213,7 +213,7 @@ static inline void atomic64_add(long i, atomic64_t *v) " stadd %[i], %[v]\n") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC64_OP_ADD_RETURN(name, mb, cl...) \ @@ -231,7 +231,7 @@ static inline long atomic64_add_return##name(long i, atomic64_t *v) \ " add %[i], %[i], x30") \ : [i] "+r" (x0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -257,7 +257,7 @@ static inline void atomic64_and(long i, atomic64_t *v) " stclr %[i], %[v]") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } static inline void atomic64_sub(long i, atomic64_t *v) @@ -274,7 +274,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) " stadd %[i], %[v]") : [i] "+r" (x0), [v] "+Q" (v->counter) : "r" (x1) - : "x30"); + : __LL_SC_CLOBBERS); } #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...) \ @@ -294,7 +294,7 @@ static inline long atomic64_sub_return##name(long i, atomic64_t *v) \ " add %[i], %[i], x30") \ : [i] "+r" (x0), [v] "+Q" (v->counter) \ : "r" (x1) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -330,7 +330,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) "2:") : [ret] "+&r" (x0), [v] "+Q" (v->counter) : - : "x30", "cc", "memory"); + : __LL_SC_CLOBBERS, "cc", "memory"); return x0; } @@ -359,7 +359,7 @@ static inline unsigned long __cmpxchg_case_##name(volatile void *ptr, \ " mov %" #w "[ret], " #w "30") \ : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr) \ : [old] "r" (x1), [new] "r" (x2) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } @@ -416,7 +416,7 @@ static inline long __cmpxchg_double##name(unsigned long old1, \ [v] "+Q" (*(unsigned long *)ptr) \ : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ [oldval1] "r" (oldval1), [oldval2] "r" (oldval2) \ - : "x30" , ##cl); \ + : __LL_SC_CLOBBERS, ##cl); \ \ return x0; \ } diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h index 3de42d68611d..23acc00be32d 100644 --- a/arch/arm64/include/asm/lse.h +++ b/arch/arm64/include/asm/lse.h @@ -26,6 +26,7 @@ __asm__(".arch_extension lse"); /* Macro for constructing calls to out-of-line ll/sc atomics */ #define __LL_SC_CALL(op) "bl\t" __stringify(__LL_SC_PREFIX(op)) "\n" +#define __LL_SC_CLOBBERS "x16", "x17", "x30" /* In-line patching at runtime */ #define ARM64_LSE_ATOMIC_INSN(llsc, lse) \ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 1a811ecf71da..c86b7909ef31 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -4,15 +4,16 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ strchr.o strrchr.o -# Tell the compiler to treat all general purpose registers as -# callee-saved, which allows for efficient runtime patching of the bl -# instruction in the caller with an atomic instruction when supported by -# the CPU. Result and argument registers are handled correctly, based on -# the function prototype. +# Tell the compiler to treat all general purpose registers (with the +# exception of the IP registers, which are already handled by the caller +# in case of a PLT) as callee-saved, which allows for efficient runtime +# patching of the bl instruction in the caller with an atomic instruction +# when supported by the CPU. Result and argument registers are handled +# correctly, based on the function prototype. lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o CFLAGS_atomic_ll_sc.o := -fcall-used-x0 -ffixed-x1 -ffixed-x2 \ -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6 \ -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ - -fcall-saved-x16 -fcall-saved-x17 -fcall-saved-x18 + -fcall-saved-x18 -- cgit v1.2.3 From b78c702db9fefface6f68cfade7a1afbadd829ab Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Feb 2016 17:57:14 +0100 Subject: arm64: mm: treat memstart_addr as a signed quantity Commit c031a4213c11 ("arm64: kaslr: randomize the linear region") implements randomization of the linear region, by subtracting a random multiple of PUD_SIZE from memstart_addr. This causes the virtual mapping of system RAM to move upwards in the linear region, and at the same time causes memstart_addr to assume a value which may be negative if the offset of system RAM in the physical space is smaller than its offset relative to PAGE_OFFSET in the virtual space. Since memstart_addr is effectively an offset now, redefine its type as s64 so that expressions involving shifting or division preserve its sign. Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas (cherry picked from commit 020d044f66874eba058ce8264fc550f3eca67879) Signed-off-by: Alex Shi --- arch/arm64/include/asm/memory.h | 2 +- arch/arm64/mm/init.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 5f8667a99e41..12f8a00fb3f1 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -135,7 +135,7 @@ #include #include -extern phys_addr_t memstart_addr; +extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 19ccdb73c680..9db46dfb6afb 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -54,7 +54,7 @@ * executes, which assigns it its actual value. So use a default value * that cannot be mistaken for a real physical address. */ -phys_addr_t memstart_addr __read_mostly = ~0ULL; +s64 memstart_addr __read_mostly = -1; phys_addr_t arm64_dma_phys_limit __read_mostly; #ifdef CONFIG_BLK_DEV_INITRD @@ -188,7 +188,7 @@ void __init arm64_memblock_init(void) * linear mapping. Take care not to clip the kernel which may be * high in memory. */ - memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)), + memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)), ULLONG_MAX); if (memblock_end_of_DRAM() > linear_region_size) memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); -- cgit v1.2.3 From 200d9e78dba04ae2a5ee4b847f389758db5152dd Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 10 Mar 2016 18:30:56 +0000 Subject: arm64: kasan: Use actual memory node when populating the kernel image shadow With the 16KB or 64KB page configurations, the generic vmemmap_populate() implementation warns on potential offnode page_structs via vmemmap_verify() because the arm64 kasan_init() passes NUMA_NO_NODE instead of the actual node for the kernel image memory. Fixes: f9040773b7bb ("arm64: move kernel image to base of vmalloc area") Signed-off-by: Catalin Marinas Reported-by: James Morse Acked-by: Ard Biesheuvel Acked-by: Mark Rutland (cherry picked from commit 2f76969f2eef051bdd63d38b08d78e790440b0ad) Signed-off-by: Alex Shi --- arch/arm64/mm/kasan_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 206dd95ea292..757009daa9ed 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -152,7 +152,8 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); - vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE); + vmemmap_populate(kimg_shadow_start, kimg_shadow_end, + pfn_to_nid(virt_to_pfn(_text))); /* * vmemmap_populate() has populated the shadow region that covers the -- cgit v1.2.3 From 5dd612ebfad71f5463d89ff92d1bc307cd286b5d Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 23 Mar 2016 16:00:46 +0100 Subject: parisc: Use generic extable search and sort routines Switch to the generic extable search and sort routines which were introduced with commit a272858 from Ard Biesheuvel. This saves quite some memory in the vmlinux binary with the 64bit kernel. Signed-off-by: Helge Deller (cherry picked from commit 0de798584bdedfdad19db21e3c7aec84f252f4f3) Signed-off-by: Alex Shi --- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/assembly.h | 2 +- arch/parisc/include/asm/uaccess.h | 7 ++++--- arch/parisc/mm/fault.c | 9 ++------- 4 files changed, 8 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 729f89163bc3..d2256fa97ea0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -11,6 +11,7 @@ config PARISC select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE select BUG + select BUILDTIME_EXTABLE_SORT select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h index b3069fd83468..60e6f07b7e32 100644 --- a/arch/parisc/include/asm/assembly.h +++ b/arch/parisc/include/asm/assembly.h @@ -523,7 +523,7 @@ */ #define ASM_EXCEPTIONTABLE_ENTRY(fault_addr, except_addr) \ .section __ex_table,"aw" ! \ - ASM_ULONG_INSN fault_addr, except_addr ! \ + .word (fault_addr - .), (except_addr - .) ! \ .previous diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 1960b87c1c8b..6f893d29f1b2 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -60,14 +60,15 @@ static inline long access_ok(int type, const void __user * addr, * use a 32bit (unsigned int) address here. */ +#define ARCH_HAS_RELATIVE_EXTABLE struct exception_table_entry { - unsigned long insn; /* address of insn that is allowed to fault. */ - unsigned long fixup; /* fixup routine */ + int insn; /* relative address of insn that is allowed to fault. */ + int fixup; /* relative address of fixup routine */ }; #define ASM_EXCEPTIONTABLE_ENTRY( fault_addr, except_addr )\ ".section __ex_table,\"aw\"\n" \ - ASM_WORD_INSN #fault_addr ", " #except_addr "\n\t" \ + ".word (" #fault_addr " - .), (" #except_addr " - .)\n\t" \ ".previous\n" /* diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index f9064449908a..16dbe81c97c9 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -140,12 +140,6 @@ int fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *fix; - /* If we only stored 32bit addresses in the exception table we can drop - * out if we faulted on a 64bit address. */ - if ((sizeof(regs->iaoq[0]) > sizeof(fix->insn)) - && (regs->iaoq[0] >> 32)) - return 0; - fix = search_exception_tables(regs->iaoq[0]); if (fix) { struct exception_data *d; @@ -155,7 +149,8 @@ int fixup_exception(struct pt_regs *regs) d->fault_space = regs->isr; d->fault_addr = regs->ior; - regs->iaoq[0] = ((fix->fixup) & ~3); + regs->iaoq[0] = (unsigned long)&fix->fixup + fix->fixup; + regs->iaoq[0] &= ~3; /* * NOTE: In some cases the faulting instruction * may be in the delay slot of a branch. We -- cgit v1.2.3