diff options
| author | Linux Build Service Account <lnxbuild@quicinc.com> | 2017-09-12 14:41:59 -0700 |
|---|---|---|
| committer | Gerrit - the friendly Code Review server <code-review@localhost> | 2017-09-12 14:41:58 -0700 |
| commit | 14f6bfeeebb97c02cbf0c43818f7998e3bdb3cae (patch) | |
| tree | 055fbddd1e98f289dc9c65cb65a8e37fc2dbc151 /arch | |
| parent | 043f52c20a63011ae14175e56926076965d59353 (diff) | |
| parent | 03f50f905f9f7cadbb1b3cd82c078806bfd77703 (diff) | |
Merge "Merge android-4.4@610af85 (v4.4.85) into msm-4.4"
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/arc/include/asm/cache.h | 2 | ||||
| -rw-r--r-- | arch/arc/mm/cache.c | 13 | ||||
| -rw-r--r-- | arch/arm64/include/asm/current.h | 10 | ||||
| -rw-r--r-- | arch/arm64/include/asm/elf.h | 4 | ||||
| -rw-r--r-- | arch/arm64/include/asm/smp.h | 4 | ||||
| -rw-r--r-- | arch/arm64/kernel/setup.c | 2 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 | ||||
| -rw-r--r-- | arch/x86/crypto/sha1_ssse3_glue.c | 2 | ||||
| -rw-r--r-- | arch/x86/entry/entry_64.S | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/elf.h | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel_lbr.c | 8 |
11 files changed, 74 insertions, 44 deletions
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index 210ef3e72332..0ddd7144c492 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -88,7 +88,9 @@ extern int ioc_exists; #define ARC_REG_SLC_FLUSH 0x904 #define ARC_REG_SLC_INVALIDATE 0x905 #define ARC_REG_SLC_RGN_START 0x914 +#define ARC_REG_SLC_RGN_START1 0x915 #define ARC_REG_SLC_RGN_END 0x916 +#define ARC_REG_SLC_RGN_END1 0x917 /* Bit val in SLC_CONTROL */ #define SLC_CTRL_IM 0x040 diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index d81b6d7e11e7..9a84cbdd44b0 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -543,6 +543,7 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned int ctrl; + phys_addr_t end; spin_lock_irqsave(&lock, flags); @@ -572,8 +573,16 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) * END needs to be setup before START (latter triggers the operation) * END can't be same as START, so add (l2_line_sz - 1) to sz */ - write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1)); - write_aux_reg(ARC_REG_SLC_RGN_START, paddr); + end = paddr + sz + l2_line_sz - 1; + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_END1, upper_32_bits(end)); + + write_aux_reg(ARC_REG_SLC_RGN_END, lower_32_bits(end)); + + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_START1, upper_32_bits(paddr)); + + write_aux_reg(ARC_REG_SLC_RGN_START, lower_32_bits(paddr)); while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY); diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h index 2e61d21294ba..483a6c9d3e10 100644 --- a/arch/arm64/include/asm/current.h +++ b/arch/arm64/include/asm/current.h @@ -10,9 +10,17 @@ #ifdef CONFIG_THREAD_INFO_IN_TASK struct task_struct; +/* + * We don't use read_sysreg() as we want the compiler to cache the value where + * possible. + */ static __always_inline struct task_struct *get_current(void) { - return (struct task_struct *)read_sysreg(sp_el0); + unsigned long sp_el0; + + asm ("mrs %0, sp_el0" : "=r" (sp_el0)); + + return (struct task_struct *)sp_el0; } #define current get_current() #else diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index b98332269462..9d9287277201 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -115,10 +115,10 @@ /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ -#define ELF_ET_DYN_BASE 0x100000000UL +#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index a05033beb2a2..4325b3622a92 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -28,8 +28,10 @@ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); * We don't use this_cpu_read(cpu_number) as that has implicit writes to * preempt_count, and associated (compiler) barriers, that we'd like to avoid * the expense of. If we're preemptible, the value can be stale at use anyway. + * And we can't use this_cpu_ptr() either, as that winds up recursing back + * here under CONFIG_DEBUG_PREEMPT=y. */ -#define raw_smp_processor_id() (*this_cpu_ptr(&cpu_number)) +#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number)) struct seq_file; diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f592e7406fc5..b8b40d95ebef 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -369,7 +369,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_THREAD_INFO_IN_TASK init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page); #else - init_thread_info.ttbr0 = (init_thread_union.thread_info); + init_thread_info.ttbr0 = virt_to_phys(empty_zero_page); #endif #endif diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792db15ef..1eab79c9ac48 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm -#define K_BASE %r8 #define HASH_PTR %r9 +#define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 -#define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ - vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) - vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -446,6 +445,16 @@ .endm +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks - PRECALC_OFFSET = 0 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr - PRECALC_OFFSET = 128 + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ - cmp K_BASE, BUFFER_PTR - jne _begin + test BLOCKS_CTR, BLOCKS_CTR + jnz _begin .align 32 jmp _end .align 32 @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr - add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ - cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ - + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E - cmp K_BASE, BUFFER_PTR /* is current block the last one? */ - je _loop + test BLOCKS_CTR, BLOCKS_CTR + jz _loop mov TB, B @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr - add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ - - cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: @@ -641,19 +653,12 @@ _loop3: avx2_zeroupper - lea K_XMM_AR(%rip), K_BASE - + /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR - lea 64(BUF), BUFFER_PTR2 - - shl $6, CNT /* mul by 64 */ - add BUF, CNT - add $64, CNT - mov CNT, BUFFER_END - cmp BUFFER_END, BUFFER_PTR2 - cmovae K_BASE, BUFFER_PTR2 + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7de207a11014..dd14616b7739 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, static bool avx2_usable(void) { - if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) return true; diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a55697d19824..cc0f2f5da19b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1190,6 +1190,8 @@ ENTRY(nmi) * other IST entries. */ + ASM_CLAC + /* Use %rdx as our temp variable throughout */ pushq %rdx diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 07cf288b692e..bcd3d6199464 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -247,11 +247,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - 0x100000000UL) + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 8900400230c6..2cdae69d7e0b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -153,7 +153,7 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config; - if (!pmi) + if (!pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); @@ -432,8 +432,10 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) int out = 0; int num = x86_pmu.lbr_nr; - if (cpuc->lbr_sel->config & LBR_CALL_STACK) - num = tos; + if (cpuc->lbr_sel) { + if (cpuc->lbr_sel->config & LBR_CALL_STACK) + num = tos; + } for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; |
