UPSTREAM: arm64: lib: improve copy_page to deal with 128 bytes at a time

We want to avoid lots of different copy_page implementations, settling for something that is "good enough" everywhere and hopefully easy to understand and maintain whilst we're at it. This patch reworks our copy_page implementation based on discussions with Cavium on the list and benchmarking on Cortex-A processors so that: - The loop is unrolled to copy 128 bytes per iteration - The reads are offset so that we read from the next 128-byte block in the same iteration that we store the previous block - Explicit prefetch instructions are removed for now, since they hurt performance on CPUs with hardware prefetching - The loop exit condition is calculated at the start of the loop Signed-off-by: Will Deacon <will.deacon@arm.com> Tested-by: Andrew Pinski <apinski@cavium.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Bug: 30369029 Patchset: kaslr-arm64-4.4 (cherry picked from commit 223e23e8aa26b0bb62c597637e77295e14f6a62c) Signed-off-by: Jeff Vander Stoep <jeffv@google.com> Change-Id: Icabd86bbecc60ad0d730ab796e33b8762cecb1fb
author: Will Deacon <will.deacon@arm.com> 2016-02-02 12:46:25 +0000
committer: Jeff Vander Stoep <jeffv@google.com> 2016-09-22 13:38:22 -0700
commit: 297bf83fa4e7804c28c6d950f39dbd3860552cb9 (patch)
tree: c6c026fa6f112973d0ca708c3b3b4276dceeab21
parent: b5924f376a221ffa0ee64a4410aff54416c2fd2d (diff)
1 files changed, 38 insertions, 8 deletions
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 512b9a7b980e..2534533ceb1d 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -27,20 +27,50 @@
  *	x1 - src
  */
 ENTRY(copy_page)
-	/* Assume cache line size is 64 bytes. */
-	prfm	pldl1strm, [x1, #64]
-1:	ldp	x2, x3, [x1]
+	ldp	x2, x3, [x1]
 	ldp	x4, x5, [x1, #16]
 	ldp	x6, x7, [x1, #32]
 	ldp	x8, x9, [x1, #48]
-	add	x1, x1, #64
-	prfm	pldl1strm, [x1, #64]
+	ldp	x10, x11, [x1, #64]
+	ldp	x12, x13, [x1, #80]
+	ldp	x14, x15, [x1, #96]
+	ldp	x16, x17, [x1, #112]
+
+	mov	x18, #(PAGE_SIZE - 128)
+	add	x1, x1, #128
+1:
+	subs	x18, x18, #128
+
 	stnp	x2, x3, [x0]
+	ldp	x2, x3, [x1]
 	stnp	x4, x5, [x0, #16]
+	ldp	x4, x5, [x1, #16]
 	stnp	x6, x7, [x0, #32]
+	ldp	x6, x7, [x1, #32]
 	stnp	x8, x9, [x0, #48]
-	add	x0, x0, #64
-	tst	x1, #(PAGE_SIZE - 1)
-	b.ne	1b
+	ldp	x8, x9, [x1, #48]
+	stnp	x10, x11, [x0, #64]
+	ldp	x10, x11, [x1, #64]
+	stnp	x12, x13, [x0, #80]
+	ldp	x12, x13, [x1, #80]
+	stnp	x14, x15, [x0, #96]
+	ldp	x14, x15, [x1, #96]
+	stnp	x16, x17, [x0, #112]
+	ldp	x16, x17, [x1, #112]
+
+	add	x0, x0, #128
+	add	x1, x1, #128
+
+	b.gt	1b
+
+	stnp	x2, x3, [x0]
+	stnp	x4, x5, [x0, #16]
+	stnp	x6, x7, [x0, #32]
+	stnp	x8, x9, [x0, #48]
+	stnp	x10, x11, [x0, #64]
+	stnp	x12, x13, [x0, #80]
+	stnp	x14, x15, [x0, #96]
+	stnp	x16, x17, [x0, #112]
+
 	ret
 ENDPROC(copy_page)
author	Will Deacon <will.deacon@arm.com>	2016-02-02 12:46:25 +0000
committer	Jeff Vander Stoep <jeffv@google.com>	2016-09-22 13:38:22 -0700
commit	297bf83fa4e7804c28c6d950f39dbd3860552cb9 (patch)
tree	c6c026fa6f112973d0ca708c3b3b4276dceeab21
parent	b5924f376a221ffa0ee64a4410aff54416c2fd2d (diff)