msm: kgsl: Use a page pool to reduce allocation time

Use a mempool to reduce the allocation time. When memory allocated from pages is freed, put it in a page pool. At allocation time, try to allocate from the page pool before getting pages from the system. Make sure that the pool does not grow too big by enforcing a maximum limit. Change-Id: Icac7fb4355ee1fd07e7127ea5c2721665e279272 Signed-off-by: Deepak Kumar <dkumar@codeaurora.org>
author: Deepak Kumar <dkumar@codeaurora.org> 2016-04-29 09:16:51 -0600
committer: Jeevan Shriram <jshriram@codeaurora.org> 2016-05-05 15:05:55 -0700
commit: 658f4b0454659b6a190a3e34a81b2a81d3341afd (patch)
tree: a3263d7bd69b11fb205a425258686f7c9a21ee67 /drivers/gpu/msm/kgsl_sharedmem.c
parent: 436e7913d3e30609cbd04cfaac4d69c7b7f05838 (diff)
1 files changed, 77 insertions, 94 deletions
diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c
index 893e31852ccb..cc5e921ce920 100644
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@@ -27,6 +27,7 @@
 #include "kgsl_device.h"
 #include "kgsl_log.h"
 #include "kgsl_mmu.h"
+#include "kgsl_pool.h"
 
 /*
  * The user can set this from debugfs to force failed memory allocations to
@@ -424,9 +425,6 @@ done:
 
 static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc)
 {
-	unsigned int i = 0;
-	struct scatterlist *sg;
-
 	kgsl_page_alloc_unmap_kernel(memdesc);
 	/* we certainly do not expect the hostptr to still be mapped */
 	BUG_ON(memdesc->hostptr);
@@ -451,28 +449,15 @@ static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc)
 		atomic_long_sub(memdesc->size, &kgsl_driver.stats.page_alloc);
 	}
 
-	for_each_sg(memdesc->sgt->sgl, sg, memdesc->sgt->nents, i) {
-		/*
-		 * sg_alloc_table_from_pages() will collapse any physically
-		 * adjacent pages into a single scatterlist entry. We cannot
-		 * just call __free_pages() on the entire set since we cannot
-		 * ensure that the size is a whole order. Instead, free each
-		 * page or compound page group individually.
-		 */
-		struct page *p = sg_page(sg), *next;
-		unsigned int j = 0, count;
-		while (j < (sg->length/PAGE_SIZE)) {
-			if (memdesc->priv & KGSL_MEMDESC_TZ_LOCKED)
-				ClearPagePrivate(p);
-
-			count = 1 << compound_order(p);
-			next = nth_page(p, count);
-			__free_pages(p, compound_order(p));
-			p = next;
-			j += count;
+	if (memdesc->priv & KGSL_MEMDESC_TZ_LOCKED) {
+		struct sg_page_iter sg_iter;
 
-		}
+		for_each_sg_page(memdesc->sgt->sgl, &sg_iter,
+					memdesc->sgt->nents, 0)
+			ClearPagePrivate(sg_page_iter_page(&sg_iter));
 	}
+
+	kgsl_pool_free_sgt(memdesc->sgt);
 }
 
 /*
@@ -681,19 +666,67 @@ static inline int get_page_size(size_t size, unsigned int align)
 }
 #endif
 
+static void kgsl_zero_pages(struct page **pages, unsigned int pcount)
+{
+	unsigned int j;
+	unsigned int step = ((VMALLOC_END - VMALLOC_START)/8) >> PAGE_SHIFT;
+	pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
+	void *ptr;
+
+	/*
+	 * All memory that goes to the user has to be zeroed out before it gets
+	 * exposed to userspace. This means that the memory has to be mapped in
+	 * the kernel, zeroed (memset) and then unmapped.  This also means that
+	 * the dcache has to be flushed to ensure coherency between the kernel
+	 * and user pages. We used to pass __GFP_ZERO to alloc_page which mapped
+	 * zeroed and unmaped each individual page, and then we had to turn
+	 * around and call flush_dcache_page() on that page to clear the caches.
+	 * This was killing us for performance. Instead, we found it is much
+	 * faster to allocate the pages without GFP_ZERO, map a chunk of the
+	 * range ('step' pages), memset it, flush it and then unmap
+	 * - this results in a factor of 4 improvement for speed for large
+	 * buffers. There is a small decrease in speed for small buffers,
+	 * but only on the order of a few microseconds at best. The 'step'
+	 * size is based on a guess at the amount of free vmalloc space, but
+	 * will scale down if there's not enough free space.
+	 */
+	for (j = 0; j < pcount; j += step) {
+		step = min(step, pcount - j);
+
+		ptr = vmap(&pages[j], step, VM_IOREMAP, page_prot);
+
+		if (ptr != NULL) {
+			memset(ptr, 0, step * PAGE_SIZE);
+			dmac_flush_range(ptr, ptr + step * PAGE_SIZE);
+			vunmap(ptr);
+		} else {
+			int k;
+			/* Very, very, very slow path */
+
+			for (k = j; k < j + step; k++) {
+				ptr = kmap_atomic(pages[k]);
+				memset(ptr, 0, PAGE_SIZE);
+				dmac_flush_range(ptr, ptr + PAGE_SIZE);
+				kunmap_atomic(ptr);
+			}
+			/* scale down the step size to avoid this path */
+			if (step > 1)
+				step >>= 1;
+		}
+	}
+}
+
 static int
 kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 			struct kgsl_pagetable *pagetable,
 			uint64_t size)
 {
 	int ret = 0;
-	unsigned int j, pcount = 0, page_size, len_alloc;
+	unsigned int j, page_size, len_alloc;
+	unsigned int pcount = 0;
 	size_t len;
 	struct page **pages = NULL;
-	pgprot_t page_prot = pgprot_writecombine(PAGE_KERNEL);
-	void *ptr;
 	unsigned int align;
-	unsigned int step = ((VMALLOC_END - VMALLOC_START)/8) >> PAGE_SHIFT;
 
 	size = PAGE_ALIGN(size);
 	if (size == 0 || size > UINT_MAX)
@@ -740,30 +773,16 @@ kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 	len = size;
 
 	while (len > 0) {
-		struct page *page;
-		gfp_t gfp_mask = __GFP_HIGHMEM;
-		int j;
+		int page_count;
 
 		/* don't waste space at the end of the allocation*/
 		if (len < page_size)
 			page_size = PAGE_SIZE;
 
-		/*
-		 * Don't do some of the more aggressive memory recovery
-		 * techniques for large order allocations
-		 */
-		if (page_size != PAGE_SIZE) {
-			gfp_mask |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
-			gfp_mask &= ~__GFP_RECLAIM;
-		} else
-			gfp_mask |= GFP_KERNEL;
-
-		if (sharedmem_noretry_flag == true)
-			gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
+		page_count = kgsl_pool_alloc_page(page_size,
+					pages + pcount, len_alloc - pcount);
 
-		page = alloc_pages(gfp_mask, get_order(page_size));
-
-		if (page == NULL) {
+		if (page_count <= 0) {
 			if (page_size != PAGE_SIZE) {
 				page_size = PAGE_SIZE;
 				continue;
@@ -785,9 +804,7 @@ kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 			goto done;
 		}
 
-		for (j = 0; j < page_size >> PAGE_SHIFT; j++)
-			pages[pcount++] = nth_page(page, j);
-
+		pcount += page_count;
 		len -= page_size;
 		memdesc->size += page_size;
 	}
@@ -824,57 +841,23 @@ kgsl_sharedmem_page_alloc_user(struct kgsl_memdesc *memdesc,
 		goto done;
 	}
 
-	/*
-	 * All memory that goes to the user has to be zeroed out before it gets
-	 * exposed to userspace. This means that the memory has to be mapped in
-	 * the kernel, zeroed (memset) and then unmapped.  This also means that
-	 * the dcache has to be flushed to ensure coherency between the kernel
-	 * and user pages. We used to pass __GFP_ZERO to alloc_page which mapped
-	 * zeroed and unmaped each individual page, and then we had to turn
-	 * around and call flush_dcache_page() on that page to clear the caches.
-	 * This was killing us for performance. Instead, we found it is much
-	 * faster to allocate the pages without GFP_ZERO, map a chunk of the
-	 * range ('step' pages), memset it, flush it and then unmap
-	 * - this results in a factor of 4 improvement for speed for large
-	 * buffers. There is a small decrease in speed for small buffers,
-	 * but only on the order of a few microseconds at best. The 'step'
-	 * size is based on a guess at the amount of free vmalloc space, but
-	 * will scale down if there's not enough free space.
-	 */
-	for (j = 0; j < pcount; j += step) {
-		step = min(step, pcount - j);
-
-		ptr = vmap(&pages[j], step, VM_IOREMAP, page_prot);
-
-		if (ptr != NULL) {
-			memset(ptr, 0, step * PAGE_SIZE);
-			dmac_flush_range(ptr, ptr + step * PAGE_SIZE);
-			vunmap(ptr);
-		} else {
-			int k;
-			/* Very, very, very slow path */
-
-			for (k = j; k < j + step; k++) {
-				ptr = kmap_atomic(pages[k]);
-				memset(ptr, 0, PAGE_SIZE);
-				dmac_flush_range(ptr, ptr + PAGE_SIZE);
-				kunmap_atomic(ptr);
-			}
-			/* scale down the step size to avoid this path */
-			if (step > 1)
-				step >>= 1;
-		}
-	}
-
 	KGSL_STATS_ADD(memdesc->size, &kgsl_driver.stats.page_alloc,
 		&kgsl_driver.stats.page_alloc_max);
 
+	/*
+	 * Zero out the pages.
+	 */
+	kgsl_zero_pages(pages, pcount);
+
 done:
 	if (ret) {
-		unsigned int count = 1;
-		for (j = 0; j < pcount; j += count) {
-			count = 1 << compound_order(pages[j]);
-			__free_pages(pages[j], compound_order(pages[j]));
+		if (pages) {
+			unsigned int count = 1;
+
+			for (j = 0; j < pcount; j += count) {
+				count = 1 << compound_order(pages[j]);
+				kgsl_pool_free_page(pages[j]);
+			}
 		}
 
 		kfree(memdesc->sgt);
author	Deepak Kumar <dkumar@codeaurora.org>	2016-04-29 09:16:51 -0600
committer	Jeevan Shriram <jshriram@codeaurora.org>	2016-05-05 15:05:55 -0700
commit	658f4b0454659b6a190a3e34a81b2a81d3341afd (patch)
tree	a3263d7bd69b11fb205a425258686f7c9a21ee67 /drivers/gpu/msm/kgsl_sharedmem.c
parent	436e7913d3e30609cbd04cfaac4d69c7b7f05838 (diff)