summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/msm/adreno/a5xx_gpu.c62
-rw-r--r--drivers/gpu/drm/msm/adreno/adreno_gpu.c1
-rw-r--r--drivers/gpu/drm/msm/msm_gem.h2
-rw-r--r--drivers/gpu/drm/msm/msm_gem_submit.c10
-rw-r--r--include/uapi/drm/msm_drm.h11
5 files changed, 86 insertions, 0 deletions
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 32b2c7fab839..9ceef8f437b5 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -133,10 +133,30 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
OUT_RING(ring, 0x02);
+ /* Record the always on counter before command execution */
+ if (submit->profile_buf_iova) {
+ uint64_t gpuaddr = submit->profile_buf_iova +
+ offsetof(struct drm_msm_gem_submit_profile_buffer,
+ ticks_submitted);
+
+ /*
+ * Set bit[30] to make this command a 64 bit write operation.
+ * bits[18-29] is to specify number of consecutive registers
+ * to copy, so set this space with 2, since we want to copy
+ * data from REG_A5XX_RBBM_ALWAYSON_COUNTER_LO and [HI].
+ */
+ OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+ OUT_RING(ring, REG_A5XX_RBBM_ALWAYSON_COUNTER_LO |
+ (1 << 30) | (2 << 18));
+ OUT_RING(ring, lower_32_bits(gpuaddr));
+ OUT_RING(ring, upper_32_bits(gpuaddr));
+ }
+
/* Submit the commands */
for (i = 0; i < submit->nr_cmds; i++) {
switch (submit->cmd[i].type) {
case MSM_SUBMIT_CMD_IB_TARGET_BUF:
+ case MSM_SUBMIT_CMD_PROFILE_BUF:
break;
case MSM_SUBMIT_CMD_BUF:
OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3);
@@ -164,6 +184,19 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
OUT_RING(ring, 0x01);
+ /* Record the always on counter after command execution */
+ if (submit->profile_buf_iova) {
+ uint64_t gpuaddr = submit->profile_buf_iova +
+ offsetof(struct drm_msm_gem_submit_profile_buffer,
+ ticks_retired);
+
+ OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+ OUT_RING(ring, REG_A5XX_RBBM_ALWAYSON_COUNTER_LO |
+ (1 << 30) | (2 << 18));
+ OUT_RING(ring, lower_32_bits(gpuaddr));
+ OUT_RING(ring, upper_32_bits(gpuaddr));
+ }
+
/* Write the fence to the scratch register */
OUT_PKT4(ring, REG_A5XX_CP_SCRATCH_REG(2), 1);
OUT_RING(ring, submit->fence);
@@ -193,6 +226,35 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
/* Set bit 0 to trigger an interrupt on preempt complete */
OUT_RING(ring, 0x01);
+ if (submit->profile_buf_iova) {
+ unsigned long flags;
+ uint64_t ktime;
+ struct drm_msm_gem_submit_profile_buffer *profile_buf =
+ submit->profile_buf_vaddr;
+
+ /*
+ * With this profiling, we are trying to create closest
+ * possible mapping between the CPU time domain(monotonic clock)
+ * and the GPU time domain(ticks). In order to make this
+ * happen, we need to briefly turn off interrupts to make sure
+ * interrupts do not run between collecting these two samples.
+ */
+ local_irq_save(flags);
+
+ profile_buf->ticks_queued = gpu_read64(gpu,
+ REG_A5XX_RBBM_ALWAYSON_COUNTER_LO,
+ REG_A5XX_RBBM_ALWAYSON_COUNTER_HI);
+
+ ktime = ktime_get_raw_ns();
+
+ local_irq_restore(flags);
+
+ do_div(ktime, NSEC_PER_SEC);
+
+ profile_buf->queue_time = ktime;
+ profile_buf->submit_time = ktime;
+ }
+
a5xx_flush(gpu, ring);
/* Check to see if we need to start preemption */
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 969ed810ce9d..19267b2a3b49 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -183,6 +183,7 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
case MSM_SUBMIT_CMD_IB_TARGET_BUF:
/* ignore IB-targets */
break;
+ case MSM_SUBMIT_CMD_PROFILE_BUF:
case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
break;
case MSM_SUBMIT_CMD_BUF:
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index ac46c473791f..2045dc34c20a 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -125,6 +125,8 @@ struct msm_gem_submit {
uint32_t fence;
int ring;
bool valid;
+ uint64_t profile_buf_iova;
+ void *profile_buf_vaddr;
unsigned int nr_cmds;
unsigned int nr_bos;
struct {
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 0566cefaae81..52fc81420690 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -48,6 +48,9 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
submit->nr_bos = 0;
submit->nr_cmds = 0;
+ submit->profile_buf_vaddr = NULL;
+ submit->profile_buf_iova = 0;
+
INIT_LIST_HEAD(&submit->bo_list);
ww_acquire_init(&submit->ticket, &reservation_ww_class);
}
@@ -393,6 +396,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
case MSM_SUBMIT_CMD_BUF:
case MSM_SUBMIT_CMD_IB_TARGET_BUF:
case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
+ case MSM_SUBMIT_CMD_PROFILE_BUF:
break;
default:
DRM_ERROR("invalid type: %08x\n", submit_cmd.type);
@@ -425,6 +429,12 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
submit->cmd[i].iova = iova + submit_cmd.submit_offset;
submit->cmd[i].idx = submit_cmd.submit_idx;
+ if (submit_cmd.type == MSM_SUBMIT_CMD_PROFILE_BUF) {
+ submit->profile_buf_iova = submit->cmd[i].iova;
+ submit->profile_buf_vaddr =
+ msm_gem_vaddr_locked(&msm_obj->base);
+ }
+
if (submit->valid)
continue;
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 8baf2bf6df2e..c974714a9abe 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -152,10 +152,13 @@ struct drm_msm_gem_submit_reloc {
* this buffer in the first-level ringbuffer
* CTX_RESTORE_BUF - only executed if there has been a GPU context
* switch since the last SUBMIT ioctl
+ * PROFILE_BUF - A profiling buffer written to by both GPU and CPU.
*/
#define MSM_SUBMIT_CMD_BUF 0x0001
#define MSM_SUBMIT_CMD_IB_TARGET_BUF 0x0002
#define MSM_SUBMIT_CMD_CTX_RESTORE_BUF 0x0003
+#define MSM_SUBMIT_CMD_PROFILE_BUF 0x0004
+
struct drm_msm_gem_submit_cmd {
__u32 type; /* in, one of MSM_SUBMIT_CMD_x */
__u32 submit_idx; /* in, index of submit_bo cmdstream buffer */
@@ -207,6 +210,14 @@ struct drm_msm_gem_submit {
__u64 __user cmds; /* in, ptr to array of submit_cmd's */
};
+struct drm_msm_gem_submit_profile_buffer {
+ __s64 queue_time; /* out, Ringbuffer queue time (seconds) */
+ __s64 submit_time; /* out, Ringbuffer submission time (seconds) */
+ __u64 ticks_queued; /* out, GPU ticks at ringbuffer submission */
+ __u64 ticks_submitted; /* out, GPU ticks before cmdstream execution*/
+ __u64 ticks_retired; /* out, GPU ticks after cmdstream execution */
+};
+
/* The normal way to synchronize with the GPU is just to CPU_PREP on
* a buffer if you need to access it from the CPU (other cmdstream
* submission from same or other contexts, PAGE_FLIP ioctl, etc, all