/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 and
 * only version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 */

#include "msm_gpu.h"
#include "msm_gem.h"
#include "a5xx_gpu.h"
#include "msm_snapshot_api.h"

#define A5XX_NR_SHADER_BANKS 4

/*
 * These are a list of the registers that need to be read through the HLSQ
 * aperture through the crashdumper.  These are not nominally accessible from
 * the CPU on a secure platform.
 */
static const struct {
	u32 type;
	u32 regoffset;
	u32 count;
} a5xx_hlsq_aperture_regs[] = {
	{ 0x35, 0xE00, 0x32 },   /* HSLQ non-context */
	{ 0x31, 0x2080, 0x1 },   /* HLSQ 2D context 0 */
	{ 0x33, 0x2480, 0x1 },   /* HLSQ 2D context 1 */
	{ 0x32, 0xE780, 0x62 },  /* HLSQ 3D context 0 */
	{ 0x34, 0xEF80, 0x62 },  /* HLSQ 3D context 1 */
	{ 0x3f, 0x0EC0, 0x40 },  /* SP non-context */
	{ 0x3d, 0x2040, 0x1 },   /* SP 2D context 0 */
	{ 0x3b, 0x2440, 0x1 },   /* SP 2D context 1 */
	{ 0x3e, 0xE580, 0x180 }, /* SP 3D context 0 */
	{ 0x3c, 0xED80, 0x180 }, /* SP 3D context 1 */
	{ 0x3a, 0x0F00, 0x1c },  /* TP non-context */
	{ 0x38, 0x2000, 0xa },   /* TP 2D context 0 */
	{ 0x36, 0x2400, 0xa },   /* TP 2D context 1 */
	{ 0x39, 0xE700, 0x80 },  /* TP 3D context 0 */
	{ 0x37, 0xEF00, 0x80 },  /* TP 3D context 1 */
};

/*
 * The debugbus registers contain device state that presumably makes
 * sense to the hardware designers. 'count' is the number of indexes to read,
 * each index value is 64 bits
 */
static const struct {
	enum a5xx_debugbus id;
	u32 count;
} a5xx_debugbus_blocks[] = {
	{  A5XX_RBBM_DBGBUS_CP, 0x100, },
	{  A5XX_RBBM_DBGBUS_RBBM, 0x100, },
	{  A5XX_RBBM_DBGBUS_HLSQ, 0x100, },
	{  A5XX_RBBM_DBGBUS_UCHE, 0x100, },
	{  A5XX_RBBM_DBGBUS_DPM, 0x100, },
	{  A5XX_RBBM_DBGBUS_TESS, 0x100, },
	{  A5XX_RBBM_DBGBUS_PC, 0x100, },
	{  A5XX_RBBM_DBGBUS_VFDP, 0x100, },
	{  A5XX_RBBM_DBGBUS_VPC, 0x100, },
	{  A5XX_RBBM_DBGBUS_TSE, 0x100, },
	{  A5XX_RBBM_DBGBUS_RAS, 0x100, },
	{  A5XX_RBBM_DBGBUS_VSC, 0x100, },
	{  A5XX_RBBM_DBGBUS_COM, 0x100, },
	{  A5XX_RBBM_DBGBUS_DCOM, 0x100, },
	{  A5XX_RBBM_DBGBUS_LRZ, 0x100, },
	{  A5XX_RBBM_DBGBUS_A2D_DSP, 0x100, },
	{  A5XX_RBBM_DBGBUS_CCUFCHE, 0x100, },
	{  A5XX_RBBM_DBGBUS_GPMU, 0x100, },
	{  A5XX_RBBM_DBGBUS_RBP, 0x100, },
	{  A5XX_RBBM_DBGBUS_HM, 0x100, },
	{  A5XX_RBBM_DBGBUS_RBBM_CFG, 0x100, },
	{  A5XX_RBBM_DBGBUS_VBIF_CX, 0x100, },
	{  A5XX_RBBM_DBGBUS_GPC, 0x100, },
	{  A5XX_RBBM_DBGBUS_LARC, 0x100, },
	{  A5XX_RBBM_DBGBUS_HLSQ_SPTP, 0x100, },
	{  A5XX_RBBM_DBGBUS_RB_0, 0x100, },
	{  A5XX_RBBM_DBGBUS_RB_1, 0x100, },
	{  A5XX_RBBM_DBGBUS_RB_2, 0x100, },
	{  A5XX_RBBM_DBGBUS_RB_3, 0x100, },
	{  A5XX_RBBM_DBGBUS_CCU_0, 0x100, },
	{  A5XX_RBBM_DBGBUS_CCU_1, 0x100, },
	{  A5XX_RBBM_DBGBUS_CCU_2, 0x100, },
	{  A5XX_RBBM_DBGBUS_CCU_3, 0x100, },
	{  A5XX_RBBM_DBGBUS_A2D_RAS_0, 0x100, },
	{  A5XX_RBBM_DBGBUS_A2D_RAS_1, 0x100, },
	{  A5XX_RBBM_DBGBUS_A2D_RAS_2, 0x100, },
	{  A5XX_RBBM_DBGBUS_A2D_RAS_3, 0x100, },
	{  A5XX_RBBM_DBGBUS_VFD_0, 0x100, },
	{  A5XX_RBBM_DBGBUS_VFD_1, 0x100, },
	{  A5XX_RBBM_DBGBUS_VFD_2, 0x100, },
	{  A5XX_RBBM_DBGBUS_VFD_3, 0x100, },
	{  A5XX_RBBM_DBGBUS_SP_0, 0x100, },
	{  A5XX_RBBM_DBGBUS_SP_1, 0x100, },
	{  A5XX_RBBM_DBGBUS_SP_2, 0x100, },
	{  A5XX_RBBM_DBGBUS_SP_3, 0x100, },
	{  A5XX_RBBM_DBGBUS_TPL1_0, 0x100, },
	{  A5XX_RBBM_DBGBUS_TPL1_1, 0x100, },
	{  A5XX_RBBM_DBGBUS_TPL1_2, 0x100, },
	{  A5XX_RBBM_DBGBUS_TPL1_3, 0x100, },
};

/*
 * The shader blocks are read from the HLSQ aperture - each one has its own
 * identifier for the aperture read
 */
static const struct {
	enum a5xx_shader_blocks id;
	u32 size;
} a5xx_shader_blocks[] = {
	{A5XX_TP_W_MEMOBJ,              0x200},
	{A5XX_TP_W_MIPMAP_BASE,         0x3C0},
	{A5XX_TP_W_SAMPLER_TAG,          0x40},
	{A5XX_TP_S_3D_SAMPLER,           0x80},
	{A5XX_TP_S_3D_SAMPLER_TAG,       0x20},
	{A5XX_TP_S_CS_SAMPLER,           0x40},
	{A5XX_TP_S_CS_SAMPLER_TAG,       0x10},
	{A5XX_SP_W_CONST,               0x800},
	{A5XX_SP_W_CB_SIZE,              0x30},
	{A5XX_SP_W_CB_BASE,              0xF0},
	{A5XX_SP_W_STATE,                 0x1},
	{A5XX_SP_S_3D_CONST,            0x800},
	{A5XX_SP_S_3D_CB_SIZE,           0x28},
	{A5XX_SP_S_3D_UAV_SIZE,          0x80},
	{A5XX_SP_S_CS_CONST,            0x400},
	{A5XX_SP_S_CS_CB_SIZE,            0x8},
	{A5XX_SP_S_CS_UAV_SIZE,          0x80},
	{A5XX_SP_S_3D_CONST_DIRTY,       0x12},
	{A5XX_SP_S_3D_CB_SIZE_DIRTY,      0x1},
	{A5XX_SP_S_3D_UAV_SIZE_DIRTY,     0x2},
	{A5XX_SP_S_CS_CONST_DIRTY,        0xA},
	{A5XX_SP_S_CS_CB_SIZE_DIRTY,      0x1},
	{A5XX_SP_S_CS_UAV_SIZE_DIRTY,     0x2},
	{A5XX_HLSQ_ICB_DIRTY,             0xB},
	{A5XX_SP_POWER_RESTORE_RAM_TAG,   0xA},
	{A5XX_TP_POWER_RESTORE_RAM_TAG,   0xA},
	{A5XX_TP_W_SAMPLER,              0x80},
	{A5XX_TP_W_MEMOBJ_TAG,           0x40},
	{A5XX_TP_S_3D_MEMOBJ,           0x200},
	{A5XX_TP_S_3D_MEMOBJ_TAG,        0x20},
	{A5XX_TP_S_CS_MEMOBJ,           0x100},
	{A5XX_TP_S_CS_MEMOBJ_TAG,        0x10},
	{A5XX_SP_W_INSTR,               0x800},
	{A5XX_SP_W_UAV_SIZE,             0x80},
	{A5XX_SP_W_UAV_BASE,             0x80},
	{A5XX_SP_W_INST_TAG,             0x40},
	{A5XX_SP_S_3D_INSTR,            0x800},
	{A5XX_SP_S_3D_CB_BASE,           0xC8},
	{A5XX_SP_S_3D_UAV_BASE,          0x80},
	{A5XX_SP_S_CS_INSTR,            0x400},
	{A5XX_SP_S_CS_CB_BASE,           0x28},
	{A5XX_SP_S_CS_UAV_BASE,          0x80},
	{A5XX_SP_S_3D_INSTR_DIRTY,        0x1},
	{A5XX_SP_S_3D_CB_BASE_DIRTY,      0x5},
	{A5XX_SP_S_3D_UAV_BASE_DIRTY,     0x2},
	{A5XX_SP_S_CS_INSTR_DIRTY,        0x1},
	{A5XX_SP_S_CS_CB_BASE_DIRTY,      0x1},
	{A5XX_SP_S_CS_UAV_BASE_DIRTY,     0x2},
	{A5XX_HLSQ_ICB,                 0x200},
	{A5XX_HLSQ_ICB_CB_BASE_DIRTY,     0x4},
	{A5XX_SP_POWER_RESTORE_RAM,     0x140},
	{A5XX_TP_POWER_RESTORE_RAM,      0x40},
};

/*
 * The A5XX architecture has a a built in engine to asynchronously dump
 * registers from the GPU. It is used to accelerate the copy of hundreds
 * (thousands) of registers and as a safe way to access registers that might
 * have secure data in them (if the GPU is in secure, the crashdumper returns
 * bogus values for those registers). On a fully secured device the CPU will be
 * blocked from accessing those registers directly and so the crashdump is the
 * only way that we can access context registers and the shader banks for debug
 * purposes.
 *
 * The downside of the crashdump is that it requires access to GPU accessible
 * memory (so the VBIF and the bus and the SMMU need to be up and working) and
 * you need enough memory to write the script for the crashdumper and to store
 * the data that you are dumping so there is a balancing act between the work to
 * set up a crash dumper and the value we get out of it.
 */

/*
 * The crashdump uses a pseudo-script format to read and write registers.  Each
 * operation is two 64 bit values.
 *
 * READ:
 *  [qword 0] [64:00] - The absolute IOVA address target for the register value
 *  [qword 1] [63:44] - the dword address of the register offset to read
 *            [15:00] - Number of dwords to read at once
 *
 * WRITE:
 *  [qword 0] [31:0] 32 bit value to write to the register
 *  [qword 1] [63:44] - the dword address of the register offset to write
 *            [21:21] - set 1 to write
 *            [15:00] - Number of dwords to write (usually 1)
 *
 * At the bottom of the script, write quadword zeros to trigger the end.
 */
struct crashdump {
	struct drm_gem_object *bo;
	void *ptr;
	u64 iova;
	u32 index;
};

#define CRASHDUMP_BO_SIZE (SZ_1M)
#define CRASHDUMP_SCRIPT_SIZE (256 * SZ_1K)
#define CRASHDUMP_DATA_SIZE (CRASHDUMP_BO_SIZE - CRASHDUMP_SCRIPT_SIZE)

static int crashdump_init(struct msm_gpu *gpu, struct crashdump *crashdump)
{
	int ret = 0;

	crashdump->ptr = msm_gem_kernel_new_locked(gpu->dev,
		CRASHDUMP_BO_SIZE, MSM_BO_UNCACHED,
		gpu->aspace, &crashdump->bo, &crashdump->iova);
	if (IS_ERR(crashdump->ptr)) {
		ret = PTR_ERR(crashdump->ptr);
		crashdump->ptr = NULL;
	}

	return ret;
}

static int crashdump_run(struct msm_gpu *gpu, struct crashdump *crashdump)
{
	if (!crashdump->ptr || !crashdump->index)
		return -EINVAL;

	gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_LO,
		lower_32_bits(crashdump->iova));
	gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_HI,
		upper_32_bits(crashdump->iova));

	gpu_write(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL, 1);

	return spin_until(gpu_read(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL) & 0x04);
}

static void crashdump_destroy(struct msm_gpu *gpu, struct crashdump *crashdump)
{
	if (!crashdump->bo)
		return;

	if (crashdump->iova)
		msm_gem_put_iova(crashdump->bo, gpu->aspace);

	drm_gem_object_unreference(crashdump->bo);

	memset(crashdump, 0, sizeof(*crashdump));
}

static inline void CRASHDUMP_SCRIPT_WRITE(struct crashdump *crashdump,
		u32 reg, u32 val)
{
	u64 *ptr = crashdump->ptr + crashdump->index;

	if (WARN_ON(crashdump->index + (2 * sizeof(u64))
		>= CRASHDUMP_SCRIPT_SIZE))
		return;

	/* This is the value to write */
	ptr[0] = (u64) val;

	/*
	 * This triggers a write to the specified register.  1 is the size of
	 * the write in dwords
	 */
	ptr[1] = (((u64) reg) << 44) | (1 << 21) | 1;

	crashdump->index += 2 * sizeof(u64);
}

static inline void CRASHDUMP_SCRIPT_READ(struct crashdump *crashdump,
		u32 reg, u32 count, u32 offset)
{
	u64 *ptr = crashdump->ptr + crashdump->index;

	if (WARN_ON(crashdump->index + (2 * sizeof(u64))
		>= CRASHDUMP_SCRIPT_SIZE))
		return;

	if (WARN_ON(offset + (count * sizeof(u32)) >= CRASHDUMP_DATA_SIZE))
		return;

	ptr[0] = (u64) crashdump->iova + CRASHDUMP_SCRIPT_SIZE + offset;
	ptr[1] = (((u64) reg) << 44) | count;

	crashdump->index += 2 * sizeof(u64);
}

static inline void *CRASHDUMP_DATA_PTR(struct crashdump *crashdump, u32 offset)
{
	if (WARN_ON(!crashdump->ptr || offset >= CRASHDUMP_DATA_SIZE))
		return NULL;

	return crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset;
}

static inline u32 CRASHDUMP_DATA_READ(struct crashdump *crashdump, u32 offset)
{
	return *((u32 *) CRASHDUMP_DATA_PTR(crashdump, offset));
}

static inline void CRASHDUMP_RESET(struct crashdump *crashdump)
{
	crashdump->index = 0;
}

static inline void CRASHDUMP_END(struct crashdump *crashdump)
{
	u64 *ptr = crashdump->ptr + crashdump->index;

	if (WARN_ON((crashdump->index + (2 * sizeof(u64)))
		>= CRASHDUMP_SCRIPT_SIZE))
		return;

	ptr[0] = 0;
	ptr[1] = 0;

	crashdump->index += 2 * sizeof(u64);
}

static u32 _crashdump_read_hlsq_aperture(struct crashdump *crashdump,
		u32 offset, u32 statetype, u32 bank,
		u32 count)
{
	CRASHDUMP_SCRIPT_WRITE(crashdump, REG_A5XX_HLSQ_DBG_READ_SEL,
		A5XX_HLSQ_DBG_READ_SEL_STATETYPE(statetype) | bank);

	CRASHDUMP_SCRIPT_READ(crashdump, REG_A5XX_HLSQ_DBG_AHB_READ_APERTURE,
		count, offset);

	return count * sizeof(u32);
}

static u32 _copy_registers(struct msm_snapshot *snapshot,
		struct crashdump *crashdump, u32 reg, u32 count,
		u32 offset)
{
	int i;
	u32 *ptr = (u32 *) (crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset);
	/*
	 * Write the offset of the first register of the group and the number of
	 * registers in the group
	 */
	SNAPSHOT_WRITE_U32(snapshot, ((count << 16) | reg));

	/* Followed by each register value in the group */
	for (i = 0; i < count; i++)
		SNAPSHOT_WRITE_U32(snapshot, ptr[i]);

	return count * sizeof(u32);
}

/*
 * Return the number of registers in each register group from the
 * adreno_gpu->rgisters
 */
static inline u32 REG_COUNT(const unsigned int *ptr)
{
	return (ptr[1] - ptr[0]) + 1;
}

/*
 * Capture what registers we can from the CPU in case the crashdumper is
 * unavailable or broken.  This will omit the SP,TP and HLSQ registers, but
 * you'll get everything else and that ain't bad
 */
static void a5xx_snapshot_registers_cpu(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
	struct msm_snapshot_regs header;
	u32 regcount = 0, groups = 0;
	int i;

	/*
	 * Before we write the section we need to figure out how big our data
	 * section will be
	 */
	for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) {
		regcount += REG_COUNT(&(adreno_gpu->registers[i]));
		groups++;
	}

	header.count = groups;

	/*
	 * We need one dword for each group and then one dword for each register
	 * value in that group
	 */
	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2,
		regcount + groups))
		return;

	for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) {
		u32 count = REG_COUNT(&(adreno_gpu->registers[i]));
		u32 reg = adreno_gpu->registers[i];
		int j;

		/* Write the offset and count for the group */
		SNAPSHOT_WRITE_U32(snapshot, (count << 16) | reg);

		/* Write each value in the group */
		for (j = 0; j < count; j++)
			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, reg++));
	}
}

static void a5xx_snapshot_registers(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	struct msm_snapshot_regs header;
	struct crashdump *crashdump = snapshot->priv;
	u32 offset = 0, regcount = 0, groups = 0;
	int i;

	/*
	 * First snapshot all the registers that we can from the CPU.  Do this
	 * because the crashdumper has a tendency to "taint" the value of some
	 * of the registers (because the GPU implements the crashdumper) so we
	 * only want to use the crash dump facility if we have to
	 */
	a5xx_snapshot_registers_cpu(gpu, snapshot);

	if (!crashdump)
		return;

	CRASHDUMP_RESET(crashdump);

	/* HLSQ and context registers behind the aperture */
	for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++) {
		u32 count = a5xx_hlsq_aperture_regs[i].count;

		offset += _crashdump_read_hlsq_aperture(crashdump, offset,
			a5xx_hlsq_aperture_regs[i].type, 0, count);
		regcount += count;

		groups++;
	}

	CRASHDUMP_END(crashdump);

	if (crashdump_run(gpu, crashdump))
		return;

	header.count = groups;

	/*
	 * The size of the data will be one dword for each "group" of registers,
	 * and then one dword for each of the registers in that group
	 */
	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2,
		groups + regcount))
		return;

	/* Copy the registers to the snapshot */
	for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++)
		offset += _copy_registers(snapshot, crashdump,
			a5xx_hlsq_aperture_regs[i].regoffset,
			a5xx_hlsq_aperture_regs[i].count, offset);
}

static void _a5xx_snapshot_shader_bank(struct msm_snapshot *snapshot,
		struct crashdump *crashdump, u32 block, u32 bank,
		u32 size, u32 offset)
{
	void *src;

	struct msm_snapshot_shader header = {
		.type = block,
		.index = bank,
		.size = size,
	};

	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_SHADER, size))
		return;

	src = CRASHDUMP_DATA_PTR(crashdump, offset);

	if (src)
		SNAPSHOT_MEMCPY(snapshot, src, size * sizeof(u32));
}

static void a5xx_snapshot_shader_memory(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	struct crashdump *crashdump = snapshot->priv;
	u32 offset = 0;
	int i;

	/* We can only get shader memory through the crashdump */
	if (!crashdump)
		return;

	CRASHDUMP_RESET(crashdump);

	/* For each shader block */
	for (i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) {
		int j;

		/* For each block, dump 4 banks */
		for (j = 0; j < A5XX_NR_SHADER_BANKS; j++)
			offset += _crashdump_read_hlsq_aperture(crashdump,
				offset, a5xx_shader_blocks[i].id, j,
				a5xx_shader_blocks[i].size);
	}

	CRASHDUMP_END(crashdump);

	/* If the crashdump fails we can't get shader memory any other way */
	if (crashdump_run(gpu, crashdump))
		return;

	/* Each bank of each shader gets its own snapshot section */
	for (offset = 0, i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) {
		int j;

		for (j = 0; j < A5XX_NR_SHADER_BANKS; j++) {
			_a5xx_snapshot_shader_bank(snapshot, crashdump,
				a5xx_shader_blocks[i].id, j,
				a5xx_shader_blocks[i].size, offset);
			offset += a5xx_shader_blocks[i].size * sizeof(u32);
		}
	}
}

#define A5XX_NUM_AXI_ARB_BLOCKS 2
#define A5XX_NUM_XIN_BLOCKS     4
#define VBIF_DATA_SIZE ((16 * A5XX_NUM_AXI_ARB_BLOCKS) + \
	(18 * A5XX_NUM_XIN_BLOCKS) + (12 * A5XX_NUM_XIN_BLOCKS))

static void a5xx_snapshot_debugbus_vbif(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	int i;
	struct msm_snapshot_debugbus header = {
		.id = A5XX_RBBM_DBGBUS_VBIF,
		.count = VBIF_DATA_SIZE,
	};

	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS,
		VBIF_DATA_SIZE))
		return;

	gpu_rmw(gpu, REG_A5XX_VBIF_CLKON, A5XX_VBIF_CLKON_FORCE_ON_TESTBUS,
		A5XX_VBIF_CLKON_FORCE_ON_TESTBUS);

	gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 0);
	gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS_OUT_CTRL,
		A5XX_VBIF_TEST_BUS_OUT_CTRL_TEST_BUS_CTRL_EN);

	for (i = 0; i < A5XX_NUM_AXI_ARB_BLOCKS; i++) {
		int j;

		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << (i + 16));
		for (j = 0; j < 16; j++) {
			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1,
			A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j));
			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
				REG_A5XX_VBIF_TEST_BUS_OUT));
		}
	}

	for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) {
		int j;

		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << i);
		for (j = 0; j < 18; j++) {
			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1,
			A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j));
			SNAPSHOT_WRITE_U32(snapshot,
				gpu_read(gpu, REG_A5XX_VBIF_TEST_BUS_OUT));
		}
	}

	for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) {
		int j;

		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 1 << i);
		for (j = 0; j < 12; j++) {
			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL1,
			A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL(j));
			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
				REG_A5XX_VBIF_TEST_BUS_OUT));
		}
	}

}

static void a5xx_snapshot_debugbus_block(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot, u32 block, u32 count)
{
	int i;
	struct msm_snapshot_debugbus header = {
		.id = block,
		.count = count * 2, /* Each value is 2 dwords */
	};

	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS,
		(count * 2)))
		return;

	for (i = 0; i < count; i++) {
		u32 reg = A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX(i) |
			A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);

		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_A, reg);
		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_B, reg);
		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_C, reg);
		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_D, reg);

		/* Each debugbus entry is a quad word */
		SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
			REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF2));
		SNAPSHOT_WRITE_U32(snapshot,
			gpu_read(gpu, REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF1));
	}
}

static void a5xx_snapshot_debugbus(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	int i;

	gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_CNTLM,
		A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE(0xF));

	for (i = 0; i < ARRAY_SIZE(a5xx_debugbus_blocks); i++)
		a5xx_snapshot_debugbus_block(gpu, snapshot,
			a5xx_debugbus_blocks[i].id,
			a5xx_debugbus_blocks[i].count);

	/* VBIF is special and not in a good way */
	a5xx_snapshot_debugbus_vbif(gpu, snapshot);
}

static void a5xx_snapshot_cp_merciu(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	unsigned int i;
	struct msm_snapshot_debug header = {
		.type = SNAPSHOT_DEBUG_CP_MERCIU,
		.size = 64 << 1, /* Data size is 2 dwords per entry */
	};

	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64 << 1))
		return;

	gpu_write(gpu, REG_A5XX_CP_MERCIU_DBG_ADDR, 0);
	for (i = 0; i < 64; i++) {
		SNAPSHOT_WRITE_U32(snapshot,
			gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_1));
		SNAPSHOT_WRITE_U32(snapshot,
			gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_2));
	}
}

static void a5xx_snapshot_cp_roq(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	int i;
	struct msm_snapshot_debug header = {
		.type = SNAPSHOT_DEBUG_CP_ROQ,
		.size = 512,
	};

	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 512))
		return;

	gpu_write(gpu, REG_A5XX_CP_ROQ_DBG_ADDR, 0);
	for (i = 0; i < 512; i++)
		SNAPSHOT_WRITE_U32(snapshot,
			gpu_read(gpu, REG_A5XX_CP_ROQ_DBG_DATA));
}

static void a5xx_snapshot_cp_meq(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot)
{
	int i;
	struct msm_snapshot_debug header = {
		.type = SNAPSHOT_DEBUG_CP_MEQ,
		.size = 64,
	};

	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64))
		return;

	gpu_write(gpu, REG_A5XX_CP_MEQ_DBG_ADDR, 0);
	for (i = 0; i < 64; i++)
		SNAPSHOT_WRITE_U32(snapshot,
			gpu_read(gpu, REG_A5XX_CP_MEQ_DBG_DATA));
}

static void a5xx_snapshot_indexed_registers(struct msm_gpu *gpu,
		struct msm_snapshot *snapshot, u32 addr, u32 data,
		u32 count)
{
	unsigned int i;
	struct msm_snapshot_indexed_regs header = {
		.index_reg = addr,
		.data_reg = data,
		.start = 0,
		.count = count,
	};

	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_INDEXED_REGS,
		count))
		return;

	for (i = 0; i < count; i++) {
		gpu_write(gpu, addr, i);
		SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, data));
	}
}

static void a5xx_snapshot_preemption(struct msm_gpu *gpu, struct msm_snapshot
		*snapshot)
{
	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
	struct msm_snapshot_gpu_object header = {
		.type = SNAPSHOT_GPU_OBJECT_GLOBAL,
		.size = A5XX_PREEMPT_RECORD_SIZE >> 2,
		.pt_base = 0,
	};
	int index;

	if (gpu->nr_rings <= 1)
		return;

	for (index = 0; index < gpu->nr_rings; index++) {

		header.gpuaddr = a5xx_gpu->preempt_iova[index];

		if (!SNAPSHOT_HEADER(snapshot, header,
			SNAPSHOT_SECTION_GPU_OBJECT_V2,
			A5XX_PREEMPT_RECORD_SIZE >> 2))
			return;

		SNAPSHOT_MEMCPY(snapshot, a5xx_gpu->preempt[index],
				A5XX_PREEMPT_RECORD_SIZE);
	}
}

int a5xx_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
{
	struct crashdump crashdump = { 0 };

	if (!crashdump_init(gpu, &crashdump))
		snapshot->priv = &crashdump;

	/* To accurately read all registers, disable hardware clock gating */
	a5xx_set_hwcg(gpu, false);

	/* Kick it up to the generic level */
	adreno_snapshot(gpu, snapshot);

	/* Read the GPU registers */
	a5xx_snapshot_registers(gpu, snapshot);

	/* Read the shader memory banks */
	a5xx_snapshot_shader_memory(gpu, snapshot);

	/* Read the debugbus registers */
	a5xx_snapshot_debugbus(gpu, snapshot);

	/* PFP data */
	a5xx_snapshot_indexed_registers(gpu, snapshot,
		REG_A5XX_CP_PFP_STAT_ADDR, REG_A5XX_CP_PFP_STAT_DATA, 36);

	/* ME data */
	a5xx_snapshot_indexed_registers(gpu, snapshot,
		REG_A5XX_CP_ME_STAT_ADDR, REG_A5XX_CP_ME_STAT_DATA, 29);

	/* DRAW_STATE data */
	a5xx_snapshot_indexed_registers(gpu, snapshot,
		REG_A5XX_CP_DRAW_STATE_ADDR, REG_A5XX_CP_DRAW_STATE_DATA,
		256);

	/* ME cache */
	a5xx_snapshot_indexed_registers(gpu, snapshot,
		REG_A5XX_CP_ME_UCODE_DBG_ADDR, REG_A5XX_CP_ME_UCODE_DBG_DATA,
		0x53F);

	/* PFP cache */
	a5xx_snapshot_indexed_registers(gpu, snapshot,
		REG_A5XX_CP_PFP_UCODE_DBG_ADDR, REG_A5XX_CP_PFP_UCODE_DBG_DATA,
		0x53F);

	/* ME queue */
	a5xx_snapshot_cp_meq(gpu, snapshot);

	/* CP ROQ */
	a5xx_snapshot_cp_roq(gpu, snapshot);

	/* CP MERCIU */
	a5xx_snapshot_cp_merciu(gpu, snapshot);

	/* Preemption records*/
	a5xx_snapshot_preemption(gpu, snapshot);

	crashdump_destroy(gpu, &crashdump);
	snapshot->priv = NULL;

	/* Re-enable HWCG */
	a5xx_set_hwcg(gpu, true);
	return 0;
}