From 7e43022e8c875587b4ffe447ef84eae0fffb657b Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 25 Jun 2019 17:19:11 +0200 Subject: [PATCH] radv/gfx10: add gfx10_cs_emit_cache_flush The cache flush logic on GFX10 is quite different and it's implemented with a new function. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/vulkan/si_cmd_buffer.c | 175 ++++++++++++++++++++++++++++++++- 1 file changed, 174 insertions(+), 1 deletion(-) diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 91a1db4fc84..7a2099d7d1a 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -823,6 +823,172 @@ si_emit_acquire_mem(struct radeon_cmdbuf *cs, } } +static void +gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, + enum chip_class chip_class, + uint32_t *flush_cnt, + uint64_t flush_va, + bool is_mec, + enum radv_cmd_flush_bits flush_bits, + uint64_t gfx9_eop_bug_va) +{ + uint32_t gcr_cntl = 0; + unsigned cb_db_event = 0; + + /* We don't need these. */ + assert(!(flush_bits & (RADV_CMD_FLAG_VGT_FLUSH | + RADV_CMD_FLAG_VGT_STREAMOUT_SYNC))); + + if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) + gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); + if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) { + /* TODO: When writing to the SMEM L1 cache, we need to set SEQ + * to FORWARD when both L1 and L2 are written out (WB or INV). + */ + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); + } + if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); + if (flush_bits & RADV_CMD_FLAG_INV_L2) { + /* Writeback and invalidate everything in L2. */ + gcr_cntl |= S_586_GL2_INV(1) | S_586_GLM_INV(1); + } else if (flush_bits & RADV_CMD_FLAG_WB_L2) { + /* Writeback but do not invalidate. */ + gcr_cntl |= S_586_GL2_WB(1); + } + + /* TODO: Implement this new flag for GFX9+. + if (flush_bits & RADV_CMD_FLAG_INV_L2_METADATA) + gcr_cntl |= S_586_GLM_INV(1); + */ + + if (flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) { + /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_CB_META */ + if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { + /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | + EVENT_INDEX(0)); + } + + /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */ + if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { + /* Flush HTILE. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | + EVENT_INDEX(0)); + } + + /* First flush CB/DB, then L1/L2. */ + gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD); + + if ((flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) == + (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB)) { + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } else if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + } else if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + } else { + assert(0); + } + } else { + /* Wait for graphics shaders to go idle if requested. */ + if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + } + } + + if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); + } + + if (cb_db_event) { + /* CB/DB flush and invalidate (or possibly just a wait for a + * meta flush) via RELEASE_MEM. + * + * Combine this with other cache flushes when possible; this + * requires affected shaders to be idle, so do it after the + * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always + * implied). + */ + /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ + unsigned glm_wb = G_586_GLM_WB(gcr_cntl); + unsigned glm_inv = G_586_GLM_INV(gcr_cntl); + unsigned glv_inv = G_586_GLV_INV(gcr_cntl); + unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); + assert(G_586_GL2_US(gcr_cntl) == 0); + assert(G_586_GL2_RANGE(gcr_cntl) == 0); + assert(G_586_GL2_DISCARD(gcr_cntl) == 0); + unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); + unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); + unsigned gcr_seq = G_586_SEQ(gcr_cntl); + + gcr_cntl &= C_586_GLM_WB & + C_586_GLM_INV & + C_586_GLV_INV & + C_586_GL1_INV & + C_586_GL2_INV & + C_586_GL2_WB; /* keep SEQ */ + + assert(flush_cnt); + (*flush_cnt)++; + + si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, + S_490_GLM_WB(glm_wb) | + S_490_GLM_INV(glm_inv) | + S_490_GLV_INV(glv_inv) | + S_490_GL1_INV(gl1_inv) | + S_490_GL2_INV(gl2_inv) | + S_490_GL2_WB(gl2_wb) | + S_490_SEQ(gcr_seq), + EOP_DATA_SEL_VALUE_32BIT, + flush_va, *flush_cnt, + gfx9_eop_bug_va); + + radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, flush_va, + *flush_cnt, 0xffffffff); + } + + /* Ignore fields that only modify the behavior of other fields. */ + if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { + /* Flush caches and wait for the caches to assert idle. + * The cache flush is executed in the ME, but the PFP waits + * for completion. + */ + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + radeon_emit(cs, 0); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ + } else if (cb_db_event || + (flush_bits & (RADV_CMD_FLAG_VS_PARTIAL_FLUSH | + RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + RADV_CMD_FLAG_CS_PARTIAL_FLUSH))) { + /* We need to ensure that PFP waits as well. */ + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } + + if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | + EVENT_INDEX(0)); + } else if (flush_bits & RADV_CMD_FLAG_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | + EVENT_INDEX(0)); + } +} + void si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, enum chip_class chip_class, @@ -835,7 +1001,14 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, unsigned cp_coher_cntl = 0; uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB); - + + if (chip_class >= GFX10) { + /* GFX10 cache flush handling is quite different. */ + gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va, + is_mec, flush_bits, gfx9_eop_bug_va); + return; + } + if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); if (flush_bits & RADV_CMD_FLAG_INV_SCACHE)