From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: duggasco Date: Mon, 18 May 2026 20:00:00 +0000 Subject: [PATCH] drm/amdgpu/gfx10: BC-250 40 CU unlock via CC + SPI register writes Re-enable all 40 harvested CUs on the AMD BC-250 (gfx1013 / Cyan Skillfish / salvaged PS5 APU) by writing two hardware registers during CU enumeration in gfx_v10_0_get_cu_info(): 1. CC_GC_SHADER_ARRAY_CONFIG — clears the harvest enumeration mask so the driver, RADV, and KFD see all 40 CUs. 2. SPI_PG_ENABLE_STATIC_WGP_MASK — enables the SPI (Shader Processor Input) to dispatch wavefronts to all 5 WGPs per shader array. Without this, CC alone changes reporting but SPI still dispatches to only 3 WGPs (24 CUs). 3. RLC_PG_ALWAYS_ON_WGP_MASK — keeps all WGPs powered. Both CC and SPI writes are required — neither alone produces compute scaling. Controlled via module parameter amdgpu.bc250_cc_write_mode (default 0 = off, 3 = enable all). Guarded by PCI device ID 0x13FE. Verified: pp512 302 tok/s (24 CU) -> 466 tok/s (40 CU) = 1.54x at 2GHz. At 1500MHz/900mV: 230 -> 372 tok/s = 1.61x with sustainable thermals. 4-state A/B test confirmed neither register alone has any effect: CC=0 SPI=0x07 (stock): 302 tok/s CC=0 SPI=0x1F (SPI only): 302 tok/s (no gain) CC=3 SPI=0x07 (CC only): 302 tok/s (no gain) CC=3 SPI=0x1F (both): 466 tok/s (1.54x) Reference: https://github.com/duggasco/bc250-40cu-unlock Signed-off-by: duggasco --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 46 ++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -26,6 +26,13 @@ #include #include #include + +/* BC-250 40 CU unlock: clears harvest mask + enables SPI dispatch to all WGPs */ +static int bc250_cc_write_mode; +module_param(bc250_cc_write_mode, int, 0444); +MODULE_PARM_DESC(bc250_cc_write_mode, + "BC-250: 0=off 1=probe-SE0SH0 2=clear-SE0SH0 3=clear-all-SAs 4=probe-all-SAs"); +#define BC250_PCI_DEVICE_ID 0x13FE #include "amdgpu.h" #include "amdgpu_gfx.h" #include "amdgpu_psp.h" @@ -10127,6 +10134,40 @@ amdgpu_gfx_parse_disable_cu(disable_masks, 4, 2); mutex_lock(&adev->grbm_idx_mutex); + + /* BC-250: unlock harvested CUs -- CC (enumeration) + SPI (dispatch) + RLC (power) */ + if (bc250_cc_write_mode > 0 && adev->pdev->device == BC250_PCI_DEVICE_ID) { + int bc_se, bc_sh; + for (bc_se = 0; bc_se < adev->gfx.config.max_shader_engines; bc_se++) { + for (bc_sh = 0; bc_sh < adev->gfx.config.max_sh_per_se; bc_sh++) { + u32 bc_cc_orig, bc_cc_after, bc_spi_orig, bc_spi_after; + if (bc250_cc_write_mode == 2 && (bc_se > 0 || bc_sh > 0)) + continue; + gfx_v10_0_select_se_sh(adev, bc_se, bc_sh, 0xffffffff, 0); + bc_cc_orig = RREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG); + WREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG, 0); + bc_cc_after = RREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG); + bc_spi_orig = RREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK); + WREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK, 0x1f); + bc_spi_after = RREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK); + WREG32_SOC15(GC, 0, mmRLC_PG_ALWAYS_ON_WGP_MASK, 0x1f); + if (bc250_cc_write_mode == 1 || bc250_cc_write_mode == 4) { + WREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG, bc_cc_orig); + WREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK, bc_spi_orig); + dev_info(adev->dev, + "bc250-40cu-probe: se=%d sh=%d CC=0x%08x->0x%08x SPI=0x%08x->0x%08x (restored)", + bc_se, bc_sh, bc_cc_orig, bc_cc_after, bc_spi_orig, bc_spi_after); + } else { + dev_info(adev->dev, + "bc250-40cu-enable: mode=%d se=%d sh=%d CC=0x%08x->0x%08x SPI=0x%08x->0x%08x", + bc250_cc_write_mode, bc_se, bc_sh, + bc_cc_orig, bc_cc_after, bc_spi_orig, bc_spi_after); + } + } + } + gfx_v10_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, 0); + } + for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { bitmap = i * adev->gfx.config.max_sh_per_se + j; -- 2.53.0