From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: duggasco <duggasco@gmail.com>
Date: Mon, 18 May 2026 20:00:00 +0000
Subject: [PATCH] drm/amdgpu/gfx10: BC-250 40 CU unlock via CC + SPI register
 writes

Re-enable all 40 harvested CUs on the AMD BC-250 (gfx1013 / Cyan
Skillfish / salvaged PS5 APU) by writing two hardware registers during
CU enumeration in gfx_v10_0_get_cu_info():

1. CC_GC_SHADER_ARRAY_CONFIG — clears the harvest enumeration mask so
   the driver, RADV, and KFD see all 40 CUs.
2. SPI_PG_ENABLE_STATIC_WGP_MASK — enables the SPI (Shader Processor
   Input) to dispatch wavefronts to all 5 WGPs per shader array.
   Without this, CC alone changes reporting but SPI still dispatches
   to only 3 WGPs (24 CUs).
3. RLC_PG_ALWAYS_ON_WGP_MASK — keeps all WGPs powered.

Both CC and SPI writes are required — neither alone produces compute
scaling. Controlled via module parameter amdgpu.bc250_cc_write_mode
(default 0 = off, 3 = enable all). Guarded by PCI device ID 0x13FE.

Verified: pp512 302 tok/s (24 CU) -> 466 tok/s (40 CU) = 1.54x at 2GHz.
At 1500MHz/900mV: 230 -> 372 tok/s = 1.61x with sustainable thermals.

4-state A/B test confirmed neither register alone has any effect:
  CC=0 SPI=0x07 (stock):    302 tok/s
  CC=0 SPI=0x1F (SPI only): 302 tok/s (no gain)
  CC=3 SPI=0x07 (CC only):  302 tok/s (no gain)
  CC=3 SPI=0x1F (both):     466 tok/s (1.54x)

Reference: https://github.com/duggasco/bc250-40cu-unlock

Signed-off-by: duggasco <duggasco@gmail.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 46 ++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -26,6 +26,13 @@
 #include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+
+/* BC-250 40 CU unlock: clears harvest mask + enables SPI dispatch to all WGPs */
+static int bc250_cc_write_mode;
+module_param(bc250_cc_write_mode, int, 0444);
+MODULE_PARM_DESC(bc250_cc_write_mode,
+	"BC-250: 0=off 1=probe-SE0SH0 2=clear-SE0SH0 3=clear-all-SAs 4=probe-all-SAs");
+#define BC250_PCI_DEVICE_ID 0x13FE
 #include "amdgpu.h"
 #include "amdgpu_gfx.h"
 #include "amdgpu_psp.h"
@@ -10127,6 +10134,40 @@
 	amdgpu_gfx_parse_disable_cu(disable_masks, 4, 2);

 	mutex_lock(&adev->grbm_idx_mutex);
+
+	/* BC-250: unlock harvested CUs -- CC (enumeration) + SPI (dispatch) + RLC (power) */
+	if (bc250_cc_write_mode > 0 && adev->pdev->device == BC250_PCI_DEVICE_ID) {
+		int bc_se, bc_sh;
+		for (bc_se = 0; bc_se < adev->gfx.config.max_shader_engines; bc_se++) {
+			for (bc_sh = 0; bc_sh < adev->gfx.config.max_sh_per_se; bc_sh++) {
+				u32 bc_cc_orig, bc_cc_after, bc_spi_orig, bc_spi_after;
+				if (bc250_cc_write_mode == 2 && (bc_se > 0 || bc_sh > 0))
+					continue;
+				gfx_v10_0_select_se_sh(adev, bc_se, bc_sh, 0xffffffff, 0);
+				bc_cc_orig = RREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG);
+				WREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG, 0);
+				bc_cc_after = RREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG);
+				bc_spi_orig = RREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK);
+				WREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK, 0x1f);
+				bc_spi_after = RREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK);
+				WREG32_SOC15(GC, 0, mmRLC_PG_ALWAYS_ON_WGP_MASK, 0x1f);
+				if (bc250_cc_write_mode == 1 || bc250_cc_write_mode == 4) {
+					WREG32_SOC15(GC, 0, mmCC_GC_SHADER_ARRAY_CONFIG, bc_cc_orig);
+					WREG32_SOC15(GC, 0, mmSPI_PG_ENABLE_STATIC_WGP_MASK, bc_spi_orig);
+					dev_info(adev->dev,
+						"bc250-40cu-probe: se=%d sh=%d CC=0x%08x->0x%08x SPI=0x%08x->0x%08x (restored)",
+						bc_se, bc_sh, bc_cc_orig, bc_cc_after, bc_spi_orig, bc_spi_after);
+				} else {
+					dev_info(adev->dev,
+						"bc250-40cu-enable: mode=%d se=%d sh=%d CC=0x%08x->0x%08x SPI=0x%08x->0x%08x",
+						bc250_cc_write_mode, bc_se, bc_sh,
+						bc_cc_orig, bc_cc_after, bc_spi_orig, bc_spi_after);
+				}
+			}
+		}
+		gfx_v10_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, 0);
+	}
+
 	for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
 		for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
 			bitmap = i * adev->gfx.config.max_sh_per_se + j;
--
2.53.0