Teach HIP grouped_gemm about autograd

- wrap the ROCm grouped GEMM call in a torch.autograd.Function so hidden states and expert weights receive gradients

- reuse the backend kernel for backward matmuls and normalize batch size tensors on the host

- note the hipBLASLt opt-in flag in grouped_gemm.hip while keeping it off by default

Tests: python -m pytest axolotl.shisa/tests/e2e/test_ring_moe_grouped.py -k megablocks_gradient_parity -s

Files changed (2) hide show

csrc/grouped_gemm/grouped_gemm.hip +3 -0
torch-ext/megablocks/grouped_gemm/backend.py +99 -3

csrc/grouped_gemm/grouped_gemm.hip CHANGED Viewed

@@ -17,6 +17,9 @@
 namespace grouped_gemm {
 namespace {
 bool use_hipblaslt_backend() {
   static int cached = [] {
     const char* raw = std::getenv("MEGABLOCKS_GG_USE_HIPBLASLT");

 namespace grouped_gemm {
 namespace {
+// Experimental: toggled via MEGABLOCKS_GG_USE_HIPBLASLT=1. This flag is
+// intentionally off by default because the hipBLASLt path still fails on the
+// largest `tests/ops_test.py` configurations.
 bool use_hipblaslt_backend() {
   static int cached = [] {
     const char* raw = std::getenv("MEGABLOCKS_GG_USE_HIPBLASLT");

torch-ext/megablocks/grouped_gemm/backend.py CHANGED Viewed

@@ -1,5 +1,9 @@
 # NOTE: Torch needs to be imported before the custom
 # extensions. Otherwise libc10.so cannot be found.
 import torch
 # # TODO(tgale): Wrap this in a try-block with better
@@ -13,6 +17,7 @@ import torch
 # from megablocks._ops import ops as backend  # type: ignore
 from .._ops import ops as backend  # type: ignore
 def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
     assert not (trans_a and trans_b)
     assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
@@ -32,8 +37,99 @@ def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
     # reproduced by `_dev/debug-gg-small.py`.
     return torch.zeros(*shape, device=a.device, dtype=a.dtype)
-def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
     if c is None:
-        c = _allocate_output(a, b, batch_sizes, trans_a, trans_b)
-    backend.gmm(a, b, c, batch_sizes, trans_a, trans_b)
     return c

 # NOTE: Torch needs to be imported before the custom
 # extensions. Otherwise libc10.so cannot be found.
+from __future__ import annotations
+from typing import Optional
 import torch
 # # TODO(tgale): Wrap this in a try-block with better
 # from megablocks._ops import ops as backend  # type: ignore
 from .._ops import ops as backend  # type: ignore
 def _allocate_output(a, b, batch_sizes, trans_a, trans_b):
     assert not (trans_a and trans_b)
     assert batch_sizes.ndim == 1, "Expected 1d tensor for batch_sizes"
     # reproduced by `_dev/debug-gg-small.py`.
     return torch.zeros(*shape, device=a.device, dtype=a.dtype)
+def _normalize_batch_sizes(batch_sizes: torch.Tensor) -> torch.Tensor:
+    if batch_sizes.device.type != "cpu":
+        batch_sizes = batch_sizes.to(device="cpu", dtype=torch.int64)
+    else:
+        batch_sizes = batch_sizes.to(dtype=torch.int64)
+    return batch_sizes
+def _run_backend(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    batch_sizes: torch.Tensor,
+    trans_a: bool,
+    trans_b: bool,
+    c: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    batch_sizes_cpu = _normalize_batch_sizes(batch_sizes)
     if c is None:
+        c = _allocate_output(a, b, batch_sizes_cpu, trans_a, trans_b)
+    backend.gmm(a, b, c, batch_sizes_cpu, trans_a, trans_b)
     return c
+class _GroupedGemmFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        batch_sizes: torch.Tensor,
+        trans_a: bool,
+        trans_b: bool,
+        c: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if trans_a:
+            raise NotImplementedError("Grouped GEMM autograd currently requires trans_a=False.")
+        batch_sizes_cpu = _normalize_batch_sizes(batch_sizes)
+        output = _run_backend(a, b, batch_sizes_cpu, trans_a, trans_b, c)
+        ctx.save_for_backward(a, b, batch_sizes_cpu)
+        ctx.trans_a = trans_a
+        ctx.trans_b = trans_b
+        return output
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):  # type: ignore[override]
+        a, b, batch_sizes_cpu = ctx.saved_tensors
+        trans_a = ctx.trans_a
+        trans_b = ctx.trans_b
+        if trans_a:
+            raise NotImplementedError("Grouped GEMM backward currently requires trans_a=False.")
+        grad_output = grad_output.contiguous()
+        grad_output_cast = grad_output
+        if grad_output_cast.dtype != a.dtype:
+            grad_output_cast = grad_output_cast.to(dtype=a.dtype)
+        grad_a = grad_b = None
+        with torch.no_grad():
+            if ctx.needs_input_grad[0]:
+                grad_a = _run_backend(
+                    grad_output_cast,
+                    b.detach(),
+                    batch_sizes_cpu,
+                    trans_a=False,
+                    trans_b=not trans_b,
+                )
+            if ctx.needs_input_grad[1]:
+                grad_b_eff = _run_backend(
+                    a.detach(),
+                    grad_output_cast,
+                    batch_sizes_cpu,
+                    trans_a=True,
+                    trans_b=False,
+                )
+                grad_b = (
+                    grad_b_eff.transpose(-2, -1) if trans_b else grad_b_eff
+                )
+        if grad_a is not None and grad_a.dtype != a.dtype:
+            grad_a = grad_a.to(dtype=a.dtype)
+        if grad_b is not None and grad_b.dtype != b.dtype:
+            grad_b = grad_b.to(dtype=b.dtype)
+        # None returned for batch_sizes / trans flags / optional c.
+        return grad_a, grad_b, None, None, None, None
+def gmm(a, b, batch_sizes, trans_a=False, trans_b=False, c=None):
+    return _GroupedGemmFunction.apply(a, b, batch_sizes, trans_a, trans_b, c)