megablocks-hip / run-tests.sh

Add ROCm build debugging utilities

2d8a802 about 2 months ago

3.93 kB

	#!/usr/bin/env bash
	set -euo pipefail

	KERNEL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
	cd "$KERNEL_DIR"

	export KERNEL_DIR

	detect_variant() {
	python - <<'PY'
	import os
	import pathlib

	root = pathlib.Path(os.environ["KERNEL_DIR"])
	build_dir = root / "build"
	variant = None

	try:
	from kernels.utils import build_variant as _build_variant
	except Exception:
	_build_variant = None

	if _build_variant is not None:
	try:
	variant = _build_variant()
	except Exception:
	variant = None

	if variant is None:
	candidates = sorted(build_dir.glob("torch-rocm64-") or build_dir.glob("torch-cu"))
	if candidates:
	variant = candidates[0].name

	if variant is None:
	raise SystemExit("Could not determine MegaBlocks build variant. Run build.py first.")

	print(variant)
	PY
	}

	VARIANT=$(detect_variant)

	STAGED_DIR="$KERNEL_DIR/build/$VARIANT"
	find_staged_lib() {
	local base="$1"
	local candidates=(
	"$base/_megablocks_rocm.so"
	"$base/megablocks/_megablocks_rocm.so"
	)
	for path in "${candidates[@]}"; do
	if [[ -f "$path" ]]; then
	echo "$path"
	return 0
	fi
	done
	return 1
	}

	STAGED_LIB=$(find_staged_lib "$STAGED_DIR") \|\| true

	if [[ -z "${STAGED_LIB:-}" ]]; then
	echo "Staged ROCm extension not found under $STAGED_DIR; rebuilding kernels..."
	python build.py
	VARIANT=$(detect_variant)
	STAGED_DIR="$KERNEL_DIR/build/$VARIANT"
	STAGED_LIB=$(find_staged_lib "$STAGED_DIR") \|\| true
	if [[ -z "${STAGED_LIB:-}" ]]; then
	echo "ERROR: build.py completed but no extension was found under $STAGED_DIR" >&2
	exit 1
	fi
	fi

	export PYTHONPATH="$STAGED_DIR:${PYTHONPATH:-}"

	echo "Using MegaBlocks build variant: $VARIANT"

	declare -i GPU_COUNT
	GPU_COUNT=$(python - <<'PY'
	import torch
	print(torch.cuda.device_count() if torch.cuda.is_available() else 0)
	PY
	)

	if (( GPU_COUNT == 0 )); then
	echo "ERROR: No HIP/CUDA GPUs detected. Tests require at least one visible accelerator." >&2
	exit 1
	fi

	echo "Detected $GPU_COUNT visible GPU(s)."

	log() {
	echo
	echo "==> $1"
	}

	run_pytest() {
	local label="$1"
	shift
	log "$label"
	set -x
	"$@"
	{ set +x; } 2>/dev/null \|\| true
	}

	SINGLE_GPU_ENV=(HIP_VISIBLE_DEVICES=0 CUDA_VISIBLE_DEVICES=0 WORLD_SIZE=1)
	MULTI2_GPU_ENV=(HIP_VISIBLE_DEVICES=0,1 CUDA_VISIBLE_DEVICES=0,1 WORLD_SIZE=2)
	MULTI8_GPU_ENV=(HIP_VISIBLE_DEVICES=$(seq -s, 0 7) CUDA_VISIBLE_DEVICES=$(seq -s, 0 7) WORLD_SIZE=8)

	SINGLE_TESTS=(
	"test_mb_moe.py"
	"test_mb_moe_shared_expert.py"
	"layer_test.py"
	"test_gg.py"
	"ops_test.py"
	)

	for test in "${SINGLE_TESTS[@]}"; do
	run_pytest "Single-GPU pytest ${test}" env "${SINGLE_GPU_ENV[@]}" python -m pytest "tests/${test}" -q
	done

	if (( GPU_COUNT >= 2 )); then
	run_pytest "Distributed layer smoke (2 GPUs)" env "${MULTI2_GPU_ENV[@]}" python -m pytest "tests/parallel_layer_test.py::test_megablocks_moe_mlp_functionality" -q
	else
	log "Skipping 2-GPU distributed layer test (requires >=2 GPUs, detected ${GPU_COUNT})."
	fi

	run_pytest "Shared expert functionality (world_size=1)" env "${SINGLE_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_functionality[1]' -q
	run_pytest "Shared expert weighted sum (world_size=1)" env "${SINGLE_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_weighted_sum[1]' -q

	if (( GPU_COUNT >= 8 )); then
	run_pytest "Shared expert functionality (world_size=8)" env "${MULTI8_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_functionality[8]' -q
	run_pytest "Shared expert weighted sum (world_size=8)" env "${MULTI8_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_weighted_sum[8]' -q
	else
	log "Skipping 8-GPU shared expert tests (requires >=8 GPUs, detected ${GPU_COUNT})."
	fi

	echo
	echo "All requested tests completed."