#!/usr/bin/env bash
set -euo pipefail

echo "================= RUNTIME CAPABILITIES ================="
nvidia-smi || true
echo "CUDA_HOME: ${CUDA_HOME:-/usr/local/cuda}"
echo "NVCC: $(nvcc --version 2>/dev/null | tail -n1 || echo 'N/A')"
echo

echo "[PyTorch / CUDA backend]"
python3 - <<'PY'
import json
try:
    import torch
    info = {
      "torch": torch.__version__,
      "cuda_available": torch.cuda.is_available(),
      "cuda_device_count": torch.cuda.device_count(),
      "cuda_runtime_version": getattr(torch.version, "cuda", None),
      "cudnn_version": (torch.backends.cudnn.version() if torch.cuda.is_available() else None),
      "tf32": (torch.backends.cuda.matmul.allow_tf32 if torch.cuda.is_available() else None),
      "flash_sdp": (torch.backends.cuda.flash_sdp_enabled() if hasattr(torch.backends.cuda,"flash_sdp_enabled") else None),
      "mem_efficient_sdp": (torch.backends.cuda.mem_efficient_sdp_enabled() if hasattr(torch.backends.cuda,"mem_efficient_sdp_enabled") else None),
      "math_sdp": (torch.backends.cuda.math_sdp_enabled() if hasattr(torch.backends.cuda,"math_sdp_enabled") else None),
    }
    print(json.dumps(info, indent=2))
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
except Exception as e:
    print(f"[ERR torch] {type(e).__name__}: {e}")
PY
echo

echo "[Apex]"
python3 - <<'PY'
try:
    import importlib
    importlib.import_module("apex.normalization")
    print("apex.normalization: OK")
except Exception as e:
    print(f"Apex: ERR {type(e).__name__}: {e}")
PY
echo

echo "[FlashAttention]"
python3 - <<'PY'
try:
    import flash_attn
    print(f"flash_attn: OK (version={getattr(flash_attn,'__version__', 'unknown')})")
    try:
        import flash_attn_2_cuda
        print("flash_attn_2_cuda: OK")
    except Exception as e:
        print(f"flash_attn_2_cuda: ERR {type(e).__name__}: {e}")
except Exception as e:
    print(f"flash_attn: ERR {type(e).__name__}: {e}")
PY
echo

echo "[Triton]"
python3 - <<'PY'
try:
    import triton
    print(f"triton: OK (version={getattr(triton,'__version__','unknown')})")
    try:
        import triton.ops
        print("triton.ops: legacy module present")
    except ModuleNotFoundError:
        print("triton.ops: not present (ok on Triton>=3.x)")
    except Exception as e:
        print(f"triton.ops: WARN {type(e).__name__}: {e}")
except Exception as e:
    print(f"triton: ERR {type(e).__name__}: {e}")
PY
echo

echo "[BitsAndBytes (Q8/Q4)]"
python3 - <<'PY'
try:
    import bitsandbytes as bnb
    v = getattr(bnb, "__version__", "unknown")
    print(f"bitsandbytes: OK (version={v})")
    try:
        import bitsandbytes.triton.int8_matmul_mixed_dequantize as q8
        print("bnb.triton.int8_matmul_mixed_dequantize: OK")
    except ModuleNotFoundError:
        print("bnb.q8.triton: not present (disabled or no GPU build)")
    except Exception as e:
        print(f"bnb.q8.triton: WARN {type(e).__name__}: {e}")
except Exception as e:
    print(f"bitsandbytes: ERR {type(e).__name__}: {e}")
PY
echo

echo "[Transformers / Diffusers / XFormers]"
python3 - <<'PY'
import importlib
def ver(name):
    try:
        m = importlib.import_module(name)
        return getattr(m, "__version__", "unknown")
    except Exception as e:
        return f"ERR:{type(e).__name__}"
print("transformers:", ver("transformers"))
print("diffusers:", ver("diffusers"))
print("xformers:", ver("xformers"))
PY
echo

echo "[Distribuído / NCCL Env]"
env | egrep 'MASTER_|NCCL|CUDA_VISIBLE_DEVICES|TORCH_|ENABLE_' | sort
echo "================= END CAPABILITIES ================="