#!/usr/bin/env bash set -euo pipefail echo "================= RUNTIME CAPABILITIES =================" date if command -v nvidia-smi >/dev/null 2>&1; then nvidia-smi else echo "nvidia-smi: not available" fi echo echo "CUDA_HOME: ${CUDA_HOME:-/usr/local/cuda}" if command -v nvcc >/dev/null 2>&1; then nvcc --version || true else echo "nvcc: not available" fi echo echo "[PyTorch / CUDA backend]" python - <<'PY' import json, os, torch, inspect def to_bool(x): try: if callable(x): try: sig = inspect.signature(x) if len(sig.parameters)==0: return bool(x()) except Exception: pass return True return bool(x) except Exception: return None info = { "torch": getattr(torch, "__version__", None), "cuda_available": torch.cuda.is_available(), "cuda_device_count": torch.cuda.device_count(), "cuda_runtime_version": getattr(torch.version, "cuda", None), "cudnn_version": torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None, "tf32": (torch.backends.cuda.matmul.allow_tf32 if torch.cuda.is_available() else None), "flash_sdp": (to_bool(getattr(torch.backends.cuda, "enable_flash_sdp", None)) if torch.cuda.is_available() else None), "mem_efficient_sdp": (to_bool(getattr(torch.backends.cuda, "enable_mem_efficient_sdp", None)) if torch.cuda.is_available() else None), "math_sdp": (to_bool(getattr(torch.backends.cuda, "enable_math_sdp", None)) if torch.cuda.is_available() else None), } print(json.dumps(info, indent=2)) for i in range(min(torch.cuda.device_count(), 8)): print(f"GPU {i}: {torch.cuda.get_device_name(i)}") PY echo echo "[Apex]" python - <<'PY' try: from apex.normalization import FusedLayerNorm, FusedRMSNorm import importlib; importlib.import_module("fused_layer_norm_cuda") print("apex.normalization: OK") except Exception as e: print("apex.normalization: FAIL ->", e) PY echo echo "[FlashAttention]" python - <<'PY' import importlib for m in ("flash_attn","flash_attn_2_cuda"): try: importlib.import_module(m); print(f"{m}: OK") except Exception as e: print(f"{m}: FAIL -> {e}") PY echo echo "[FlashAttention LN test]" python - <<'PY' import os, warnings, importlib warnings.filterwarnings("ignore", category=FutureWarning) def ok_import(names): for n in names: try: importlib.import_module(n) print(f" [+] import '{n}' OK") return True except Exception as e: print(f" [-] import '{n}' fail: {e}") return False fa_ver = None try: import flash_attn fa_ver = getattr(flash_attn, "__version__", None) except Exception: pass try: import torch tv = torch.__version__ cu = getattr(torch.version, "cuda", None) except Exception: tv, cu = "unknown", "unknown" print(f" flash_attn version: {fa_ver}") print(f" torch: {tv} | cuda: {cu} | TORCH_CUDA_ARCH_LIST={os.getenv('TORCH_CUDA_ARCH_LIST')}") names_to_try = [ "flash_attn_2_cuda", "flash_attn.ops.layer_norm", "flash_attn.layers.layer_norm", ] ok = ok_import(names_to_try) if not ok: print(" Hint: faltam kernels LN/RMSNorm do FlashAttention (performance reduzida).") print(" Use builder.sh para compilar flash_attn e reutilizar a wheel.") PY echo echo "[Triton]" python - <<'PY' try: import triton print("triton:", triton.__version__) try: import triton.ops as _; print("triton.ops: OK") except Exception: print("triton.ops: not present (ok on Triton>=3.x)") except Exception as e: print("triton: FAIL ->", e) PY echo echo "[BitsAndBytes (Q8/Q4)]" python - <<'PY' try: import bitsandbytes as bnb print("bitsandbytes:", bnb.__version__) try: from bitsandbytes.triton import _custom_ops as _; print("bnb.triton.int8_matmul_mixed_dequantize: OK") except Exception as e: print("bnb.triton: partial ->", e) except Exception as e: print("bitsandbytes: FAIL ->", e) PY echo echo "[Transformers / Diffusers / XFormers]" python - <<'PY' def _v(m): try: mod = __import__(m) print(f"{m}:", getattr(mod, "__version__", "unknown")) except Exception as e: print(f"{m}: FAIL -> {e}") for m in ("transformers","diffusers","xformers"): _v(m) PY echo echo "[Distribuído / NCCL Env]" env | grep -E '^(CUDA_VISIBLE_DEVICES|NCCL_|TORCH_|ENABLE_.*SDP|HF_HUB_.*|CUDA_|NV_.*NCCL.*|PYTORCH_CUDA_ALLOC_CONF)=' | sort echo echo "[Caminhos e permissões de saída]" OUT="/app/outputs" echo "OUT dir: $OUT" mkdir -p "$OUT" ls -la "$OUT" || true echo "================= END CAPABILITIES ================="