#!/bin/bash set -e echo "🔧 Compilador FlashAttention — modo GPU (configurado para infra grande)" WHEEL_DIR="/data/wheel" TMP_DIR="/data/tmp" mkdir -p "$WHEEL_DIR" "$TMP_DIR" export TMPDIR="$TMP_DIR" FLASH_WHL=$(find "$WHEEL_DIR" -name "flash_attn-*.whl" | head -n 1) if [[ -f "$FLASH_WHL" ]]; then echo "📦 Wheel já existe: $FLASH_WHL" pip install --no-cache-dir "$FLASH_WHL" exit 0 fi echo "⚙️ Nenhum wheel encontrado — iniciando compilação controlada com CUDA..." # ----- Ajustes seguros para ambiente grande (8x L40S) ----- #export MAX_JOBS=${MAX_JOBS:-128} # jobs de compilação (ninja/cmake) #export OMP_NUM_THREADS=${OMP_NUM_THREADS:-8} #export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} # usar apenas 1 GPU para build #export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-"8.9"} #export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-"max_split_size_mb:512"} export USE_FLASH_ATTENTION_WITH_CUDA=1 export FORCE_CUDA=1 echo " > MAX_JOBS=$MAX_JOBS, OMP_NUM_THREADS=$OMP_NUM_THREADS, CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" echo " > TMPDIR=$TMPDIR, WHEEL_DIR=$WHEEL_DIR" # Tenta build com CUDA set -o pipefail if pip wheel flash-attn==2.8.0.post2 --no-build-isolation -w "$WHEEL_DIR" 2>&1 | tee /tmp/flash_build.log; then echo "✅ Build com CUDA concluído." else echo "⚠️ Build CUDA falhou; tentando fallback CPU-only (mais lento). Verifique /tmp/flash_build.log" export USE_FLASH_ATTENTION_WITH_CUDA=0 export FORCE_CUDA=0 pip wheel flash-attn==2.8.0.post2 --no-build-isolation -w "$WHEEL_DIR" fi FLASH_WHL=$(find "$WHEEL_DIR" -name "flash_attn-*.whl" | head -n 1) if [[ -f "$FLASH_WHL" ]]; then echo "✅ Wheel gerado: $FLASH_WHL" pip install --no-cache-dir "$FLASH_WHL" echo "📁 Wheel salvo em $WHEEL_DIR" else echo "❌ Erro: não foi possível gerar o wheel. Ver logs em /tmp/flash_build.log" exit 1 fi # ----- Upload opcional para Hugging Face ----- if [ "${HF_UPLOAD_WHEELS:-0}" = "1" ]; then echo "⬆️ Upload habilitado. Verificando HF_TOKEN..." python3 - <<'PY' import os from huggingface_hub import HfApi, HfFolder repo = os.environ.get("SELF_HF_REPO_ID","caarleexx/Flash") token = os.getenv("HF_TOKEN") or HfFolder.get_token() if not token: raise SystemExit("HF_TOKEN ausente; upload desabilitado") api = HfApi(token=token) api.upload_folder(folder_path="/data/wheel", repo_id=repo, repo_type="model", allow_patterns=["*.whl","NGC-DL-CONTAINER-LICENSE"], ignore_patterns=["**/src/**","**/*.log","**/logs/**",".git/**"]) print("Upload concluído.") PY else echo "ℹ️ Upload HF desabilitado (HF_UPLOAD_WHEELS!=1)" fi