#!/usr/bin/env bash set -euo pipefail echo "=======================================================" echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)" echo "=======================================================" # ---------------------- Env base ---------------------- export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}" # SDPA/FA toggles export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}" export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}" export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}" export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}" export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}" # CUDA/NCCL/perf single-node robust export CUDA_MODULE_LOADING="LAZY" export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}" export CUDA_DEVICE_ORDER="PCI_BUS_ID" export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8" export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}" export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}" export NCCL_DEBUG="INFO" export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_P2P_DISABLE=0 export NCCL_IB_DISABLE=1 export NCCL_SOCKET_IFNAME="lo" export NCCL_BLOCKING_WAIT=1 export TORCH_NCCL_BLOCKING_WAIT=1 export NCCL_TIMEOUT=600 # ---------------------- Persistência: /data ---------------------- if [ -d /data ]; then export HF_HOME="/data/.cache/huggingface" export TORCH_HOME="/data/.cache/torch" else export HF_HOME="/app/.cache/huggingface" export TORCH_HOME="/app/.cache/torch" fi export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}" mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME" # Symlink para compatibilidade com /app mkdir -p /app/.cache ln -sf "$HF_HOME" /app/.cache/huggingface unset TRANSFORMERS_CACHE export HF_HUB_ENABLE_HF_TRANSFER=1 export HF_HUB_DOWNLOAD_TIMEOUT=60 # ---------------------- Pré-download do modelo (cache-first) ---------------------- MODEL_REPO="ByteDance-Seed/VINCIE-3B" CACHE_MODEL_DIR="$HF_HUB_CACHE/models--ByteDance-Seed--VINCIE-3B" if [ -d "$CACHE_MODEL_DIR" ] && [ "$(ls -A "$CACHE_MODEL_DIR" 2>/dev/null)" ]; then echo "Cache do modelo já existe em $CACHE_MODEL_DIR. Não será feito novo download." else echo "Baixando o modelo ${MODEL_REPO} para cache persistente em $HF_HUB_CACHE..." retry_count=0 max_retries=3 while [ $retry_count -lt $max_retries ]; do echo "Tentativa $((retry_count+1)) de snapshot_download..." python - <<'PY' || true import os from huggingface_hub import snapshot_download hf_home = os.environ.get("HF_HOME","/app/.cache/huggingface") cache_dir = os.path.join(hf_home, "hub") os.makedirs(cache_dir, exist_ok=True) repo_id = "ByteDance-Seed/VINCIE-3B" token = os.getenv("HF_TOKEN") snapshot_download(repo_id=repo_id, cache_dir=cache_dir, resume_download=True, token=token) print("Download concluído.") PY status=$? if [ $status -eq 0 ]; then break fi retry_count=$((retry_count+1)) echo "Falha na tentativa $retry_count. Tentando novamente em 10s..." sleep 10 if [ $retry_count -eq $max_retries ]; then echo "Erro: Falha ao baixar o modelo após $max_retries tentativas." exit 1 fi done fi # ---------------------- Builder Apex/Q8 ---------------------- if nvidia-smi >/dev/null 2>&1; then if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then echo "Executando builder Apex/Q8..." chmod +x /app/builder.sh || true timeout "${BUILDER_TIMEOUT_SEC:-7200}" bash -lc /app/builder.sh || echo "Builder excedeu tempo/erro, prosseguindo." else echo "Builder desabilitado por DISABLE_BUILDER=1" fi else echo "GPU não visível, pulando builder Apex/Q8." fi # ---------------------- Diagnóstico ---------------------- /app/info.sh || true ls -la /app && ls -R /app | head -n 2000 # ---------------------- Subindo serviço ---------------------- echo "🚀 Subindo serviços..." python /app/app_vince.py