Spaces:
Paused
Paused
| set -euo pipefail | |
| echo "=======================================================" | |
| echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)" | |
| echo "=======================================================" | |
| # ---------------------- Env base ---------------------- | |
| export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" | |
| export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}" | |
| # SDPA / FA toggles | |
| export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}" | |
| export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}" | |
| export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}" | |
| export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}" | |
| export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}" | |
| # CUDA / NCCL baseline | |
| export CUDA_MODULE_LOADING="LAZY" | |
| export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}" | |
| export CUDA_DEVICE_ORDER="PCI_BUS_ID" | |
| export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8" | |
| export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}" | |
| export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}" | |
| export NCCL_DEBUG="INFO" | |
| export NCCL_ASYNC_ERROR_HANDLING=1 | |
| export NCCL_P2P_DISABLE=0 | |
| export NCCL_IB_DISABLE=1 | |
| export NCCL_SOCKET_IFNAME="lo" | |
| export NCCL_BLOCKING_WAIT=1 | |
| export TORCH_NCCL_BLOCKING_WAIT=1 | |
| export NCCL_TIMEOUT=600 | |
| # ---------------------- Persistência HF/torch ---------------------- | |
| if [ -d /data ]; then | |
| export HF_HOME="/data/.cache/huggingface" | |
| export TORCH_HOME="/data/.cache/torch" | |
| else | |
| export HF_HOME="/app/.cache/huggingface" | |
| export TORCH_HOME="/app/.cache/torch" | |
| fi | |
| export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}" | |
| mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME" | |
| mkdir -p /app/.cache | |
| ln -sf "$HF_HOME" /app/.cache/huggingface | |
| unset TRANSFORMERS_CACHE | |
| export HF_HUB_ENABLE_HF_TRANSFER=1 | |
| export HF_HUB_DOWNLOAD_TIMEOUT=60 | |
| MODEL_REPO="ByteDance-Seed/VINCIE-3B" | |
| CACHE_MODEL_DIR="$HF_HUB_CACHE/models--ByteDance-Seed--VINCIE-3B" | |
| CKPT_DIR="/app/ckpt/VINCIE-3B" | |
| mkdir -p "$CKPT_DIR" | |
| # ---------------------- Download: cache estruturado ou direto ---------------------- | |
| if [ "${DIRECT_TO_CKPT:-0}" -eq 1 ]; then | |
| echo "[direct] Baixando ${MODEL_REPO} diretamente para $CKPT_DIR" | |
| python - <<'PY' | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download | |
| repo_id = "ByteDance-Seed/VINCIE-3B" | |
| ckpt_dir = Path("/app/ckpt/VINCIE-3B") | |
| ckpt_dir.mkdir(parents=True, exist_ok=True) | |
| token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN") | |
| snapshot_download( | |
| repo_id=repo_id, | |
| local_dir=str(ckpt_dir), | |
| local_dir_use_symlinks=False, | |
| resume_download=True, | |
| token=token, | |
| ) | |
| print("[direct] Snapshot materializado em", ckpt_dir) | |
| PY | |
| else | |
| echo "Verificando snapshot do ${MODEL_REPO} no cache..." | |
| python - <<'PY' | |
| import os | |
| from pathlib import Path | |
| from huggingface_hub import snapshot_download | |
| hf_home = os.environ.get("HF_HOME","/app/.cache/huggingface") | |
| cache_dir = os.path.join(hf_home, "hub") | |
| os.makedirs(cache_dir, exist_ok=True) | |
| repo_id = "ByteDance-Seed/VINCIE-3B" | |
| token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN") | |
| snapshot_download( | |
| repo_id=repo_id, | |
| cache_dir=cache_dir, | |
| resume_download=True, | |
| local_dir_use_symlinks=False, | |
| token=token, | |
| ) | |
| mcache = Path(cache_dir) / ("models--" + repo_id.replace("/", "--")) | |
| #print("[cache] Estrutura em:", mcache) | |
| #print("[cache] refs:", list((mcache/"refs").glob("*"))) | |
| #print("[cache] snapshots:", [p.name for p in (mcache/"snapshots").glob("*") if p.is_dir()]) | |
| PY | |
| fi | |
| python3 - <<'PY' | |
| from huggingface_hub import snapshot_download | |
| import os | |
| save_dir = '/app/ckpt/VINCIE-3B' | |
| os.makedirs(save_dir, exist_ok=True) | |
| try: | |
| print('📥 Baixando VINCIE-3B...') | |
| snapshot_download( | |
| repo_id='ByteDance-Seed/VINCIE-3B', | |
| local_dir=save_dir, | |
| cache_dir=cache_dir, | |
| #resume_download=True, | |
| #local_dir_use_symlinks=False | |
| ) | |
| print('✅ Modelo ok') | |
| except Exception as e: | |
| print(f'⚠️ Download falhou: {e}') | |
| PY | |
| mkdir -p /app/VINCIE/ckpt | |
| ln -sfn /app/ckpt/VINCIE-3B /app/VINCIE/ckpt/VINCIE-3B | |
| #echo "[diag] Cache model dir: $CACHE_MODEL_DIR" | |
| #ls -la "$CACHE_MODEL_DIR" || true | |
| #echo "[diag] refs:"; ls -la "$CACHE_MODEL_DIR/refs" || true | |
| #echo "[diag] snapshots:"; ls -la "$CACHE_MODEL_DIR/snapshots" || true | |
| #echo "[diag] CKPT_DIR: $CKPT_DIR"; ls -la "$CKPT_DIR" || true | |
| # ---------------------- Builder Apex/Q8 ---------------------- | |
| if nvidia-smi >/dev/null 2>&1; then | |
| if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then | |
| echo "Executando builder Apex/Q8..." | |
| chmod +x /app/builder.sh || true | |
| timeout "${BUILDER_TIMEOUT_SEC:-7200}" bash -lc /app/builder.sh || echo "Builder excedeu tempo/erro, prosseguindo." | |
| else | |
| echo "Builder desabilitado por DISABLE_BUILDER=1" | |
| fi | |
| else | |
| echo "GPU não visível, pulando builder Apex/Q8." | |
| fi | |
| pip uninstall -y triton || true && \ | |
| pip install -v --no-build-isolation triton==3.1.0 | |
| pip uninstall -y bitsandbytes || true && \ | |
| pip install bitsandbytes==0.43.1 | |
| # ---------------------- Diagnóstico ---------------------- | |
| /app/info.sh || true | |
| #ls -la /app || true | |
| #ls -R /app | head -n 2000 || true | |
| # ---------------------- Subindo serviço ---------------------- | |
| echo "🚀 Subindo serviços..." | |
| # Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno | |
| python /app/app_vince.py | |