#!/usr/bin/env bash set -euo pipefail echo "=======================================================" echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)" echo "=======================================================" # ---------------------- Env base ---------------------- export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}" # SDPA / FA toggles export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}" export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}" export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}" export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}" export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}" # CUDA / NCCL baseline export CUDA_MODULE_LOADING="LAZY" export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}" export CUDA_DEVICE_ORDER="PCI_BUS_ID" export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8" export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}" export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}" export NCCL_DEBUG="INFO" export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_P2P_DISABLE=0 export NCCL_IB_DISABLE=1 export NCCL_SOCKET_IFNAME="lo" export NCCL_BLOCKING_WAIT=1 export TORCH_NCCL_BLOCKING_WAIT=1 export NCCL_TIMEOUT=600 # ---------------------- Persistência HF/torch ---------------------- if [ -d /data ]; then export HF_HOME="/data/.cache/huggingface" export TORCH_HOME="/data/.cache/torch" else export HF_HOME="/app/.cache/huggingface" export TORCH_HOME="/app/.cache/torch" fi export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}" mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME" mkdir -p /app/.cache ln -sf "$HF_HOME" /app/.cache/huggingface unset TRANSFORMERS_CACHE export HF_HUB_ENABLE_HF_TRANSFER=1 export HF_HUB_DOWNLOAD_TIMEOUT=60 MODEL_REPO="ByteDance-Seed/VINCIE-3B" CACHE_MODEL_DIR="$HF_HUB_CACHE/models--ByteDance-Seed--VINCIE-3B" CKPT_DIR="/app/ckpt/VINCIE-3B" mkdir -p "$CKPT_DIR" # ---------------------- Download: cache estruturado ou direto ---------------------- if [ "${DIRECT_TO_CKPT:-0}" -eq 1 ]; then echo "[direct] Baixando ${MODEL_REPO} diretamente para $CKPT_DIR" python - <<'PY' import os from pathlib import Path from huggingface_hub import snapshot_download repo_id = "ByteDance-Seed/VINCIE-3B" ckpt_dir = Path("/app/ckpt/VINCIE-3B") ckpt_dir.mkdir(parents=True, exist_ok=True) token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN") snapshot_download( repo_id=repo_id, local_dir=str(ckpt_dir), local_dir_use_symlinks=False, resume_download=True, token=token, ) print("[direct] Snapshot materializado em", ckpt_dir) PY else echo "Verificando snapshot do ${MODEL_REPO} no cache..." python - <<'PY' import os from pathlib import Path from huggingface_hub import snapshot_download hf_home = os.environ.get("HF_HOME","/app/.cache/huggingface") cache_dir = os.path.join(hf_home, "hub") os.makedirs(cache_dir, exist_ok=True) repo_id = "ByteDance-Seed/VINCIE-3B" token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN") snapshot_download( repo_id=repo_id, cache_dir=cache_dir, resume_download=True, local_dir_use_symlinks=False, token=token, ) mcache = Path(cache_dir) / ("models--" + repo_id.replace("/", "--")) #print("[cache] Estrutura em:", mcache) #print("[cache] refs:", list((mcache/"refs").glob("*"))) #print("[cache] snapshots:", [p.name for p in (mcache/"snapshots").glob("*") if p.is_dir()]) PY fi python3 - <<'PY' from huggingface_hub import snapshot_download import os save_dir = '/app/ckpt/VINCIE-3B' os.makedirs(save_dir, exist_ok=True) try: print('📥 Baixando VINCIE-3B...') snapshot_download( repo_id='ByteDance-Seed/VINCIE-3B', local_dir=save_dir, cache_dir=cache_dir, #resume_download=True, #local_dir_use_symlinks=False ) print('✅ Modelo ok') except Exception as e: print(f'⚠️ Download falhou: {e}') PY mkdir -p /app/VINCIE/ckpt ln -sfn /app/ckpt/VINCIE-3B /app/VINCIE/ckpt/VINCIE-3B #echo "[diag] Cache model dir: $CACHE_MODEL_DIR" #ls -la "$CACHE_MODEL_DIR" || true #echo "[diag] refs:"; ls -la "$CACHE_MODEL_DIR/refs" || true #echo "[diag] snapshots:"; ls -la "$CACHE_MODEL_DIR/snapshots" || true #echo "[diag] CKPT_DIR: $CKPT_DIR"; ls -la "$CKPT_DIR" || true # ---------------------- Builder Apex/Q8 ---------------------- if nvidia-smi >/dev/null 2>&1; then if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then echo "Executando builder Apex/Q8..." chmod +x /app/builder.sh || true timeout "${BUILDER_TIMEOUT_SEC:-7200}" bash -lc /app/builder.sh || echo "Builder excedeu tempo/erro, prosseguindo." else echo "Builder desabilitado por DISABLE_BUILDER=1" fi else echo "GPU não visível, pulando builder Apex/Q8." fi pip uninstall -y triton || true && \ pip install -v --no-build-isolation triton==3.1.0 pip uninstall -y bitsandbytes || true && \ pip install bitsandbytes==0.43.1 # ---------------------- Diagnóstico ---------------------- /app/info.sh || true #ls -la /app || true #ls -R /app | head -n 2000 || true # ---------------------- Subindo serviço ---------------------- echo "🚀 Subindo serviços..." # Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno python /app/app_vince.py