Aduc-sdr-2_5 / start.sh
carlex3321's picture
Update start.sh
6dfd96b verified
#!/usr/bin/env bash
set -euo pipefail
echo "======================================================="
echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
echo "======================================================="
# ---------------------- Env base ----------------------
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
# SDPA / FA toggles
export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}"
export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}"
export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}"
export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
# CUDA / NCCL baseline
export CUDA_MODULE_LOADING="LAZY"
export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
export NCCL_DEBUG="INFO"
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_P2P_DISABLE=0
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME="lo"
export NCCL_BLOCKING_WAIT=1
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_TIMEOUT=600
# ---------------------- Persistência HF/torch ----------------------
if [ -d /data ]; then
export HF_HOME="/data/.cache/huggingface"
export TORCH_HOME="/data/.cache/torch"
else
export HF_HOME="/app/.cache/huggingface"
export TORCH_HOME="/app/.cache/torch"
fi
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME"
mkdir -p /app/.cache
ln -sf "$HF_HOME" /app/.cache/huggingface
unset TRANSFORMERS_CACHE
export HF_HUB_ENABLE_HF_TRANSFER=1
export HF_HUB_DOWNLOAD_TIMEOUT=60
MODEL_REPO="ByteDance-Seed/VINCIE-3B"
CACHE_MODEL_DIR="$HF_HUB_CACHE/models--ByteDance-Seed--VINCIE-3B"
CKPT_DIR="/app/ckpt/VINCIE-3B"
mkdir -p "$CKPT_DIR"
# ---------------------- Download: cache estruturado ou direto ----------------------
if [ "${DIRECT_TO_CKPT:-0}" -eq 1 ]; then
echo "[direct] Baixando ${MODEL_REPO} diretamente para $CKPT_DIR"
python - <<'PY'
import os
from pathlib import Path
from huggingface_hub import snapshot_download
repo_id = "ByteDance-Seed/VINCIE-3B"
ckpt_dir = Path("/app/ckpt/VINCIE-3B")
ckpt_dir.mkdir(parents=True, exist_ok=True)
token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN")
snapshot_download(
repo_id=repo_id,
local_dir=str(ckpt_dir),
local_dir_use_symlinks=False,
resume_download=True,
token=token,
)
print("[direct] Snapshot materializado em", ckpt_dir)
PY
else
echo "Verificando snapshot do ${MODEL_REPO} no cache..."
python - <<'PY'
import os
from pathlib import Path
from huggingface_hub import snapshot_download
hf_home = os.environ.get("HF_HOME","/app/.cache/huggingface")
cache_dir = os.path.join(hf_home, "hub")
os.makedirs(cache_dir, exist_ok=True)
repo_id = "ByteDance-Seed/VINCIE-3B"
token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN")
snapshot_download(
repo_id=repo_id,
cache_dir=cache_dir,
resume_download=True,
local_dir_use_symlinks=False,
token=token,
)
mcache = Path(cache_dir) / ("models--" + repo_id.replace("/", "--"))
#print("[cache] Estrutura em:", mcache)
#print("[cache] refs:", list((mcache/"refs").glob("*")))
#print("[cache] snapshots:", [p.name for p in (mcache/"snapshots").glob("*") if p.is_dir()])
PY
fi
python3 - <<'PY'
from huggingface_hub import snapshot_download
import os
save_dir = '/app/ckpt/VINCIE-3B'
os.makedirs(save_dir, exist_ok=True)
try:
print('📥 Baixando VINCIE-3B...')
snapshot_download(
repo_id='ByteDance-Seed/VINCIE-3B',
local_dir=save_dir,
cache_dir=cache_dir,
#resume_download=True,
#local_dir_use_symlinks=False
)
print('✅ Modelo ok')
except Exception as e:
print(f'⚠️ Download falhou: {e}')
PY
mkdir -p /app/VINCIE/ckpt
ln -sfn /app/ckpt/VINCIE-3B /app/VINCIE/ckpt/VINCIE-3B
#echo "[diag] Cache model dir: $CACHE_MODEL_DIR"
#ls -la "$CACHE_MODEL_DIR" || true
#echo "[diag] refs:"; ls -la "$CACHE_MODEL_DIR/refs" || true
#echo "[diag] snapshots:"; ls -la "$CACHE_MODEL_DIR/snapshots" || true
#echo "[diag] CKPT_DIR: $CKPT_DIR"; ls -la "$CKPT_DIR" || true
# ---------------------- Builder Apex/Q8 ----------------------
if nvidia-smi >/dev/null 2>&1; then
if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then
echo "Executando builder Apex/Q8..."
chmod +x /app/builder.sh || true
timeout "${BUILDER_TIMEOUT_SEC:-7200}" bash -lc /app/builder.sh || echo "Builder excedeu tempo/erro, prosseguindo."
else
echo "Builder desabilitado por DISABLE_BUILDER=1"
fi
else
echo "GPU não visível, pulando builder Apex/Q8."
fi
pip uninstall -y triton || true && \
pip install -v --no-build-isolation triton==3.1.0
pip uninstall -y bitsandbytes || true && \
pip install bitsandbytes==0.43.1
# ---------------------- Diagnóstico ----------------------
/app/info.sh || true
#ls -la /app || true
#ls -R /app | head -n 2000 || true
# ---------------------- Subindo serviço ----------------------
echo "🚀 Subindo serviços..."
# Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno
python /app/app_vince.py