Spaces:
Paused
Paused
File size: 5,267 Bytes
545445d bdec336 1c31d0f bdec336 1c31d0f 545445d bdec336 545445d bdec336 545445d bdec336 545445d 1c31d0f 545445d 316b67d b1804d0 1c31d0f 14c5fa4 b1804d0 1c31d0f b1804d0 1c31d0f 545445d 316b67d 545445d a817dc4 316b67d 545445d 316b67d 545445d cf55b31 fb56537 f21bddb e445ead 6dfd96b f21bddb a817dc4 0834a4b 1c31d0f bcf1a11 0834a4b 545445d 1c31d0f a817dc4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
#!/usr/bin/env bash
set -euo pipefail
echo "======================================================="
echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
echo "======================================================="
# ---------------------- Env base ----------------------
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
# SDPA / FA toggles
export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}"
export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}"
export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}"
export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
# CUDA / NCCL baseline
export CUDA_MODULE_LOADING="LAZY"
export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
export NCCL_DEBUG="INFO"
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_P2P_DISABLE=0
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME="lo"
export NCCL_BLOCKING_WAIT=1
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_TIMEOUT=600
# ---------------------- Persistência HF/torch ----------------------
if [ -d /data ]; then
export HF_HOME="/data/.cache/huggingface"
export TORCH_HOME="/data/.cache/torch"
else
export HF_HOME="/app/.cache/huggingface"
export TORCH_HOME="/app/.cache/torch"
fi
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME"
mkdir -p /app/.cache
ln -sf "$HF_HOME" /app/.cache/huggingface
unset TRANSFORMERS_CACHE
export HF_HUB_ENABLE_HF_TRANSFER=1
export HF_HUB_DOWNLOAD_TIMEOUT=60
MODEL_REPO="ByteDance-Seed/VINCIE-3B"
CACHE_MODEL_DIR="$HF_HUB_CACHE/models--ByteDance-Seed--VINCIE-3B"
CKPT_DIR="/app/ckpt/VINCIE-3B"
mkdir -p "$CKPT_DIR"
# ---------------------- Download: cache estruturado ou direto ----------------------
if [ "${DIRECT_TO_CKPT:-0}" -eq 1 ]; then
echo "[direct] Baixando ${MODEL_REPO} diretamente para $CKPT_DIR"
python - <<'PY'
import os
from pathlib import Path
from huggingface_hub import snapshot_download
repo_id = "ByteDance-Seed/VINCIE-3B"
ckpt_dir = Path("/app/ckpt/VINCIE-3B")
ckpt_dir.mkdir(parents=True, exist_ok=True)
token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN")
snapshot_download(
repo_id=repo_id,
local_dir=str(ckpt_dir),
local_dir_use_symlinks=False,
resume_download=True,
token=token,
)
print("[direct] Snapshot materializado em", ckpt_dir)
PY
else
echo "Verificando snapshot do ${MODEL_REPO} no cache..."
python - <<'PY'
import os
from pathlib import Path
from huggingface_hub import snapshot_download
hf_home = os.environ.get("HF_HOME","/app/.cache/huggingface")
cache_dir = os.path.join(hf_home, "hub")
os.makedirs(cache_dir, exist_ok=True)
repo_id = "ByteDance-Seed/VINCIE-3B"
token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_TOKEN")
snapshot_download(
repo_id=repo_id,
cache_dir=cache_dir,
resume_download=True,
local_dir_use_symlinks=False,
token=token,
)
mcache = Path(cache_dir) / ("models--" + repo_id.replace("/", "--"))
#print("[cache] Estrutura em:", mcache)
#print("[cache] refs:", list((mcache/"refs").glob("*")))
#print("[cache] snapshots:", [p.name for p in (mcache/"snapshots").glob("*") if p.is_dir()])
PY
fi
python3 - <<'PY'
from huggingface_hub import snapshot_download
import os
save_dir = '/app/ckpt/VINCIE-3B'
os.makedirs(save_dir, exist_ok=True)
try:
print('📥 Baixando VINCIE-3B...')
snapshot_download(
repo_id='ByteDance-Seed/VINCIE-3B',
local_dir=save_dir,
cache_dir=cache_dir,
#resume_download=True,
#local_dir_use_symlinks=False
)
print('✅ Modelo ok')
except Exception as e:
print(f'⚠️ Download falhou: {e}')
PY
mkdir -p /app/VINCIE/ckpt
ln -sfn /app/ckpt/VINCIE-3B /app/VINCIE/ckpt/VINCIE-3B
#echo "[diag] Cache model dir: $CACHE_MODEL_DIR"
#ls -la "$CACHE_MODEL_DIR" || true
#echo "[diag] refs:"; ls -la "$CACHE_MODEL_DIR/refs" || true
#echo "[diag] snapshots:"; ls -la "$CACHE_MODEL_DIR/snapshots" || true
#echo "[diag] CKPT_DIR: $CKPT_DIR"; ls -la "$CKPT_DIR" || true
# ---------------------- Builder Apex/Q8 ----------------------
if nvidia-smi >/dev/null 2>&1; then
if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then
echo "Executando builder Apex/Q8..."
chmod +x /app/builder.sh || true
timeout "${BUILDER_TIMEOUT_SEC:-7200}" bash -lc /app/builder.sh || echo "Builder excedeu tempo/erro, prosseguindo."
else
echo "Builder desabilitado por DISABLE_BUILDER=1"
fi
else
echo "GPU não visível, pulando builder Apex/Q8."
fi
pip uninstall -y triton || true && \
pip install -v --no-build-isolation triton==3.1.0
pip uninstall -y bitsandbytes || true && \
pip install bitsandbytes==0.43.1
# ---------------------- Diagnóstico ----------------------
/app/info.sh || true
#ls -la /app || true
#ls -R /app | head -n 2000 || true
# ---------------------- Subindo serviço ----------------------
echo "🚀 Subindo serviços..."
# Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno
python /app/app_vince.py
|