kyrgyz_spm_tokenizer / upload_models.py
Hanbaike's picture
Upload folder using huggingface_hub
059fc10 verified
import os
import json
from huggingface_hub import create_repo, upload_file
from tokenizers import Tokenizer, pre_tokenizers, decoders, processors
from tokenizers.models import SentencePiece as HF_SentencePiece
import sentencepiece as spm
username = "Hanbiike"
model_folder = "models"
graph_file = "graph.jpg"
readme_file = "README.md"
special_tokens_file = "special_tokens_map.json"
def generate_tokenizer_config(model_type: str, model_file: str) -> dict:
return {
"model_type": model_type,
"unk_token": "<unk>",
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>",
"tokenizer_class": "PreTrainedTokenizerFast",
"tokenizer_file": model_file
}
# 📁 Получаем все .model файлы
model_files = [f for f in os.listdir(model_folder) if f.endswith(".model")]
# 📖 Загружаем карту специальных токенов
special_token_ids = {}
if os.path.exists(special_tokens_file):
with open(special_tokens_file, "r", encoding="utf-8") as f:
special_tokens = json.load(f)
for token_type, token in special_tokens.items():
special_token_ids[token] = None # ID будет определён позже через spm
for model_file in model_files:
model_name = model_file.replace(".model", "")
vocab_file = model_name + ".vocab"
repo_id = f"{username}/{model_name}"
print(f"\n📦 Создаю репозиторий: {repo_id}")
create_repo(repo_id, repo_type="model", exist_ok=True)
# ✅ Загрузка .model
upload_file(
path_or_fileobj=os.path.join(model_folder, model_file),
path_in_repo=model_file,
repo_id=repo_id,
repo_type="model"
)
# ✅ Загрузка .vocab (если есть)
vocab_path = os.path.join(model_folder, vocab_file)
if os.path.exists(vocab_path):
upload_file(
path_or_fileobj=vocab_path,
path_in_repo=vocab_file,
repo_id=repo_id,
repo_type="model"
)
# ✅ Загрузка graph.jpg
if os.path.exists(graph_file):
upload_file(
path_or_fileobj=graph_file,
path_in_repo="graph.jpg",
repo_id=repo_id,
repo_type="model"
)
# ✅ Загрузка special_tokens_map.json
if os.path.exists(special_tokens_file):
upload_file(
path_or_fileobj=special_tokens_file,
path_in_repo="special_tokens_map.json",
repo_id=repo_id,
repo_type="model"
)
# ✅ Генерация tokenizer_config.json
model_type = "bpe" if "bpe" in model_name.lower() else "unigram"
tokenizer_config = generate_tokenizer_config(model_type, model_file)
config_path = "tokenizer_config.json"
with open(config_path, "w", encoding="utf-8") as f:
json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
upload_file(
path_or_fileobj=config_path,
path_in_repo="tokenizer_config.json",
repo_id=repo_id,
repo_type="model"
)
# ✅ Генерация tokenizer.json (универсально для BPE и Unigram)
try:
sp_model_path = os.path.join(model_folder, model_file)
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
# Получаем ID специальных токенов
for token in special_token_ids:
try:
special_token_ids[token] = sp.piece_to_id(token)
except:
special_token_ids[token] = 0 # fallback
tokenizer = Tokenizer(HF_SentencePiece(sp_model_path))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.decoder = decoders.Replace("▁", " ")
tokenizer.post_processor = processors.TemplateProcessing(
single=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')}",
pair=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')} {special_tokens.get('bos_token', '<s>')} $B {special_tokens.get('eos_token', '</s>')}",
special_tokens=[
(special_tokens.get("bos_token", "<s>"), special_token_ids.get(special_tokens.get("bos_token", "<s>"), 1)),
(special_tokens.get("eos_token", "</s>"), special_token_ids.get(special_tokens.get("eos_token", "</s>"), 2))
]
)
tokenizer.enable_truncation(max_length=512)
tokenizer_path = "tokenizer.json"
tokenizer.save(tokenizer_path)
upload_file(
path_or_fileobj=tokenizer_path,
path_in_repo="tokenizer.json",
repo_id=repo_id,
repo_type="model"
)
except Exception as e:
print(f"⚠️ Не удалось создать tokenizer.json для {model_name}: {e}")
print(f"✅ Загружено: {repo_id}")