|
|
import os
|
|
|
import json
|
|
|
from huggingface_hub import create_repo, upload_file
|
|
|
from tokenizers import Tokenizer, pre_tokenizers, decoders, processors
|
|
|
from tokenizers.models import SentencePiece as HF_SentencePiece
|
|
|
import sentencepiece as spm
|
|
|
|
|
|
username = "Hanbiike"
|
|
|
model_folder = "models"
|
|
|
graph_file = "graph.jpg"
|
|
|
readme_file = "README.md"
|
|
|
special_tokens_file = "special_tokens_map.json"
|
|
|
|
|
|
def generate_tokenizer_config(model_type: str, model_file: str) -> dict:
|
|
|
return {
|
|
|
"model_type": model_type,
|
|
|
"unk_token": "<unk>",
|
|
|
"bos_token": "<s>",
|
|
|
"eos_token": "</s>",
|
|
|
"pad_token": "<pad>",
|
|
|
"tokenizer_class": "PreTrainedTokenizerFast",
|
|
|
"tokenizer_file": model_file
|
|
|
}
|
|
|
|
|
|
|
|
|
model_files = [f for f in os.listdir(model_folder) if f.endswith(".model")]
|
|
|
|
|
|
|
|
|
special_token_ids = {}
|
|
|
if os.path.exists(special_tokens_file):
|
|
|
with open(special_tokens_file, "r", encoding="utf-8") as f:
|
|
|
special_tokens = json.load(f)
|
|
|
for token_type, token in special_tokens.items():
|
|
|
special_token_ids[token] = None
|
|
|
|
|
|
for model_file in model_files:
|
|
|
model_name = model_file.replace(".model", "")
|
|
|
vocab_file = model_name + ".vocab"
|
|
|
repo_id = f"{username}/{model_name}"
|
|
|
|
|
|
print(f"\n📦 Создаю репозиторий: {repo_id}")
|
|
|
create_repo(repo_id, repo_type="model", exist_ok=True)
|
|
|
|
|
|
|
|
|
upload_file(
|
|
|
path_or_fileobj=os.path.join(model_folder, model_file),
|
|
|
path_in_repo=model_file,
|
|
|
repo_id=repo_id,
|
|
|
repo_type="model"
|
|
|
)
|
|
|
|
|
|
|
|
|
vocab_path = os.path.join(model_folder, vocab_file)
|
|
|
if os.path.exists(vocab_path):
|
|
|
upload_file(
|
|
|
path_or_fileobj=vocab_path,
|
|
|
path_in_repo=vocab_file,
|
|
|
repo_id=repo_id,
|
|
|
repo_type="model"
|
|
|
)
|
|
|
|
|
|
|
|
|
if os.path.exists(graph_file):
|
|
|
upload_file(
|
|
|
path_or_fileobj=graph_file,
|
|
|
path_in_repo="graph.jpg",
|
|
|
repo_id=repo_id,
|
|
|
repo_type="model"
|
|
|
)
|
|
|
|
|
|
|
|
|
if os.path.exists(special_tokens_file):
|
|
|
upload_file(
|
|
|
path_or_fileobj=special_tokens_file,
|
|
|
path_in_repo="special_tokens_map.json",
|
|
|
repo_id=repo_id,
|
|
|
repo_type="model"
|
|
|
)
|
|
|
|
|
|
|
|
|
model_type = "bpe" if "bpe" in model_name.lower() else "unigram"
|
|
|
tokenizer_config = generate_tokenizer_config(model_type, model_file)
|
|
|
|
|
|
config_path = "tokenizer_config.json"
|
|
|
with open(config_path, "w", encoding="utf-8") as f:
|
|
|
json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
upload_file(
|
|
|
path_or_fileobj=config_path,
|
|
|
path_in_repo="tokenizer_config.json",
|
|
|
repo_id=repo_id,
|
|
|
repo_type="model"
|
|
|
)
|
|
|
|
|
|
|
|
|
try:
|
|
|
sp_model_path = os.path.join(model_folder, model_file)
|
|
|
sp = spm.SentencePieceProcessor()
|
|
|
sp.load(sp_model_path)
|
|
|
|
|
|
|
|
|
for token in special_token_ids:
|
|
|
try:
|
|
|
special_token_ids[token] = sp.piece_to_id(token)
|
|
|
except:
|
|
|
special_token_ids[token] = 0
|
|
|
|
|
|
tokenizer = Tokenizer(HF_SentencePiece(sp_model_path))
|
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
|
|
|
tokenizer.decoder = decoders.Replace("▁", " ")
|
|
|
|
|
|
tokenizer.post_processor = processors.TemplateProcessing(
|
|
|
single=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')}",
|
|
|
pair=f"{special_tokens.get('bos_token', '<s>')} $A {special_tokens.get('eos_token', '</s>')} {special_tokens.get('bos_token', '<s>')} $B {special_tokens.get('eos_token', '</s>')}",
|
|
|
special_tokens=[
|
|
|
(special_tokens.get("bos_token", "<s>"), special_token_ids.get(special_tokens.get("bos_token", "<s>"), 1)),
|
|
|
(special_tokens.get("eos_token", "</s>"), special_token_ids.get(special_tokens.get("eos_token", "</s>"), 2))
|
|
|
]
|
|
|
)
|
|
|
|
|
|
tokenizer.enable_truncation(max_length=512)
|
|
|
|
|
|
tokenizer_path = "tokenizer.json"
|
|
|
tokenizer.save(tokenizer_path)
|
|
|
|
|
|
upload_file(
|
|
|
path_or_fileobj=tokenizer_path,
|
|
|
path_in_repo="tokenizer.json",
|
|
|
repo_id=repo_id,
|
|
|
repo_type="model"
|
|
|
)
|
|
|
except Exception as e:
|
|
|
print(f"⚠️ Не удалось создать tokenizer.json для {model_name}: {e}")
|
|
|
|
|
|
print(f"✅ Загружено: {repo_id}")
|
|
|
|