import os import json from huggingface_hub import create_repo, upload_file from tokenizers import Tokenizer, pre_tokenizers, decoders, processors from tokenizers.models import SentencePiece as HF_SentencePiece import sentencepiece as spm username = "Hanbiike" model_folder = "models" graph_file = "graph.jpg" readme_file = "README.md" special_tokens_file = "special_tokens_map.json" def generate_tokenizer_config(model_type: str, model_file: str) -> dict: return { "model_type": model_type, "unk_token": "", "bos_token": "", "eos_token": "", "pad_token": "", "tokenizer_class": "PreTrainedTokenizerFast", "tokenizer_file": model_file } # 📁 Получаем все .model файлы model_files = [f for f in os.listdir(model_folder) if f.endswith(".model")] # 📖 Загружаем карту специальных токенов special_token_ids = {} if os.path.exists(special_tokens_file): with open(special_tokens_file, "r", encoding="utf-8") as f: special_tokens = json.load(f) for token_type, token in special_tokens.items(): special_token_ids[token] = None # ID будет определён позже через spm for model_file in model_files: model_name = model_file.replace(".model", "") vocab_file = model_name + ".vocab" repo_id = f"{username}/{model_name}" print(f"\n📦 Создаю репозиторий: {repo_id}") create_repo(repo_id, repo_type="model", exist_ok=True) # ✅ Загрузка .model upload_file( path_or_fileobj=os.path.join(model_folder, model_file), path_in_repo=model_file, repo_id=repo_id, repo_type="model" ) # ✅ Загрузка .vocab (если есть) vocab_path = os.path.join(model_folder, vocab_file) if os.path.exists(vocab_path): upload_file( path_or_fileobj=vocab_path, path_in_repo=vocab_file, repo_id=repo_id, repo_type="model" ) # ✅ Загрузка graph.jpg if os.path.exists(graph_file): upload_file( path_or_fileobj=graph_file, path_in_repo="graph.jpg", repo_id=repo_id, repo_type="model" ) # ✅ Загрузка special_tokens_map.json if os.path.exists(special_tokens_file): upload_file( path_or_fileobj=special_tokens_file, path_in_repo="special_tokens_map.json", repo_id=repo_id, repo_type="model" ) # ✅ Генерация tokenizer_config.json model_type = "bpe" if "bpe" in model_name.lower() else "unigram" tokenizer_config = generate_tokenizer_config(model_type, model_file) config_path = "tokenizer_config.json" with open(config_path, "w", encoding="utf-8") as f: json.dump(tokenizer_config, f, indent=2, ensure_ascii=False) upload_file( path_or_fileobj=config_path, path_in_repo="tokenizer_config.json", repo_id=repo_id, repo_type="model" ) # ✅ Генерация tokenizer.json (универсально для BPE и Unigram) try: sp_model_path = os.path.join(model_folder, model_file) sp = spm.SentencePieceProcessor() sp.load(sp_model_path) # Получаем ID специальных токенов for token in special_token_ids: try: special_token_ids[token] = sp.piece_to_id(token) except: special_token_ids[token] = 0 # fallback tokenizer = Tokenizer(HF_SentencePiece(sp_model_path)) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() tokenizer.decoder = decoders.Replace("▁", " ") tokenizer.post_processor = processors.TemplateProcessing( single=f"{special_tokens.get('bos_token', '')} $A {special_tokens.get('eos_token', '')}", pair=f"{special_tokens.get('bos_token', '')} $A {special_tokens.get('eos_token', '')} {special_tokens.get('bos_token', '')} $B {special_tokens.get('eos_token', '')}", special_tokens=[ (special_tokens.get("bos_token", ""), special_token_ids.get(special_tokens.get("bos_token", ""), 1)), (special_tokens.get("eos_token", ""), special_token_ids.get(special_tokens.get("eos_token", ""), 2)) ] ) tokenizer.enable_truncation(max_length=512) tokenizer_path = "tokenizer.json" tokenizer.save(tokenizer_path) upload_file( path_or_fileobj=tokenizer_path, path_in_repo="tokenizer.json", repo_id=repo_id, repo_type="model" ) except Exception as e: print(f"⚠️ Не удалось создать tokenizer.json для {model_name}: {e}") print(f"✅ Загружено: {repo_id}")