--- license: mit datasets: - starhopp3r/TinyChat - agentlans/multiturn-chat language: - en base_model: - SubhrajitSain/anwgpt2-355m pipeline_tag: question-answering library_name: transformers tags: - conversational - anwgpt - anw --- # ANWGPT3 (anwgpt3-355m) Conversational version of ANWGPT2 ([anwgpt2-355m](https://huggingface.co/SubhrajitSain/anwgpt2-355m)). - **Developed by:** Subhrajit Sain, aka, ANW - **Funded by:** no one - **Contributors:** [FlameF0X](https://huggingface.co/FlameF0X) - **Model type:** text generator / question answering - **Language (NLP):** English - **License:** MIT - **Finetuned from model:** SubhrajitSain/anwgpt2-355m --- ### Requirements (Python) ```requirements torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 transformers==4.41.2 peft==0.10.0 accelerate==0.29.3 datasets==2.19.0 trl==0.8.6 bitsandbytes==0.43.1 ``` --- ### Custom Inference Code ```python # ================================================================================================= # ANWGPT3 Inference Code - by ANW # ================================================================================================= import torch import gc import time from transformers import AutoTokenizer, AutoModelForCausalLM from accelerate.utils import load_checkpoint_in_model from huggingface_hub import snapshot_download model_hub_id = "SubhrajitSain/anwgpt3-355m" base_model_name = "SubhrajitSain/anwgpt2-355m" if torch.cuda.is_available(): device = "cuda" print("Using GPU. Loading model structure (FP16).") load_kwargs = {} else: device = "cpu" print("Using CPU. Inference will be slow.") load_kwargs = {} print(f"Downloading checkpoint files for {model_hub_id} to local cache...") local_checkpoint_path = snapshot_download(repo_id=model_hub_id) print(f"Loading tokenizer from: {model_hub_id}...") tokenizer = AutoTokenizer.from_pretrained(model_hub_id, use_fast=False) vocab_size = len(tokenizer) clean_template = ( "{% for message in messages %}" "{{ message['content'] | trim }}\n" "{% endfor %}" ) tokenizer.chat_template = clean_template print("Applied chat template modification: Removed role tags in input.") terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|im_end|>") ] print(f"Loading base model structure from: {base_model_name}...") model = AutoModelForCausalLM.from_pretrained( base_model_name, torch_dtype=torch.float16, **load_kwargs ) print(f"Resizing model embeddings from {model.config.vocab_size} to {vocab_size} tokens.") model.resize_token_embeddings(vocab_size) model.config.vocab_size = vocab_size print("Loading final merged weights onto the resized model structure from local cache...") load_checkpoint_in_model( model, checkpoint=local_checkpoint_path ) gc.collect() torch.cuda.empty_cache() model = model.to(device) model.eval() sys_prompt = "You are ANWGPT3, a large language model meticulously crafted by ANW. Your primary purpose is to be a helpful, harmless, and knowledgeable conversational partner. Engage users in a supportive and informative manner, striving for accuracy, clarity, and kindness in all your responses. Always be honest about your nature as an AI. If you do not know the answer to a question, admit it rather than inventing information. Your goal is to assist users thoughtfully and make every interaction a positive and productive one." print("\n--- Starting Interactive Chat with ANWGPT3 ---") print("Type 'quit' or 'exit' to stop. Type 'clear' to reset history. uwu") conversation_history = [ {"role": "system", "content": sys_prompt} ] while True: user_input = input("You: ") if user_input.lower() in ["quit", "exit"]: print("Exit inference.") break if user_input.lower() == "clear": print("\n--- Conversation history reset! ---") conversation_history = [ {"role": "system", "content": sys_prompt} ] continue conversation_history.append({"role": "user", "content": user_input}) input_text = tokenizer.apply_chat_template( conversation_history, tokenize=False, add_generation_prompt=True ) input_ids = tokenizer( input_text, return_tensors="pt", truncation=True ).input_ids.to(model.device) start_time = time.time() with torch.no_grad(): generated_ids = model.generate( input_ids, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, eos_token_id=terminators, pad_token_id=tokenizer.eos_token_id ) end_time = time.time() new_tokens = generated_ids[0][len(input_ids[0]):] response = tokenizer.decode(new_tokens, skip_special_tokens=True) final_response = response.split("<|im_end|>")[0].strip() final_response = final_response.replace("assistant", "").replace("[INST]", "").strip() print(f"ANWGPT3: {final_response}") print(f"(Time: {end_time - start_time:.2f}s)") conversation_history.append({"role": "assistant", "content": final_response}) print("\n--- Interactive session ended ---") del model gc.collect() torch.cuda.empty_cache() ```