Trouter-20B Usage Guide
Installation
pip install transformers torch accelerate bitsandbytes
Quick Start
Basic Text Generation
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Load model and tokenizer
model_name = "your-username/Trouter-20B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto"
)
# Generate text
prompt = "Explain quantum computing in simple terms:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.95,
do_sample=True
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Chat Interface
def chat(messages, max_new_tokens=512):
"""
Chat with the model using a conversation history.
Args:
messages: List of dicts with 'role' and 'content' keys
max_new_tokens: Maximum tokens to generate
Example:
messages = [
{"role": "user", "content": "What is machine learning?"}
]
"""
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
top_p=0.95,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
return response
# Example usage
conversation = [
{"role": "user", "content": "Hello! Can you help me with Python?"}
]
response = chat(conversation)
print(response)
# Continue conversation
conversation.append({"role": "assistant", "content": response})
conversation.append({"role": "user", "content": "Show me how to read a CSV file."})
response = chat(conversation)
print(response)
Memory-Efficient Loading (8-bit Quantization)
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_name = "your-username/Trouter-20B"
# Load in 8-bit for reduced memory usage
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_8bit=True,
device_map="auto",
torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
4-bit Quantization (Even Lower Memory)
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_name = "your-username/Trouter-20B"
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
Advanced Usage
Batch Generation
prompts = [
"Write a poem about AI:",
"Explain neural networks:",
"What is reinforcement learning?"
]
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=128,
temperature=0.8,
top_p=0.95,
num_return_sequences=1,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for prompt, response in zip(prompts, responses):
print(f"Prompt: {prompt}")
print(f"Response: {response}\n")
Streaming Generation
from transformers import TextIteratorStreamer
from threading import Thread
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
prompt = "Write a story about a robot:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generation_kwargs = {
**inputs,
"max_new_tokens": 256,
"temperature": 0.7,
"do_sample": True,
"streamer": streamer
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
print("Generated text: ", end="")
for new_text in streamer:
print(new_text, end="", flush=True)
print()
Custom Generation Parameters
# Creative generation
creative_output = model.generate(
**inputs,
max_new_tokens=256,
temperature=1.0, # Higher = more creative
top_p=0.95,
top_k=50,
repetition_penalty=1.2,
do_sample=True
)
# Deterministic generation
deterministic_output = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.1, # Lower = more focused
do_sample=False,
num_beams=4 # Beam search for quality
)
Fine-tuning
Using PEFT (Parameter-Efficient Fine-Tuning)
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
# Configure LoRA
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Training arguments
training_args = TrainingArguments(
output_dir="./trouter-finetuned",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
num_train_epochs=3,
logging_steps=10,
save_steps=100,
fp16=True
)
# Train
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
trainer.train()
Performance Optimization
GPU Memory Requirements
- Full precision (bfloat16): ~40GB VRAM
- 8-bit quantization: ~20GB VRAM
- 4-bit quantization: ~10GB VRAM
Recommendations
- Use
device_map="auto"for automatic multi-GPU distribution - Enable
torch.compile()for PyTorch 2.0+ for faster inference - Use Flash Attention 2 if available for better performance
# Enable Flash Attention 2
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2"
)
Troubleshooting
Out of Memory Errors
- Use quantization (8-bit or 4-bit)
- Reduce
max_new_tokens - Decrease batch size
- Enable gradient checkpointing for fine-tuning
Slow Generation
- Use smaller
max_new_tokens - Disable
do_samplefor greedy decoding - Use Flash Attention 2
- Consider model quantization
Poor Quality Outputs
- Adjust temperature (0.7-0.9 recommended)
- Tune top_p and top_k values
- Add repetition_penalty (1.1-1.3)
- Ensure proper prompt formatting
Community and Support
- Issues: GitHub Issues
- Discussions: Hugging Face Discussions
- Discord: Community Discord
Citation
If you use Trouter-20B in your research, please cite:
@software{trouter20b2025,
title={Trouter-20B: A 20 Billion Parameter Language Model},
author={Your Name},
year={2025},
url={https://huggingface.co/your-username/Trouter-20B}
}